diff options
author | Alex Gorrod <alexg@wiredtiger.com> | 2014-12-03 13:36:04 +1100 |
---|---|---|
committer | Alex Gorrod <alexg@wiredtiger.com> | 2014-12-03 13:36:04 +1100 |
commit | 0512f27b669b8b1a1f4d9f78d38708efd8cdcd87 (patch) | |
tree | eaf8cfd6effc5823cd6dd275a22394fb01a4f5f9 /src/reconcile | |
parent | 2bd317c731c00ca7b82663014e5662517e344ba4 (diff) | |
download | mongo-0512f27b669b8b1a1f4d9f78d38708efd8cdcd87.tar.gz |
Move eviction and reconciliation files into their own subdirs.
Diffstat (limited to 'src/reconcile')
-rw-r--r-- | src/reconcile/rec_track.c | 904 | ||||
-rw-r--r-- | src/reconcile/rec_write.c | 5528 |
2 files changed, 6432 insertions, 0 deletions
diff --git a/src/reconcile/rec_track.c b/src/reconcile/rec_track.c new file mode 100644 index 00000000000..92282393a23 --- /dev/null +++ b/src/reconcile/rec_track.c @@ -0,0 +1,904 @@ +/*- + * Copyright (c) 2008-2014 WiredTiger, Inc. + * All rights reserved. + * + * See the file LICENSE for redistribution information. + */ + +#include "wt_internal.h" + +/* + * Estimated memory cost for a structure on the overflow lists, the size of + * the structure plus two pointers (assume the average skip list depth is 2). + */ +#define WT_OVFL_SIZE(s) \ + (sizeof(s) + 2 * sizeof(void *)) + +/* + * __ovfl_track_init -- + * Initialize the overflow tracking structure. + */ +static int +__ovfl_track_init(WT_SESSION_IMPL *session, WT_PAGE *page) +{ + return (__wt_calloc_def(session, 1, &page->modify->ovfl_track)); +} + +/* + * __ovfl_discard_verbose -- + * Dump information about a discard overflow record. + */ +static int +__ovfl_discard_verbose( + WT_SESSION_IMPL *session, WT_PAGE *page, WT_CELL *cell, const char *tag) +{ + WT_CELL_UNPACK *unpack, _unpack; + WT_DECL_ITEM(tmp); + WT_DECL_RET; + + WT_RET(__wt_scr_alloc(session, 512, &tmp)); + + unpack = &_unpack; + __wt_cell_unpack(cell, unpack); + + WT_ERR(__wt_verbose(session, WT_VERB_OVERFLOW, + "discard: %s%s%p %s", + tag == NULL ? "" : tag, + tag == NULL ? "" : ": ", + page, + __wt_addr_string(session, unpack->data, unpack->size, tmp))); + +err: __wt_scr_free(&tmp); + return (ret); +} + +#if 0 +/* + * __ovfl_discard_dump -- + * Debugging information. + */ +static void +__ovfl_discard_dump(WT_SESSION_IMPL *session, WT_PAGE *page) +{ + WT_CELL **cellp; + WT_OVFL_TRACK *track; + size_t i; + + if (page->modify == NULL || page->modify->ovfl_track == NULL) + return; + + track = page->modify->ovfl_track; + for (i = 0, cellp = track->discard; + i < track->discard_entries; ++i, ++cellp) + (void)__ovfl_discard_verbose(session, page, *cellp, "dump"); +} +#endif + +/* + * __ovfl_discard_wrapup -- + * Resolve the page's overflow discard list after a page is written. + */ +static int +__ovfl_discard_wrapup(WT_SESSION_IMPL *session, WT_PAGE *page) +{ + WT_CELL **cellp; + WT_DECL_RET; + WT_OVFL_TRACK *track; + uint32_t i; + + track = page->modify->ovfl_track; + for (i = 0, cellp = track->discard; + i < track->discard_entries; ++i, ++cellp) { + if (WT_VERBOSE_ISSET(session, WT_VERB_OVERFLOW)) + WT_RET(__ovfl_discard_verbose( + session, page, *cellp, "free")); + + /* Discard each cell's overflow item. */ + WT_RET(__wt_ovfl_discard(session, *cellp)); + } + + __wt_free(session, track->discard); + track->discard_entries = track->discard_allocated = 0; + + return (ret); +} + +/* + * __ovfl_discard_wrapup_err -- + * Resolve the page's overflow discard list after an error occurs. + */ +static int +__ovfl_discard_wrapup_err(WT_SESSION_IMPL *session, WT_PAGE *page) +{ + WT_OVFL_TRACK *track; + + track = page->modify->ovfl_track; + + __wt_free(session, track->discard); + track->discard_entries = track->discard_allocated = 0; + + return (0); +} + +/* + * __wt_ovfl_discard_add -- + * Add a new entry to the page's list of overflow records that have been + * discarded. + */ +int +__wt_ovfl_discard_add(WT_SESSION_IMPL *session, WT_PAGE *page, WT_CELL *cell) +{ + WT_OVFL_TRACK *track; + + if (page->modify->ovfl_track == NULL) + WT_RET(__ovfl_track_init(session, page)); + + track = page->modify->ovfl_track; + WT_RET(__wt_realloc_def(session, &track->discard_allocated, + track->discard_entries + 1, &track->discard)); + track->discard[track->discard_entries++] = cell; + + if (WT_VERBOSE_ISSET(session, WT_VERB_OVERFLOW)) + WT_RET(__ovfl_discard_verbose(session, page, cell, "add")); + + return (0); +} + +/* + * __wt_ovfl_discard_free -- + * Free the page's list of discarded overflow record addresses. + */ +void +__wt_ovfl_discard_free(WT_SESSION_IMPL *session, WT_PAGE *page) +{ + WT_OVFL_TRACK *track; + + if (page->modify == NULL || page->modify->ovfl_track == NULL) + return; + + track = page->modify->ovfl_track; + + __wt_free(session, track->discard); + track->discard_entries = track->discard_allocated = 0; +} + +/* + * __ovfl_reuse_verbose -- + * Dump information about a reuse overflow record. + */ +static int +__ovfl_reuse_verbose(WT_SESSION_IMPL *session, + WT_PAGE *page, WT_OVFL_REUSE *reuse, const char *tag) +{ + WT_DECL_ITEM(tmp); + WT_DECL_RET; + + WT_RET(__wt_scr_alloc(session, 64, &tmp)); + + WT_ERR(__wt_verbose(session, WT_VERB_OVERFLOW, + "reuse: %s%s%p %s (%s%s%s) {%.*s}", + tag == NULL ? "" : tag, + tag == NULL ? "" : ": ", + page, + __wt_addr_string( + session, WT_OVFL_REUSE_ADDR(reuse), reuse->addr_size, tmp), + F_ISSET(reuse, WT_OVFL_REUSE_INUSE) ? "inuse" : "", + F_ISSET(reuse, WT_OVFL_REUSE_INUSE) && + F_ISSET(reuse, WT_OVFL_REUSE_JUST_ADDED) ? ", " : "", + F_ISSET(reuse, WT_OVFL_REUSE_JUST_ADDED) ? "just-added" : "", + WT_MIN(reuse->value_size, 40), (char *)WT_OVFL_REUSE_VALUE(reuse))); + +err: __wt_scr_free(&tmp); + return (ret); +} + +#if 0 +/* + * __ovfl_reuse_dump -- + * Debugging information. + */ +static void +__ovfl_reuse_dump(WT_SESSION_IMPL *session, WT_PAGE *page) +{ + WT_OVFL_REUSE **head, *reuse; + + if (page->modify == NULL || page->modify->ovfl_track == NULL) + return; + head = page->modify->ovfl_track->ovfl_reuse; + + for (reuse = head[0]; reuse != NULL; reuse = reuse->next[0]) + (void)__ovfl_reuse_verbose(session, page, reuse, "dump"); +} +#endif + +/* + * __ovfl_reuse_skip_search -- + * Return the first, not in-use, matching value in the overflow reuse list. + */ +static WT_OVFL_REUSE * +__ovfl_reuse_skip_search( + WT_OVFL_REUSE **head, const void *value, size_t value_size) +{ + WT_OVFL_REUSE **e, *next; + size_t len; + int cmp, i; + + /* + * Start at the highest skip level, then go as far as possible at each + * level before stepping down to the next. + */ + for (i = WT_SKIP_MAXDEPTH - 1, e = &head[i]; i >= 0;) { + if (*e == NULL) { /* Empty levels */ + --i; + --e; + continue; + } + + /* + * Values are not unique, and it's possible to have long lists + * of identical overflow items. (We've seen it in benchmarks.) + * Move through a list of identical items at the current level + * as long as the next one is in-use, otherwise, drop down a + * level. When at the bottom level, return items if reusable, + * else NULL. + */ + len = WT_MIN((*e)->value_size, value_size); + cmp = memcmp(WT_OVFL_REUSE_VALUE(*e), value, len); + if (cmp == 0 && (*e)->value_size == value_size) { + if (i == 0) + return (F_ISSET(*e, + WT_OVFL_REUSE_INUSE) ? NULL : *e); + if ((next = (*e)->next[i]) == NULL || + !F_ISSET(next, WT_OVFL_REUSE_INUSE) || + next->value_size != len || memcmp( + WT_OVFL_REUSE_VALUE(next), value, len) != 0) { + --i; /* Drop down a level */ + --e; + } else /* Keep going at this level */ + e = &(*e)->next[i]; + continue; + } + + /* + * If the skiplist value is larger than the search value, or + * they compare equally and the skiplist value is longer than + * the search value, drop down a level, otherwise continue on + * this level. + */ + if (cmp > 0 || (cmp == 0 && (*e)->value_size > value_size)) { + --i; /* Drop down a level */ + --e; + } else /* Keep going at this level */ + e = &(*e)->next[i]; + } + return (NULL); +} + +/* + * __ovfl_reuse_skip_search_stack -- + * Search an overflow reuse skiplist, returning an insert/remove stack. + */ +static void +__ovfl_reuse_skip_search_stack(WT_OVFL_REUSE **head, + WT_OVFL_REUSE ***stack, const void *value, size_t value_size) +{ + WT_OVFL_REUSE **e; + size_t len; + int cmp, i; + + /* + * Start at the highest skip level, then go as far as possible at each + * level before stepping down to the next. + */ + for (i = WT_SKIP_MAXDEPTH - 1, e = &head[i]; i >= 0;) { + if (*e == NULL) { /* Empty levels */ + stack[i--] = e--; + continue; + } + + /* + * If the skiplist value is larger than the search value, or + * they compare equally and the skiplist value is longer than + * the search value, drop down a level, otherwise continue on + * this level. + */ + len = WT_MIN((*e)->value_size, value_size); + cmp = memcmp(WT_OVFL_REUSE_VALUE(*e), value, len); + if (cmp > 0 || (cmp == 0 && (*e)->value_size > value_size)) + stack[i--] = e--; /* Drop down a level */ + else + e = &(*e)->next[i]; /* Keep going at this level */ + } +} + +/* + * __ovfl_reuse_wrapup -- + * Resolve the page's overflow reuse list after a page is written. + */ +static int +__ovfl_reuse_wrapup(WT_SESSION_IMPL *session, WT_PAGE *page) +{ + WT_BM *bm; + WT_OVFL_REUSE **e, **head, *reuse; + size_t incr, decr; + int i; + + bm = S2BT(session)->bm; + head = page->modify->ovfl_track->ovfl_reuse; + + /* + * Discard any overflow records that aren't in-use, freeing underlying + * blocks. + * + * First, walk the overflow reuse lists (except for the lowest one), + * fixing up skiplist links. + */ + for (i = WT_SKIP_MAXDEPTH - 1; i > 0; --i) + for (e = &head[i]; *e != NULL;) { + if (F_ISSET(*e, WT_OVFL_REUSE_INUSE)) { + e = &(*e)->next[i]; + continue; + } + *e = (*e)->next[i]; + } + + /* + * Second, discard any overflow record without an in-use flag, clear + * the flags for the next run. + * + * As part of the pass through the lowest level, figure out how much + * space we added/subtracted from the page, and update its footprint. + * We don't get it exactly correct because we don't know the depth of + * the skiplist here, but it's close enough, and figuring out the + * memory footprint change in the reconciliation wrapup code means + * fewer atomic updates and less code overall. + */ + incr = decr = 0; + for (e = &head[0]; (reuse = *e) != NULL;) { + if (F_ISSET(reuse, WT_OVFL_REUSE_INUSE)) { + if (F_ISSET(reuse, WT_OVFL_REUSE_JUST_ADDED)) + incr += WT_OVFL_SIZE(WT_OVFL_REUSE) + + reuse->addr_size + reuse->value_size; + + F_CLR(reuse, + WT_OVFL_REUSE_INUSE | WT_OVFL_REUSE_JUST_ADDED); + e = &(*e)->next[0]; + continue; + } + *e = (*e)->next[0]; + + WT_ASSERT(session, !F_ISSET(reuse, WT_OVFL_REUSE_JUST_ADDED)); + decr += WT_OVFL_SIZE(WT_OVFL_REUSE) + + reuse->addr_size + reuse->value_size; + + if (WT_VERBOSE_ISSET(session, WT_VERB_OVERFLOW)) + WT_RET( + __ovfl_reuse_verbose(session, page, reuse, "free")); + WT_RET(bm->free( + bm, session, WT_OVFL_REUSE_ADDR(reuse), reuse->addr_size)); + __wt_free(session, reuse); + } + + if (incr > decr) + __wt_cache_page_inmem_incr(session, page, incr - decr); + if (decr > incr) + __wt_cache_page_inmem_decr(session, page, decr - incr); + return (0); +} + +/* + * __ovfl_reuse_wrapup_err -- + * Resolve the page's overflow reuse list after an error occurs. + */ +static int +__ovfl_reuse_wrapup_err(WT_SESSION_IMPL *session, WT_PAGE *page) +{ + WT_BM *bm; + WT_DECL_RET; + WT_OVFL_REUSE **e, **head, *reuse; + int i; + + bm = S2BT(session)->bm; + head = page->modify->ovfl_track->ovfl_reuse; + + /* + * Discard any overflow records that were just added, freeing underlying + * blocks. + * + * First, walk the overflow reuse lists (except for the lowest one), + * fixing up skiplist links. + */ + for (i = WT_SKIP_MAXDEPTH - 1; i > 0; --i) + for (e = &head[i]; *e != NULL;) { + if (!F_ISSET(*e, WT_OVFL_REUSE_JUST_ADDED)) { + e = &(*e)->next[i]; + continue; + } + *e = (*e)->next[i]; + } + + /* + * Second, discard any overflow record with a just-added flag, clear the + * flags for the next run. + */ + for (e = &head[0]; (reuse = *e) != NULL;) { + if (!F_ISSET(reuse, WT_OVFL_REUSE_JUST_ADDED)) { + F_CLR(reuse, WT_OVFL_REUSE_INUSE); + e = &(*e)->next[0]; + continue; + } + *e = (*e)->next[0]; + + if (WT_VERBOSE_ISSET(session, WT_VERB_OVERFLOW)) + WT_RET( + __ovfl_reuse_verbose(session, page, reuse, "free")); + WT_TRET(bm->free( + bm, session, WT_OVFL_REUSE_ADDR(reuse), reuse->addr_size)); + __wt_free(session, reuse); + } + return (0); +} + +/* + * __wt_ovfl_reuse_search -- + * Search the page's list of overflow records for a match. + */ +int +__wt_ovfl_reuse_search(WT_SESSION_IMPL *session, WT_PAGE *page, + uint8_t **addrp, size_t *addr_sizep, + const void *value, size_t value_size) +{ + WT_OVFL_REUSE **head, *reuse; + + *addrp = NULL; + *addr_sizep = 0; + + if (page->modify->ovfl_track == NULL) + return (0); + + head = page->modify->ovfl_track->ovfl_reuse; + + /* + * The search function returns the first matching record in the list + * which does not have the in-use flag set, or NULL. + */ + if ((reuse = __ovfl_reuse_skip_search(head, value, value_size)) == NULL) + return (0); + + *addrp = WT_OVFL_REUSE_ADDR(reuse); + *addr_sizep = reuse->addr_size; + F_SET(reuse, WT_OVFL_REUSE_INUSE); + + if (WT_VERBOSE_ISSET(session, WT_VERB_OVERFLOW)) + WT_RET(__ovfl_reuse_verbose(session, page, reuse, "reclaim")); + return (1); +} + +/* + * __wt_ovfl_reuse_add -- + * Add a new entry to the page's list of overflow records tracked for + * reuse. + */ +int +__wt_ovfl_reuse_add(WT_SESSION_IMPL *session, WT_PAGE *page, + const uint8_t *addr, size_t addr_size, + const void *value, size_t value_size) +{ + WT_OVFL_REUSE **head, *reuse, **stack[WT_SKIP_MAXDEPTH]; + size_t size; + u_int i, skipdepth; + uint8_t *p; + + if (page->modify->ovfl_track == NULL) + WT_RET(__ovfl_track_init(session, page)); + + head = page->modify->ovfl_track->ovfl_reuse; + + /* Choose a skiplist depth for this insert. */ + skipdepth = __wt_skip_choose_depth(session); + + /* + * Allocate the WT_OVFL_REUSE structure, next pointers for the skip + * list, room for the address and value, then copy everything into + * place. + * + * To minimize the WT_OVFL_REUSE structure size, the address offset + * and size are single bytes: that's safe because the address follows + * the structure (which can't be more than about 100B), and address + * cookies are limited to 255B. + */ + size = sizeof(WT_OVFL_REUSE) + + skipdepth * sizeof(WT_OVFL_REUSE *) + addr_size + value_size; + WT_RET(__wt_calloc(session, 1, size, &reuse)); + p = (uint8_t *)reuse + + sizeof(WT_OVFL_REUSE) + skipdepth * sizeof(WT_OVFL_REUSE *); + reuse->addr_offset = (uint8_t)WT_PTRDIFF(p, reuse); + reuse->addr_size = (uint8_t)addr_size; + memcpy(p, addr, addr_size); + p += addr_size; + reuse->value_offset = WT_PTRDIFF32(p, reuse); + reuse->value_size = WT_STORE_SIZE(value_size); + memcpy(p, value, value_size); + F_SET(reuse, WT_OVFL_REUSE_INUSE | WT_OVFL_REUSE_JUST_ADDED); + + /* Insert the new entry into the skiplist. */ + __ovfl_reuse_skip_search_stack(head, stack, value, value_size); + for (i = 0; i < skipdepth; ++i) { + reuse->next[i] = *stack[i]; + *stack[i] = reuse; + } + + if (WT_VERBOSE_ISSET(session, WT_VERB_OVERFLOW)) + WT_RET(__ovfl_reuse_verbose(session, page, reuse, "add")); + + return (0); +} + +/* + * __wt_ovfl_reuse_free -- + * Free the page's list of overflow records tracked for reuse. + */ +void +__wt_ovfl_reuse_free(WT_SESSION_IMPL *session, WT_PAGE *page) +{ + WT_OVFL_REUSE *reuse; + WT_PAGE_MODIFY *mod; + void *next; + + mod = page->modify; + if (mod == NULL || mod->ovfl_track == NULL) + return; + + for (reuse = mod->ovfl_track->ovfl_reuse[0]; + reuse != NULL; reuse = next) { + next = reuse->next[0]; + __wt_free(session, reuse); + } +} + +/* + * __ovfl_txnc_verbose -- + * Dump information about a transaction-cached overflow record. + */ +static int +__ovfl_txnc_verbose(WT_SESSION_IMPL *session, + WT_PAGE *page, WT_OVFL_TXNC *txnc, const char *tag) +{ + WT_DECL_ITEM(tmp); + WT_DECL_RET; + + WT_RET(__wt_scr_alloc(session, 64, &tmp)); + + WT_ERR(__wt_verbose(session, WT_VERB_OVERFLOW, + "txn-cache: %s%s%p %s %" PRIu64 " {%.*s}", + tag == NULL ? "" : tag, + tag == NULL ? "" : ": ", + page, + __wt_addr_string( + session, WT_OVFL_TXNC_ADDR(txnc), txnc->addr_size, tmp), + txnc->current, + WT_MIN(txnc->value_size, 40), (char *)WT_OVFL_TXNC_VALUE(txnc))); + +err: __wt_scr_free(&tmp); + return (ret); +} + +#if 0 +/* + * __ovfl_txnc_dump -- + * Debugging information. + */ +static void +__ovfl_txnc_dump(WT_SESSION_IMPL *session, WT_PAGE *page) +{ + WT_OVFL_TXNC **head, *txnc; + + if (page->modify == NULL || page->modify->ovfl_track == NULL) + return; + head = page->modify->ovfl_track->ovfl_txnc; + + for (txnc = head[0]; txnc != NULL; txnc = txnc->next[0]) + (void)__ovfl_txnc_verbose(session, page, txnc, "dump"); +} +#endif + +/* + * __ovfl_txnc_skip_search -- + * Return the first matching addr in the overflow transaction-cache list. + */ +static WT_OVFL_TXNC * +__ovfl_txnc_skip_search(WT_OVFL_TXNC **head, const void *addr, size_t addr_size) +{ + WT_OVFL_TXNC **e; + size_t len; + int cmp, i; + + /* + * Start at the highest skip level, then go as far as possible at each + * level before stepping down to the next. + */ + for (i = WT_SKIP_MAXDEPTH - 1, e = &head[i]; i >= 0;) { + if (*e == NULL) { /* Empty levels */ + --i; + --e; + continue; + } + + /* + * Return any exact matches: we don't care in what search level + * we found a match. + */ + len = WT_MIN((*e)->addr_size, addr_size); + cmp = memcmp(WT_OVFL_TXNC_ADDR(*e), addr, len); + if (cmp == 0 && (*e)->addr_size == addr_size) + return (*e); + + /* + * If the skiplist address is larger than the search address, or + * they compare equally and the skiplist address is longer than + * the search address, drop down a level, otherwise continue on + * this level. + */ + if (cmp > 0 || (cmp == 0 && (*e)->addr_size > addr_size)) { + --i; /* Drop down a level */ + --e; + } else /* Keep going at this level */ + e = &(*e)->next[i]; + } + return (NULL); +} + +/* + * __ovfl_txnc_skip_search_stack -- + * Search an overflow transaction-cache skiplist, returning an + * insert/remove stack. + */ +static void +__ovfl_txnc_skip_search_stack(WT_OVFL_TXNC **head, + WT_OVFL_TXNC ***stack, const void *addr, size_t addr_size) +{ + WT_OVFL_TXNC **e; + size_t len; + int cmp, i; + + /* + * Start at the highest skip level, then go as far as possible at each + * level before stepping down to the next. + */ + for (i = WT_SKIP_MAXDEPTH - 1, e = &head[i]; i >= 0;) { + if (*e == NULL) { /* Empty levels */ + stack[i--] = e--; + continue; + } + + /* + * If the skiplist addr is larger than the search addr, or + * they compare equally and the skiplist addr is longer than + * the search addr, drop down a level, otherwise continue on + * this level. + */ + len = WT_MIN((*e)->addr_size, addr_size); + cmp = memcmp(WT_OVFL_TXNC_ADDR(*e), addr, len); + if (cmp > 0 || (cmp == 0 && (*e)->addr_size > addr_size)) + stack[i--] = e--; /* Drop down a level */ + else + e = &(*e)->next[i]; /* Keep going at this level */ + } +} + +/* + * __ovfl_txnc_wrapup -- + * Resolve the page's transaction-cache list. + */ +static int +__ovfl_txnc_wrapup(WT_SESSION_IMPL *session, WT_PAGE *page) +{ + WT_OVFL_TXNC **e, **head, *txnc; + size_t decr; + int i; + + head = page->modify->ovfl_track->ovfl_txnc; + + /* + * Discard any transaction-cache records with transaction IDs earlier + * than any in the system. + * + * First, walk the overflow transaction-cache skip lists (except for + * the lowest level), fixing up links. + */ + for (i = WT_SKIP_MAXDEPTH - 1; i > 0; --i) + for (e = &head[i]; *e != NULL;) { + if (!__wt_txn_visible_all(session, (*e)->current)) { + e = &(*e)->next[i]; + continue; + } + *e = (*e)->next[i]; + } + + /* Second, discard any no longer needed transaction-cache records. */ + decr = 0; + for (e = &head[0]; (txnc = *e) != NULL;) { + if (!__wt_txn_visible_all(session, txnc->current)) { + e = &(*e)->next[0]; + continue; + } + *e = (*e)->next[0]; + + decr += WT_OVFL_SIZE(WT_OVFL_TXNC) + + txnc->addr_size + txnc->value_size; + + if (WT_VERBOSE_ISSET(session, WT_VERB_OVERFLOW)) + WT_RET( + __ovfl_txnc_verbose(session, page, txnc, "free")); + __wt_free(session, txnc); + } + + if (decr != 0) + __wt_cache_page_inmem_decr(session, page, decr); + return (0); +} + +/* + * __wt_ovfl_txnc_search -- + * Search the page's list of transaction-cache overflow records for a + * match. + */ +int +__wt_ovfl_txnc_search( + WT_PAGE *page, const uint8_t *addr, size_t addr_size, WT_ITEM *store) +{ + WT_OVFL_TXNC **head, *txnc; + + if (page->modify->ovfl_track == NULL) + return (WT_NOTFOUND); + + head = page->modify->ovfl_track->ovfl_txnc; + + if ((txnc = __ovfl_txnc_skip_search(head, addr, addr_size)) == NULL) + return (WT_NOTFOUND); + + store->data = WT_OVFL_TXNC_VALUE(txnc); + store->size = txnc->value_size; + return (0); +} + +/* + * __wt_ovfl_txnc_add -- + * Add a new entry to the page's list of transaction-cached overflow + * records. + */ +int +__wt_ovfl_txnc_add(WT_SESSION_IMPL *session, WT_PAGE *page, + const uint8_t *addr, size_t addr_size, + const void *value, size_t value_size) +{ + WT_OVFL_TXNC **head, **stack[WT_SKIP_MAXDEPTH], *txnc; + size_t size; + u_int i, skipdepth; + uint8_t *p; + + if (page->modify->ovfl_track == NULL) + WT_RET(__ovfl_track_init(session, page)); + + head = page->modify->ovfl_track->ovfl_txnc; + + /* Choose a skiplist depth for this insert. */ + skipdepth = __wt_skip_choose_depth(session); + + /* + * Allocate the WT_OVFL_TXNC structure, next pointers for the skip + * list, room for the address and value, then copy everything into + * place. + * + * To minimize the WT_OVFL_TXNC structure size, the address offset + * and size are single bytes: that's safe because the address follows + * the structure (which can't be more than about 100B), and address + * cookies are limited to 255B. + */ + size = sizeof(WT_OVFL_TXNC) + + skipdepth * sizeof(WT_OVFL_TXNC *) + addr_size + value_size; + WT_RET(__wt_calloc(session, 1, size, &txnc)); + p = (uint8_t *)txnc + + sizeof(WT_OVFL_TXNC) + skipdepth * sizeof(WT_OVFL_TXNC *); + txnc->addr_offset = (uint8_t)WT_PTRDIFF(p, txnc); + txnc->addr_size = (uint8_t)addr_size; + memcpy(p, addr, addr_size); + p += addr_size; + txnc->value_offset = WT_PTRDIFF32(p, txnc); + txnc->value_size = WT_STORE_SIZE(value_size); + memcpy(p, value, value_size); + txnc->current = __wt_txn_new_id(session); + + __wt_cache_page_inmem_incr(session, page, + WT_OVFL_SIZE(WT_OVFL_TXNC) + addr_size + value_size); + + /* Insert the new entry into the skiplist. */ + __ovfl_txnc_skip_search_stack(head, stack, addr, addr_size); + for (i = 0; i < skipdepth; ++i) { + txnc->next[i] = *stack[i]; + *stack[i] = txnc; + } + + if (WT_VERBOSE_ISSET(session, WT_VERB_OVERFLOW)) + WT_RET(__ovfl_txnc_verbose(session, page, txnc, "add")); + + return (0); +} + +/* + * __wt_ovfl_txnc_free -- + * Free the page's list of transaction-cached overflow records. + */ +void +__wt_ovfl_txnc_free(WT_SESSION_IMPL *session, WT_PAGE *page) +{ + WT_OVFL_TXNC *txnc; + WT_PAGE_MODIFY *mod; + void *next; + + mod = page->modify; + if (mod == NULL || mod->ovfl_track == NULL) + return; + + for (txnc = mod->ovfl_track->ovfl_txnc[0]; + txnc != NULL; txnc = next) { + next = txnc->next[0]; + __wt_free(session, txnc); + } +} + +/* + * __wt_ovfl_track_wrapup -- + * Resolve the page's overflow tracking on reconciliation success. + */ +int +__wt_ovfl_track_wrapup(WT_SESSION_IMPL *session, WT_PAGE *page) +{ + WT_DECL_RET; + WT_OVFL_TRACK *track; + + if (page->modify == NULL || page->modify->ovfl_track == NULL) + return (0); + + track = page->modify->ovfl_track; + if (track->discard != NULL) + WT_RET(__ovfl_discard_wrapup(session, page)); + + if (track->ovfl_reuse[0] != NULL) + WT_RET(__ovfl_reuse_wrapup(session, page)); + + if (track->ovfl_txnc[0] != NULL) { + WT_RET(__wt_writelock(session, S2BT(session)->ovfl_lock)); + ret = __ovfl_txnc_wrapup(session, page); + WT_TRET(__wt_writeunlock(session, S2BT(session)->ovfl_lock)); + } + return (0); +} + +/* + * __wt_ovfl_track_wrapup_err -- + * Resolve the page's overflow tracking on reconciliation error. + */ +int +__wt_ovfl_track_wrapup_err(WT_SESSION_IMPL *session, WT_PAGE *page) +{ + WT_DECL_RET; + WT_OVFL_TRACK *track; + + if (page->modify == NULL || page->modify->ovfl_track == NULL) + return (0); + + track = page->modify->ovfl_track; + if (track->discard != NULL) + WT_RET(__ovfl_discard_wrapup_err(session, page)); + + if (track->ovfl_reuse[0] != NULL) + WT_RET(__ovfl_reuse_wrapup_err(session, page)); + + if (track->ovfl_txnc[0] != NULL) { + WT_RET(__wt_writelock(session, S2BT(session)->ovfl_lock)); + ret = __ovfl_txnc_wrapup(session, page); + WT_TRET(__wt_writeunlock(session, S2BT(session)->ovfl_lock)); + } + return (0); +} diff --git a/src/reconcile/rec_write.c b/src/reconcile/rec_write.c new file mode 100644 index 00000000000..c72447ae841 --- /dev/null +++ b/src/reconcile/rec_write.c @@ -0,0 +1,5528 @@ +/*- + * Copyright (c) 2008-2014 WiredTiger, Inc. + * All rights reserved. + * + * See the file LICENSE for redistribution information. + */ + +#include "wt_internal.h" + +struct __rec_boundary; typedef struct __rec_boundary WT_BOUNDARY; +struct __rec_dictionary; typedef struct __rec_dictionary WT_DICTIONARY; +struct __rec_kv; typedef struct __rec_kv WT_KV; + +/* + * Reconciliation is the process of taking an in-memory page, walking each entry + * in the page, building a backing disk image in a temporary buffer representing + * that information, and writing that buffer to disk. What could be simpler? + * + * WT_RECONCILE -- + * Information tracking a single page reconciliation. + */ +typedef struct { + WT_REF *ref; /* Page being reconciled */ + WT_PAGE *page; + uint32_t flags; /* Caller's configuration */ + + WT_ITEM dsk; /* Temporary disk-image buffer */ + + /* Track whether all changes to the page are written. */ + uint64_t max_txn; + uint64_t skipped_txn; + uint32_t orig_write_gen; + + /* + * If page updates are skipped because they are as yet unresolved, or + * the page has updates we cannot discard, the page is left "dirty": + * the page cannot be discarded and a subsequent reconciliation will + * be necessary to discard the page. + */ + int leave_dirty; + + /* + * Raw compression (don't get me started, as if normal reconciliation + * wasn't bad enough). If an application wants absolute control over + * what gets written to disk, we give it a list of byte strings and it + * gives us back an image that becomes a file block. Because we don't + * know the number of items we're storing in a block until we've done + * a lot of work, we turn off most compression: dictionary, copy-cell, + * prefix and row-store internal page suffix compression are all off. + */ + int raw_compression; + uint32_t raw_max_slots; /* Raw compression array sizes */ + uint32_t *raw_entries; /* Raw compression slot entries */ + uint32_t *raw_offsets; /* Raw compression slot offsets */ + uint64_t *raw_recnos; /* Raw compression recno count */ + WT_ITEM raw_destination; /* Raw compression destination buffer */ + + /* + * Track if reconciliation has seen any overflow items. If a leaf page + * with no overflow items is written, the parent page's address cell is + * set to the leaf-no-overflow type. This means we can delete the leaf + * page without reading it because we don't have to discard any overflow + * items it might reference. + * + * The test test is per-page reconciliation, that is, once we see an + * overflow item on the page, all subsequent leaf pages written for the + * page will not be leaf-no-overflow type, regardless of whether or not + * they contain overflow items. In other words, leaf-no-overflow is not + * guaranteed to be set on every page that doesn't contain an overflow + * item, only that if it is set, the page contains no overflow items. + * + * The reason is because of raw compression: there's no easy/fast way to + * figure out if the rows selected by raw compression included overflow + * items, and the optimization isn't worth another pass over the data. + */ + int ovfl_items; + + /* + * Track if reconciliation of a row-store leaf page has seen empty (zero + * length) values. We don't write out anything for empty values, so if + * there are empty values on a page, we have to make two passes over the + * page when it's read to figure out how many keys it has, expensive in + * the common case of no empty values and (entries / 2) keys. Likewise, + * a page with only empty values is another common data set, and keys on + * that page will be equal to the number of entries. In both cases, set + * a flag in the page's on-disk header. + * + * The test is per-page reconciliation as described above for the + * overflow-item test. + */ + int all_empty_value, any_empty_value; + + /* + * Reconciliation gets tricky if we have to split a page, which happens + * when the disk image we create exceeds the page type's maximum disk + * image size. + * + * First, the sizes of the page we're building. If WiredTiger is doing + * page layout, page_size is the same as page_size_max. We accumulate + * the maximum page size of raw data and when we reach that size, we + * split the page into multiple chunks, eventually compressing those + * chunks. When the application is doing page layout (raw compression + * is configured), page_size can continue to grow past page_size_max, + * and we keep accumulating raw data until the raw compression callback + * accepts it. + */ + uint32_t page_size; /* Current page size */ + uint32_t page_size_max; /* Maximum on-disk page size */ + + /* + * Second, the split size: if we're doing the page layout, split to a + * smaller-than-maximum page size when a split is required so we don't + * repeatedly split a packed page. + */ + uint32_t split_size; /* Split page size */ + + /* + * The problem with splits is we've done a lot of work by the time we + * realize we're going to have to split, we don't want to start over. + * + * To keep from having to start over when we hit the maximum page size, + * we track the page information when we approach a split boundary. + * If we eventually have to split, we walk this structure and pretend + * we were splitting all along. After that, we continue to append to + * this structure, and eventually walk it to create a new internal page + * that references all of our split pages. + */ + struct __rec_boundary { + /* + * The start field records location in the initial split buffer, + * that is, the first byte of the split chunk recorded before we + * decide to split a page; the offset between the first byte of + * chunk[0] and the first byte of chunk[1] is chunk[0]'s length. + * + * Once we split a page, we stop filling in the start field, as + * we're writing the split chunks as we find them. + */ + uint8_t *start; /* Split's first byte */ + + /* + * The recno and entries fields are the starting record number + * of the split chunk (for column-store splits), and the number + * of entries in the split chunk. These fields are used both + * to write the split chunk, and to create a new internal page + * to reference the split pages. + */ + uint64_t recno; /* Split's starting record */ + uint32_t entries; /* Split's entries */ + + WT_ADDR addr; /* Split's written location */ + uint32_t size; /* Split's size */ + uint32_t cksum; /* Split's checksum */ + void *dsk; /* Split's disk image */ + + /* + * When busy pages get large, we need to be able to evict them + * even when they contain unresolved updates, or updates which + * cannot be evicted because of running transactions. In such + * cases, break the page into multiple blocks, write the blocks + * that can be evicted, saving lists of updates for blocks that + * cannot be evicted, then re-instantiate the blocks that cannot + * be evicted as new, in-memory pages, restoring the updates on + * those pages. + */ + WT_UPD_SKIPPED *skip; /* Skipped updates */ + uint32_t skip_next; + size_t skip_allocated; + + /* + * The key for a row-store page; no column-store key is needed + * because the page's recno, stored in the recno field, is the + * column-store key. + */ + WT_ITEM key; /* Promoted row-store key */ + + /* + * During wrapup, after reconciling the root page, we write a + * final block as part of a checkpoint. If raw compression + * was configured, that block may have already been compressed. + */ + int already_compressed; + } *bnd; /* Saved boundaries */ + uint32_t bnd_next; /* Next boundary slot */ + uint32_t bnd_next_max; /* Maximum boundary slots used */ + size_t bnd_entries; /* Total boundary slots */ + size_t bnd_allocated; /* Bytes allocated */ + + /* + * We track the total number of page entries copied into split chunks + * so we can easily figure out how many entries in the current split + * chunk. + */ + uint32_t total_entries; /* Total entries in splits */ + + /* + * And there's state information as to where in this process we are: + * (1) tracking split boundaries because we can still fit more split + * chunks into the maximum page size, (2) tracking the maximum page + * size boundary because we can't fit any more split chunks into the + * maximum page size, (3) not performing boundary checks because it's + * either not useful with the current page size configuration, or + * because we've already been forced to split. + */ + enum { SPLIT_BOUNDARY=0, /* Next: a split page boundary */ + SPLIT_MAX=1, /* Next: the maximum page boundary */ + SPLIT_TRACKING_OFF=2, /* No boundary checks */ + SPLIT_TRACKING_RAW=3 } /* Underlying compression decides */ + bnd_state; + + /* + * We track current information about the current record number, the + * number of entries copied into the temporary buffer, where we are + * in the temporary buffer, and how much memory remains. Those items + * are packaged here rather than passing pointers to stack locations + * around the code. + */ + uint64_t recno; /* Current record number */ + uint32_t entries; /* Current number of entries */ + uint8_t *first_free; /* Current first free byte */ + size_t space_avail; /* Remaining space in this chunk */ + + /* + * While reviewing updates for each page, we store skipped updates here, + * and then move them to per-block areas as the blocks are defined. + */ + WT_UPD_SKIPPED *skip; /* Skipped updates */ + uint32_t skip_next; + size_t skip_allocated; + + /* + * We don't need to keep the 0th key around on internal pages, the + * search code ignores them as nothing can sort less by definition. + * There's some trickiness here, see the code for comments on how + * these fields work. + */ + int cell_zero; /* Row-store internal page 0th key */ + + /* + * WT_DICTIONARY -- + * We optionally build a dictionary of row-store values for leaf + * pages. Where two value cells are identical, only write the value + * once, the second and subsequent copies point to the original cell. + * The dictionary is fixed size, but organized in a skip-list to make + * searches faster. + */ + struct __rec_dictionary { + uint64_t hash; /* Hash value */ + void *cell; /* Matching cell */ + + u_int depth; /* Skiplist */ + WT_DICTIONARY *next[0]; + } **dictionary; /* Dictionary */ + u_int dictionary_next, dictionary_slots; /* Next, max entries */ + /* Skiplist head. */ + WT_DICTIONARY *dictionary_head[WT_SKIP_MAXDEPTH]; + + /* + * WT_KV-- + * An on-page key/value item we're building. + */ + struct __rec_kv { + WT_ITEM buf; /* Data */ + WT_CELL cell; /* Cell and cell's length */ + size_t cell_len; + size_t len; /* Total length of cell + data */ + } k, v; /* Key/Value being built */ + + WT_ITEM *cur, _cur; /* Key/Value being built */ + WT_ITEM *last, _last; /* Last key/value built */ + + int key_pfx_compress; /* If can prefix-compress next key */ + int key_pfx_compress_conf; /* If prefix compression configured */ + int key_sfx_compress; /* If can suffix-compress next key */ + int key_sfx_compress_conf; /* If suffix compression configured */ + + int is_bulk_load; /* If it's a bulk load */ + + WT_SALVAGE_COOKIE *salvage; /* If it's a salvage operation */ + + int tested_ref_state; /* Debugging information */ +} WT_RECONCILE; + +static void __rec_bnd_cleanup(WT_SESSION_IMPL *, WT_RECONCILE *, int); +static void __rec_cell_build_addr( + WT_RECONCILE *, const void *, size_t, u_int, uint64_t); +static int __rec_cell_build_int_key(WT_SESSION_IMPL *, + WT_RECONCILE *, const void *, size_t, int *); +static int __rec_cell_build_leaf_key(WT_SESSION_IMPL *, + WT_RECONCILE *, const void *, size_t, int *); +static int __rec_cell_build_ovfl(WT_SESSION_IMPL *, + WT_RECONCILE *, WT_KV *, uint8_t, uint64_t); +static int __rec_cell_build_val(WT_SESSION_IMPL *, + WT_RECONCILE *, const void *, size_t, uint64_t); +static int __rec_child_deleted( + WT_SESSION_IMPL *, WT_RECONCILE *, WT_REF *, int *); +static int __rec_col_fix(WT_SESSION_IMPL *, WT_RECONCILE *, WT_PAGE *); +static int __rec_col_fix_slvg(WT_SESSION_IMPL *, + WT_RECONCILE *, WT_PAGE *, WT_SALVAGE_COOKIE *); +static int __rec_col_int(WT_SESSION_IMPL *, WT_RECONCILE *, WT_PAGE *); +static int __rec_col_merge(WT_SESSION_IMPL *, WT_RECONCILE *, WT_PAGE *); +static int __rec_col_var(WT_SESSION_IMPL *, + WT_RECONCILE *, WT_PAGE *, WT_SALVAGE_COOKIE *); +static int __rec_col_var_helper(WT_SESSION_IMPL *, WT_RECONCILE *, + WT_SALVAGE_COOKIE *, WT_ITEM *, int, uint8_t, uint64_t); +static int __rec_destroy_session(WT_SESSION_IMPL *); +static int __rec_root_write(WT_SESSION_IMPL *, WT_PAGE *, uint32_t); +static int __rec_row_int(WT_SESSION_IMPL *, WT_RECONCILE *, WT_PAGE *); +static int __rec_row_leaf(WT_SESSION_IMPL *, + WT_RECONCILE *, WT_PAGE *, WT_SALVAGE_COOKIE *); +static int __rec_row_leaf_insert( + WT_SESSION_IMPL *, WT_RECONCILE *, WT_INSERT *); +static int __rec_row_merge(WT_SESSION_IMPL *, WT_RECONCILE *, WT_PAGE *); +static int __rec_split_col(WT_SESSION_IMPL *, WT_RECONCILE *, WT_PAGE *); +static int __rec_split_discard(WT_SESSION_IMPL *, WT_PAGE *); +static int __rec_split_fixup(WT_SESSION_IMPL *, WT_RECONCILE *); +static int __rec_split_row(WT_SESSION_IMPL *, WT_RECONCILE *, WT_PAGE *); +static int __rec_split_row_promote( + WT_SESSION_IMPL *, WT_RECONCILE *, WT_ITEM *, uint8_t); +static int __rec_split_write(WT_SESSION_IMPL *, + WT_RECONCILE *, WT_BOUNDARY *, WT_ITEM *, int); +static int __rec_write_init(WT_SESSION_IMPL *, + WT_REF *, uint32_t, WT_SALVAGE_COOKIE *, void *); +static int __rec_write_wrapup(WT_SESSION_IMPL *, WT_RECONCILE *, WT_PAGE *); +static int __rec_write_wrapup_err( + WT_SESSION_IMPL *, WT_RECONCILE *, WT_PAGE *); + +static void __rec_dictionary_free(WT_SESSION_IMPL *, WT_RECONCILE *); +static int __rec_dictionary_init(WT_SESSION_IMPL *, WT_RECONCILE *, u_int); +static int __rec_dictionary_lookup( + WT_SESSION_IMPL *, WT_RECONCILE *, WT_KV *, WT_DICTIONARY **); +static void __rec_dictionary_reset(WT_RECONCILE *); + +/* + * __wt_reconcile -- + * Reconcile an in-memory page into its on-disk format, and write it. + */ +int +__wt_reconcile(WT_SESSION_IMPL *session, + WT_REF *ref, WT_SALVAGE_COOKIE *salvage, uint32_t flags) +{ + WT_CONNECTION_IMPL *conn; + WT_DECL_RET; + WT_PAGE *page; + WT_PAGE_MODIFY *mod; + WT_RECONCILE *r; + int locked; + + conn = S2C(session); + page = ref->page; + mod = page->modify; + + /* We're shouldn't get called with a clean page, that's an error. */ + if (!__wt_page_is_modified(page)) + WT_RET_MSG(session, WT_ERROR, + "Attempt to reconcile a clean page."); + + WT_RET(__wt_verbose(session, + WT_VERB_RECONCILE, "%s", __wt_page_type_string(page->type))); + WT_STAT_FAST_CONN_INCR(session, rec_pages); + WT_STAT_FAST_DATA_INCR(session, rec_pages); + if (LF_ISSET(WT_EVICTING)) { + WT_STAT_FAST_CONN_INCR(session, rec_pages_eviction); + WT_STAT_FAST_DATA_INCR(session, rec_pages_eviction); + } + + /* Record the most recent transaction ID we will *not* write. */ + mod->disk_snap_min = session->txn.snap_min; + + /* Initialize the reconciliation structure for each new run. */ + WT_RET(__rec_write_init( + session, ref, flags, salvage, &session->reconcile)); + r = session->reconcile; + + /* + * The compaction process looks at the page's modification information; + * if compaction is running, lock the page down. + * + * Otherwise, flip on the scanning flag: obsolete updates cannot be + * freed while reconciliation is in progress. + */ + locked = 0; + if (conn->compact_in_memory_pass) { + locked = 1; + WT_PAGE_LOCK(session, page); + } else + for (;;) { + F_CAS_ATOMIC(page, WT_PAGE_SCANNING, ret); + if (ret == 0) + break; + __wt_yield(); + } + + /* Reconcile the page. */ + switch (page->type) { + case WT_PAGE_COL_FIX: + if (salvage != NULL) + ret = __rec_col_fix_slvg(session, r, page, salvage); + else + ret = __rec_col_fix(session, r, page); + break; + case WT_PAGE_COL_INT: + ret = __rec_col_int(session, r, page); + break; + case WT_PAGE_COL_VAR: + ret = __rec_col_var(session, r, page, salvage); + break; + case WT_PAGE_ROW_INT: + ret = __rec_row_int(session, r, page); + break; + case WT_PAGE_ROW_LEAF: + ret = __rec_row_leaf(session, r, page, salvage); + break; + WT_ILLEGAL_VALUE_SET(session); + } + + /* Wrap up the page reconciliation. */ + if (ret == 0) + ret = __rec_write_wrapup(session, r, page); + else + WT_TRET(__rec_write_wrapup_err(session, r, page)); + + /* Release the page lock if we're holding one. */ + if (locked) + WT_PAGE_UNLOCK(session, page); + else + F_CLR_ATOMIC(page, WT_PAGE_SCANNING); + + /* + * Clean up the boundary structures: some workloads result in millions + * of these structures, and if associated with some random session that + * got roped into doing forced eviction, they won't be discarded for the + * life of the session. + */ + __rec_bnd_cleanup(session, r, 0); + + WT_RET(ret); + + /* + * Root pages are special, splits have to be done, we can't put it off + * as the parent's problem any more. + */ + if (__wt_ref_is_root(ref)) + return (__rec_root_write(session, page, flags)); + + /* + * Otherwise, mark the page's parent dirty. + * Don't mark the tree dirty: if this reconciliation is in service of a + * checkpoint, it's cleared the tree's dirty flag, and we don't want to + * set it again as part of that walk. + */ + return (__wt_page_parent_modify_set(session, ref, 1)); +} + +/* + * __rec_root_write -- + * Handle the write of a root page. + */ +static int +__rec_root_write(WT_SESSION_IMPL *session, WT_PAGE *page, uint32_t flags) +{ + WT_DECL_RET; + WT_PAGE *next; + WT_PAGE_INDEX *pindex; + WT_PAGE_MODIFY *mod; + WT_REF fake_ref; + uint32_t i; + + mod = page->modify; + + /* + * If a single root page was written (either an empty page or there was + * a 1-for-1 page swap), we've written root and checkpoint, we're done. + * If the root page split, write the resulting WT_REF array. We already + * have an infrastructure for writing pages, create a fake root page and + * write it instead of adding code to write blocks based on the list of + * blocks resulting from a multiblock reconciliation. + */ + switch (F_ISSET(mod, WT_PM_REC_MASK)) { + case WT_PM_REC_EMPTY: /* Page is empty */ + case WT_PM_REC_REPLACE: /* 1-for-1 page swap */ + return (0); + case WT_PM_REC_MULTIBLOCK: /* Multiple blocks */ + break; + WT_ILLEGAL_VALUE(session); + } + + WT_RET(__wt_verbose(session, WT_VERB_SPLIT, + "root page split -> %" PRIu32 " pages", mod->mod_multi_entries)); + + /* + * Create a new root page, initialize the array of child references, + * mark it dirty, then write it. + */ + switch (page->type) { + case WT_PAGE_COL_INT: + WT_RET(__wt_page_alloc(session, + WT_PAGE_COL_INT, 1, mod->mod_multi_entries, 1, &next)); + break; + case WT_PAGE_ROW_INT: + WT_RET(__wt_page_alloc(session, + WT_PAGE_ROW_INT, 0, mod->mod_multi_entries, 1, &next)); + break; + WT_ILLEGAL_VALUE(session); + } + + pindex = WT_INTL_INDEX_COPY(next); + for (i = 0; i < mod->mod_multi_entries; ++i) { + WT_ERR(__wt_multi_to_ref(session, + next, &mod->mod_multi[i], &pindex->index[i], NULL)); + pindex->index[i]->home = next; + } + + /* + * We maintain a list of pages written for the root in order to free the + * backing blocks the next time the root is written. + */ + mod->mod_root_split = next; + + WT_ERR(__wt_page_modify_init(session, next)); + __wt_page_only_modify_set(session, next); + + /* + * Fake up a reference structure, and write the next root page. + */ + __wt_root_ref_init(&fake_ref, next, page->type == WT_PAGE_COL_INT); + return (__wt_reconcile(session, &fake_ref, NULL, flags)); + +err: __wt_page_out(session, &next); + return (ret); +} + +/* + * __rec_raw_compression_config -- + * Configure raw compression. + */ +static inline int +__rec_raw_compression_config( + WT_SESSION_IMPL *session, WT_PAGE *page, WT_SALVAGE_COOKIE *salvage) +{ + WT_BTREE *btree; + + btree = S2BT(session); + + /* Check if raw compression configured. */ + if (btree->compressor == NULL || + btree->compressor->compress_raw == NULL) + return (0); + + /* Only for row-store and variable-length column-store objects. */ + if (page->type == WT_PAGE_COL_FIX) + return (0); + + /* + * Raw compression cannot support dictionary compression. (Technically, + * we could still use the raw callback on column-store variable length + * internal pages with dictionary compression configured, because + * dictionary compression only applies to column-store leaf pages, but + * that seems an unlikely use case.) + */ + if (btree->dictionary != 0) + return (0); + + /* Raw compression cannot support prefix compression. */ + if (btree->prefix_compression != 0) + return (0); + + /* + * Raw compression is also turned off during salvage: we can't allow + * pages to split during salvage, raw compression has no point if it + * can't manipulate the page size. + */ + if (salvage != NULL) + return (0); + + return (1); +} + +/* + * __rec_write_init -- + * Initialize the reconciliation structure. + */ +static int +__rec_write_init(WT_SESSION_IMPL *session, + WT_REF *ref, uint32_t flags, WT_SALVAGE_COOKIE *salvage, void *reconcilep) +{ + WT_BTREE *btree; + WT_PAGE *page; + WT_RECONCILE *r; + + btree = S2BT(session); + page = ref->page; + + if ((r = *(WT_RECONCILE **)reconcilep) == NULL) { + WT_RET(__wt_calloc_def(session, 1, &r)); + + *(WT_RECONCILE **)reconcilep = r; + session->reconcile_cleanup = __rec_destroy_session; + + /* Connect pointers/buffers. */ + r->cur = &r->_cur; + r->last = &r->_last; + + /* Disk buffers need to be aligned for writing. */ + F_SET(&r->dsk, WT_ITEM_ALIGNED); + } + + /* Remember the configuration. */ + r->ref = ref; + r->page = page; + r->flags = flags; + + /* Track if the page can be marked clean. */ + r->leave_dirty = 0; + + /* Raw compression. */ + r->raw_compression = + __rec_raw_compression_config(session, page, salvage); + r->raw_destination.flags = WT_ITEM_ALIGNED; + + /* Track overflow items. */ + r->ovfl_items = 0; + + /* Track empty values. */ + r->all_empty_value = 1; + r->any_empty_value = 0; + + /* The list of cached, skipped updates. */ + r->skip_next = 0; + + /* + * Dictionary compression only writes repeated values once. We grow + * the dictionary as necessary, always using the largest size we've + * seen. + * + * Reset the dictionary. + * + * Sanity check the size: 100 slots is the smallest dictionary we use. + */ + if (btree->dictionary != 0 && btree->dictionary > r->dictionary_slots) + WT_RET(__rec_dictionary_init(session, + r, btree->dictionary < 100 ? 100 : btree->dictionary)); + __rec_dictionary_reset(r); + + /* + * Suffix compression shortens internal page keys by discarding trailing + * bytes that aren't necessary for tree navigation. We don't do suffix + * compression if there is a custom collator because we don't know what + * bytes a custom collator might use. Some custom collators (for + * example, a collator implementing reverse ordering of strings), won't + * have any problem with suffix compression: if there's ever a reason to + * implement suffix compression for custom collators, we can add a + * setting to the collator, configured when the collator is added, that + * turns on suffix compression. + * + * The raw compression routines don't even consider suffix compression, + * but it doesn't hurt to confirm that. + */ + r->key_sfx_compress_conf = 0; + if (btree->collator == NULL && + btree->internal_key_truncate && !r->raw_compression) + r->key_sfx_compress_conf = 1; + + /* + * Prefix compression discards repeated prefix bytes from row-store leaf + * page keys. + */ + r->key_pfx_compress_conf = 0; + if (btree->prefix_compression && page->type == WT_PAGE_ROW_LEAF) + r->key_pfx_compress_conf = 1; + + r->salvage = salvage; + + /* Save the page's write generation before reading the page. */ + WT_ORDERED_READ(r->orig_write_gen, page->modify->write_gen); + + /* + * Running transactions may update the page after we write it, so + * this is the highest ID we can be confident we will see. + */ + r->skipped_txn = S2C(session)->txn_global.last_running; + + return (0); +} + +/* + * __rec_destroy -- + * Clean up the reconciliation structure. + */ +static void +__rec_destroy(WT_SESSION_IMPL *session, void *reconcilep) +{ + WT_RECONCILE *r; + + if ((r = *(WT_RECONCILE **)reconcilep) == NULL) + return; + *(WT_RECONCILE **)reconcilep = NULL; + + __wt_buf_free(session, &r->dsk); + + __wt_free(session, r->raw_entries); + __wt_free(session, r->raw_offsets); + __wt_free(session, r->raw_recnos); + __wt_buf_free(session, &r->raw_destination); + + __rec_bnd_cleanup(session, r, 1); + + __wt_free(session, r->skip); + + __wt_buf_free(session, &r->k.buf); + __wt_buf_free(session, &r->v.buf); + __wt_buf_free(session, &r->_cur); + __wt_buf_free(session, &r->_last); + + __rec_dictionary_free(session, r); + + __wt_free(session, r); +} + +/* + * __rec_destroy_session -- + * Clean up the reconciliation structure, session version. + */ +static int +__rec_destroy_session(WT_SESSION_IMPL *session) +{ + __rec_destroy(session, &session->reconcile); + return (0); +} + +/* + * __rec_bnd_cleanup -- + * Cleanup the boundary structure information. + */ +static void +__rec_bnd_cleanup(WT_SESSION_IMPL *session, WT_RECONCILE *r, int destroy) +{ + WT_BOUNDARY *bnd; + uint32_t i, last_used; + + if (r->bnd == NULL) + return; + + /* + * Free the boundary structures' memory. In the case of normal cleanup, + * discard any memory we won't reuse in the next reconciliation; in the + * case of destruction, discard everything. + * + * During some big-page evictions we have seen boundary arrays that have + * millions of elements. That should not be a normal event, but if the + * memory is associated with a random session, it won't be discarded + * until the session is closed. If there are more than 10,000 boundary + * structure elements, destroy the boundary array and we'll start over. + */ + if (destroy || r->bnd_entries > 10 * 1000) { + for (bnd = r->bnd, i = 0; i < r->bnd_entries; ++bnd, ++i) { + __wt_free(session, bnd->addr.addr); + __wt_free(session, bnd->dsk); + __wt_free(session, bnd->skip); + __wt_buf_free(session, &bnd->key); + } + __wt_free(session, r->bnd); + r->bnd_next = 0; + r->bnd_entries = r->bnd_allocated = 0; + } else { + /* + * The boundary-next field points to the next boundary structure + * we were going to use, but there's no requirement that value + * be incremented before reconciliation updates the structure it + * points to, that is, there's no guarantee elements of the next + * boundary structure are still unchanged. Be defensive, clean + * up the "next" structure as well as the ones we know we used. + */ + last_used = r->bnd_next; + if (last_used < r->bnd_entries) + ++last_used; + for (bnd = r->bnd, i = 0; i < last_used; ++bnd, ++i) { + __wt_free(session, bnd->addr.addr); + __wt_free(session, bnd->dsk); + __wt_free(session, bnd->skip); + } + } +} + +/* + * __rec_skip_update_save -- + * Save a skipped WT_UPDATE list for later restoration. + */ +static int +__rec_skip_update_save( + WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_INSERT *ins, WT_ROW *rip) +{ + WT_RET(__wt_realloc_def( + session, &r->skip_allocated, r->skip_next + 1, &r->skip)); + r->skip[r->skip_next].ins = ins; + r->skip[r->skip_next].rip = rip; + ++r->skip_next; + return (0); +} + +/* + * __rec_skip_update_move -- + * Move a skipped WT_UPDATE list from the per-page cache to a specific + * block's list. + */ +static int +__rec_skip_update_move( + WT_SESSION_IMPL *session, WT_BOUNDARY *bnd, WT_UPD_SKIPPED *skip) +{ + WT_RET(__wt_realloc_def( + session, &bnd->skip_allocated, bnd->skip_next + 1, &bnd->skip)); + bnd->skip[bnd->skip_next] = *skip; + ++bnd->skip_next; + + skip->ins = NULL; + skip->rip = NULL; + return (0); +} + +/* + * __rec_txn_read -- + * Return the first visible update in a list (or NULL if none are visible), + * set a flag if any updates were skipped, track the maximum transaction ID on + * the page. + */ +static inline int +__rec_txn_read(WT_SESSION_IMPL *session, WT_RECONCILE *r, + WT_INSERT *ins, WT_ROW *rip, WT_CELL_UNPACK *vpack, WT_UPDATE **updp) +{ + WT_ITEM ovfl; + WT_PAGE *page; + WT_UPDATE *upd, *upd_list, *upd_ovfl; + size_t notused; + uint64_t max_txn, min_txn, txnid; + int skipped; + + *updp = NULL; + + page = r->page; + + /* + * If we're called with an WT_INSERT reference, use its WT_UPDATE + * list, else is an on-page row-store WT_UPDATE list. + */ + upd_list = ins == NULL ? WT_ROW_UPDATE(page, rip) : ins->upd; + skipped = 0; + + for (max_txn = WT_TXN_NONE, min_txn = UINT64_MAX, upd = upd_list; + upd != NULL; upd = upd->next) { + if ((txnid = upd->txnid) == WT_TXN_ABORTED) + continue; + + /* Track the largest/smallest transaction IDs on the list. */ + if (TXNID_LT(max_txn, txnid)) + max_txn = txnid; + if (TXNID_LT(txnid, min_txn)) + min_txn = txnid; + if (TXNID_LT(txnid, r->skipped_txn) && + !__wt_txn_visible_all(session, txnid)) + r->skipped_txn = txnid; + + /* + * Record whether any updates were skipped on the way to finding + * the first visible update. + * + * If updates were skipped before the one being written, future + * reads without intervening modifications to the page could + * see a different value; if no updates were skipped, the page + * can safely be marked clean and does not need to be + * reconciled until modified again. + */ + if (*updp == NULL) { + if (__wt_txn_visible(session, txnid)) + *updp = upd; + else + skipped = 1; + } + } + + /* + * Track the maximum transaction ID in the page. We store this in the + * page at the end of reconciliation if no updates are skipped, it's + * used to avoid evicting clean pages from memory with changes required + * to satisfy a snapshot read. + */ + if (TXNID_LT(r->max_txn, max_txn)) + r->max_txn = max_txn; + + /* + * If all updates are globally visible and no updates were skipped, the + * page can be marked clean and we're done, regardless of whether we're + * evicting or checkpointing. + * + * The oldest transaction ID may have moved while we were scanning the + * page, so it is possible to skip an update but then find that by the + * end of the scan, all updates are stable. + */ + if (__wt_txn_visible_all(session, max_txn) && !skipped) + return (0); + + /* + * If some updates are not globally visible, or were skipped, the page + * cannot be marked clean. + */ + r->leave_dirty = 1; + + /* If we're not evicting, we're done, we know what we'll write. */ + if (!F_ISSET(r, WT_EVICTING)) + return (0); + + /* In some cases, there had better not be any updates we can't write. */ + if (F_ISSET(r, WT_SKIP_UPDATE_ERR)) + WT_PANIC_RET(session, EINVAL, + "reconciliation illegally skipped an update"); + + /* + * If evicting and we aren't able to save/restore the not-yet-visible + * updates, the page can't be evicted. + */ + if (!F_ISSET(r, WT_SKIP_UPDATE_RESTORE)) + return (EBUSY); + + /* + * Evicting a page with not-yet-visible updates: save and restore the + * list of updates on a newly instantiated page. + * + * The order of the updates on the list matters so we can't move only + * the unresolved updates, we have to move the entire update list. + * + * Clear the returned update so our caller ignores the key/value pair + * in the case of an insert/append entry (everything we need is in the + * update list), and otherwise writes the original on-page key/value + * pair to which the update list applies. + */ + *updp = NULL; + + /* + * Handle the case were we don't want to write an original on-page value + * item to disk because it's been updated or removed. + * + * Here's the deal: an overflow value was updated or removed and its + * backing blocks freed. If any transaction in the system might still + * read the value, a copy was cached in page reconciliation tracking + * memory, and the page cell set to WT_CELL_VALUE_OVFL_RM. Eviction + * then chose the page and we're splitting it up in order to push parts + * of it out of memory. + * + * We could write the original on-page value item to disk... if we had + * a copy. The cache may not have a copy (a globally visible update + * would have kept a value from ever being cached), or an update that + * subsequent became globally visible could cause a cached value to be + * discarded. Either way, once there's a globally visible update, we + * may not have the value. + * + * Fortunately, if there's a globally visible update we don't care about + * the original version, so we simply ignore it, no transaction can ever + * try and read it. If there isn't a globally visible update, there had + * better be a cached value. + * + * In the latter case, we could write the value out to disk, but (1) we + * are planning on re-instantiating this page in memory, it isn't going + * to disk, and (2) the value item is eventually going to be discarded, + * that seems like a waste of a write. Instead, find the cached value + * and append it to the update list we're saving for later restoration. + */ + if (vpack != NULL && vpack->raw == WT_CELL_VALUE_OVFL_RM && + !__wt_txn_visible_all(session, min_txn)) { + WT_RET(__wt_ovfl_txnc_search( + page, vpack->data, vpack->size, &ovfl)); + /* + * Create an update structure with an impossibly low transaction + * ID and append it to the update list we're about to save. + * Restoring that update list when this page is re-instantiated + * creates an update for the key/value pair visible to every + * running transaction in the system, ensuring the on-page value + * will be ignored. + */ + WT_RET(__wt_update_alloc(session, &ovfl, &upd_ovfl, ¬used)); + upd_ovfl->txnid = WT_TXN_NONE; + for (upd = upd_list; upd->next != NULL; upd = upd->next) + ; + upd->next = upd_ovfl; + } + + return (__rec_skip_update_save(session, r, ins, rip)); +} + +/* + * CHILD_RELEASE -- + * Macros to clean up during internal-page reconciliation, releasing the + * hazard pointer we're holding on child pages. + */ +#undef CHILD_RELEASE +#define CHILD_RELEASE(session, hazard, ref) do { \ + if (hazard) { \ + hazard = 0; \ + WT_TRET( \ + __wt_page_release(session, ref, WT_READ_NO_EVICT)); \ + } \ +} while (0) +#undef CHILD_RELEASE_ERR +#define CHILD_RELEASE_ERR(session, hazard, ref) do { \ + CHILD_RELEASE(session, hazard, ref); \ + WT_ERR(ret); \ +} while (0) + +/* + * __rec_child_modify -- + * Return if the internal page's child references any modifications. + */ +static int +__rec_child_modify(WT_SESSION_IMPL *session, + WT_RECONCILE *r, WT_REF *ref, int *hazardp, int *statep) +{ + WT_DECL_RET; + WT_PAGE_MODIFY *mod; + + /* We may acquire a hazard pointer our caller must release. */ + *hazardp = 0; + +#define WT_CHILD_IGNORE 1 /* Deleted child: ignore */ +#define WT_CHILD_MODIFIED 2 /* Modified child */ +#define WT_CHILD_PROXY 3 /* Deleted child: proxy */ + *statep = 0; + + /* + * This function is called when walking an internal page to decide how + * to handle child pages referenced by the internal page, specifically + * if the child page is to be merged into its parent. + * + * Internal pages are reconciled for two reasons: first, when evicting + * an internal page, second by the checkpoint code when writing internal + * pages. During eviction, the subtree is locked down so all pages + * should be in the WT_REF_DISK or WT_REF_LOCKED state. During + * checkpoint, any eviction that might affect our review of an internal + * page is prohibited, however, as the subtree is not reserved for our + * exclusive use, there are other page states that must be considered. + */ + for (;; __wt_yield()) + switch (r->tested_ref_state = ref->state) { + case WT_REF_DISK: + /* On disk, not modified by definition. */ + goto done; + + case WT_REF_DELETED: + /* + * The child is in a deleted state. + * + * It's possible the state could change underneath us as + * the page is read in, and we can race between checking + * for a deleted state and looking at the transaction ID + * to see if the delete is visible to us. Lock down the + * structure. + */ + if (!WT_ATOMIC_CAS4( + ref->state, WT_REF_DELETED, WT_REF_LOCKED)) + break; + ret = __rec_child_deleted(session, r, ref, statep); + WT_PUBLISH(ref->state, WT_REF_DELETED); + goto done; + + case WT_REF_LOCKED: + /* + * Locked. + * + * If evicting, the evicted page's subtree, including + * this child, was selected for eviction by us and the + * state is stable until we reset it, it's an in-memory + * state. This is the expected state for a child being + * merged into a page (where the page was selected by + * the eviction server for eviction). + */ + if (F_ISSET(r, WT_EVICTING)) + goto in_memory; + + /* + * If called during checkpoint, the child is being + * considered by the eviction server or the child is a + * fast-delete page being read. The eviction may have + * started before the checkpoint and so we must wait + * for the eviction to be resolved. I suspect we could + * handle fast-delete reads, but we can't distinguish + * between the two and fast-delete reads aren't expected + * to be common. + */ + break; + + case WT_REF_MEM: + /* + * In memory. + * + * If evicting, the evicted page's subtree, including + * this child, was selected for eviction by us and the + * state is stable until we reset it, it's an in-memory + * state. This is the expected state for a child being + * merged into a page (where the page belongs to a file + * being discarded from the cache during close). + */ + if (F_ISSET(r, WT_EVICTING)) + goto in_memory; + + /* + * If called during checkpoint, acquire a hazard pointer + * so the child isn't evicted, it's an in-memory case. + * + * This call cannot return split/restart, dirty page + * eviction is shutout during checkpoint, all splits in + * process will have completed before we walk any pages + * for checkpoint. + */ + if ((ret = __wt_page_in(session, ref, + WT_READ_CACHE | WT_READ_NO_EVICT | + WT_READ_NO_GEN | WT_READ_NO_WAIT)) == WT_NOTFOUND) { + ret = 0; + break; + } + *hazardp = 1; + goto in_memory; + + case WT_REF_READING: + /* + * Being read, not modified by definition. + * + * We should never be here during eviction, a child page + * in this state within an evicted page's subtree would + * have caused normally eviction to fail, and exclusive + * eviction shouldn't ever see pages being read. + */ + WT_ASSERT(session, !F_ISSET(r, WT_EVICTING)); + goto done; + + case WT_REF_SPLIT: + /* + * The page was split out from under us. + * + * We should never be here during eviction, a child page + * in this state within an evicted page's subtree would + * have caused eviction to fail. + * + * We should never be here during checkpoint, dirty page + * eviction is shutout during checkpoint, all splits in + * process will have completed before we walk any pages + * for checkpoint. + */ + WT_ASSERT(session, ref->state != WT_REF_SPLIT); + /* FALLTHROUGH */ + + WT_ILLEGAL_VALUE(session); + } + +in_memory: + /* + * In-memory states: the child is potentially modified if the page's + * modify structure has been instantiated. If the modify structure + * exists and the page has actually been modified, set that state. + * If that's not the case, we would normally use the original cell's + * disk address as our reference, but, if we're forced to instantiate + * a deleted child page and it's never modified, we end up here with + * a page that has a modify structure, no modifications, and no disk + * address. Ignore those pages, they're not modified and there is no + * reason to write the cell. + */ + mod = ref->page->modify; + if (mod != NULL && mod->flags != 0) + *statep = WT_CHILD_MODIFIED; + else if (ref->addr == NULL) { + *statep = WT_CHILD_IGNORE; + CHILD_RELEASE(session, *hazardp, ref); + } + +done: WT_HAVE_DIAGNOSTIC_YIELD; + return (ret); +} + +/* + * __rec_child_deleted -- + * Handle pages with leaf pages in the WT_REF_DELETED state. + */ +static int +__rec_child_deleted( + WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_REF *ref, int *statep) +{ + WT_BM *bm; + WT_PAGE_DELETED *page_del; + size_t addr_size; + const uint8_t *addr; + + bm = S2BT(session)->bm; + page_del = ref->page_del; + + /* + * Internal pages with child leaf pages in the WT_REF_DELETED state are + * a special case during reconciliation. First, if the deletion was a + * result of a session truncate call, the deletion may not be visible to + * us. In that case, we proceed as with any change that's not visible + * during reconciliation by setting the skipped flag and ignoring the + * change for the purposes of writing the internal page. + * + * In this case, there must be an associated page-deleted structure, and + * it holds the transaction ID we care about. + */ + if (page_del != NULL && !__wt_txn_visible(session, page_del->txnid)) { + /* + * In some cases, there had better not be any updates we can't + * write. + */ + if (F_ISSET(r, WT_SKIP_UPDATE_ERR)) + WT_PANIC_RET(session, EINVAL, + "reconciliation illegally skipped an update"); + + /* If this page cannot be evicted, quit now. */ + if (F_ISSET(r, WT_EVICTING)) + return (EBUSY); + } + + /* + * The deletion is visible to us, deal with any underlying disk blocks. + * + * First, check to see if there is an address associated with this leaf: + * if there isn't, we're done, the underlying page is already gone. If + * the page still exists, check for any transactions in the system that + * might want to see the page's state before it's deleted. + * + * If any such transactions exist, we cannot discard the underlying leaf + * page to the block manager because the transaction may eventually read + * it. However, this write might be part of a checkpoint, and should we + * recover to that checkpoint, we'll need to delete the leaf page, else + * we'd leak it. The solution is to write a proxy cell on the internal + * page ensuring the leaf page is eventually discarded. + * + * If no such transactions exist, we can discard the leaf page to the + * block manager and no cell needs to be written at all. We do this + * outside of the underlying tracking routines because this action is + * permanent and irrevocable. (Clearing the address means we've lost + * track of the disk address in a permanent way. This is safe because + * there's no path to reading the leaf page again: if there's ever a + * read into this part of the name space again, the cache read function + * instantiates an entirely new page.) + */ + if (ref->addr != NULL && + (page_del == NULL || + __wt_txn_visible_all(session, page_del->txnid))) { + WT_RET(__wt_ref_info(session, ref, &addr, &addr_size, NULL)); + WT_RET(bm->free(bm, session, addr, addr_size)); + + if (__wt_off_page(ref->home, ref->addr)) { + __wt_free(session, ((WT_ADDR *)ref->addr)->addr); + __wt_free(session, ref->addr); + } + ref->addr = NULL; + } + + /* + * Minor memory cleanup: if a truncate call deleted this page and we + * were ever forced to instantiate the page in memory, we would have + * built a list of updates in the page reference in order to be able + * to abort the truncate. It's a cheap test to make that memory go + * away, we do it here because there's really nowhere else we do the + * checks. In short, if we have such a list, and the backing address + * blocks are gone, there can't be any transaction that can abort. + */ + if (ref->addr == NULL && page_del != NULL) { + __wt_free(session, ref->page_del->update_list); + __wt_free(session, ref->page_del); + } + + /* + * If there's still a disk address, then we have to write a proxy + * record, otherwise, we can safely ignore this child page. + */ + *statep = ref->addr == NULL ? WT_CHILD_IGNORE : WT_CHILD_PROXY; + return (0); +} + +/* + * __rec_incr -- + * Update the memory tracking structure for a set of new entries. + */ +static inline void +__rec_incr(WT_SESSION_IMPL *session, WT_RECONCILE *r, uint32_t v, size_t size) +{ + /* + * The buffer code is fragile and prone to off-by-one errors -- check + * for overflow in diagnostic mode. + */ + WT_ASSERT(session, r->space_avail >= size); + WT_ASSERT(session, + WT_BLOCK_FITS(r->first_free, size, r->dsk.mem, r->page_size)); + + r->entries += v; + r->space_avail -= size; + r->first_free += size; +} + +/* + * __rec_copy_incr -- + * Copy a key/value cell and buffer pair into the new image. + */ +static inline void +__rec_copy_incr(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_KV *kv) +{ + size_t len; + uint8_t *p, *t; + + /* + * If there's only one chunk of data to copy (because the cell and data + * are being copied from the original disk page), the cell length won't + * be set, the WT_ITEM data/length will reference the data to be copied. + * + * WT_CELLs are typically small, 1 or 2 bytes -- don't call memcpy, do + * the copy in-line. + */ + for (p = (uint8_t *)r->first_free, + t = (uint8_t *)&kv->cell, len = kv->cell_len; len > 0; --len) + *p++ = *t++; + + /* The data can be quite large -- call memcpy. */ + if (kv->buf.size != 0) + memcpy(p, kv->buf.data, kv->buf.size); + + WT_ASSERT(session, kv->len == kv->cell_len + kv->buf.size); + __rec_incr(session, r, 1, kv->len); +} + +/* + * __rec_dict_replace -- + * Check for a dictionary match. + */ +static int +__rec_dict_replace( + WT_SESSION_IMPL *session, WT_RECONCILE *r, uint64_t rle, WT_KV *val) +{ + WT_DICTIONARY *dp; + uint64_t offset; + + /* + * We optionally create a dictionary of values and only write a unique + * value once per page, using a special "copy" cell for all subsequent + * copies of the value. We have to do the cell build and resolution at + * this low level because we need physical cell offsets for the page. + * + * Sanity check: short-data cells can be smaller than dictionary-copy + * cells. If the data is already small, don't bother doing the work. + * This isn't just work avoidance: on-page cells can't grow as a result + * of writing a dictionary-copy cell, the reconciliation functions do a + * split-boundary test based on the size required by the value's cell; + * if we grow the cell after that test we'll potentially write off the + * end of the buffer's memory. + */ + if (val->buf.size <= WT_INTPACK32_MAXSIZE) + return (0); + WT_RET(__rec_dictionary_lookup(session, r, val, &dp)); + if (dp == NULL) + return (0); + + /* + * If the dictionary cell reference is not set, we're creating a new + * entry in the dictionary, update its location. + * + * If the dictionary cell reference is set, we have a matching value. + * Create a copy cell instead. + */ + if (dp->cell == NULL) + dp->cell = r->first_free; + else { + offset = WT_PTRDIFF(r->first_free, dp->cell); + val->len = val->cell_len = + __wt_cell_pack_copy(&val->cell, rle, offset); + val->buf.data = NULL; + val->buf.size = 0; + } + return (0); +} + +/* + * __rec_key_state_update -- + * Update prefix and suffix compression based on the last key. + */ +static inline void +__rec_key_state_update(WT_RECONCILE *r, int ovfl_key) +{ + WT_ITEM *a; + + /* + * If writing an overflow key onto the page, don't update the "last key" + * value, and leave the state of prefix compression alone. (If we are + * currently doing prefix compression, we have a key state which will + * continue to work, we're just skipping the key just created because + * it's an overflow key and doesn't participate in prefix compression. + * If we are not currently doing prefix compression, we can't start, an + * overflow key doesn't give us any state.) + * + * Additionally, if we wrote an overflow key onto the page, turn off the + * suffix compression of row-store internal node keys. (When we split, + * "last key" is the largest key on the previous page, and "cur key" is + * the first key on the next page, which is being promoted. In some + * cases we can discard bytes from the "cur key" that are not needed to + * distinguish between the "last key" and "cur key", compressing the + * size of keys on internal nodes. If we just built an overflow key, + * we're not going to update the "last key", making suffix compression + * impossible for the next key. Alternatively, we could remember where + * the last key was on the page, detect it's an overflow key, read it + * from disk and do suffix compression, but that's too much work for an + * unlikely event.) + * + * If we're not writing an overflow key on the page, update the last-key + * value and turn on both prefix and suffix compression. + */ + if (ovfl_key) + r->key_sfx_compress = 0; + else { + a = r->cur; + r->cur = r->last; + r->last = a; + + r->key_pfx_compress = r->key_pfx_compress_conf; + r->key_sfx_compress = r->key_sfx_compress_conf; + } +} + +/* + * Macros from fixed-length entries to/from bytes. + */ +#define WT_FIX_BYTES_TO_ENTRIES(btree, bytes) \ + ((uint32_t)((((bytes) * 8) / (btree)->bitcnt))) +#define WT_FIX_ENTRIES_TO_BYTES(btree, entries) \ + ((uint32_t)WT_ALIGN((entries) * (btree)->bitcnt, 8)) + +/* + * __rec_leaf_page_max -- + * Figure out the maximum leaf page size for the reconciliation. + */ +static inline uint32_t +__rec_leaf_page_max(WT_SESSION_IMPL *session, WT_RECONCILE *r) +{ + WT_BTREE *btree; + WT_PAGE *page; + uint32_t page_size; + + btree = S2BT(session); + page = r->page; + + page_size = 0; + switch (page->type) { + case WT_PAGE_COL_FIX: + /* + * Column-store pages can grow if there are missing records + * (that is, we lost a chunk of the range, and have to write + * deleted records). Fixed-length objects are a problem, if + * there's a big missing range, we could theoretically have to + * write large numbers of missing objects. + */ + page_size = (uint32_t)WT_ALIGN(WT_FIX_ENTRIES_TO_BYTES(btree, + r->salvage->take + r->salvage->missing), btree->allocsize); + break; + case WT_PAGE_COL_VAR: + /* + * Column-store pages can grow if there are missing records + * (that is, we lost a chunk of the range, and have to write + * deleted records). Variable-length objects aren't usually a + * problem because we can write any number of deleted records + * in a single page entry because of the RLE, we just need to + * ensure that additional entry fits. + */ + break; + case WT_PAGE_ROW_LEAF: + default: + /* + * Row-store pages can't grow, salvage never does anything + * other than reduce the size of a page read from disk. + */ + break; + } + + /* + * Default size for variable-length column-store and row-store pages + * during salvage is the maximum leaf page size. + */ + if (page_size < btree->maxleafpage) + page_size = btree->maxleafpage; + + /* + * The page we read from the disk should be smaller than the page size + * we just calculated, check out of paranoia. + */ + if (page_size < page->dsk->mem_size) + page_size = page->dsk->mem_size; + + /* + * Salvage is the backup plan: don't let this fail. + */ + return (page_size * 2); +} + +/* + * __rec_split_bnd_init -- + * Initialize a single boundary structure. + */ +static void +__rec_split_bnd_init(WT_SESSION_IMPL *session, WT_BOUNDARY *bnd) +{ + bnd->start = NULL; + + bnd->recno = 0; + bnd->entries = 0; + + __wt_free(session, bnd->addr.addr); + WT_CLEAR(bnd->addr); + bnd->size = 0; + bnd->cksum = 0; + __wt_free(session, bnd->dsk); + + __wt_free(session, bnd->skip); + bnd->skip_next = 0; + bnd->skip_allocated = 0; + + /* Ignore the key, we re-use that memory in each new reconciliation. */ + + bnd->already_compressed = 0; +} + +/* + * __rec_split_bnd_grow -- + * Grow the boundary array as necessary. + */ +static int +__rec_split_bnd_grow(WT_SESSION_IMPL *session, WT_RECONCILE *r) +{ + /* + * Make sure there's enough room for another boundary. The calculation + * is +2, because when filling in the current boundary's information, + * we save the start point of the next boundary (for example, a record + * number or key), in the (current + 1) slot. + * + * For the same reason, we're always initializing one ahead. + */ + WT_RET(__wt_realloc_def( + session, &r->bnd_allocated, r->bnd_next + 2, &r->bnd)); + r->bnd_entries = r->bnd_allocated / sizeof(r->bnd[0]); + + __rec_split_bnd_init(session, &r->bnd[r->bnd_next + 1]); + + return (0); +} + +/* + * __rec_split_init -- + * Initialization for the reconciliation split functions. + */ +static int +__rec_split_init(WT_SESSION_IMPL *session, + WT_RECONCILE *r, WT_PAGE *page, uint64_t recno, uint32_t max) +{ + WT_BM *bm; + WT_BTREE *btree; + WT_PAGE_HEADER *dsk; + size_t corrected_page_size; + + btree = S2BT(session); + bm = btree->bm; + + /* + * The maximum leaf page size governs when an in-memory leaf page splits + * into multiple on-disk pages; however, salvage can't be allowed to + * split, there's no parent page yet. If we're doing salvage, override + * the caller's selection of a maximum page size, choosing a page size + * that ensures we won't split. + */ + if (r->salvage != NULL) + max = __rec_leaf_page_max(session, r); + + /* + * Set the page sizes. If we're doing the page layout, the maximum page + * size is the same as the page size. If the application is doing page + * layout (raw compression is configured), we accumulate some amount of + * additional data because we don't know how well it will compress, and + * we don't want to increment our way up to the amount of data needed by + * the application to successfully compress to the target page size. + */ + r->page_size = r->page_size_max = max; + if (r->raw_compression) + r->page_size *= 10; + + /* + * Ensure the disk image buffer is large enough for the max object, as + * corrected by the underlying block manager. + */ + corrected_page_size = r->page_size; + WT_RET(bm->write_size(bm, session, &corrected_page_size)); + WT_RET(__wt_buf_init(session, &r->dsk, corrected_page_size)); + + /* + * Clear the disk page's header and block-manager space, set the page + * type (the type doesn't change, and setting it later would require + * additional code in a few different places). + */ + dsk = r->dsk.mem; + memset(dsk, 0, WT_PAGE_HEADER_BYTE_SIZE(btree)); + dsk->type = page->type; + + /* + * If we have to split, we want to choose a smaller page size for the + * split pages, because otherwise we could end up splitting one large + * packed page over and over. We don't want to pick the minimum size + * either, because that penalizes an application that did a bulk load + * and subsequently inserted a few items into packed pages. Currently + * defaulted to 75%, but I have no empirical evidence that's "correct". + * + * The maximum page size may be a multiple of the split page size (for + * example, there's a maximum page size of 128KB, but because the table + * is active and we don't want to split a lot, the split size is 20KB). + * The maximum page size may NOT be an exact multiple of the split page + * size. + * + * It's lots of work to build these pages and don't want to start over + * when we reach the maximum page size (it's painful to restart after + * creating overflow items and compacted data, for example, as those + * items have already been written to disk). So, the loop calls the + * helper functions when approaching a split boundary, and we save the + * information at that point. That allows us to go back and split the + * page at the boundary points if we eventually overflow the maximum + * page size. + * + * Finally, all this doesn't matter for fixed-size column-store pages, + * raw compression, and salvage. Fixed-size column store pages can + * split under (very) rare circumstances, but they're allocated at a + * fixed page size, never anything smaller. In raw compression, the + * underlying compression routine decides when we split, so it's not + * our problem. In salvage, as noted above, we can't split at all. + */ + if (r->raw_compression || r->salvage != NULL) { + r->split_size = 0; + r->space_avail = r->page_size - WT_PAGE_HEADER_BYTE_SIZE(btree); + } + else if (page->type == WT_PAGE_COL_FIX) { + r->split_size = r->page_size_max; + r->space_avail = + r->split_size - WT_PAGE_HEADER_BYTE_SIZE(btree); + } else { + r->split_size = __wt_split_page_size(btree, r->page_size_max); + r->space_avail = + r->split_size - WT_PAGE_HEADER_BYTE_SIZE(btree); + } + r->first_free = WT_PAGE_HEADER_BYTE(btree, dsk); + + /* Initialize the first boundary. */ + r->bnd_next = 0; + WT_RET(__rec_split_bnd_grow(session, r)); + __rec_split_bnd_init(session, &r->bnd[0]); + r->bnd[0].recno = recno; + r->bnd[0].start = WT_PAGE_HEADER_BYTE(btree, dsk); + + /* + * If the maximum page size is the same as the split page size, either + * because of the object type or application configuration, there isn't + * any need to maintain split boundaries within a larger page. + * + * No configuration for salvage here, because salvage can't split. + */ + if (r->raw_compression) + r->bnd_state = SPLIT_TRACKING_RAW; + else if (max == r->split_size) + r->bnd_state = SPLIT_TRACKING_OFF; + else + r->bnd_state = SPLIT_BOUNDARY; + + /* Initialize the entry counters. */ + r->entries = r->total_entries = 0; + + /* Initialize the starting record number. */ + r->recno = recno; + + /* New page, compression off. */ + r->key_pfx_compress = r->key_sfx_compress = 0; + + return (0); +} + +/* + * __rec_is_checkpoint -- + * Return if we're writing a checkpoint. + */ +static int +__rec_is_checkpoint(WT_RECONCILE *r, WT_BOUNDARY *bnd) +{ + /* + * Check to see if we're going to create a checkpoint. + * + * This function exists as a place to hang this comment. + * + * Any time we write the root page of the tree without splitting we are + * creating a checkpoint (and have to tell the underlying block manager + * so it creates and writes the additional information checkpoints + * require). However, checkpoints are completely consistent, and so we + * have to resolve information about the blocks we're expecting to free + * as part of the checkpoint, before writing the checkpoint. In short, + * we don't do checkpoint writes here; clear the boundary information as + * a reminder and create the checkpoint during wrapup. + */ + if (bnd == &r->bnd[0] && __wt_ref_is_root(r->ref)) { + bnd->addr.addr = NULL; + bnd->addr.size = 0; + bnd->addr.type = 0; + return (1); + } + return (0); +} + +/* + * __rec_split_row_promote_cell -- + * Get a key from a cell for the purposes of promotion. + */ +static int +__rec_split_row_promote_cell( + WT_SESSION_IMPL *session, WT_PAGE_HEADER *dsk, WT_ITEM *key) +{ + WT_BTREE *btree; + WT_CELL *cell; + WT_CELL_UNPACK *kpack, _kpack; + + btree = S2BT(session); + kpack = &_kpack; + + /* + * The cell had better have a zero-length prefix and not be a copy cell; + * the first cell on a page cannot refer an earlier cell on the page. + */ + cell = WT_PAGE_HEADER_BYTE(btree, dsk); + __wt_cell_unpack(cell, kpack); + WT_ASSERT(session, + kpack->prefix == 0 && kpack->raw != WT_CELL_VALUE_COPY); + + WT_RET(__wt_cell_data_copy(session, dsk->type, kpack, key)); + return (0); +} + +/* + * __rec_split_row_promote -- + * Key promotion for a row-store. + */ +static int +__rec_split_row_promote( + WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_ITEM *key, uint8_t type) +{ + WT_BTREE *btree; + WT_DECL_ITEM(update); + WT_DECL_RET; + WT_ITEM *max; + WT_UPD_SKIPPED *skip; + size_t cnt, len, size; + uint32_t i; + const uint8_t *pa, *pb; + int cmp; + + /* + * For a column-store, the promoted key is the recno and we already have + * a copy. For a row-store, it's the first key on the page, a variable- + * length byte string, get a copy. + * + * This function is called from the split code at each split boundary, + * but that means we're not called before the first boundary, and we + * will eventually have to get the first key explicitly when splitting + * a page. + * + * For the current slot, take the last key we built, after doing suffix + * compression. The "last key we built" describes some process: before + * calling the split code, we must place the last key on the page before + * the boundary into the "last" key structure, and the first key on the + * page after the boundary into the "current" key structure, we're going + * to compare them for suffix compression. + * + * Suffix compression is a hack to shorten keys on internal pages. We + * only need enough bytes in the promoted key to ensure searches go to + * the correct page: the promoted key has to be larger than the last key + * on the leaf page preceding it, but we don't need any more bytes than + * that. In other words, we can discard any suffix bytes not required + * to distinguish between the key being promoted and the last key on the + * leaf page preceding it. This can only be done for the first level of + * internal pages, you cannot repeat suffix truncation as you split up + * the tree, it loses too much information. + * + * Note #1: if the last key on the previous page was an overflow key, + * we don't have the in-memory key against which to compare, and don't + * try to do suffix compression. The code for that case turns suffix + * compression off for the next key, we don't have to deal with it here. + */ + if (type != WT_PAGE_ROW_LEAF || !r->key_sfx_compress) + return (__wt_buf_set(session, key, r->cur->data, r->cur->size)); + + btree = S2BT(session); + WT_RET(__wt_scr_alloc(session, 0, &update)); + + /* + * Note #2: if we skipped updates, an update key may be larger than the + * last key stored in the previous block (probable for append-centric + * workloads). If there are skipped updates, check for one larger than + * the last key and smaller than the current key. + */ + max = r->last; + for (i = r->skip_next; i > 0; --i) { + skip = &r->skip[i - 1]; + if (skip->ins == NULL) + WT_ERR(__wt_row_leaf_key( + session, r->page, skip->rip, update, 0)); + else { + update->data = WT_INSERT_KEY(skip->ins); + update->size = WT_INSERT_KEY_SIZE(skip->ins); + } + + /* Compare against the current key, it must be less. */ + WT_ERR(__wt_compare( + session, btree->collator, update, r->cur, &cmp)); + if (cmp >= 0) + continue; + + /* Compare against the last key, it must be greater. */ + WT_ERR(__wt_compare( + session, btree->collator, update, r->last, &cmp)); + if (cmp >= 0) + max = update; + + /* + * The skipped updates are in key-sort order so the entry we're + * looking for is either the last one or the next-to-last one + * in the list. Once we've compared an entry against the last + * key on the page, we're done. + */ + break; + } + + /* + * The largest key on the last block must sort before the current key, + * so we'll either find a larger byte value in the current key, or the + * current key will be a longer key, and the interesting byte is one + * past the length of the shorter key. + */ + pa = max->data; + pb = r->cur->data; + len = WT_MIN(max->size, r->cur->size); + size = len + 1; + for (cnt = 1; len > 0; ++cnt, --len, ++pa, ++pb) + if (*pa != *pb) { + if (size != cnt) { + WT_STAT_FAST_DATA_INCRV(session, + rec_suffix_compression, size - cnt); + size = cnt; + } + break; + } + ret = __wt_buf_set(session, key, r->cur->data, size); + +err: __wt_scr_free(&update); + return (ret); +} + +/* + * __rec_split -- + * Handle the page reconciliation bookkeeping. (Did you know "bookkeeper" + * has 3 doubled letters in a row? Sweet-tooth does, too.) + */ +static int +__rec_split(WT_SESSION_IMPL *session, WT_RECONCILE *r) +{ + WT_BTREE *btree; + WT_BOUNDARY *last, *next; + WT_PAGE_HEADER *dsk; + uint32_t len; + + /* + * We should never split during salvage, and we're about to drop core + * because there's no parent page. + */ + if (r->salvage != NULL) + WT_PANIC_RET(session, WT_PANIC, + "%s page too large, attempted split during salvage", + __wt_page_type_string(r->page->type)); + + /* + * Handle page-buffer size tracking; we have to do this work in every + * reconciliation loop, and I don't want to repeat the code that many + * times. + */ + btree = S2BT(session); + dsk = r->dsk.mem; + + /* Hitting a page boundary resets the dictionary, in all cases. */ + __rec_dictionary_reset(r); + + /* + * There are 3 cases we have to handle. + * + * #1 + * About to cross a split boundary: save current boundary information + * and return. + * + * #2 + * About to cross the maximum boundary: use saved boundary information + * to write all of the split pages. + * + * #3 + * About to cross a split boundary, but we've either already done the + * split thing when we approached the maximum boundary, in which + * case we write the page and keep going, or we were never tracking + * split boundaries at all. + * + * Cases #1 and #2 are the hard ones: we're called when we're about to + * cross each split boundary, and we save information away so we can + * split if we have to. We're also called when we're about to cross + * the maximum page boundary: in that case, we do the actual split and + * clean up all the previous boundaries, then keep going. + */ + switch (r->bnd_state) { + case SPLIT_BOUNDARY: /* Case #1 */ + /* + * Save the information about where we are when the split would + * have happened. + */ + WT_RET(__rec_split_bnd_grow(session, r)); + last = &r->bnd[r->bnd_next++]; + next = last + 1; + + /* Set the number of entries for the just finished chunk. */ + last->entries = r->entries - r->total_entries; + r->total_entries = r->entries; + + /* Set the key for the next chunk. */ + next->recno = r->recno; + if (dsk->type == WT_PAGE_ROW_INT || + dsk->type == WT_PAGE_ROW_LEAF) + WT_RET(__rec_split_row_promote( + session, r, &next->key, dsk->type)); + + /* + * Set the starting buffer address and clear the entries (the + * latter not required, but cleaner). + */ + next->start = r->first_free; + next->entries = 0; + + /* + * Set the space available to another split-size chunk, if we + * have one. If we don't have room for another split chunk, + * add whatever space remains in the maximum page size, and + * hope it's enough. + */ + len = WT_PTRDIFF32(r->first_free, dsk); + if (len + r->split_size <= r->page_size) + r->space_avail = + r->split_size - WT_PAGE_HEADER_BYTE_SIZE(btree); + else { + r->bnd_state = SPLIT_MAX; + r->space_avail = r->page_size - + (WT_PAGE_HEADER_BYTE_SIZE(btree) + len); + } + break; + case SPLIT_MAX: /* Case #2 */ + /* + * It didn't all fit into a single page. + * + * Cycle through the saved split-point information, writing the + * split chunks we have tracked. + */ + WT_RET(__rec_split_fixup(session, r)); + + /* We're done saving split chunks. */ + r->bnd_state = SPLIT_TRACKING_OFF; + break; + case SPLIT_TRACKING_OFF: /* Case #3 */ + /* + * It didn't all fit, but either we've already noticed it and + * are now processing the rest of the page at the split-size + * boundaries, or the split size was the same as the page size, + * so we never bothered with saving split-point information. + */ + WT_RET(__rec_split_bnd_grow(session, r)); + last = &r->bnd[r->bnd_next++]; + next = last + 1; + + /* + * Set the key for the next chunk (before writing the block, a + * key range is needed in that code). + */ + next->recno = r->recno; + if (dsk->type == WT_PAGE_ROW_INT || + dsk->type == WT_PAGE_ROW_LEAF) + WT_RET(__rec_split_row_promote( + session, r, &next->key, dsk->type)); + + /* Clear the entries (not required, but cleaner). */ + next->entries = 0; + + /* Finalize the header information and write the page. */ + dsk->recno = last->recno; + dsk->u.entries = r->entries; + dsk->mem_size = r->dsk.size = WT_PTRDIFF32(r->first_free, dsk); + WT_RET(__rec_split_write(session, r, last, &r->dsk, 0)); + + /* + * Set the caller's entry count and buffer information for the + * next chunk. We only get here if we're not splitting or have + * already split, so it's split-size chunks from here on out. + */ + r->entries = 0; + r->first_free = WT_PAGE_HEADER_BYTE(btree, dsk); + r->space_avail = + r->split_size - WT_PAGE_HEADER_BYTE_SIZE(btree); + break; + case SPLIT_TRACKING_RAW: + WT_ILLEGAL_VALUE(session); + } + return (0); +} + +/* + * __rec_split_raw_worker -- + * Handle the raw compression page reconciliation bookkeeping. + */ +static int +__rec_split_raw_worker( + WT_SESSION_IMPL *session, WT_RECONCILE *r, int no_more_rows) +{ + WT_BM *bm; + WT_BOUNDARY *last, *next; + WT_BTREE *btree; + WT_CELL *cell; + WT_CELL_UNPACK *unpack, _unpack; + WT_COMPRESSOR *compressor; + WT_DECL_RET; + WT_ITEM *dst, *write_ref; + WT_PAGE_HEADER *dsk, *dsk_dst; + WT_SESSION *wt_session; + size_t corrected_page_size, len, result_len; + uint64_t recno; + uint32_t entry, i, result_slots, slots; + int last_block; + uint8_t *dsk_start; + + wt_session = (WT_SESSION *)session; + btree = S2BT(session); + bm = btree->bm; + + unpack = &_unpack; + compressor = btree->compressor; + dst = &r->raw_destination; + dsk = r->dsk.mem; + + WT_RET(__rec_split_bnd_grow(session, r)); + last = &r->bnd[r->bnd_next]; + next = last + 1; + + /* + * Build arrays of offsets and cumulative counts of cells and rows in + * the page: the offset is the byte offset to the possible split-point + * (adjusted for an initial chunk that cannot be compressed), entries + * is the cumulative page entries covered by the byte offset, recnos is + * the cumulative rows covered by the byte offset. + */ + if (r->entries >= r->raw_max_slots) { + __wt_free(session, r->raw_entries); + __wt_free(session, r->raw_offsets); + __wt_free(session, r->raw_recnos); + r->raw_max_slots = 0; + + i = r->entries + 100; + WT_RET(__wt_calloc_def(session, i, &r->raw_entries)); + WT_RET(__wt_calloc_def(session, i, &r->raw_offsets)); + if (dsk->type == WT_PAGE_COL_INT || + dsk->type == WT_PAGE_COL_VAR) + WT_RET(__wt_calloc_def(session, i, &r->raw_recnos)); + r->raw_max_slots = i; + } + + /* + * We're going to walk the disk image, which requires setting the + * number of entries. + */ + dsk->u.entries = r->entries; + + /* + * We track the record number at each column-store split point, set an + * initial value. + */ + recno = 0; + if (dsk->type == WT_PAGE_COL_VAR) + recno = last->recno; + + entry = slots = 0; + WT_CELL_FOREACH(btree, dsk, cell, unpack, i) { + ++entry; + + /* + * Row-store pages can split at keys, but not at values, + * column-store pages can split at values. + */ + __wt_cell_unpack(cell, unpack); + switch (unpack->type) { + case WT_CELL_KEY: + case WT_CELL_KEY_OVFL: + case WT_CELL_KEY_SHORT: + break; + case WT_CELL_ADDR_DEL: + case WT_CELL_ADDR_INT: + case WT_CELL_ADDR_LEAF: + case WT_CELL_ADDR_LEAF_NO: + case WT_CELL_DEL: + case WT_CELL_VALUE: + case WT_CELL_VALUE_OVFL: + case WT_CELL_VALUE_SHORT: + if (dsk->type == WT_PAGE_COL_INT) { + recno = unpack->v; + break; + } + if (dsk->type == WT_PAGE_COL_VAR) { + recno += __wt_cell_rle(unpack); + break; + } + r->raw_entries[slots] = entry; + continue; + WT_ILLEGAL_VALUE(session); + } + + /* + * We can't compress the first 64B of the block (it must be + * written without compression), and a possible split point + * may appear in that 64B; keep it simple, ignore the first + * allocation size of data, anybody splitting smaller than + * that (as calculated before compression), is doing it wrong. + */ + if ((len = WT_PTRDIFF(cell, dsk)) > btree->allocsize) + r->raw_offsets[++slots] = + WT_STORE_SIZE(len - WT_BLOCK_COMPRESS_SKIP); + + if (dsk->type == WT_PAGE_COL_INT || + dsk->type == WT_PAGE_COL_VAR) + r->raw_recnos[slots] = recno; + r->raw_entries[slots] = entry; + } + + /* + * If we haven't managed to find at least one split point, we're done, + * don't bother calling the underlying compression function. + */ + if (slots == 0) { + result_len = 0; + result_slots = 0; + goto no_slots; + } + + /* The slot at array's end is the total length of the data. */ + r->raw_offsets[++slots] = + WT_STORE_SIZE(WT_PTRDIFF(cell, dsk) - WT_BLOCK_COMPRESS_SKIP); + + /* + * Allocate a destination buffer. If there's a pre-size function, use + * it to determine the destination buffer's minimum size, otherwise the + * destination buffer is documented to be at least the maximum object + * size. + * + * The destination buffer really only needs to be large enough for the + * target block size, corrected for the requirements of the underlying + * block manager. If the target block size is 8KB, that's a multiple + * of 512B and so the underlying block manager is fine with it. But... + * we don't control what the pre_size method returns us as a required + * size, and we don't want to document the compress_raw method has to + * skip bytes in the buffer because that's confusing, so do something + * more complicated. First, find out how much space the compress_raw + * function might need, either the value returned from pre_size, or the + * maximum object size. Add the compress-skip bytes, and then correct + * that value for the underlying block manager. As a result, we have + * a destination buffer that's the right "object" size when calling the + * compress_raw method, and there are bytes in the header just for us. + */ + if (compressor->pre_size == NULL) + result_len = r->page_size_max; + else + WT_RET(compressor->pre_size(compressor, wt_session, + (uint8_t *)dsk + WT_BLOCK_COMPRESS_SKIP, + (size_t)r->raw_offsets[slots], &result_len)); + corrected_page_size = result_len + WT_BLOCK_COMPRESS_SKIP; + WT_RET(bm->write_size(bm, session, &corrected_page_size)); + WT_RET(__wt_buf_init(session, dst, corrected_page_size)); + + /* + * Copy the header bytes into the destination buffer, then call the + * compression function. + */ + memcpy(dst->mem, dsk, WT_BLOCK_COMPRESS_SKIP); + ret = compressor->compress_raw(compressor, wt_session, + r->page_size_max, btree->split_pct, + WT_BLOCK_COMPRESS_SKIP, (uint8_t *)dsk + WT_BLOCK_COMPRESS_SKIP, + r->raw_offsets, slots, + (uint8_t *)dst->mem + WT_BLOCK_COMPRESS_SKIP, + result_len, no_more_rows, &result_len, &result_slots); + switch (ret) { + case EAGAIN: + /* + * The compression function wants more rows; accumulate and + * retry. + * + * Reset the resulting slots count, just in case the compression + * function modified it before giving up. + */ + result_slots = 0; + break; + case 0: + /* + * If the compression function returned zero result slots, it's + * giving up and we write the original data. (This is a pretty + * bad result: we've not done compression on a block much larger + * than the maximum page size, but once compression gives up, + * there's not much else we can do.) + * + * If the compression function returned non-zero result slots, + * we were successful and have a block to write. + */ + if (result_slots == 0) { + WT_STAT_FAST_DATA_INCR(session, compress_raw_fail); + + /* + * If there are no more rows, we can write the original + * data from the original buffer. + */ + if (no_more_rows) + break; + + /* + * Copy the original data to the destination buffer, as + * if the compression function simply copied it. Take + * all but the last row of the original data (the last + * row has to be set as the key for the next block). + */ + result_slots = slots - 1; + result_len = r->raw_offsets[result_slots]; + WT_RET(__wt_buf_grow( + session, dst, result_len + WT_BLOCK_COMPRESS_SKIP)); + memcpy((uint8_t *)dst->mem + WT_BLOCK_COMPRESS_SKIP, + (uint8_t *)dsk + WT_BLOCK_COMPRESS_SKIP, + result_len); + + /* + * Mark it as uncompressed so the standard compression + * function is called before the buffer is written. + */ + last->already_compressed = 0; + } else { + WT_STAT_FAST_DATA_INCR(session, compress_raw_ok); + + /* + * If there are more rows and the compression function + * consumed all of the current data, there are problems: + * First, with row-store objects, we're potentially + * skipping updates, we must have a key for the next + * block so we know with what block a skipped update is + * associated. Second, if the compression function + * compressed all of the data, we're not pushing it + * hard enough (unless we got lucky and gave it exactly + * the right amount to work with, which is unlikely). + * Handle both problems by accumulating more data any + * time we're not writing the last block and compression + * ate all of the rows. + */ + if (result_slots == slots && !no_more_rows) + result_slots = 0; + else + last->already_compressed = 1; + } + break; + default: + return (ret); + } + +no_slots: + /* + * Check for the last block we're going to write: if no more rows and + * we failed to compress anything, or we compressed everything, it's + * the last block. + */ + last_block = no_more_rows && + (result_slots == 0 || result_slots == slots); + + if (result_slots != 0) { + /* + * We have a block, finalize the header information. + */ + dst->size = result_len + WT_BLOCK_COMPRESS_SKIP; + dsk_dst = dst->mem; + dsk_dst->recno = last->recno; + dsk_dst->mem_size = + r->raw_offsets[result_slots] + WT_BLOCK_COMPRESS_SKIP; + dsk_dst->u.entries = r->raw_entries[result_slots - 1]; + + /* + * There is likely a remnant in the working buffer that didn't + * get compressed; copy it down to the start of the buffer and + * update the starting record number, free space and so on. + * !!! + * Note use of memmove, the source and destination buffers can + * overlap. + */ + len = WT_PTRDIFF(r->first_free, (uint8_t *)dsk + + r->raw_offsets[result_slots] + WT_BLOCK_COMPRESS_SKIP); + dsk_start = WT_PAGE_HEADER_BYTE(btree, dsk); + (void)memmove(dsk_start, (uint8_t *)r->first_free - len, len); + + r->entries -= r->raw_entries[result_slots - 1]; + r->first_free = dsk_start + len; + r->space_avail = + r->page_size - (WT_PAGE_HEADER_BYTE_SIZE(btree) + len); + + /* + * Set the key for the next block (before writing the block, a + * key range is needed in that code). + */ + switch (dsk->type) { + case WT_PAGE_COL_INT: + next->recno = r->raw_recnos[result_slots]; + break; + case WT_PAGE_COL_VAR: + next->recno = r->raw_recnos[result_slots - 1]; + break; + case WT_PAGE_ROW_INT: + case WT_PAGE_ROW_LEAF: + next->recno = 0; + if (!last_block) { + /* + * Confirm there was uncompressed data remaining + * in the buffer, we're about to read it for the + * next chunk's initial key. + */ + WT_ASSERT(session, len > 0); + WT_RET(__rec_split_row_promote_cell( + session, dsk, &next->key)); + } + break; + } + write_ref = dst; + } else if (no_more_rows) { + /* + * Compression failed and there are no more rows to accumulate, + * write the original buffer instead. + */ + WT_STAT_FAST_DATA_INCR(session, compress_raw_fail); + + dsk->recno = last->recno; + dsk->mem_size = r->dsk.size = WT_PTRDIFF32(r->first_free, dsk); + dsk->u.entries = r->entries; + + r->entries = 0; + r->first_free = WT_PAGE_HEADER_BYTE(btree, dsk); + r->space_avail = r->page_size - WT_PAGE_HEADER_BYTE_SIZE(btree); + + write_ref = &r->dsk; + last->already_compressed = 0; + } else { + /* + * Compression failed, there are more rows to accumulate and the + * compression function wants to try again; increase the size of + * the "page" and try again after we accumulate some more rows. + */ + WT_STAT_FAST_DATA_INCR(session, compress_raw_fail_temporary); + + len = WT_PTRDIFF(r->first_free, r->dsk.mem); + corrected_page_size = r->page_size * 2; + WT_RET(bm->write_size(bm, session, &corrected_page_size)); + WT_RET(__wt_buf_grow(session, &r->dsk, corrected_page_size)); + r->page_size *= 2; + r->first_free = (uint8_t *)r->dsk.mem + len; + r->space_avail = + r->page_size - (WT_PAGE_HEADER_BYTE_SIZE(btree) + len); + return (0); + } + + /* We have a block, update the boundary counter. */ + ++r->bnd_next; + + /* + * If we are writing the whole page in our first/only attempt, it might + * be a checkpoint (checkpoints are only a single page, by definition). + * Further, checkpoints aren't written here, the wrapup functions do the + * write, and they do the write from the original buffer location. If + * it's a checkpoint and the block isn't in the right buffer, copy it. + * + * If it's not a checkpoint, write the block. + */ + if (r->bnd_next == 1 && last_block && __rec_is_checkpoint(r, last)) { + if (write_ref == dst) + WT_RET(__wt_buf_set( + session, &r->dsk, dst->mem, dst->size)); + } else + WT_RET( + __rec_split_write(session, r, last, write_ref, last_block)); + return (0); +} + +/* + * __rec_raw_decompress -- + * Decompress a raw-compressed image. + */ +static int +__rec_raw_decompress( + WT_SESSION_IMPL *session, const void *image, size_t size, void *retp) +{ + WT_BTREE *btree; + WT_DECL_ITEM(tmp); + WT_DECL_RET; + WT_PAGE_HEADER const *dsk; + size_t result_len; + + btree = S2BT(session); + dsk = image; + + /* + * We skipped an update and we can't write a block, but unfortunately, + * the block has already been compressed. Decompress the block so we + * can subsequently re-instantiate it in memory. + */ + WT_RET(__wt_scr_alloc(session, dsk->mem_size, &tmp)); + memcpy(tmp->mem, image, WT_BLOCK_COMPRESS_SKIP); + WT_ERR(btree->compressor->decompress(btree->compressor, + &session->iface, + (uint8_t *)image + WT_BLOCK_COMPRESS_SKIP, + size - WT_BLOCK_COMPRESS_SKIP, + (uint8_t *)tmp->mem + WT_BLOCK_COMPRESS_SKIP, + dsk->mem_size - WT_BLOCK_COMPRESS_SKIP, + &result_len)); + if (result_len != dsk->mem_size - WT_BLOCK_COMPRESS_SKIP) + WT_ERR(__wt_illegal_value(session, btree->dhandle->name)); + + WT_ERR(__wt_strndup(session, tmp->data, dsk->mem_size, retp)); + WT_ASSERT(session, __wt_verify_dsk_image( + session, "[raw evict split]", tmp->data, dsk->mem_size, 0) == 0); + +err: __wt_scr_free(&tmp); + return (ret); +} + +/* + * __rec_split_raw -- + * Raw compression split routine. + */ +static inline int +__rec_split_raw(WT_SESSION_IMPL *session, WT_RECONCILE *r) +{ + return (__rec_split_raw_worker(session, r, 0)); +} + +/* + * __rec_split_finish_std -- + * Finish processing a page, standard version. + */ +static int +__rec_split_finish_std(WT_SESSION_IMPL *session, WT_RECONCILE *r) +{ + WT_BOUNDARY *bnd; + WT_PAGE_HEADER *dsk; + + /* Adjust the boundary information based on our split status. */ + switch (r->bnd_state) { + case SPLIT_BOUNDARY: + case SPLIT_MAX: + /* + * We never split, the reconciled page fit into a maximum page + * size. Change the first boundary slot to represent the full + * page (the first boundary slot is largely correct, just update + * the number of entries). + */ + r->bnd_next = 0; + break; + case SPLIT_TRACKING_OFF: + /* + * If we have already split, or aren't tracking boundaries, put + * the remaining data in the next boundary slot. + */ + WT_RET(__rec_split_bnd_grow(session, r)); + break; + case SPLIT_TRACKING_RAW: + /* + * We were configured for raw compression, but never actually + * wrote anything. + */ + break; + WT_ILLEGAL_VALUE(session); + } + + /* + * We only arrive here with no entries to write if the page was entirely + * empty, and if the page is empty, we merge it into its parent during + * the parent's reconciliation. A page with skipped updates isn't truly + * empty, continue on. + */ + if (r->entries == 0 && r->skip_next == 0) + return (0); + + /* Set the boundary reference and increment the count. */ + bnd = &r->bnd[r->bnd_next++]; + bnd->entries = r->entries; + + /* Finalize the header information. */ + dsk = r->dsk.mem; + dsk->recno = bnd->recno; + dsk->u.entries = r->entries; + dsk->mem_size = r->dsk.size = WT_PTRDIFF32(r->first_free, dsk); + + /* If this is a checkpoint, we're done, otherwise write the page. */ + return ( + __rec_is_checkpoint(r, bnd) ? 0 : + __rec_split_write(session, r, bnd, &r->dsk, 1)); +} + +/* + * __rec_split_finish -- + * Finish processing a page. + */ +static int +__rec_split_finish(WT_SESSION_IMPL *session, WT_RECONCILE *r) +{ + /* We're done reconciling - write the final page */ + if (r->raw_compression && r->entries != 0) { + while (r->entries != 0) + WT_RET(__rec_split_raw_worker(session, r, 1)); + } else + WT_RET(__rec_split_finish_std(session, r)); + + return (0); +} + +/* + * __rec_split_fixup -- + * Fix up after crossing the maximum page boundary. + */ +static int +__rec_split_fixup(WT_SESSION_IMPL *session, WT_RECONCILE *r) +{ + WT_BOUNDARY *bnd; + WT_BTREE *btree; + WT_DECL_ITEM(tmp); + WT_DECL_RET; + WT_PAGE_HEADER *dsk; + uint32_t i, len; + uint8_t *dsk_start; + + /* + * When we overflow physical limits of the page, we walk the list of + * split chunks we've created and write those pages out, then update + * the caller's information. + */ + btree = S2BT(session); + + /* + * The data isn't laid out on a page boundary or nul padded; copy it to + * a clean, aligned, padded buffer before writing it. + * + * Allocate a scratch buffer to hold the new disk image. Copy the + * WT_PAGE_HEADER header onto the scratch buffer, most of the header + * information remains unchanged between the pages. + */ + WT_RET(__wt_scr_alloc(session, r->page_size_max, &tmp)); + dsk = tmp->mem; + memcpy(dsk, r->dsk.mem, WT_PAGE_HEADER_SIZE); + + /* + * For each split chunk we've created, update the disk image and copy + * it into place. + */ + dsk_start = WT_PAGE_HEADER_BYTE(btree, dsk); + for (i = 0, bnd = r->bnd; i < r->bnd_next; ++i, ++bnd) { + /* Copy the page contents to the temporary buffer. */ + len = WT_PTRDIFF32((bnd + 1)->start, bnd->start); + memcpy(dsk_start, bnd->start, len); + + /* Finalize the header information and write the page. */ + dsk->recno = bnd->recno; + dsk->u.entries = bnd->entries; + dsk->mem_size = + tmp->size = WT_PAGE_HEADER_BYTE_SIZE(btree) + len; + WT_ERR(__rec_split_write(session, r, bnd, tmp, 0)); + } + + /* + * There is probably a remnant in the working buffer that didn't get + * written; copy it down to the beginning of the working buffer, and + * update the starting record number. + * + * Confirm the remnant is no larger than the available split buffer. + * + * Fix up our caller's information. + */ + len = WT_PTRDIFF32(r->first_free, bnd->start); + if (len >= r->split_size - WT_PAGE_HEADER_BYTE_SIZE(btree)) + WT_PANIC_ERR(session, EINVAL, + "Reconciliation remnant too large for the split buffer"); + + dsk = r->dsk.mem; + dsk_start = WT_PAGE_HEADER_BYTE(btree, dsk); + (void)memmove(dsk_start, bnd->start, len); + + r->entries -= r->total_entries; + r->first_free = dsk_start + len; + r->space_avail = + (r->split_size - WT_PAGE_HEADER_BYTE_SIZE(btree)) - len; + +err: __wt_scr_free(&tmp); + return (ret); +} + +/* + * __rec_split_write -- + * Write a disk block out for the split helper functions. + */ +static int +__rec_split_write(WT_SESSION_IMPL *session, + WT_RECONCILE *r, WT_BOUNDARY *bnd, WT_ITEM *buf, int last_block) +{ + WT_BTREE *btree; + WT_DECL_ITEM(key); + WT_DECL_RET; + WT_MULTI *multi; + WT_PAGE *page; + WT_PAGE_HEADER *dsk; + WT_PAGE_MODIFY *mod; + WT_UPD_SKIPPED *skip; + size_t addr_size; + uint32_t bnd_slot, i, j; + int cmp; + uint8_t addr[WT_BTREE_MAX_ADDR_COOKIE]; + + btree = S2BT(session); + dsk = buf->mem; + page = r->page; + mod = page->modify; + + WT_RET(__wt_scr_alloc(session, 0, &key)); + + /* Set the zero-length value flag in the page header. */ + if (dsk->type == WT_PAGE_ROW_LEAF) { + F_CLR(dsk, WT_PAGE_EMPTY_V_ALL | WT_PAGE_EMPTY_V_NONE); + + if (r->entries != 0 && r->all_empty_value) + F_SET(dsk, WT_PAGE_EMPTY_V_ALL); + if (r->entries != 0 && !r->any_empty_value) + F_SET(dsk, WT_PAGE_EMPTY_V_NONE); + } + + /* Initialize the address (set the page type for the parent). */ + switch (dsk->type) { + case WT_PAGE_COL_FIX: + bnd->addr.type = WT_ADDR_LEAF_NO; + break; + case WT_PAGE_COL_VAR: + case WT_PAGE_ROW_LEAF: + bnd->addr.type = r->ovfl_items ? WT_ADDR_LEAF : WT_ADDR_LEAF_NO; + break; + case WT_PAGE_COL_INT: + case WT_PAGE_ROW_INT: + bnd->addr.type = WT_ADDR_INT; + break; + WT_ILLEGAL_VALUE_ERR(session); + } + + bnd->size = (uint32_t)buf->size; + bnd->cksum = 0; + + /* + * Check if we've skipped updates that belong to this block, and move + * any to the per-block structure. Quit as soon as we find a skipped + * update that doesn't belong to the block, they're in sorted order. + * + * This code requires a key be filled in for the next block (or the + * last block flag be set, if there's no next block). + */ + for (i = 0, skip = r->skip; i < r->skip_next; ++i, ++skip) { + /* The last block gets all remaining skipped updates. */ + if (last_block) { + WT_ERR(__rec_skip_update_move(session, bnd, skip)); + continue; + } + + /* + * Get the skipped update's key and compare it with this block's + * key range. If the skipped update list belongs with the block + * we're about to write, move it to the per-block memory. Check + * only to the first update that doesn't go with the block, they + * must be in sorted order. + */ + switch (page->type) { + case WT_PAGE_COL_FIX: + case WT_PAGE_COL_VAR: + if (WT_INSERT_RECNO(skip->ins) >= (bnd + 1)->recno) + goto skip_check_complete; + break; + case WT_PAGE_ROW_LEAF: + if (skip->ins == NULL) + WT_ERR(__wt_row_leaf_key( + session, page, skip->rip, key, 0)); + else { + key->data = WT_INSERT_KEY(skip->ins); + key->size = WT_INSERT_KEY_SIZE(skip->ins); + } + WT_ERR(__wt_compare(session, + btree->collator, key, &(bnd + 1)->key, &cmp)); + if (cmp >= 0) + goto skip_check_complete; + break; + WT_ILLEGAL_VALUE_ERR(session); + } + WT_ERR(__rec_skip_update_move(session, bnd, skip)); + } + +skip_check_complete: + /* + * If there are updates that weren't moved to the block, shuffle them to + * the beginning of the cached list (we maintain the skipped updates in + * sorted order, new skipped updates must be appended to the list). + */ + for (j = 0; i < r->skip_next; ++j, ++i) + r->skip[j] = r->skip[i]; + r->skip_next = j; + + /* + * If we had to skip updates in order to build this disk image, we can't + * actually write it. Instead, we will re-instantiate the page using the + * disk image and the list of updates we skipped. + */ + if (bnd->skip != NULL) { + /* + * If the buffer is compressed (raw compression was configured), + * we have to decompress it so we can instantiate it later. It's + * a slow and convoluted path, but it's also a rare one and it's + * not worth making it faster. Else, the disk image is ready, + * copy it into place for later. It's possible the disk image + * has no items; we have to flag that for verification, it's a + * special case since read/writing empty pages isn't generally + * allowed. + */ + if (bnd->already_compressed) + WT_ERR(__rec_raw_decompress( + session, buf->data, buf->size, &bnd->dsk)); + else { + WT_ERR(__wt_strndup( + session, buf->data, buf->size, &bnd->dsk)); + WT_ASSERT(session, __wt_verify_dsk_image(session, + "[evict split]", buf->data, buf->size, 1) == 0); + } + goto done; + } + + /* + * If we wrote this block before, re-use it. Pages get written in the + * same block order every time, only check the appropriate slot. The + * expensive part of this test is the checksum, only do that work when + * there has been or will be a reconciliation of this page involving + * split pages. This test isn't perfect: we're doing a checksum if a + * previous reconciliation of the page split or if we will split this + * time, but that test won't calculate a checksum on the first block + * the first time the page splits. + */ + bnd_slot = (uint32_t)(bnd - r->bnd); + if (bnd_slot > 1 || + (F_ISSET(mod, WT_PM_REC_MULTIBLOCK) && mod->mod_multi != NULL)) { + /* + * There are page header fields which need to be cleared to get + * consistent checksums: specifically, the write generation and + * the memory owned by the block manager. We are reusing the + * same buffer space each time, clear it before calculating the + * checksum. + */ + dsk->write_gen = 0; + memset(WT_BLOCK_HEADER_REF(dsk), 0, btree->block_header); + bnd->cksum = __wt_cksum(buf->data, buf->size); + + if (F_ISSET(mod, WT_PM_REC_MULTIBLOCK) && + mod->mod_multi_entries > bnd_slot) { + multi = &mod->mod_multi[bnd_slot]; + if (multi->size == bnd->size && + multi->cksum == bnd->cksum) { + multi->addr.reuse = 1; + bnd->addr = multi->addr; + + WT_STAT_FAST_DATA_INCR(session, rec_page_match); + goto done; + } + } + } + + WT_ERR(__wt_bt_write(session, + buf, addr, &addr_size, 0, bnd->already_compressed)); + WT_ERR(__wt_strndup(session, addr, addr_size, &bnd->addr.addr)); + bnd->addr.size = (uint8_t)addr_size; + +done: +err: __wt_scr_free(&key); + return (ret); +} + +/* + * __wt_bulk_init -- + * Bulk insert initialization. + */ +int +__wt_bulk_init(WT_SESSION_IMPL *session, WT_CURSOR_BULK *cbulk) +{ + WT_BTREE *btree; + WT_PAGE_INDEX *pindex; + WT_RECONCILE *r; + uint64_t recno; + + btree = S2BT(session); + /* + * Bulk-load is only permitted on newly created files, not any empty + * file -- see the checkpoint code for a discussion. + */ + if (!btree->bulk_load_ok) + WT_RET_MSG(session, EINVAL, + "bulk-load is only possible for newly created trees"); + + /* Set a reference to the empty leaf page. */ + pindex = WT_INTL_INDEX_COPY(btree->root.page); + cbulk->ref = pindex->index[0]; + cbulk->leaf = cbulk->ref->page; + + WT_RET( + __rec_write_init(session, cbulk->ref, 0, NULL, &cbulk->reconcile)); + r = cbulk->reconcile; + r->is_bulk_load = 1; + + switch (btree->type) { + case BTREE_COL_FIX: + case BTREE_COL_VAR: + recno = 1; + break; + case BTREE_ROW: + recno = 0; + break; + WT_ILLEGAL_VALUE(session); + } + + return (__rec_split_init( + session, r, cbulk->leaf, recno, btree->maxleafpage)); +} + +/* + * __wt_bulk_wrapup -- + * Bulk insert cleanup. + */ +int +__wt_bulk_wrapup(WT_SESSION_IMPL *session, WT_CURSOR_BULK *cbulk) +{ + WT_BTREE *btree; + WT_PAGE *parent; + WT_RECONCILE *r; + + r = cbulk->reconcile; + btree = S2BT(session); + + switch (btree->type) { + case BTREE_COL_FIX: + if (cbulk->entry != 0) + __rec_incr(session, r, cbulk->entry, + __bitstr_size( + (size_t)cbulk->entry * btree->bitcnt)); + break; + case BTREE_COL_VAR: + if (cbulk->rle != 0) + WT_RET(__wt_bulk_insert_var(session, cbulk)); + break; + case BTREE_ROW: + break; + WT_ILLEGAL_VALUE(session); + } + + WT_RET(__rec_split_finish(session, r)); + WT_RET(__rec_write_wrapup(session, r, r->page)); + + /* Mark the page's parent dirty. */ + parent = r->ref->home; + WT_RET(__wt_page_modify_init(session, parent)); + __wt_page_modify_set(session, parent); + + __rec_destroy(session, &cbulk->reconcile); + + return (0); +} + +/* + * __wt_bulk_insert_row -- + * Row-store bulk insert. + */ +int +__wt_bulk_insert_row(WT_SESSION_IMPL *session, WT_CURSOR_BULK *cbulk) +{ + WT_BTREE *btree; + WT_CURSOR *cursor; + WT_KV *key, *val; + WT_RECONCILE *r; + int ovfl_key; + + r = cbulk->reconcile; + btree = S2BT(session); + cursor = &cbulk->cbt.iface; + + key = &r->k; + val = &r->v; + WT_RET(__rec_cell_build_leaf_key(session, r, /* Build key cell */ + cursor->key.data, cursor->key.size, &ovfl_key)); + WT_RET(__rec_cell_build_val(session, r, /* Build value cell */ + cursor->value.data, cursor->value.size, (uint64_t)0)); + + /* Boundary: split or write the page. */ + while (key->len + val->len > r->space_avail) + if (r->raw_compression) + WT_RET(__rec_split_raw(session, r)); + else { + WT_RET(__rec_split(session, r)); + + /* + * Turn off prefix compression until a full key written + * to the new page, and (unless we're already working + * with an overflow key), rebuild the key without prefix + * compression. + */ + if (r->key_pfx_compress_conf) { + r->key_pfx_compress = 0; + if (!ovfl_key) + WT_RET(__rec_cell_build_leaf_key( + session, r, NULL, 0, &ovfl_key)); + } + } + + /* Copy the key/value pair onto the page. */ + __rec_copy_incr(session, r, key); + if (val->len == 0) + r->any_empty_value = 1; + else { + r->all_empty_value = 0; + if (btree->dictionary) + WT_RET(__rec_dict_replace(session, r, 0, val)); + __rec_copy_incr(session, r, val); + } + + /* Update compression state. */ + __rec_key_state_update(r, ovfl_key); + + return (0); +} + +/* + * __rec_col_fix_bulk_insert_split_check -- + * Check if a bulk-loaded fixed-length column store page needs to split. + */ +static inline int +__rec_col_fix_bulk_insert_split_check(WT_CURSOR_BULK *cbulk) +{ + WT_BTREE *btree; + WT_RECONCILE *r; + WT_SESSION_IMPL *session; + + session = (WT_SESSION_IMPL *)cbulk->cbt.iface.session; + r = cbulk->reconcile; + btree = S2BT(session); + + if (cbulk->entry == cbulk->nrecs) { + if (cbulk->entry != 0) { + /* + * If everything didn't fit, update the counters and + * split. + * + * Boundary: split or write the page. + */ + __rec_incr(session, r, cbulk->entry, + __bitstr_size( + (size_t)cbulk->entry * btree->bitcnt)); + WT_RET(__rec_split(session, r)); + } + cbulk->entry = 0; + cbulk->nrecs = WT_FIX_BYTES_TO_ENTRIES(btree, r->space_avail); + } + return (0); +} + +/* + * __wt_bulk_insert_fix -- + * Fixed-length column-store bulk insert. + */ +int +__wt_bulk_insert_fix(WT_SESSION_IMPL *session, WT_CURSOR_BULK *cbulk) +{ + WT_BTREE *btree; + WT_CURSOR *cursor; + WT_RECONCILE *r; + uint32_t entries, offset, page_entries, page_size; + const uint8_t *data; + + r = cbulk->reconcile; + btree = S2BT(session); + cursor = &cbulk->cbt.iface; + + if (cbulk->bitmap) { + if (((r->recno - 1) * btree->bitcnt) & 0x7) + WT_RET_MSG(session, EINVAL, + "Bulk bitmap load not aligned on a byte boundary"); + for (data = cursor->value.data, + entries = (uint32_t)cursor->value.size; + entries > 0; + entries -= page_entries, data += page_size) { + WT_RET(__rec_col_fix_bulk_insert_split_check(cbulk)); + + page_entries = + WT_MIN(entries, cbulk->nrecs - cbulk->entry); + page_size = __bitstr_size(page_entries * btree->bitcnt); + offset = __bitstr_size(cbulk->entry * btree->bitcnt); + memcpy(r->first_free + offset, data, page_size); + cbulk->entry += page_entries; + r->recno += page_entries; + } + return (0); + } + + WT_RET(__rec_col_fix_bulk_insert_split_check(cbulk)); + + __bit_setv(r->first_free, + cbulk->entry, btree->bitcnt, ((uint8_t *)cursor->value.data)[0]); + ++cbulk->entry; + ++r->recno; + + return (0); +} + +/* + * __wt_bulk_insert_var -- + * Variable-length column-store bulk insert. + */ +int +__wt_bulk_insert_var(WT_SESSION_IMPL *session, WT_CURSOR_BULK *cbulk) +{ + WT_BTREE *btree; + WT_KV *val; + WT_RECONCILE *r; + + r = cbulk->reconcile; + btree = S2BT(session); + + /* + * Store the bulk cursor's last buffer, not the current value, we're + * creating a duplicate count, which means we want the previous value + * seen, not the current value. + */ + val = &r->v; + WT_RET(__rec_cell_build_val( + session, r, cbulk->last.data, cbulk->last.size, cbulk->rle)); + + /* Boundary: split or write the page. */ + while (val->len > r->space_avail) + if (r->raw_compression) + WT_RET(__rec_split_raw(session, r)); + else + WT_RET(__rec_split(session, r)); + + /* Copy the value onto the page. */ + if (btree->dictionary) + WT_RET(__rec_dict_replace(session, r, cbulk->rle, val)); + __rec_copy_incr(session, r, val); + + /* Update the starting record number in case we split. */ + r->recno += cbulk->rle; + + return (0); +} + +/* + * __rec_vtype -- + * Return a value cell's address type. + */ +static inline u_int +__rec_vtype(WT_ADDR *addr) +{ + if (addr->type == WT_ADDR_INT) + return (WT_CELL_ADDR_INT); + if (addr->type == WT_ADDR_LEAF) + return (WT_CELL_ADDR_LEAF); + return (WT_CELL_ADDR_LEAF_NO); +} + +/* + * __rec_col_int -- + * Reconcile a column-store internal page. + */ +static int +__rec_col_int(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page) +{ + WT_ADDR *addr; + WT_BTREE *btree; + WT_CELL_UNPACK *vpack, _vpack; + WT_DECL_RET; + WT_KV *val; + WT_PAGE *child; + WT_REF *ref; + int hazard, state; + + btree = S2BT(session); + child = NULL; + hazard = 0; + + val = &r->v; + vpack = &_vpack; + + WT_RET(__rec_split_init( + session, r, page, page->pg_intl_recno, btree->maxintlpage)); + + /* For each entry in the in-memory page... */ + WT_INTL_FOREACH_BEGIN(session, page, ref) { + /* Update the starting record number in case we split. */ + r->recno = ref->key.recno; + + /* + * Modified child. + * The page may be emptied or internally created during a split. + * Deleted/split pages are merged into the parent and discarded. + */ + WT_ERR(__rec_child_modify(session, r, ref, &hazard, &state)); + addr = NULL; + child = ref->page; + if (state != 0) { + /* + * Currently the only non-zero returned stated possible + * for a column-store page is child-modified (all other + * states are part of the fast-truncate support, which + * is row-store only). + */ + WT_ASSERT(session, state == WT_CHILD_MODIFIED); + + switch (F_ISSET(child->modify, WT_PM_REC_MASK)) { + case WT_PM_REC_EMPTY: + /* + * Column-store pages are almost never empty, as + * discarding a page would remove a chunk of the + * name space. The exceptions are pages created + * when the tree is created, and never filled. + */ + CHILD_RELEASE_ERR(session, hazard, ref); + continue; + case WT_PM_REC_MULTIBLOCK: + WT_ERR(__rec_col_merge(session, r, child)); + CHILD_RELEASE_ERR(session, hazard, ref); + continue; + case WT_PM_REC_REPLACE: + addr = &child->modify->mod_replace; + break; + WT_ILLEGAL_VALUE_ERR(session); + } + } + + /* + * Build the value cell. The child page address is in one of 3 + * places: if the page was replaced, the page's modify structure + * references it and we built the value cell just above in the + * switch statement. Else, the WT_REF->addr reference points to + * an on-page cell or an off-page WT_ADDR structure: if it's an + * on-page cell and we copy it from the page, else build a new + * cell. + */ + if (addr == NULL && __wt_off_page(page, ref->addr)) + addr = ref->addr; + if (addr == NULL) { + __wt_cell_unpack(ref->addr, vpack); + val->buf.data = ref->addr; + val->buf.size = __wt_cell_total_len(vpack); + val->cell_len = 0; + val->len = val->buf.size; + } else + __rec_cell_build_addr(r, addr->addr, addr->size, + __rec_vtype(addr), ref->key.recno); + CHILD_RELEASE_ERR(session, hazard, ref); + + /* Boundary: split or write the page. */ + while (val->len > r->space_avail) + if (r->raw_compression) + WT_ERR(__rec_split_raw(session, r)); + else + WT_ERR(__rec_split(session, r)); + + /* Copy the value onto the page. */ + __rec_copy_incr(session, r, val); + } WT_INTL_FOREACH_END; + + /* Write the remnant page. */ + return (__rec_split_finish(session, r)); + +err: CHILD_RELEASE(session, hazard, ref); + return (ret); +} + +/* + * __rec_col_merge -- + * Merge in a split page. + */ +static int +__rec_col_merge(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page) +{ + WT_ADDR *addr; + WT_KV *val; + WT_MULTI *multi; + WT_PAGE_MODIFY *mod; + uint32_t i; + + mod = page->modify; + + val = &r->v; + + /* For each entry in the split array... */ + for (multi = mod->mod_multi, + i = 0; i < mod->mod_multi_entries; ++multi, ++i) { + /* Update the starting record number in case we split. */ + r->recno = multi->key.recno; + + /* Build the value cell. */ + addr = &multi->addr; + __rec_cell_build_addr(r, + addr->addr, addr->size, __rec_vtype(addr), r->recno); + + /* Boundary: split or write the page. */ + while (val->len > r->space_avail) + if (r->raw_compression) + WT_RET(__rec_split_raw(session, r)); + else + WT_RET(__rec_split(session, r)); + + /* Copy the value onto the page. */ + __rec_copy_incr(session, r, val); + } + return (0); +} + +/* + * __rec_col_fix -- + * Reconcile a fixed-width, column-store leaf page. + */ +static int +__rec_col_fix(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page) +{ + WT_BTREE *btree; + WT_INSERT *ins; + WT_UPDATE *upd; + uint64_t recno; + uint32_t entry, nrecs; + + btree = S2BT(session); + + WT_RET(__rec_split_init( + session, r, page, page->pg_fix_recno, btree->maxleafpage)); + + /* Update any changes to the original on-page data items. */ + WT_SKIP_FOREACH(ins, WT_COL_UPDATE_SINGLE(page)) { + WT_RET(__rec_txn_read(session, r, ins, NULL, NULL, &upd)); + if (upd != NULL) + __bit_setv_recno(page, WT_INSERT_RECNO(ins), + btree->bitcnt, ((uint8_t *)WT_UPDATE_DATA(upd))[0]); + } + + /* Copy the updated, disk-image bytes into place. */ + memcpy(r->first_free, page->pg_fix_bitf, + __bitstr_size((size_t)page->pg_fix_entries * btree->bitcnt)); + + /* Calculate the number of entries per page remainder. */ + entry = page->pg_fix_entries; + nrecs = WT_FIX_BYTES_TO_ENTRIES( + btree, r->space_avail) - page->pg_fix_entries; + r->recno += entry; + + /* Walk any append list. */ + WT_SKIP_FOREACH(ins, WT_COL_APPEND(page)) { + WT_RET(__rec_txn_read(session, r, ins, NULL, NULL, &upd)); + if (upd == NULL) + continue; + for (;;) { + /* + * The application may have inserted records which left + * gaps in the name space. + */ + for (recno = WT_INSERT_RECNO(ins); + nrecs > 0 && r->recno < recno; + --nrecs, ++entry, ++r->recno) + __bit_setv( + r->first_free, entry, btree->bitcnt, 0); + + if (nrecs > 0) { + __bit_setv(r->first_free, entry, btree->bitcnt, + ((uint8_t *)WT_UPDATE_DATA(upd))[0]); + --nrecs; + ++entry; + ++r->recno; + break; + } + + /* + * If everything didn't fit, update the counters and + * split. + * + * Boundary: split or write the page. + */ + __rec_incr(session, r, entry, + __bitstr_size((size_t)entry * btree->bitcnt)); + WT_RET(__rec_split(session, r)); + + /* Calculate the number of entries per page. */ + entry = 0; + nrecs = WT_FIX_BYTES_TO_ENTRIES(btree, r->space_avail); + } + } + + /* Update the counters. */ + __rec_incr( + session, r, entry, __bitstr_size((size_t)entry * btree->bitcnt)); + + /* Write the remnant page. */ + return (__rec_split_finish(session, r)); +} + +/* + * __rec_col_fix_slvg -- + * Reconcile a fixed-width, column-store leaf page created during salvage. + */ +static int +__rec_col_fix_slvg(WT_SESSION_IMPL *session, + WT_RECONCILE *r, WT_PAGE *page, WT_SALVAGE_COOKIE *salvage) +{ + WT_BTREE *btree; + uint64_t page_start, page_take; + uint32_t entry, nrecs; + + btree = S2BT(session); + + /* + * !!! + * It's vanishingly unlikely and probably impossible for fixed-length + * column-store files to have overlapping key ranges. It's possible + * for an entire key range to go missing (if a page is corrupted and + * lost), but because pages can't split, it shouldn't be possible to + * find pages where the key ranges overlap. That said, we check for + * it during salvage and clean up after it here because it doesn't + * cost much and future column-store formats or operations might allow + * for fixed-length format ranges to overlap during salvage, and I + * don't want to have to retrofit the code later. + */ + WT_RET(__rec_split_init( + session, r, page, page->pg_fix_recno, btree->maxleafpage)); + + /* We may not be taking all of the entries on the original page. */ + page_take = salvage->take == 0 ? page->pg_fix_entries : salvage->take; + page_start = salvage->skip == 0 ? 0 : salvage->skip; + + /* Calculate the number of entries per page. */ + entry = 0; + nrecs = WT_FIX_BYTES_TO_ENTRIES(btree, r->space_avail); + + for (; nrecs > 0 && salvage->missing > 0; + --nrecs, --salvage->missing, ++entry) + __bit_setv(r->first_free, entry, btree->bitcnt, 0); + + for (; nrecs > 0 && page_take > 0; + --nrecs, --page_take, ++page_start, ++entry) + __bit_setv(r->first_free, entry, btree->bitcnt, + __bit_getv(page->pg_fix_bitf, + (uint32_t)page_start, btree->bitcnt)); + + r->recno += entry; + __rec_incr(session, r, entry, + __bitstr_size((size_t)entry * btree->bitcnt)); + + /* + * We can't split during salvage -- if everything didn't fit, it's + * all gone wrong. + */ + if (salvage->missing != 0 || page_take != 0) + WT_PANIC_RET(session, WT_PANIC, + "%s page too large, attempted split during salvage", + __wt_page_type_string(page->type)); + + /* Write the page. */ + return (__rec_split_finish(session, r)); +} + +/* + * __rec_col_var_helper -- + * Create a column-store variable length record cell and write it onto a + * page. + */ +static int +__rec_col_var_helper(WT_SESSION_IMPL *session, WT_RECONCILE *r, + WT_SALVAGE_COOKIE *salvage, + WT_ITEM *value, int deleted, uint8_t overflow_type, uint64_t rle) +{ + WT_BTREE *btree; + WT_KV *val; + + btree = S2BT(session); + + val = &r->v; + + /* + * Occasionally, salvage needs to discard records from the beginning or + * end of the page, and because the items may be part of a RLE cell, do + * the adjustments here. It's not a mistake we don't bother telling + * our caller we've handled all the records from the page we care about, + * and can quit processing the page: salvage is a rare operation and I + * don't want to complicate our caller's loop. + */ + if (salvage != NULL) { + if (salvage->done) + return (0); + if (salvage->skip != 0) { + if (rle <= salvage->skip) { + salvage->skip -= rle; + return (0); + } + rle -= salvage->skip; + salvage->skip = 0; + } + if (salvage->take != 0) { + if (rle <= salvage->take) + salvage->take -= rle; + else { + rle = salvage->take; + salvage->take = 0; + } + if (salvage->take == 0) + salvage->done = 1; + } + } + + if (deleted) { + val->cell_len = __wt_cell_pack_del(&val->cell, rle); + val->buf.data = NULL; + val->buf.size = 0; + val->len = val->cell_len; + } else if (overflow_type) { + val->cell_len = __wt_cell_pack_ovfl( + &val->cell, overflow_type, rle, value->size); + val->buf.data = value->data; + val->buf.size = value->size; + val->len = val->cell_len + value->size; + } else + WT_RET(__rec_cell_build_val( + session, r, value->data, value->size, rle)); + + /* Boundary: split or write the page. */ + while (val->len > r->space_avail) + if (r->raw_compression) + WT_RET(__rec_split_raw(session, r)); + else + WT_RET(__rec_split(session, r)); + + /* Copy the value onto the page. */ + if (!deleted && !overflow_type && btree->dictionary) + WT_RET(__rec_dict_replace(session, r, rle, val)); + __rec_copy_incr(session, r, val); + + /* Update the starting record number in case we split. */ + r->recno += rle; + + return (0); +} + +/* + * __rec_col_var -- + * Reconcile a variable-width column-store leaf page. + */ +static int +__rec_col_var(WT_SESSION_IMPL *session, + WT_RECONCILE *r, WT_PAGE *page, WT_SALVAGE_COOKIE *salvage) +{ + enum { OVFL_IGNORE, OVFL_UNUSED, OVFL_USED } ovfl_state; + WT_BTREE *btree; + WT_CELL *cell; + WT_CELL_UNPACK *vpack, _vpack; + WT_COL *cip; + WT_DECL_ITEM(orig); + WT_DECL_RET; + WT_INSERT *ins; + WT_ITEM *last; + WT_UPDATE *upd; + uint64_t n, nrepeat, repeat_count, rle, src_recno; + uint32_t i, size; + int deleted, last_deleted, orig_deleted, update_no_copy; + const void *data; + + btree = S2BT(session); + last = r->last; + vpack = &_vpack; + + WT_RET(__wt_scr_alloc(session, 0, &orig)); + data = NULL; + size = 0; + upd = NULL; + + WT_RET(__rec_split_init( + session, r, page, page->pg_var_recno, btree->maxleafpage)); + + /* + * The salvage code may be calling us to reconcile a page where there + * were missing records in the column-store name space. If taking the + * first record from on the page, it might be a deleted record, so we + * have to give the RLE code a chance to figure that out. Else, if + * not taking the first record from the page, write a single element + * representing the missing records onto a new page. (Don't pass the + * salvage cookie to our helper function in this case, we're handling + * one of the salvage cookie fields on our own, and we don't need the + * helper function's assistance.) + */ + rle = 0; + last_deleted = 0; + if (salvage != NULL && salvage->missing != 0) { + if (salvage->skip == 0) { + rle = salvage->missing; + last_deleted = 1; + + /* + * Correct the number of records we're going to "take", + * pretending the missing records were on the page. + */ + salvage->take += salvage->missing; + } else + WT_ERR(__rec_col_var_helper( + session, r, NULL, NULL, 1, 0, salvage->missing)); + } + + /* + * We track two data items through this loop: the previous (last) item + * and the current item: if the last item is the same as the current + * item, we increment the RLE count for the last item; if the last item + * is different from the current item, we write the last item onto the + * page, and replace it with the current item. The r->recno counter + * tracks records written to the page, and is incremented by the helper + * function immediately after writing records to the page. The record + * number of our source record, that is, the current item, is maintained + * in src_recno. + */ + src_recno = r->recno + rle; + + /* For each entry in the in-memory page... */ + WT_COL_FOREACH(page, cip, i) { + ovfl_state = OVFL_IGNORE; + if ((cell = WT_COL_PTR(page, cip)) == NULL) { + nrepeat = 1; + ins = NULL; + orig_deleted = 1; + } else { + __wt_cell_unpack(cell, vpack); + nrepeat = __wt_cell_rle(vpack); + ins = WT_SKIP_FIRST(WT_COL_UPDATE(page, cip)); + + /* + * If the original value is "deleted", there's no value + * to compare, we're done. + */ + orig_deleted = vpack->type == WT_CELL_DEL ? 1 : 0; + if (orig_deleted) + goto record_loop; + + /* + * Overflow items are tricky: we don't know until we're + * finished processing the set of values if we need the + * overflow value or not. If we don't use the overflow + * item at all, we have to discard it from the backing + * file, otherwise we'll leak blocks on the checkpoint. + * That's safe because if the backing overflow value is + * still needed by any running transaction, we'll cache + * a copy in the reconciliation tracking structures. + * + * Regardless, we avoid copying in overflow records: if + * there's a WT_INSERT entry that modifies a reference + * counted overflow record, we may have to write copies + * of the overflow record, and in that case we'll do the + * comparisons, but we don't read overflow items just to + * see if they match records on either side. + */ + if (vpack->ovfl) { + ovfl_state = OVFL_UNUSED; + goto record_loop; + } + + /* + * If data is Huffman encoded, we have to decode it in + * order to compare it with the last item we saw, which + * may have been an update string. This guarantees we + * find every single pair of objects we can RLE encode, + * including applications updating an existing record + * where the new value happens (?) to match a Huffman- + * encoded value in a previous or next record. + */ + WT_ERR(__wt_dsk_cell_data_ref( + session, WT_PAGE_COL_VAR, vpack, orig)); + } + +record_loop: /* + * Generate on-page entries: loop repeat records, looking for + * WT_INSERT entries matching the record number. The WT_INSERT + * lists are in sorted order, so only need check the next one. + */ + for (n = 0; + n < nrepeat; n += repeat_count, src_recno += repeat_count) { + upd = NULL; + if (ins != NULL && WT_INSERT_RECNO(ins) == src_recno) { + WT_ERR(__rec_txn_read( + session, r, ins, NULL, vpack, &upd)); + ins = WT_SKIP_NEXT(ins); + } + if (upd != NULL) { + update_no_copy = 1; /* No data copy */ + repeat_count = 1; /* Single record */ + + deleted = WT_UPDATE_DELETED_ISSET(upd); + if (!deleted) { + data = WT_UPDATE_DATA(upd); + size = upd->size; + } + } else if (vpack->raw == WT_CELL_VALUE_OVFL_RM) { + update_no_copy = 1; /* No data copy */ + repeat_count = 1; /* Single record */ + + deleted = 0; + + /* + * If doing update save and restore, there's an + * update that's not globally visible, and the + * underlying value is a removed overflow value, + * we end up here. + * + * When the update save/restore code noticed the + * removed overflow value, it appended a copy of + * the cached, original overflow value to the + * update list being saved (ensuring the on-page + * item will never be accessed after the page is + * re-instantiated), then returned a NULL update + * to us. + * + * Assert the case: if we remove an underlying + * overflow object, checkpoint reconciliation + * should never see it again, there should be a + * visible update in the way. + * + * Write a placeholder. + */ + WT_ASSERT(session, + F_ISSET(r, WT_SKIP_UPDATE_RESTORE)); + + data = "@"; + size = 1; + } else { + update_no_copy = 0; /* Maybe data copy */ + + /* + * The repeat count is the number of records up + * to the next WT_INSERT record, or up to the + * end of the entry if we have no more WT_INSERT + * records. + */ + if (ins == NULL) + repeat_count = nrepeat - n; + else + repeat_count = + WT_INSERT_RECNO(ins) - src_recno; + + deleted = orig_deleted; + if (deleted) + goto compare; + + /* + * If we are handling overflow items, use the + * overflow item itself exactly once, after + * which we have to copy it into a buffer and + * from then on use a complete copy because we + * are re-creating a new overflow record each + * time. + */ + switch (ovfl_state) { + case OVFL_UNUSED: + /* + * An as-yet-unused overflow item. + * + * We're going to copy the on-page cell, + * write out any record we're tracking. + */ + if (rle != 0) { + WT_ERR(__rec_col_var_helper( + session, r, salvage, last, + last_deleted, 0, rle)); + rle = 0; + } + + last->data = vpack->data; + last->size = vpack->size; + WT_ERR(__rec_col_var_helper( + session, r, salvage, last, 0, + WT_CELL_VALUE_OVFL, repeat_count)); + + /* Track if page has overflow items. */ + r->ovfl_items = 1; + + ovfl_state = OVFL_USED; + continue; + case OVFL_USED: + /* + * Original is an overflow item; we used + * it for a key and now we need another + * copy; read it into memory. + */ + WT_ERR(__wt_dsk_cell_data_ref(session, + WT_PAGE_COL_VAR, vpack, orig)); + + ovfl_state = OVFL_IGNORE; + /* FALLTHROUGH */ + case OVFL_IGNORE: + /* + * Original is an overflow item and we + * were forced to copy it into memory, + * or the original wasn't an overflow + * item; use the data copied into orig. + */ + data = orig->data; + size = (uint32_t)orig->size; + break; + } + } + +compare: /* + * If we have a record against which to compare, and + * the records compare equal, increment the rle counter + * and continue. If the records don't compare equal, + * output the last record and swap the last and current + * buffers: do NOT update the starting record number, + * we've been doing that all along. + */ + if (rle != 0) { + if ((deleted && last_deleted) || + (!last_deleted && !deleted && + last->size == size && + memcmp(last->data, data, size) == 0)) { + rle += repeat_count; + continue; + } + WT_ERR(__rec_col_var_helper(session, r, + salvage, last, last_deleted, 0, rle)); + } + + /* + * Swap the current/last state. + * + * Reset RLE counter and turn on comparisons. + */ + if (!deleted) { + /* + * We can't simply assign the data values into + * the last buffer because they may have come + * from a copy built from an encoded/overflow + * cell and creating the next record is going + * to overwrite that memory. Check, because + * encoded/overflow cells aren't that common + * and we'd like to avoid the copy. If data + * was taken from the current unpack structure + * (which points into the page), or was taken + * from an update structure, we can just use + * the pointers, they're not moving. + */ + if (data == vpack->data || update_no_copy) { + last->data = data; + last->size = size; + } else + WT_ERR(__wt_buf_set( + session, last, data, size)); + } + last_deleted = deleted; + rle = repeat_count; + } + + /* + * If we had a reference to an overflow record we never used, + * discard the underlying blocks, they're no longer useful. + * + * One complication: we must cache a copy before discarding the + * on-disk version if there's a transaction in the system that + * might read the original value. + */ + if (ovfl_state == OVFL_UNUSED && + vpack->raw != WT_CELL_VALUE_OVFL_RM) + WT_ERR(__wt_ovfl_cache(session, page, upd, vpack)); + } + + /* Walk any append list. */ + WT_SKIP_FOREACH(ins, WT_COL_APPEND(page)) { + WT_ERR(__rec_txn_read(session, r, ins, NULL, NULL, &upd)); + if (upd == NULL) + continue; + for (n = WT_INSERT_RECNO(ins); src_recno <= n; ++src_recno) { + /* + * The application may have inserted records which left + * gaps in the name space. + */ + if (src_recno < n) + deleted = 1; + else { + deleted = WT_UPDATE_DELETED_ISSET(upd); + if (!deleted) { + data = WT_UPDATE_DATA(upd); + size = upd->size; + } + } + + /* + * Handle RLE accounting and comparisons -- see comment + * above, this code fragment does the same thing. + */ + if (rle != 0) { + if ((deleted && last_deleted) || + (!last_deleted && !deleted && + last->size == size && + memcmp(last->data, data, size) == 0)) { + ++rle; + continue; + } + WT_ERR(__rec_col_var_helper(session, r, + salvage, last, last_deleted, 0, rle)); + } + + /* + * Swap the current/last state. We always assign the + * data values to the buffer because they can only be + * the data from a WT_UPDATE structure. + * + * Reset RLE counter and turn on comparisons. + */ + if (!deleted) { + last->data = data; + last->size = size; + } + last_deleted = deleted; + rle = 1; + } + } + + /* If we were tracking a record, write it. */ + if (rle != 0) + WT_ERR(__rec_col_var_helper( + session, r, salvage, last, last_deleted, 0, rle)); + + /* Write the remnant page. */ + ret = __rec_split_finish(session, r); + +err: __wt_scr_free(&orig); + return (ret); +} + +/* + * __rec_row_int -- + * Reconcile a row-store internal page. + */ +static int +__rec_row_int(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page) +{ + WT_ADDR *addr; + WT_BTREE *btree; + WT_CELL *cell; + WT_CELL_UNPACK *kpack, _kpack, *vpack, _vpack; + WT_DECL_RET; + WT_IKEY *ikey; + WT_KV *key, *val; + WT_PAGE *child; + WT_REF *ref; + size_t size; + u_int vtype; + int hazard, key_onpage_ovfl, ovfl_key, state; + const void *p; + + btree = S2BT(session); + child = NULL; + hazard = 0; + + key = &r->k; + kpack = &_kpack; + WT_CLEAR(*kpack); /* -Wuninitialized */ + val = &r->v; + vpack = &_vpack; + WT_CLEAR(*vpack); /* -Wuninitialized */ + + WT_RET(__rec_split_init(session, r, page, 0ULL, btree->maxintlpage)); + + /* + * Ideally, we'd never store the 0th key on row-store internal pages + * because it's never used during tree search and there's no reason + * to waste the space. The problem is how we do splits: when we split, + * we've potentially picked out several "split points" in the buffer + * which is overflowing the maximum page size, and when the overflow + * happens, we go back and physically split the buffer, at those split + * points, into new pages. It would be both difficult and expensive + * to re-process the 0th key at each split point to be an empty key, + * so we don't do that. However, we are reconciling an internal page + * for whatever reason, and the 0th key is known to be useless. We + * truncate the key to a single byte, instead of removing it entirely, + * it simplifies various things in other parts of the code (we don't + * have to special case transforming the page from its disk image to + * its in-memory version, for example). + */ + r->cell_zero = 1; + + /* For each entry in the in-memory page... */ + WT_INTL_FOREACH_BEGIN(session, page, ref) { + /* + * There are different paths if the key is an overflow item vs. + * a straight-forward on-page value. If an overflow item, we + * would have instantiated it, and we can use that fact to set + * things up. + * + * Note the cell reference and unpacked key cell are available + * only in the case of an instantiated, off-page key. + */ + ikey = __wt_ref_key_instantiated(ref); + if (ikey == NULL || ikey->cell_offset == 0) { + cell = NULL; + key_onpage_ovfl = 0; + } else { + cell = WT_PAGE_REF_OFFSET(page, ikey->cell_offset); + __wt_cell_unpack(cell, kpack); + key_onpage_ovfl = + kpack->ovfl && kpack->raw != WT_CELL_KEY_OVFL_RM; + } + + WT_ERR(__rec_child_modify(session, r, ref, &hazard, &state)); + addr = ref->addr; + child = ref->page; + vtype = 0; + + /* Deleted child we don't have to write. */ + if (state == WT_CHILD_IGNORE) { + /* + * Overflow keys referencing discarded pages are no + * longer useful, schedule them for discard. Don't + * worry about instantiation, internal page keys are + * always instantiated. Don't worry about reuse, + * reusing this key in this reconciliation is unlikely. + */ + if (key_onpage_ovfl) + WT_ERR(__wt_ovfl_discard_add( + session, page, kpack->cell)); + CHILD_RELEASE_ERR(session, hazard, ref); + continue; + } + + /* Deleted child requiring a proxy cell. */ + if (state == WT_CHILD_PROXY) + vtype = WT_CELL_ADDR_DEL; + + /* + * Modified child. Empty pages are merged into the parent and + * discarded. + */ + if (state == WT_CHILD_MODIFIED) + switch (F_ISSET(child->modify, WT_PM_REC_MASK)) { + case WT_PM_REC_EMPTY: + /* + * Overflow keys referencing empty pages are no + * longer useful, schedule them for discard. + * Don't worry about instantiation, internal + * page keys are always instantiated. Don't + * worry about reuse, reusing this key in this + * reconciliation is unlikely. + */ + if (key_onpage_ovfl) + WT_ERR(__wt_ovfl_discard_add( + session, page, kpack->cell)); + CHILD_RELEASE_ERR(session, hazard, ref); + continue; + case WT_PM_REC_MULTIBLOCK: + /* + * Overflow keys referencing split pages are no + * longer useful (the split page's key is the + * interesting key); schedule them for discard. + * Don't worry about instantiation, internal + * page keys are always instantiated. Don't + * worry about reuse, reusing this key in this + * reconciliation is unlikely. + */ + if (key_onpage_ovfl) + WT_ERR(__wt_ovfl_discard_add( + session, page, kpack->cell)); + + WT_ERR(__rec_row_merge(session, r, child)); + CHILD_RELEASE_ERR(session, hazard, ref); + continue; + case WT_PM_REC_REPLACE: + /* + * If the page is replaced, the page's modify + * structure has the page's address. + */ + addr = &child->modify->mod_replace; + break; + WT_ILLEGAL_VALUE_ERR(session); + } + + /* + * Build the value cell, the child page's address. Addr points + * to an on-page cell or an off-page WT_ADDR structure. The + * cell type has been set in the case of page deletion requiring + * a proxy cell, otherwise use the information from the addr or + * original cell. + */ + if (__wt_off_page(page, addr)) { + p = addr->addr; + size = addr->size; + if (vtype == 0) + vtype = __rec_vtype(addr); + } else { + __wt_cell_unpack(ref->addr, vpack); + p = vpack->data; + size = vpack->size; + if (vtype == 0) + vtype = vpack->raw; + } + __rec_cell_build_addr(r, p, size, vtype, 0); + CHILD_RELEASE_ERR(session, hazard, ref); + + /* + * Build key cell. + * Truncate any 0th key, internal pages don't need 0th keys. + */ + if (key_onpage_ovfl) { + key->buf.data = cell; + key->buf.size = __wt_cell_total_len(kpack); + key->cell_len = 0; + key->len = key->buf.size; + ovfl_key = 1; + } else { + __wt_ref_key(page, ref, &p, &size); + WT_ERR(__rec_cell_build_int_key( + session, r, p, r->cell_zero ? 1 : size, &ovfl_key)); + } + r->cell_zero = 0; + + /* Boundary: split or write the page. */ + while (key->len + val->len > r->space_avail) { + if (r->raw_compression) { + WT_ERR(__rec_split_raw(session, r)); + continue; + } + + /* + * In one path above, we copied address blocks from the + * page rather than building the actual key. In that + * case, we have to build the actual key now because we + * are about to promote it. + */ + if (key_onpage_ovfl) { + WT_ERR(__wt_buf_set(session, + r->cur, WT_IKEY_DATA(ikey), ikey->size)); + key_onpage_ovfl = 0; + } + WT_ERR(__rec_split(session, r)); + } + + /* Copy the key and value onto the page. */ + __rec_copy_incr(session, r, key); + __rec_copy_incr(session, r, val); + + /* Update compression state. */ + __rec_key_state_update(r, ovfl_key); + } WT_INTL_FOREACH_END; + + /* Write the remnant page. */ + return (__rec_split_finish(session, r)); + +err: CHILD_RELEASE(session, hazard, ref); + return (ret); +} + +/* + * __rec_row_merge -- + * Merge in a split page. + */ +static int +__rec_row_merge(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page) +{ + WT_ADDR *addr; + WT_KV *key, *val; + WT_MULTI *multi; + WT_PAGE_MODIFY *mod; + uint32_t i; + int ovfl_key; + + mod = page->modify; + + key = &r->k; + val = &r->v; + + /* For each entry in the split array... */ + for (multi = mod->mod_multi, + i = 0; i < mod->mod_multi_entries; ++multi, ++i) { + /* Build the key and value cells. */ + WT_RET(__rec_cell_build_int_key(session, r, + WT_IKEY_DATA(multi->key.ikey), + r->cell_zero ? 1 : multi->key.ikey->size, &ovfl_key)); + r->cell_zero = 0; + + addr = &multi->addr; + __rec_cell_build_addr( + r, addr->addr, addr->size, __rec_vtype(addr), 0); + + /* Boundary: split or write the page. */ + while (key->len + val->len > r->space_avail) + if (r->raw_compression) + WT_RET(__rec_split_raw(session, r)); + else + WT_RET(__rec_split(session, r)); + + /* Copy the key and value onto the page. */ + __rec_copy_incr(session, r, key); + __rec_copy_incr(session, r, val); + + /* Update compression state. */ + __rec_key_state_update(r, ovfl_key); + } + return (0); +} + +/* + * __rec_row_leaf -- + * Reconcile a row-store leaf page. + */ +static int +__rec_row_leaf(WT_SESSION_IMPL *session, + WT_RECONCILE *r, WT_PAGE *page, WT_SALVAGE_COOKIE *salvage) +{ + WT_BTREE *btree; + WT_CELL *cell, *val_cell; + WT_CELL_UNPACK *kpack, _kpack, *vpack, _vpack; + WT_DECL_ITEM(tmpkey); + WT_DECL_ITEM(tmpval); + WT_DECL_RET; + WT_IKEY *ikey; + WT_INSERT *ins; + WT_KV *key, *val; + WT_ROW *rip; + WT_UPDATE *upd; + size_t size; + uint64_t slvg_skip; + uint32_t i; + int dictionary, onpage_ovfl, ovfl_key; + const void *p; + void *copy; + + btree = S2BT(session); + slvg_skip = salvage == NULL ? 0 : salvage->skip; + + key = &r->k; + val = &r->v; + + WT_RET(__rec_split_init(session, r, page, 0ULL, btree->maxleafpage)); + + /* + * Write any K/V pairs inserted into the page before the first from-disk + * key on the page. + */ + if ((ins = WT_SKIP_FIRST(WT_ROW_INSERT_SMALLEST(page))) != NULL) + WT_RET(__rec_row_leaf_insert(session, r, ins)); + + /* + * Temporary buffers in which to instantiate any uninstantiated keys + * or value items we need. + */ + WT_RET(__wt_scr_alloc(session, 0, &tmpkey)); + WT_RET(__wt_scr_alloc(session, 0, &tmpval)); + + /* For each entry in the page... */ + WT_ROW_FOREACH(page, rip, i) { + /* + * The salvage code, on some rare occasions, wants to reconcile + * a page but skip some leading records on the page. Because + * the row-store leaf reconciliation function copies keys from + * the original disk page, this is non-trivial -- just changing + * the in-memory pointers isn't sufficient, we have to change + * the WT_CELL structures on the disk page, too. It's ugly, but + * we pass in a value that tells us how many records to skip in + * this case. + */ + if (slvg_skip != 0) { + --slvg_skip; + continue; + } + + /* + * Figure out the key: set any cell reference (and unpack it), + * set any instantiated key reference. + */ + copy = WT_ROW_KEY_COPY(rip); + (void)__wt_row_leaf_key_info( + page, copy, &ikey, &cell, NULL, NULL); + if (cell == NULL) + kpack = NULL; + else { + kpack = &_kpack; + __wt_cell_unpack(cell, kpack); + } + + /* Unpack the on-page value cell, and look for an update. */ + if ((val_cell = + __wt_row_leaf_value_cell(page, rip, NULL)) == NULL) + vpack = NULL; + else { + vpack = &_vpack; + __wt_cell_unpack(val_cell, vpack); + } + WT_ERR(__rec_txn_read(session, r, NULL, rip, vpack, &upd)); + + /* Build value cell. */ + dictionary = 0; + if (upd == NULL) { + /* + * When the page was read into memory, there may not + * have been a value item. + * + * If there was a value item, check if it's a dictionary + * cell (a copy of another item on the page). If it's a + * copy, we have to create a new value item as the old + * item might have been discarded from the page. + */ + if (vpack == NULL) { + val->buf.data = NULL; + val->cell_len = val->len = val->buf.size = 0; + } else if (vpack->raw == WT_CELL_VALUE_COPY) { + /* If the item is Huffman encoded, decode it. */ + if (btree->huffman_value == NULL) { + p = vpack->data; + size = vpack->size; + } else { + WT_ERR(__wt_huffman_decode(session, + btree->huffman_value, + vpack->data, vpack->size, + tmpval)); + p = tmpval->data; + size = tmpval->size; + } + WT_ERR(__rec_cell_build_val( + session, r, p, size, (uint64_t)0)); + dictionary = 1; + } else if (vpack->raw == WT_CELL_VALUE_OVFL_RM) { + /* + * If doing update save and restore in service + * of eviction, there's an update that's not + * globally visible, and the underlying value + * is a removed overflow value, we end up here. + * + * When the update save/restore code noticed the + * removed overflow value, it appended a copy of + * the cached, original overflow value to the + * update list being saved (ensuring any on-page + * item will never be accessed after the page is + * re-instantiated), then returned a NULL update + * to us. + * + * Assert the case. + */ + WT_ASSERT(session, + F_ISSET(r, WT_SKIP_UPDATE_RESTORE)); + + /* + * If the key is also a removed overflow item, + * don't write anything at all. + * + * We don't have to write anything because the + * code re-instantiating the page gets the key + * to match the saved list of updates from the + * original page. By not putting the key on + * the page, we'll move the key/value set from + * a row-store leaf page slot to an insert list, + * but that shouldn't matter. + * + * The reason we bother with the test is because + * overflows are expensive to write. It's hard + * to imagine a real workload where this test is + * worth the effort, but it's a simple test. + */ + if (kpack != NULL && + kpack->raw == WT_CELL_KEY_OVFL_RM) + goto leaf_insert; + + /* + * The on-page value will never be accessed, + * write a placeholder record. + */ + WT_ERR(__rec_cell_build_val( + session, r, "@", 1, (uint64_t)0)); + } else { + val->buf.data = val_cell; + val->buf.size = __wt_cell_total_len(vpack); + val->cell_len = 0; + val->len = val->buf.size; + + /* Track if page has overflow items. */ + if (vpack->ovfl) + r->ovfl_items = 1; + } + } else { + /* + * If the original value was an overflow and we've not + * already done so, discard it. One complication: we + * must cache a copy before discarding the on-disk + * version if there's a transaction in the system that + * might read the original value. + */ + if (vpack != NULL && + vpack->ovfl && vpack->raw != WT_CELL_VALUE_OVFL_RM) + WT_ERR( + __wt_ovfl_cache(session, page, rip, vpack)); + + /* If this key/value pair was deleted, we're done. */ + if (WT_UPDATE_DELETED_ISSET(upd)) { + /* + * Overflow keys referencing discarded values + * are no longer useful, discard the backing + * blocks. Don't worry about reuse, reusing + * keys from a row-store page reconciliation + * seems unlikely enough to ignore. + */ + if (kpack != NULL && kpack->ovfl && + kpack->raw != WT_CELL_KEY_OVFL_RM) { + /* + * Keys are part of the name-space, we + * can't remove them from the in-memory + * tree; if an overflow key was deleted + * without being instantiated (for + * example, cursor-based truncation, do + * it now. + */ + if (ikey == NULL) + WT_ERR(__wt_row_leaf_key( + session, + page, rip, tmpkey, 1)); + + WT_ERR(__wt_ovfl_discard_add( + session, page, kpack->cell)); + } + + /* + * We aren't actually creating the key so we + * can't use bytes from this key to provide + * prefix information for a subsequent key. + */ + tmpkey->size = 0; + + /* Proceed with appended key/value pairs. */ + goto leaf_insert; + } + + /* + * If no value, nothing needs to be copied. Otherwise, + * build the value's WT_CELL chunk from the most recent + * update value. + */ + if (upd->size == 0) { + val->buf.data = NULL; + val->cell_len = val->len = val->buf.size = 0; + } else { + WT_ERR(__rec_cell_build_val(session, r, + WT_UPDATE_DATA(upd), upd->size, + (uint64_t)0)); + dictionary = 1; + } + } + + /* + * Build key cell. + * + * If the key is an overflow key that hasn't been removed, use + * the original backing blocks. + */ + onpage_ovfl = kpack != NULL && + kpack->ovfl && kpack->raw != WT_CELL_KEY_OVFL_RM; + if (onpage_ovfl) { + key->buf.data = cell; + key->buf.size = __wt_cell_total_len(kpack); + key->cell_len = 0; + key->len = key->buf.size; + ovfl_key = 1; + + /* + * We aren't creating a key so we can't use this key as + * a prefix for a subsequent key. + */ + tmpkey->size = 0; + + /* Track if page has overflow items. */ + r->ovfl_items = 1; + } else { + /* + * Get the key from the page or an instantiated key, or + * inline building the key from a previous key (it's a + * fast path for simple, prefix-compressed keys), or by + * by building the key from scratch. + */ + if (__wt_row_leaf_key_info(page, copy, + NULL, &cell, &tmpkey->data, &tmpkey->size)) + goto build; + + kpack = &_kpack; + __wt_cell_unpack(cell, kpack); + if (btree->huffman_key == NULL && + kpack->type == WT_CELL_KEY && + tmpkey->size >= kpack->prefix) { + /* + * The previous clause checked for a prefix of + * zero, which means the temporary buffer must + * have a non-zero size, and it references a + * valid key. + */ + WT_ASSERT(session, tmpkey->size != 0); + + /* + * Grow the buffer as necessary, ensuring data + * data has been copied into local buffer space, + * then append the suffix to the prefix already + * in the buffer. + * + * Don't grow the buffer unnecessarily or copy + * data we don't need, truncate the item's data + * length to the prefix bytes. + */ + tmpkey->size = kpack->prefix; + WT_ERR(__wt_buf_grow(session, + tmpkey, tmpkey->size + kpack->size)); + memcpy((uint8_t *)tmpkey->mem + tmpkey->size, + kpack->data, kpack->size); + tmpkey->size += kpack->size; + } else + WT_ERR(__wt_row_leaf_key_copy( + session, page, rip, tmpkey)); +build: + WT_ERR(__rec_cell_build_leaf_key(session, r, + tmpkey->data, tmpkey->size, &ovfl_key)); + } + + /* Boundary: split or write the page. */ + while (key->len + val->len > r->space_avail) { + if (r->raw_compression) { + WT_ERR(__rec_split_raw(session, r)); + continue; + } + + /* + * In one path above, we copied address blocks from the + * page rather than building the actual key. In that + * case, we have to build the actual key now because we + * are about to promote it. + */ + if (onpage_ovfl) { + WT_ERR(__wt_dsk_cell_data_ref( + session, WT_PAGE_ROW_LEAF, kpack, r->cur)); + onpage_ovfl = 0; + } + WT_ERR(__rec_split(session, r)); + + /* + * Turn off prefix compression until a full key written + * to the new page, and (unless we're already working + * with an overflow key), rebuild the key without prefix + * compression. + */ + if (r->key_pfx_compress_conf) { + r->key_pfx_compress = 0; + if (!ovfl_key) + WT_ERR(__rec_cell_build_leaf_key( + session, r, NULL, 0, &ovfl_key)); + } + } + + /* Copy the key/value pair onto the page. */ + __rec_copy_incr(session, r, key); + if (val->len == 0) + r->any_empty_value = 1; + else { + r->all_empty_value = 0; + if (dictionary && btree->dictionary) + WT_ERR(__rec_dict_replace(session, r, 0, val)); + __rec_copy_incr(session, r, val); + } + + /* Update compression state. */ + __rec_key_state_update(r, ovfl_key); + +leaf_insert: /* Write any K/V pairs inserted into the page after this key. */ + if ((ins = WT_SKIP_FIRST(WT_ROW_INSERT(page, rip))) != NULL) + WT_ERR(__rec_row_leaf_insert(session, r, ins)); + } + + /* Write the remnant page. */ + ret = __rec_split_finish(session, r); + +err: __wt_scr_free(&tmpkey); + __wt_scr_free(&tmpval); + return (ret); +} + +/* + * __rec_row_leaf_insert -- + * Walk an insert chain, writing K/V pairs. + */ +static int +__rec_row_leaf_insert(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_INSERT *ins) +{ + WT_BTREE *btree; + WT_KV *key, *val; + WT_UPDATE *upd; + int ovfl_key; + + btree = S2BT(session); + + key = &r->k; + val = &r->v; + + for (; ins != NULL; ins = WT_SKIP_NEXT(ins)) { + /* Look for an update. */ + WT_RET(__rec_txn_read(session, r, ins, NULL, NULL, &upd)); + if (upd == NULL || WT_UPDATE_DELETED_ISSET(upd)) + continue; + + if (upd->size == 0) /* Build value cell. */ + val->len = 0; + else + WT_RET(__rec_cell_build_val(session, r, + WT_UPDATE_DATA(upd), upd->size, (uint64_t)0)); + + /* Build key cell. */ + WT_RET(__rec_cell_build_leaf_key(session, r, + WT_INSERT_KEY(ins), WT_INSERT_KEY_SIZE(ins), &ovfl_key)); + + /* Boundary: split or write the page. */ + while (key->len + val->len > r->space_avail) { + if (r->raw_compression) { + WT_RET(__rec_split_raw(session, r)); + continue; + } + WT_RET(__rec_split(session, r)); + + /* + * Turn off prefix compression until a full key written + * to the new page, and (unless we're already working + * with an overflow key), rebuild the key without prefix + * compression. + */ + if (r->key_pfx_compress_conf) { + r->key_pfx_compress = 0; + if (!ovfl_key) + WT_RET(__rec_cell_build_leaf_key( + session, r, NULL, 0, &ovfl_key)); + } + } + + /* Copy the key/value pair onto the page. */ + __rec_copy_incr(session, r, key); + if (val->len == 0) + r->any_empty_value = 1; + else { + r->all_empty_value = 0; + if (btree->dictionary) + WT_RET(__rec_dict_replace(session, r, 0, val)); + __rec_copy_incr(session, r, val); + } + + /* Update compression state. */ + __rec_key_state_update(r, ovfl_key); + } + + return (0); +} + +/* + * __rec_split_discard -- + * Discard the pages resulting from a previous split. + */ +static int +__rec_split_discard(WT_SESSION_IMPL *session, WT_PAGE *page) +{ + WT_BM *bm; + WT_DECL_RET; + WT_PAGE_MODIFY *mod; + WT_MULTI *multi; + uint32_t i; + + bm = S2BT(session)->bm; + mod = page->modify; + + /* + * A page that split is being reconciled for the second, or subsequent + * time; discard underlying block space used in the last reconciliation + * that is not being reused for this reconciliation. + */ + for (multi = mod->mod_multi, + i = 0; i < mod->mod_multi_entries; ++multi, ++i) { + switch (page->type) { + case WT_PAGE_ROW_INT: + case WT_PAGE_ROW_LEAF: + __wt_free(session, multi->key.ikey); + break; + } + if (multi->skip == NULL) { + if (multi->addr.reuse) + multi->addr.addr = NULL; + else { + WT_RET(bm->free(bm, session, + multi->addr.addr, multi->addr.size)); + __wt_free(session, multi->addr.addr); + } + } else { + __wt_free(session, multi->skip); + __wt_free(session, multi->skip_dsk); + } + } + __wt_free(session, mod->mod_multi); + mod->mod_multi_entries = 0; + + /* + * This routine would be trivial, and only walk a single page freeing + * any blocks written to support the split, except for root splits. + * In the case of root splits, we have to cope with multiple pages in + * a linked list, and we also have to discard overflow items written + * for the page. + */ + switch (page->type) { + case WT_PAGE_COL_INT: + case WT_PAGE_ROW_INT: + if (mod->mod_root_split == NULL) + break; + WT_RET(__rec_split_discard(session, mod->mod_root_split)); + WT_RET(__wt_ovfl_track_wrapup(session, mod->mod_root_split)); + __wt_page_out(session, &mod->mod_root_split); + break; + } + + return (ret); +} + +/* + * __rec_write_wrapup -- + * Finish the reconciliation. + */ +static int +__rec_write_wrapup(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page) +{ + WT_BM *bm; + WT_BOUNDARY *bnd; + WT_BTREE *btree; + WT_MULTI *multi; + WT_PAGE_MODIFY *mod; + WT_REF *ref; + size_t addr_size; + const uint8_t *addr; + + btree = S2BT(session); + bm = btree->bm; + mod = page->modify; + ref = r->ref; + + /* + * This page may have previously been reconciled, and that information + * is now about to be replaced. Make sure it's discarded at some point, + * and clear the underlying modification information, we're creating a + * new reality. + */ + switch (F_ISSET(mod, WT_PM_REC_MASK)) { + case 0: /* + * The page has never been reconciled before, free the original + * address blocks (if any). The "if any" is for empty trees + * created when a new tree is opened or previously deleted pages + * instantiated in memory. + * + * The exception is root pages are never tracked or free'd, they + * are checkpoints, and must be explicitly dropped. + */ + if (__wt_ref_is_root(ref)) + break; + if (ref->addr != NULL) { + /* + * Free the page and clear the address (so we don't free + * it twice). + */ + WT_RET(__wt_ref_info( + session, ref, &addr, &addr_size, NULL)); + WT_RET(bm->free(bm, session, addr, addr_size)); + if (__wt_off_page(ref->home, ref->addr)) { + __wt_free( + session, ((WT_ADDR *)ref->addr)->addr); + __wt_free(session, ref->addr); + } + ref->addr = NULL; + } + break; + case WT_PM_REC_EMPTY: /* Page deleted */ + break; + case WT_PM_REC_MULTIBLOCK: /* Multiple blocks */ + /* + * Discard the multiple replacement blocks. + */ + WT_RET(__rec_split_discard(session, page)); + break; + case WT_PM_REC_REPLACE: /* 1-for-1 page swap */ + /* + * Discard the replacement leaf page's blocks. + * + * The exception is root pages are never tracked or free'd, they + * are checkpoints, and must be explicitly dropped. + */ + if (!__wt_ref_is_root(ref)) + WT_RET(bm->free(bm, session, + mod->mod_replace.addr, mod->mod_replace.size)); + + /* Discard the replacement page's address. */ + __wt_free(session, mod->mod_replace.addr); + mod->mod_replace.size = 0; + break; + WT_ILLEGAL_VALUE(session); + } + F_CLR(mod, WT_PM_REC_MASK); + + /* + * Wrap up overflow tracking. If we are about to create a checkpoint, + * the system must be entirely consistent at that point (the underlying + * block manager is presumably going to do some action to resolve the + * list of allocated/free/whatever blocks that are associated with the + * checkpoint). + */ + WT_RET(__wt_ovfl_track_wrapup(session, page)); + + switch (r->bnd_next) { + case 0: /* Page delete */ + WT_RET(__wt_verbose( + session, WT_VERB_RECONCILE, "page %p empty", page)); + WT_STAT_FAST_DATA_INCR(session, rec_page_delete); + + /* If this is the root page, we need to create a sync point. */ + ref = r->ref; + if (__wt_ref_is_root(ref)) + WT_RET( + bm->checkpoint(bm, session, NULL, btree->ckpt, 0)); + + /* + * If the page was empty, we want to discard it from the tree + * by discarding the parent's key when evicting the parent. + * Mark the page as deleted, then return success, leaving the + * page in memory. If the page is subsequently modified, that + * is OK, we'll just reconcile it again. + */ + F_SET(mod, WT_PM_REC_EMPTY); + break; + case 1: /* 1-for-1 page swap */ + /* + * Because WiredTiger's pages grow without splitting, we're + * replacing a single page with another single page most of + * the time. + */ + bnd = &r->bnd[0]; + + /* + * If we're saving/restoring changes for this page, there's + * nothing to write. Allocate, then initialize the array of + * replacement blocks. + */ + if (bnd->skip != NULL) { + WT_RET(__wt_calloc_def( + session, r->bnd_next, &mod->mod_multi)); + multi = mod->mod_multi; + multi->skip = bnd->skip; + multi->skip_entries = bnd->skip_next; + bnd->skip = NULL; + multi->skip_dsk = bnd->dsk; + bnd->dsk = NULL; + mod->mod_multi_entries = 1; + + F_SET(mod, WT_PM_REC_MULTIBLOCK); + break; + } + + /* + * If this is a root page, then we don't have an address and we + * have to create a sync point. The address was cleared when + * we were about to write the buffer so we know what to do here. + */ + if (bnd->addr.addr == NULL) + WT_RET(__wt_bt_write(session, + &r->dsk, NULL, NULL, 1, bnd->already_compressed)); + else { + mod->mod_replace = bnd->addr; + bnd->addr.addr = NULL; + } + + F_SET(mod, WT_PM_REC_REPLACE); + break; + default: /* Page split */ + WT_RET(__wt_verbose(session, WT_VERB_RECONCILE, + "page %p reconciled into %" PRIu32 " pages", + page, r->bnd_next)); + + switch (page->type) { + case WT_PAGE_COL_INT: + case WT_PAGE_ROW_INT: + WT_STAT_FAST_DATA_INCR( + session, rec_multiblock_internal); + break; + case WT_PAGE_COL_FIX: + case WT_PAGE_COL_VAR: + case WT_PAGE_ROW_LEAF: + WT_STAT_FAST_DATA_INCR(session, rec_multiblock_leaf); + break; + WT_ILLEGAL_VALUE(session); + } + + /* Display the actual split keys. */ + if (WT_VERBOSE_ISSET(session, WT_VERB_SPLIT)) { + WT_DECL_ITEM(tkey); + WT_DECL_RET; + uint32_t i; + + if (page->type == WT_PAGE_ROW_INT || + page->type == WT_PAGE_ROW_LEAF) + WT_RET(__wt_scr_alloc(session, 0, &tkey)); + for (bnd = r->bnd, i = 0; i < r->bnd_next; ++bnd, ++i) + switch (page->type) { + case WT_PAGE_ROW_INT: + case WT_PAGE_ROW_LEAF: + WT_ERR(__wt_buf_set_printable( + session, tkey, + bnd->key.data, bnd->key.size)); + WT_ERR(__wt_verbose( + session, WT_VERB_SPLIT, + "split: starting key " + "%.*s", + (int)tkey->size, + (const char *)tkey->data)); + break; + case WT_PAGE_COL_FIX: + case WT_PAGE_COL_INT: + case WT_PAGE_COL_VAR: + WT_ERR(__wt_verbose( + session, WT_VERB_SPLIT, + "split: starting recno %" PRIu64, + bnd->recno)); + break; + WT_ILLEGAL_VALUE_ERR(session); + } +err: __wt_scr_free(&tkey); + WT_RET(ret); + } + if (r->bnd_next > r->bnd_next_max) { + r->bnd_next_max = r->bnd_next; + WT_STAT_FAST_DATA_SET( + session, rec_multiblock_max, r->bnd_next_max); + } + + switch (page->type) { + case WT_PAGE_ROW_INT: + case WT_PAGE_ROW_LEAF: + WT_RET(__rec_split_row(session, r, page)); + break; + case WT_PAGE_COL_INT: + case WT_PAGE_COL_FIX: + case WT_PAGE_COL_VAR: + WT_RET(__rec_split_col(session, r, page)); + break; + WT_ILLEGAL_VALUE(session); + } + F_SET(mod, WT_PM_REC_MULTIBLOCK); + break; + } + + /* + * If updates were skipped, the tree isn't clean. The checkpoint call + * cleared the tree's modified value before calling the eviction thread, + * so we must explicitly reset the tree's modified flag. We insert a + * barrier after the change for clarity (the requirement is the value + * be set before a subsequent checkpoint reads it, and because the + * current checkpoint is waiting on this reconciliation to complete, + * there's no risk of that happening). + * + * Otherwise, if no updates were skipped, we have a new maximum + * transaction written for the page (used to decide if a clean page can + * be evicted). The page only might be clean; if the write generation + * is unchanged since reconciliation started, clear it and update cache + * dirty statistics, if the write generation changed, then the page has + * been written since we started reconciliation, it cannot be + * discarded. + */ + if (r->leave_dirty) { + mod->first_dirty_txn = r->skipped_txn; + + btree->modified = 1; + WT_FULL_BARRIER(); + } else { + mod->rec_max_txn = r->max_txn; + + if (WT_ATOMIC_CAS4(mod->write_gen, r->orig_write_gen, 0)) + __wt_cache_dirty_decr(session, page); + } + + return (0); +} + +/* + * __rec_write_wrapup_err -- + * Finish the reconciliation on error. + */ +static int +__rec_write_wrapup_err(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page) +{ + WT_BM *bm; + WT_BOUNDARY *bnd; + WT_DECL_RET; + WT_MULTI *multi; + WT_PAGE_MODIFY *mod; + uint32_t i; + + bm = S2BT(session)->bm; + mod = page->modify; + + /* + * Clear the address-reused flag from the multiblock reconciliation + * information (otherwise we might think the backing block is being + * reused on a subsequent reconciliation where we want to free it). + */ + if (F_ISSET(mod, WT_PM_REC_MASK) == WT_PM_REC_MULTIBLOCK) + for (multi = mod->mod_multi, + i = 0; i < mod->mod_multi_entries; ++multi, ++i) + multi->addr.reuse = 0; + + /* + * On error, discard blocks we've written, they're unreferenced by the + * tree. This is not a question of correctness, we're avoiding block + * leaks. + * + * Don't discard backing blocks marked for reuse, they remain part of + * a previous reconciliation. + */ + WT_TRET(__wt_ovfl_track_wrapup_err(session, page)); + for (bnd = r->bnd, i = 0; i < r->bnd_next; ++bnd, ++i) + if (bnd->addr.addr != NULL) { + if (bnd->addr.reuse) + bnd->addr.addr = NULL; + else { + WT_TRET(bm->free(bm, session, + bnd->addr.addr, bnd->addr.size)); + __wt_free(session, bnd->addr.addr); + } + } + + return (ret); +} + +/* + * __rec_split_row -- + * Split a row-store page into a set of replacement blocks. + */ +static int +__rec_split_row(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page) +{ + WT_BOUNDARY *bnd; + WT_MULTI *multi; + WT_PAGE_MODIFY *mod; + WT_REF *ref; + uint32_t i; + size_t size; + void *p; + + mod = page->modify; + + /* We never set the first page's key, grab it from the original page. */ + ref = r->ref; + if (__wt_ref_is_root(ref)) + WT_RET(__wt_buf_set(session, &r->bnd[0].key, "", 1)); + else { + __wt_ref_key(ref->home, ref, &p, &size); + WT_RET(__wt_buf_set(session, &r->bnd[0].key, p, size)); + } + + /* Allocate, then initialize the array of replacement blocks. */ + WT_RET(__wt_calloc_def(session, r->bnd_next, &mod->mod_multi)); + + for (multi = mod->mod_multi, + bnd = r->bnd, i = 0; i < r->bnd_next; ++multi, ++bnd, ++i) { + WT_RET(__wt_row_ikey(session, 0, + bnd->key.data, bnd->key.size, &multi->key.ikey)); + + if (bnd->skip == NULL) { + multi->addr = bnd->addr; + multi->addr.reuse = 0; + multi->size = bnd->size; + multi->cksum = bnd->cksum; + bnd->addr.addr = NULL; + } else { + multi->skip = bnd->skip; + multi->skip_entries = bnd->skip_next; + bnd->skip = NULL; + multi->skip_dsk = bnd->dsk; + bnd->dsk = NULL; + } + } + mod->mod_multi_entries = r->bnd_next; + + return (0); +} + +/* + * __rec_split_col -- + * Split a column-store page into a set of replacement blocks. + */ +static int +__rec_split_col(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page) +{ + WT_BOUNDARY *bnd; + WT_MULTI *multi; + WT_PAGE_MODIFY *mod; + uint32_t i; + + mod = page->modify; + + /* Allocate, then initialize the array of replacement blocks. */ + WT_RET(__wt_calloc_def(session, r->bnd_next, &mod->mod_multi)); + + for (multi = mod->mod_multi, + bnd = r->bnd, i = 0; i < r->bnd_next; ++multi, ++bnd, ++i) { + multi->key.recno = bnd->recno; + + if (bnd->skip == NULL) { + multi->addr = bnd->addr; + multi->addr.reuse = 0; + multi->size = bnd->size; + multi->cksum = bnd->cksum; + bnd->addr.addr = NULL; + } else { + multi->skip = bnd->skip; + multi->skip_entries = bnd->skip_next; + bnd->skip = NULL; + multi->skip_dsk = bnd->dsk; + bnd->dsk = NULL; + } + } + mod->mod_multi_entries = r->bnd_next; + + return (0); +} + +/* + * __rec_cell_build_int_key -- + * Process a key and return a WT_CELL structure and byte string to be + * stored on a row-store internal page. + */ +static int +__rec_cell_build_int_key(WT_SESSION_IMPL *session, + WT_RECONCILE *r, const void *data, size_t size, int *is_ovflp) +{ + WT_BTREE *btree; + WT_KV *key; + + *is_ovflp = 0; + + btree = S2BT(session); + + key = &r->k; + + /* Copy the bytes into the "current" and key buffers. */ + WT_RET(__wt_buf_set(session, r->cur, data, size)); + WT_RET(__wt_buf_set(session, &key->buf, data, size)); + + /* Create an overflow object if the data won't fit. */ + if (size > btree->maxintlitem) { + WT_STAT_FAST_DATA_INCR(session, rec_overflow_key_internal); + + *is_ovflp = 1; + return (__rec_cell_build_ovfl( + session, r, key, WT_CELL_KEY_OVFL, (uint64_t)0)); + } + + key->cell_len = __wt_cell_pack_int_key(&key->cell, key->buf.size); + key->len = key->cell_len + key->buf.size; + + return (0); +} + +/* + * __rec_cell_build_leaf_key -- + * Process a key and return a WT_CELL structure and byte string to be + * stored on a row-store leaf page. + */ +static int +__rec_cell_build_leaf_key(WT_SESSION_IMPL *session, + WT_RECONCILE *r, const void *data, size_t size, int *is_ovflp) +{ + WT_BTREE *btree; + WT_KV *key; + size_t pfx_max; + uint8_t pfx; + const uint8_t *a, *b; + + *is_ovflp = 0; + + btree = S2BT(session); + + key = &r->k; + + pfx = 0; + if (data == NULL) + /* + * When data is NULL, our caller has a prefix compressed key + * they can't use (probably because they just crossed a split + * point). Use the full key saved when last called, instead. + */ + WT_RET(__wt_buf_set( + session, &key->buf, r->cur->data, r->cur->size)); + else { + /* + * Save a copy of the key for later reference: we use the full + * key for prefix-compression comparisons, and if we are, for + * any reason, unable to use the compressed key we generate. + */ + WT_RET(__wt_buf_set(session, r->cur, data, size)); + + /* + * Do prefix compression on the key. We know by definition the + * previous key sorts before the current key, which means the + * keys must differ and we just need to compare up to the + * shorter of the two keys. + */ + if (r->key_pfx_compress) { + /* + * We can't compress out more than 256 bytes, limit the + * comparison to that. + */ + pfx_max = UINT8_MAX; + if (size < pfx_max) + pfx_max = size; + if (r->last->size < pfx_max) + pfx_max = r->last->size; + for (a = data, b = r->last->data; pfx < pfx_max; ++pfx) + if (*a++ != *b++) + break; + + /* + * Prefix compression may cost us CPU and memory when + * the page is re-loaded, don't do it unless there's + * reasonable gain. + */ + if (pfx < btree->prefix_compression_min) + pfx = 0; + else + WT_STAT_FAST_DATA_INCRV( + session, rec_prefix_compression, pfx); + } + + /* Copy the non-prefix bytes into the key buffer. */ + WT_RET(__wt_buf_set( + session, &key->buf, (uint8_t *)data + pfx, size - pfx)); + } + + /* Optionally compress the key using the Huffman engine. */ + if (btree->huffman_key != NULL) + WT_RET(__wt_huffman_encode(session, btree->huffman_key, + key->buf.data, (uint32_t)key->buf.size, &key->buf)); + + /* Create an overflow object if the data won't fit. */ + if (key->buf.size > btree->maxleafitem) { + /* + * Overflow objects aren't prefix compressed -- rebuild any + * object that was prefix compressed. + */ + if (pfx == 0) { + WT_STAT_FAST_DATA_INCR(session, rec_overflow_key_leaf); + + *is_ovflp = 1; + return (__rec_cell_build_ovfl( + session, r, key, WT_CELL_KEY_OVFL, (uint64_t)0)); + } + return ( + __rec_cell_build_leaf_key(session, r, NULL, 0, is_ovflp)); + } + + key->cell_len = __wt_cell_pack_leaf_key(&key->cell, pfx, key->buf.size); + key->len = key->cell_len + key->buf.size; + + return (0); +} + +/* + * __rec_cell_build_addr -- + * Process an address reference and return a cell structure to be stored + * on the page. + */ +static void +__rec_cell_build_addr(WT_RECONCILE *r, + const void *addr, size_t size, u_int cell_type, uint64_t recno) +{ + WT_KV *val; + + val = &r->v; + + /* + * We don't check the address size because we can't store an address on + * an overflow page: if the address won't fit, the overflow page's + * address won't fit either. This possibility must be handled by Btree + * configuration, we have to disallow internal page sizes that are too + * small with respect to the largest address cookie the underlying block + * manager might return. + */ + + /* + * We don't copy the data into the buffer, it's not necessary; just + * re-point the buffer's data/length fields. + */ + val->buf.data = addr; + val->buf.size = size; + val->cell_len = + __wt_cell_pack_addr(&val->cell, cell_type, recno, val->buf.size); + val->len = val->cell_len + val->buf.size; +} + +/* + * __rec_cell_build_val -- + * Process a data item and return a WT_CELL structure and byte string to + * be stored on the page. + */ +static int +__rec_cell_build_val(WT_SESSION_IMPL *session, + WT_RECONCILE *r, const void *data, size_t size, uint64_t rle) +{ + WT_BTREE *btree; + WT_KV *val; + + btree = S2BT(session); + + val = &r->v; + + /* + * We don't copy the data into the buffer, it's not necessary; just + * re-point the buffer's data/length fields. + */ + val->buf.data = data; + val->buf.size = size; + + /* Handle zero-length cells quickly. */ + if (size != 0) { + /* Optionally compress the data using the Huffman engine. */ + if (btree->huffman_value != NULL) + WT_RET(__wt_huffman_encode( + session, btree->huffman_value, + val->buf.data, (uint32_t)val->buf.size, &val->buf)); + + /* Create an overflow object if the data won't fit. */ + if (val->buf.size > btree->maxleafitem) { + WT_STAT_FAST_DATA_INCR(session, rec_overflow_value); + + return (__rec_cell_build_ovfl( + session, r, val, WT_CELL_VALUE_OVFL, rle)); + } + } + val->cell_len = __wt_cell_pack_data(&val->cell, rle, val->buf.size); + val->len = val->cell_len + val->buf.size; + + return (0); +} + +/* + * __rec_cell_build_ovfl -- + * Store overflow items in the file, returning the address cookie. + */ +static int +__rec_cell_build_ovfl(WT_SESSION_IMPL *session, + WT_RECONCILE *r, WT_KV *kv, uint8_t type, uint64_t rle) +{ + WT_BM *bm; + WT_BTREE *btree; + WT_DECL_ITEM(tmp); + WT_DECL_RET; + WT_PAGE *page; + WT_PAGE_HEADER *dsk; + size_t size; + uint8_t *addr, buf[WT_BTREE_MAX_ADDR_COOKIE]; + + btree = S2BT(session); + bm = btree->bm; + page = r->page; + + /* Track if page has overflow items. */ + r->ovfl_items = 1; + + /* + * See if this overflow record has already been written and reuse it if + * possible. Else, write a new overflow record. + */ + if (!__wt_ovfl_reuse_search(session, page, + &addr, &size, kv->buf.data, kv->buf.size)) { + /* Allocate a buffer big enough to write the overflow record. */ + size = kv->buf.size; + WT_RET(bm->write_size(bm, session, &size)); + WT_RET(__wt_scr_alloc(session, size, &tmp)); + + /* Initialize the buffer: disk header and overflow record. */ + dsk = tmp->mem; + memset(dsk, 0, WT_PAGE_HEADER_SIZE); + dsk->type = WT_PAGE_OVFL; + dsk->u.datalen = (uint32_t)kv->buf.size; + memcpy(WT_PAGE_HEADER_BYTE(btree, dsk), + kv->buf.data, kv->buf.size); + dsk->mem_size = tmp->size = + WT_PAGE_HEADER_BYTE_SIZE(btree) + (uint32_t)kv->buf.size; + + /* Write the buffer. */ + addr = buf; + WT_ERR(__wt_bt_write(session, tmp, addr, &size, 0, 0)); + + /* + * Track the overflow record (unless it's a bulk load, which + * by definition won't ever reuse a record. + */ + if (!r->is_bulk_load) + WT_ERR(__wt_ovfl_reuse_add(session, page, + addr, size, kv->buf.data, kv->buf.size)); + } + + /* Set the callers K/V to reference the overflow record's address. */ + WT_ERR(__wt_buf_set(session, &kv->buf, addr, size)); + + /* Build the cell and return. */ + kv->cell_len = __wt_cell_pack_ovfl(&kv->cell, type, rle, kv->buf.size); + kv->len = kv->cell_len + kv->buf.size; + +err: __wt_scr_free(&tmp); + return (ret); +} + +/* + * The dictionary -- + * The rest of this file is support for dictionaries. + * + * It's difficult to write generic skiplist functions without turning a single + * memory allocation into two, or requiring a function call instead of a simple + * comparison. Fortunately, skiplists are relatively simple things and we can + * include them in-place. If you need generic skip-list functions to modify, + * this set wouldn't be a bad place to start. + * + * __rec_dictionary_skip_search -- + * Search a dictionary skiplist. + */ +static WT_DICTIONARY * +__rec_dictionary_skip_search(WT_DICTIONARY **head, uint64_t hash) +{ + WT_DICTIONARY **e; + int i; + + /* + * Start at the highest skip level, then go as far as possible at each + * level before stepping down to the next. + */ + for (i = WT_SKIP_MAXDEPTH - 1, e = &head[i]; i >= 0;) { + if (*e == NULL) { /* Empty levels */ + --i; + --e; + continue; + } + + /* + * Return any exact matches: we don't care in what search level + * we found a match. + */ + if ((*e)->hash == hash) /* Exact match */ + return (*e); + if ((*e)->hash > hash) { /* Drop down a level */ + --i; + --e; + } else /* Keep going at this level */ + e = &(*e)->next[i]; + } + return (NULL); +} + +/* + * __rec_dictionary_skip_search_stack -- + * Search a dictionary skiplist, returning an insert/remove stack. + */ +static void +__rec_dictionary_skip_search_stack( + WT_DICTIONARY **head, WT_DICTIONARY ***stack, uint64_t hash) +{ + WT_DICTIONARY **e; + int i; + + /* + * Start at the highest skip level, then go as far as possible at each + * level before stepping down to the next. + */ + for (i = WT_SKIP_MAXDEPTH - 1, e = &head[i]; i >= 0;) + if (*e == NULL || (*e)->hash > hash) + stack[i--] = e--; /* Drop down a level */ + else + e = &(*e)->next[i]; /* Keep going at this level */ +} + +/* + * __rec_dictionary_skip_insert -- + * Insert an entry into the dictionary skip-list. + */ +static void +__rec_dictionary_skip_insert( + WT_DICTIONARY **head, WT_DICTIONARY *e, uint64_t hash) +{ + WT_DICTIONARY **stack[WT_SKIP_MAXDEPTH]; + u_int i; + + /* Insert the new entry into the skiplist. */ + __rec_dictionary_skip_search_stack(head, stack, hash); + for (i = 0; i < e->depth; ++i) { + e->next[i] = *stack[i]; + *stack[i] = e; + } +} + +/* + * __rec_dictionary_init -- + * Allocate and initialize the dictionary. + */ +static int +__rec_dictionary_init(WT_SESSION_IMPL *session, WT_RECONCILE *r, u_int slots) +{ + u_int depth, i; + + /* Free any previous dictionary. */ + __rec_dictionary_free(session, r); + + r->dictionary_slots = slots; + WT_RET(__wt_calloc(session, + r->dictionary_slots, sizeof(WT_DICTIONARY *), &r->dictionary)); + for (i = 0; i < r->dictionary_slots; ++i) { + depth = __wt_skip_choose_depth(session); + WT_RET(__wt_calloc(session, 1, + sizeof(WT_DICTIONARY) + depth * sizeof(WT_DICTIONARY *), + &r->dictionary[i])); + r->dictionary[i]->depth = depth; + } + return (0); +} + +/* + * __rec_dictionary_free -- + * Free the dictionary. + */ +static void +__rec_dictionary_free(WT_SESSION_IMPL *session, WT_RECONCILE *r) +{ + u_int i; + + if (r->dictionary == NULL) + return; + + /* + * We don't correct dictionary_slots when we fail during allocation, + * but that's OK, the value is either NULL or a memory reference to + * be free'd. + */ + for (i = 0; i < r->dictionary_slots; ++i) + __wt_free(session, r->dictionary[i]); + __wt_free(session, r->dictionary); +} + +/* + * __rec_dictionary_reset -- + * Reset the dictionary when reconciliation restarts and when crossing a + * page boundary (a potential split). + */ +static void +__rec_dictionary_reset(WT_RECONCILE *r) +{ + if (r->dictionary_slots) { + r->dictionary_next = 0; + memset(r->dictionary_head, 0, sizeof(r->dictionary_head)); + } +} + +/* + * __rec_dictionary_lookup -- + * Check the dictionary for a matching value on this page. + */ +static int +__rec_dictionary_lookup( + WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_KV *val, WT_DICTIONARY **dpp) +{ + WT_DICTIONARY *dp, *next; + uint64_t hash; + int match; + + *dpp = NULL; + + /* Search the dictionary, and return any match we find. */ + hash = __wt_hash_fnv64(val->buf.data, val->buf.size); + for (dp = __rec_dictionary_skip_search(r->dictionary_head, hash); + dp != NULL && dp->hash == hash; dp = dp->next[0]) { + WT_RET(__wt_cell_pack_data_match( + dp->cell, &val->cell, val->buf.data, &match)); + if (match) { + WT_STAT_FAST_DATA_INCR(session, rec_dictionary); + *dpp = dp; + return (0); + } + } + + /* + * We're not doing value replacement in the dictionary. We stop adding + * new entries if we run out of empty dictionary slots (but continue to + * use the existing entries). I can't think of any reason a leaf page + * value is more likely to be seen because it was seen more recently + * than some other value: if we find working sets where that's not the + * case, it shouldn't be too difficult to maintain a pointer which is + * the next dictionary slot to re-use. + */ + if (r->dictionary_next >= r->dictionary_slots) + return (0); + + /* + * Set the hash value, we'll add this entry into the dictionary when we + * write it into the page's disk image buffer (because that's when we + * know where on the page it will be written). + */ + next = r->dictionary[r->dictionary_next++]; + next->cell = NULL; /* Not necessary, just cautious. */ + next->hash = hash; + __rec_dictionary_skip_insert(r->dictionary_head, next, hash); + *dpp = next; + return (0); +} |