/*- * See the file LICENSE for redistribution information. * * Copyright (c) 2008-2011 WiredTiger, Inc. * All rights reserved. */ #include "wt_internal.h" #include "btree.i" #include "cell.i" struct __rec_boundary; typedef struct __rec_boundary WT_BOUNDARY; struct __rec_discard; typedef struct __rec_discard WT_DISCARD; struct __rec_kv; typedef struct __rec_kv WT_KV; struct __rec_split; typedef struct __rec_split WT_SPLIT; /* * WT_RECONCILE -- * Information tracking a single page reconciliation. */ typedef struct { /* * Reconciliation is a fairly simple process: take an in-memory page, * walk through each entry in the page, build a backing disk image in * a temporary buffer that represents that information, and write that * buffer to disk. */ WT_BUF dsk; /* Temporary disk-image buffer */ int evict; /* The page is being discarded */ int locked; /* The tree is locked down */ /* * As pages are reconciled, split pages are merged into their parent * and discarded; deleted pages and overflow K/V items are discarded * along with their underlying blocks. If a page or overflow item * were discarded and page reconciliation then failed for any reason, * the in-memory tree would be incorrect. To keep the tree correct * until we're sure page reconciliation will succeed, we keep a list * of objects to discard when the reconciled page is discarded. */ struct __rec_discard { WT_PAGE *page; /* Inactive page */ uint32_t addr; /* Page's addr/size, */ uint32_t size; /* or block pair to delete */ } *discard; /* List of discard objects */ uint32_t discard_next; /* Next discard slot */ uint32_t discard_entries; /* Total discard slots */ uint32_t discard_allocated; /* Bytes allocated */ /* * Reconciliation gets tricky if we have to split a page, that is, if * the disk image we create exceeds the maximum size of disk images for * this page type. First, the split sizes: reconciliation splits to a * smaller-than-maximum page size when a split is required so we don't * repeatedly split a packed page. */ uint32_t page_size; /* Maximum page size */ uint32_t split_size; /* Split page size */ /* * The problem with splits is we've done a lot of work by the time we * realize we're going to have to split -- we don't want to start over. * * To keep from having to start over when we reach the maximum page * size, track the page information when we approach a split boundary. */ struct __rec_boundary { uint64_t recno; /* Split's starting record */ uint32_t entries; /* Split's entries */ /* * The first byte in the split chunk; the difference between * the next slot's first byte and this slot's first byte is * the length of the split chunk. */ uint8_t *start; /* Split's first byte */ } *bnd; /* Saved boundaries */ uint32_t bnd_next; /* Next boundary slot */ uint32_t bnd_entries; /* Total boundary slots */ uint32_t bnd_allocated; /* Bytes allocated */ /* * And there's state information as to where in this process we are: * (1) tracking split boundaries because we can still fit more split * chunks into the maximum page size, (2) tracking the maximum page * size boundary because we can't fit any more split chunks into the * maximum page size, (3) not performing boundary checks because it's * either not useful with the current page size configuration, or * because we've already been forced to split. */ enum { SPLIT_BOUNDARY=0, /* Next: a split page boundary */ SPLIT_MAX=1, /* Next: the maximum page boundary */ SPLIT_TRACKING_OFF=2 } /* No boundary checks */ state; /* * We track current information about the current record number, the * number of entries copied into the temporary buffer, where we are * in the temporary buffer, and how much memory remains. Those items * are packaged here rather than passing pointer to stack locations * through the code. */ uint64_t recno; /* Current record number */ uint32_t entries; /* Current number of entries */ uint8_t *first_free; /* Current first free byte */ uint32_t space_avail; /* Remaining space in this chunk */ /* * We track the total number of page entries copied into split chunks * so we can easily figure out how many entries in the current split * chunk. */ uint32_t total_entries; /* Total entries in splits */ /* * If reconciling an in-memory page we're not evicting from memory, we * may split a page referencing an in-memory subtree. In the same way * we create a set of new disk images that reflect the split of the * original page's disk image, we have to create a set of new in-memory * images reflecting the split of the original page's in-memory image. * Once that's done, the new in-memory image must references the same * subtrees as the old in-memory image; we maintain a list of in-memory * subtrees the original page referenced to make that fixup possible. * * We obviously could maintain this list on a per-split-chunk basis: * copy it into the WT_BOUNDARY structure if the boundary is reached, * then from the WT_BOUNDARY structure into the WT_SPLIT structure if * we split. We're binarily searching this array during fixup and I * don't expect it to be that large -- I don't expect this to be my * performance problem. */ WT_REF **imref; /* In-memory subtree reference list */ uint32_t imref_next; /* Next list slot */ uint32_t imref_entries; /* Total list slots */ uint32_t imref_allocated; /* Bytes allocated */ uint32_t imref_found; /* Fast check for search */ /* * When a split happens, we create a new internal page referencing the * split pages, and replace the original page with it. In order to * build that page, keep a list of all split pages written during the * page's reconciliation. */ struct __rec_split { WT_OFF_RECORD off; /* Address, size, recno */ /* * The key for a row-store page; no column-store key is needed * because the page's recno, stored in the WT_OFF_RECORD, is * the column-store key. */ WT_BUF key; /* Row key */ /* * If we had to create an in-memory version of the page, we'll * save it here for later connection between the newly created * split page and previously existing in-memory pages. * * For the same reason, maintain the original page's subtree * references for later fixup. */ WT_PAGE *imp; /* Page's in-memory version */ } *split; /* Saved splits */ uint32_t split_next; /* Next list slot */ uint32_t split_entries; /* Total list slots */ uint32_t split_allocated; /* Bytes allocated */ int cell_zero; /* Row-store internal page 0th key */ WT_ROW_REF *merge_ref; /* Row-store merge correction key */ WT_HAZARD *hazard; /* Copy of the hazard references */ uint32_t hazard_elem; /* Number of entries in the list */ /* * WT_KV-- * An on-page key/value item we're building. */ struct __rec_kv { WT_BUF buf; /* Data */ WT_CELL cell; /* Cell and cell's length */ uint32_t cell_len; uint32_t len; /* Total length of cell + data */ WT_OFF off; /* Associated off-page value */ } k, v; /* Key/Value being built */ WT_BUF *full, _full; /* Full-key being built */ WT_BUF *last, _last; /* Last full-key built */ int key_prefix_compress; /* If can prefix-compress next key */ } WT_RECONCILE; static inline void __rec_copy_incr(SESSION *, WT_RECONCILE *, WT_KV *); static inline int __rec_discard_add_ovfl(SESSION *, WT_CELL *); static inline void __rec_incr(SESSION *, WT_RECONCILE *, uint32_t); static inline void __rec_prefix_key_update(WT_RECONCILE *); static int __hazard_bsearch_cmp(const void *, const void *); static void __hazard_copy(SESSION *); static int __hazard_exclusive(SESSION *, WT_REF *); static int __hazard_qsort_cmp(const void *, const void *); static uint32_t __rec_allocation_size(SESSION *, WT_BUF *, uint8_t *); static int __rec_cell_build_key(SESSION *, const void *, uint32_t); static int __rec_cell_build_ovfl(SESSION *, WT_KV *, u_int); static int __rec_cell_build_val(SESSION *, void *, uint32_t); static int __rec_col_fix(SESSION *, WT_PAGE *); static int __rec_col_fix_bulk(SESSION *, WT_PAGE *); static int __rec_col_int(SESSION *, WT_PAGE *); static int __rec_col_merge(SESSION *, WT_PAGE *); static int __rec_col_rle(SESSION *, WT_PAGE *); static int __rec_col_rle_bulk(SESSION *, WT_PAGE *); static int __rec_col_split(SESSION *, WT_PAGE **, WT_PAGE *); static int __rec_col_var(SESSION *, WT_PAGE *); static int __rec_col_var_bulk(SESSION *, WT_PAGE *); static int __rec_discard_add(SESSION *, WT_PAGE *, uint32_t, uint32_t); static int __rec_discard_evict(SESSION *); static void __rec_discard_init(WT_RECONCILE *); static int __rec_imref_add(SESSION *, WT_REF *); static int __rec_imref_bsearch_cmp(const void *, const void *); static int __rec_imref_fixup(SESSION *, WT_PAGE *); static void __rec_imref_init(WT_RECONCILE *); static int __rec_imref_qsort_cmp(const void *, const void *); static int __rec_init(SESSION *, uint32_t); static int __rec_ovfl_delete(SESSION *, WT_PAGE *); static int __rec_parent_update( SESSION *, WT_PAGE *, WT_PAGE *, uint32_t, uint32_t, uint32_t); static void __rec_parent_update_clean(SESSION *, WT_PAGE *); static int __rec_row_int(SESSION *, WT_PAGE *); static int __rec_row_leaf(SESSION *, WT_PAGE *, uint32_t); static int __rec_row_leaf_insert(SESSION *, WT_INSERT *); static int __rec_row_merge(SESSION *, WT_PAGE *); static int __rec_row_split(SESSION *, WT_PAGE **, WT_PAGE *); static int __rec_split(SESSION *); static int __rec_split_finish(SESSION *); static int __rec_split_fixup(SESSION *); static int __rec_split_init( SESSION *, WT_PAGE *, uint64_t, uint32_t, uint32_t); static int __rec_split_write(SESSION *, WT_BUF *, void *); static int __rec_subtree(SESSION *, WT_PAGE *); static int __rec_subtree_col(SESSION *, WT_PAGE *); static void __rec_subtree_col_clear(SESSION *, WT_PAGE *); static int __rec_subtree_row(SESSION *, WT_PAGE *); static void __rec_subtree_row_clear(SESSION *, WT_PAGE *); static int __rec_wrapup(SESSION *, WT_PAGE *); /* * When discarding a page, we discard a number of things: pages merged into the * page, overflow items referenced by the page, and the underlying blocks. It * is all one discard family, but there are 3 different interfaces. */ #define __rec_discard_add_block(session, addr, size) \ __rec_discard_add(session, NULL, addr, size) #define __rec_discard_add_page(session, page) \ __rec_discard_add(session, page, WT_PADDR(page), WT_PSIZE(page)) static inline int __rec_discard_add_ovfl(SESSION *session, WT_CELL *cell) { WT_OFF ovfl; if (__wt_cell_type_is_ovfl(cell)) { __wt_cell_off(cell, &ovfl); return (__rec_discard_add(session, NULL, ovfl.addr, ovfl.size)); } return (0); } /* * __rec_init -- * Initialize the reconciliation structure. */ static int __rec_init(SESSION *session, uint32_t flags) { CONNECTION *conn; WT_RECONCILE *r; /* Allocate a reconciliation structure if we don't already have one. */ if ((r = S2C(session)->cache->rec) == NULL) { WT_RET(__wt_calloc_def(session, 1, &r)); S2C(session)->cache->rec = r; /* * Allocate memory for a copy of the hazard references -- it's * a fixed size so doesn't need run-time adjustments. */ conn = S2C(session); WT_RET(__wt_calloc_def(session, conn->session_size * conn->hazard_size, &r->hazard)); /* Connect prefix compression pointers/buffers. */ r->full = &r->_full; r->last = &r->_last; } r->evict = LF_ISSET(WT_REC_EVICT); r->locked = LF_ISSET(WT_REC_LOCKED); /* * During internal page reconcilition we track referenced objects that * are discarded when the page is discarded. That tracking is done per * reconciliation run, initialize it before anything else. */ __rec_discard_init(r); /* * During internal page reconcilition we track in-memory subtrees we * find in the page. That tracking is done per reconciliation run, * initialize it before anything else. */ __rec_imref_init(r); return (0); } /* * __rec_destroy -- * Clean up the reconciliation structure. */ void __wt_rec_destroy(SESSION *session) { WT_RECONCILE *r; WT_SPLIT *spl; uint32_t i; if ((r = S2C(session)->cache->rec) == NULL) return; __wt_buf_free(session, &r->dsk); if (r->discard != NULL) __wt_free(session, r->discard); if (r->bnd != NULL) __wt_free(session, r->bnd); if (r->imref != NULL) __wt_free(session, r->imref); if (r->split != NULL) { for (spl = r->split, i = 0; i < r->split_entries; ++spl, ++i) { __wt_buf_free(session, &spl->key); if (spl->imp != NULL) __wt_page_free( session, spl->imp, WT_ADDR_INVALID, 0); } __wt_free(session, r->split); } if (r->hazard != NULL) __wt_free(session, r->hazard); __wt_buf_free(session, &r->k.buf); __wt_buf_free(session, &r->v.buf); __wt_buf_free(session, &r->_full); __wt_buf_free(session, &r->_last); __wt_free(session, r); } /* * __rec_copy_incr -- * Copy a key/value cell and buffer pair onto a page. */ static inline void __rec_copy_incr(SESSION *session, WT_RECONCILE *r, WT_KV *kv) { uint32_t len; uint8_t *p, *t; /* * If there's only one chunk of data to copy (because the cell and data * are being copied from the original disk page), the cell length won't * be set, the WT_BUF data/length will reference the data to be copied. * * WT_CELLs are typically small, 1 or 2 bytes -- don't call memcpy, do * the copy in-line. */ for (p = (uint8_t *)r->first_free, t = (uint8_t *)&kv->cell, len = kv->cell_len; len > 0; --len) *p++ = *t++; /* The data can be quite large -- call memcpy. */ if (kv->buf.size != 0) memcpy(p, kv->buf.data, kv->buf.size); WT_ASSERT(session, kv->len == kv->cell_len + kv->buf.size); __rec_incr(session, r, kv->len); } /* * __rec_incr -- * Update the memory tracking structure for a new entry. */ static inline void __rec_incr(SESSION *session, WT_RECONCILE *r, uint32_t size) { /* * The buffer code is fragile and prone to off-by-one errors -- * check for overflow in diagnostic mode. */ WT_ASSERT(session, r->space_avail >= size); WT_ASSERT(session, WT_PTRDIFF32( r->first_free + size, r->dsk.mem) <= r->page_size); ++r->entries; r->space_avail -= size; r->first_free += size; } /* * __rec_allocation_size -- * Return the size to the minimum number of allocation units needed * (the page size can either grow or shrink), and zero out unused bytes. */ static inline uint32_t __rec_allocation_size(SESSION *session, WT_BUF *buf, uint8_t *end) { BTREE *btree; uint32_t alloc_len, current_len, write_len; btree = session->btree; current_len = WT_PTRDIFF32(end, buf->mem); alloc_len = WT_ALIGN(current_len, btree->allocsize); write_len = alloc_len - current_len; /* * There are lots of offset calculations going on in this code, make * sure we don't overflow the end of the temporary buffer. */ WT_ASSERT( session, end + write_len <= (uint8_t *)buf->mem + buf->mem_size); if (write_len != 0) memset(end, 0, write_len); return (alloc_len); } /* * __wt_page_reconcile -- * Format an in-memory page to its on-disk format, and write it. */ int __wt_page_reconcile( SESSION *session, WT_PAGE *page, uint32_t slvg_skip, uint32_t flags) { BTREE *btree; WT_CACHE *cache; int ret; btree = session->btree; cache = S2C(session)->cache; ret = 0; WT_VERBOSE(S2C(session), WT_VERB_EVICT, (session, "reconcile %s page addr %lu (type %s)", WT_PAGE_IS_MODIFIED(page) ? "dirty" : "clean", (u_long)WT_PADDR(page), __wt_page_type_string(page->type))); /* * Handle pages marked for deletion or split. * * Check both before checking for clean pages: deleted and split pages * operate outside the rules for normal pages. * * Pages have their WT_PAGE_DELETED flag set if they're reconciled and * are found to have no valid entries. * * Pages have their WT_PAGE_SPLIT flag set if they're created as part * of an internal split. * * For deleted or split pages, there's nothing to do, they are merged * into their parent when their parent is reconciled. */ if (F_ISSET(page, WT_PAGE_DELETED | WT_PAGE_SPLIT)) return (0); /* Initialize the reconciliation structure for each new run. */ WT_RET(__rec_init(session, flags)); /* * Get exclusive access to the page and review the page's subtree: if * evicting, confirm the page is a page we can evict, and get exclusive * access to pages being merged into the reconciled page as part of * reconciliation. * * If the check fails (for example, we find an in-memory page and it's * an eviction attempt), we're done. */ WT_RET(__rec_subtree(session, page)); /* * Clean pages are simple: update the page parent's state and discard * the page. * * Assert the page is being evicted: there's no reason to reconcile a * clean page if not to evict it.) */ if (!WT_PAGE_IS_MODIFIED(page)) { WT_ASSERT(session, LF_ISSET(WT_REC_EVICT)); WT_STAT_INCR(cache->stats, cache_evict_unmodified); WT_RET(__rec_discard_add_page(session, page)); __rec_parent_update_clean(session, page); WT_RET(__rec_discard_evict(session)); return (0); } /* * Mark the page clean -- we have exclusive access to any page we're * reconciling, so it doesn't matter when we do it. */ F_CLR(page, WT_PAGE_MODIFIED); WT_STAT_INCR(cache->stats, cache_evict_modified); /* Reconcile the page. */ switch (page->type) { case WT_PAGE_COL_FIX: if (F_ISSET(page, WT_PAGE_BULK_LOAD)) WT_RET(__rec_col_fix_bulk(session, page)); else WT_RET(__rec_col_fix(session, page)); break; case WT_PAGE_COL_RLE: if (F_ISSET(page, WT_PAGE_BULK_LOAD)) WT_RET(__rec_col_rle_bulk(session, page)); else WT_RET(__rec_col_rle(session, page)); break; case WT_PAGE_COL_VAR: if (F_ISSET(page, WT_PAGE_BULK_LOAD)) WT_RET(__rec_col_var_bulk(session, page)); else WT_RET(__rec_col_var(session, page)); break; case WT_PAGE_COL_INT: WT_RET(__rec_col_int(session, page)); break; case WT_PAGE_ROW_INT: WT_RET(__rec_row_int(session, page)); break; case WT_PAGE_ROW_LEAF: WT_RET(__rec_row_leaf(session, page, slvg_skip)); break; WT_ILLEGAL_FORMAT(session); } /* * Resolve the WT_RECONCILE information, update the parent, and evict * the page if requested and possible. */ WT_RET(__rec_wrapup(session, page)); /* * Newly created internal pages are normally merged into their parents * when said parent is reconciled. Newly split root pages can't be * merged (as they have no parent), the new root page must be written. * * We detect root splits when the root page is flagged as a split. We * do the check at the top level because I'd rather the reconciliation * code not handle two pages at once, and we've just finished with the * original page. * * Reconcile the new root page explicitly rather than waiting for a * natural reconcile, because root splits result from walking the tree * during a sync or close call, and the new root page is the one page * that won't be visited as part of that walk. */ if (btree->root_page.state == WT_REF_MEM && F_ISSET(btree->root_page.page, WT_PAGE_SPLIT)) { F_CLR(btree->root_page.page, WT_PAGE_SPLIT); F_SET(btree->root_page.page, WT_PAGE_PINNED); WT_PAGE_SET_MODIFIED(btree->root_page.page); ret = __wt_page_reconcile( session, btree->root_page.page, 0, flags); } return (ret); } /* * __rec_subtree -- * Get exclusive access to a subtree for reconciliation. */ static int __rec_subtree(SESSION *session, WT_PAGE *page) { WT_RECONCILE *r; int ret; r = S2C(session)->cache->rec; ret = 0; /* * Attempt exclusive access to the page if our caller doesn't have the * tree locked down. */ if (!r->locked) WT_RET(__hazard_exclusive(session, page->parent_ref)); /* * Walk the page's subtree, and make sure we can reconcile this page. * * When reconciling a page, it may reference deleted or split pages * which will be merged into the reconciled page. * * If we find an in-memory page, check if we're trying to evict the * page; that's not OK, you can't evict a page that references other * in-memory pages, those pages have to be evicted first. While the * test is necessary, it shouldn't happen a lot: every reference to * an internal page increments its read generation and so internal * pages shouldn't be selected for eviction until their children have * been evicted. * * If we find a split page, get exclusive access to the page and then * continue, the split page will be merged into our page. * * If we find a deleted page, get exclusive access to the page and then * check its status. If still deleted, we can continue, the page will * be merged into our page. However, another thread of control might * have inserted new material and the page is no longer deleted, which * means the reconciliation fails. (We could allow sync to proceed, * but the sync is going to be inconsistent by definition, why bother?) * * If reconciliation isn't going to be possible, we have to clear any * pages we locked while we were looking. (I'm re-walking the tree * rather than keeping track of the locked pages on purpose -- the * only time this should ever fail is if eviction's LRU-based choice * is very unlucky.) * * Finally, we do some additional cleanup work during this pass: first, * add any deleted or split pages to our list of pages to discard when * we discard the page being reconciled; second, review deleted pages * for overflow items and schedule them for deletion as well. */ switch (page->type) { case WT_PAGE_COL_INT: if ((ret = __rec_subtree_col(session, page)) != 0) if (!r->locked) __rec_subtree_col_clear(session, page); break; case WT_PAGE_ROW_INT: if ((ret = __rec_subtree_row(session, page)) != 0) if (!r->locked) __rec_subtree_row_clear(session, page); break; default: break; } /* * If we're not going to reconcile this page, release our exclusive * reference. */ if (ret != 0 && !r->locked) page->parent_ref->state = WT_REF_MEM; return (ret); } /* * __rec_subtree_col -- * Walk a column-store internal page's subtree, handling deleted and split * pages. */ static int __rec_subtree_col(SESSION *session, WT_PAGE *parent) { WT_COL_REF *cref; WT_PAGE *page; WT_RECONCILE *r; WT_REF *parent_ref; uint32_t i; r = S2C(session)->cache->rec; /* For each entry in the page... */ WT_COL_REF_FOREACH(parent, cref, i) { switch (WT_COL_REF_STATE(cref)) { case WT_REF_DISK: /* On-disk */ continue; case WT_REF_LOCKED: /* Eviction candidate */ WT_ASSERT( session, WT_COL_REF_STATE(cref) != WT_REF_LOCKED); return (1); case WT_REF_MEM: /* In-memory */ break; } page = WT_COL_REF_PAGE(cref); /* * We found an in-memory page: if the page won't be merged into * its parent, then we can't evict the top-level page. This is * not a problem, it just means we chose badly when selecting a * page for eviction. If we're not evicting, sync is flushing * dirty pages: ignore pages that aren't deleted or split. */ if (!F_ISSET(page, WT_PAGE_DELETED | WT_PAGE_SPLIT)) { if (r->evict) return (1); continue; } /* * Attempt exclusive access to the page if our caller doesn't * have the tree locked down. */ if (!r->locked) WT_RET(__hazard_exclusive(session, &cref->ref)); /* * Split pages are always merged into the parent regardless of * their contents, but a deleted page might have changed state * while we waited for exclusive access. In other words, we * had exclusive access to the parent, but another thread had * a hazard reference to the deleted page in the parent's tree: * while we waited, that thread inserted new material, and the * deleted page became an in-memory page we can't merge, it has * to be reconciled on its own. * * If this happens, we fail the reconciliation: in the case of * eviction, we don't evict this page. In the case of sync, * this means sync was running while the tree active, we don't * care. In the case of close, this implies a problem, close * is not allowed in active trees. */ if (!F_ISSET(page, WT_PAGE_DELETED | WT_PAGE_SPLIT)) return (1); /* * Deleted and split pages are discarded when reconciliation * completes. */ WT_RET(__rec_discard_add_page(session, page)); parent_ref = page->parent_ref; if (parent_ref->addr != WT_ADDR_INVALID) WT_RET(__rec_discard_add_block( session, parent_ref->addr, parent_ref->size)); /* * Overflow items on deleted pages are also discarded when * reconciliation completes. */ if (F_ISSET(page, WT_PAGE_DELETED)) switch (page->type) { case WT_PAGE_COL_FIX: case WT_PAGE_COL_INT: case WT_PAGE_COL_RLE: break; case WT_PAGE_COL_VAR: WT_RET(__rec_ovfl_delete(session, page)); break; WT_ILLEGAL_FORMAT(session); } /* Recurse down the tree. */ if (page->type == WT_PAGE_COL_INT) WT_RET(__rec_subtree_col(session, page)); } return (0); } /* * __rec_subtree_col_clear -- * Clear any pages for which we have exclusive access -- reconciliation * isn't possible. */ static void __rec_subtree_col_clear(SESSION *session, WT_PAGE *parent) { WT_COL_REF *cref; WT_PAGE *page; uint32_t i; /* For each entry in the page... */ WT_COL_REF_FOREACH(parent, cref, i) if (WT_COL_REF_STATE(cref) == WT_REF_LOCKED) { WT_COL_REF_STATE(cref) = WT_REF_MEM; /* Recurse down the tree. */ page = WT_COL_REF_PAGE(cref); if (page->type == WT_PAGE_COL_INT) __rec_subtree_col_clear(session, page); } } /* * __rec_subtree_row -- * Walk a row-store internal page's subtree, handle deleted and split * pages. */ static int __rec_subtree_row(SESSION *session, WT_PAGE *parent) { WT_PAGE *page; WT_RECONCILE *r; WT_REF *parent_ref; WT_ROW_REF *rref; uint32_t i; r = S2C(session)->cache->rec; /* For each entry in the page... */ WT_ROW_REF_FOREACH(parent, rref, i) { switch (WT_ROW_REF_STATE(rref)) { case WT_REF_DISK: /* On-disk */ continue; case WT_REF_LOCKED: /* Eviction candidate */ WT_ASSERT( session, WT_ROW_REF_STATE(rref) != WT_REF_LOCKED); return (1); case WT_REF_MEM: /* In-memory */ break; } page = WT_ROW_REF_PAGE(rref); /* * We found an in-memory page: if the page won't be merged into * its parent, then we can't evict the top-level page. This is * not a problem, it just means we chose badly when selecting a * page for eviction. If we're not evicting, sync is flushing * dirty pages: ignore pages that aren't deleted or split. */ if (!F_ISSET(page, WT_PAGE_DELETED | WT_PAGE_SPLIT)) { if (r->evict) return (1); continue; } /* * Attempt exclusive access to the page if our caller doesn't * have the tree locked down. */ if (!r->locked) WT_RET(__hazard_exclusive(session, &rref->ref)); /* * Split pages are always merged into the parent regardless of * their contents, but a deleted page might have changed state * while we waited for exclusive access. In other words, we * had exclusive access to the parent, but another thread had * a hazard reference to the deleted page in the parent's tree: * while we waited, that thread inserted new material, and the * deleted page became an in-memory page we can't merge, it has * to be reconciled on its own. * * If this happens, we fail the reconciliation: in the case of * eviction, we don't evict this page. In the case of sync, * this means sync was running while the tree active, we don't * care. In the case of close, this implies a problem, close * is not allowed in active trees. */ if (!F_ISSET(page, WT_PAGE_DELETED | WT_PAGE_SPLIT)) return (1); /* * Deleted and split pages are discarded when reconciliation * completes. */ WT_RET(__rec_discard_add_page(session, page)); parent_ref = page->parent_ref; if (parent_ref->addr != WT_ADDR_INVALID) WT_RET(__rec_discard_add_block( session, parent_ref->addr, parent_ref->size)); /* * Overflow items on deleted pages are also discarded when * reconciliation completes. */ if (F_ISSET(page, WT_PAGE_DELETED)) switch (page->type) { case WT_PAGE_ROW_INT: WT_RET(__rec_ovfl_delete(session, page)); break; case WT_PAGE_ROW_LEAF: WT_RET(__rec_ovfl_delete(session, page)); break; WT_ILLEGAL_FORMAT(session); } /* Recurse down the tree. */ if (page->type == WT_PAGE_ROW_INT) WT_RET(__rec_subtree_row(session, page)); } return (0); } /* * __rec_subtree_row_clear -- * Clear any pages for which we have exclusive access -- reconciliation * isn't possible. */ static void __rec_subtree_row_clear(SESSION *session, WT_PAGE *parent) { WT_PAGE *page; WT_ROW_REF *rref; uint32_t i; /* For each entry in the page... */ WT_ROW_REF_FOREACH(parent, rref, i) if (WT_ROW_REF_STATE(rref) == WT_REF_LOCKED) { WT_ROW_REF_STATE(rref) = WT_REF_MEM; /* Recurse down the tree. */ page = WT_ROW_REF_PAGE(rref); if (page->type == WT_PAGE_ROW_INT) __rec_subtree_row_clear(session, page); } } /* * __rec_ovfl_delete -- * Walk the cells of a deleted disk page and schedule any overflow items * for discard. */ static int __rec_ovfl_delete(SESSION *session, WT_PAGE *page) { WT_CELL *cell; WT_PAGE_DISK *dsk; uint32_t i; dsk = page->dsk; /* * We're deleting the page, which means any overflow item we ever had * is deleted as well. */ WT_CELL_FOREACH(dsk, cell, i) WT_RET(__rec_discard_add_ovfl(session, cell)); return (0); } /* * __rec_split_init -- * Initialization for the reconciliation split functions. */ static int __rec_split_init( SESSION *session, WT_PAGE *page, uint64_t recno, uint32_t max, uint32_t min) { BTREE *btree; WT_PAGE_DISK *dsk; WT_RECONCILE *r; r = S2C(session)->cache->rec; btree = session->btree; /* Ensure the scratch buffer is large enough. */ WT_RET(__wt_buf_initsize(session, &r->dsk, (size_t)max)); /* * Some fields of the disk image are fixed based on the original page, * set them. */ dsk = r->dsk.mem; WT_CLEAR(*dsk); dsk->type = page->type; dsk->recno = recno; /* * If we have to split, we want to choose a smaller page size for the * split pages, because otherwise we could end up splitting one large * packed page over and over. We don't want to pick the minimum size * either, because that penalizes an application that did a bulk load * and subsequently inserted a few items into packed pages. Currently, * I'm using 75%, but I have no empirical evidence that's a good value. * We should leave this as a tuning variable, but probably undocumented. * * The maximum page size may be a multiple of the split page size (for * example, there's a maximum page size of 128KB, but because the table * is active and we don't want to split a lot, the split size is 20KB). * The maximum page size may NOT be an exact multiple of the split page * size. * * It's lots of work to build these pages and don't want to start over * when we reach the maximum page size (it's painful to restart after * creating overflow items and compacted data, not to mention deleting * overflow items from the underlying pages). So, the loop calls the * helper functions when approaching a split boundary, and we save the * information at that point. That allows us to go back and split the * page at the boundary points if we eventually overflow the maximum * page size. */ r->page_size = max; r->split_size = WT_ALIGN((max / 4) * 3, btree->allocsize); #ifdef HAVE_DIAGNOSTIC /* * This won't get tested enough if we don't force the code to create * lots of splits. */ r->split_size = min; #else WT_UNUSED(min); #endif /* * If the maximum page size is the same as the split page size, there * is no need to maintain split boundaries within a larger page. */ r->state = max == r->split_size ? SPLIT_TRACKING_OFF : SPLIT_BOUNDARY; /* * Initialize the arrays of saved and written page entries to * reference the first slot. */ r->split_next = r->bnd_next = 0; /* Initialize the total entries in split chunks. */ r->total_entries = 0; /* * Set the caller's information and configure so the loop calls us * when approaching the split boundary. */ r->recno = recno; r->entries = 0; r->first_free = WT_PAGE_DISK_BYTE(dsk); r->space_avail = r->split_size - WT_PAGE_DISK_SIZE; return (0); } /* * __rec_split -- * Handle the page reconciliation bookkeeping. (Did you know "bookkeeper" * has 3 doubled letters in a row? Sweet-tooth does, too.) */ static int __rec_split(SESSION *session) { WT_BOUNDARY *bnd; WT_PAGE_DISK *dsk; WT_RECONCILE *r; uint32_t current_len; /* * Handle page-buffer size tracking; we have to do this work in every * reconciliation loop, and I don't want to repeat the code that many * times. */ r = S2C(session)->cache->rec; dsk = r->dsk.mem; /* * There are 3 cases we have to handle. * * #1 * Not done, and about to cross a split boundary, in which case we save * away the current boundary information and return. * * #2 * Not done, and about to cross the max boundary, in which case we have * to physically split the page -- use the saved split information to * write all the split pages. * * #3 * Not done, and about to cross the split boundary, but we've already * done the split thing when we approached the max boundary, in which * case we write the page and keep going. * * Cases #1 and #2 are the hard ones: we're called when we're about to * cross each split boundary, and we save information away so we can * split if we have to. We're also called when we're about to cross * the maximum page boundary: in that case, we do the actual split, * clean things up, then keep going. */ switch (r->state) { case SPLIT_BOUNDARY: /* Case #1 */ /* * Make sure there's enough room in which to save the slot. * * The calculation is actually +1, because we save the start * point one past the current entry -- make it +20 so we don't * grow slot-by-slot. */ if (r->bnd_next + 1 >= r->bnd_entries) { WT_RET(__wt_realloc(session, &r->bnd_allocated, (r->bnd_entries + 20) * sizeof(*r->bnd), &r->bnd)); r->bnd_entries += 20; } /* * Save the information about where we are when the split would * have happened. * * The first time through, set the starting record number and * buffer address for the first slot from well-known values. */ if (r->bnd_next == 0) { r->bnd[0].recno = dsk->recno; r->bnd[0].start = WT_PAGE_DISK_BYTE(dsk); } /* Set the number of entries for the just finished chunk. */ bnd = &r->bnd[r->bnd_next++]; bnd->entries = r->entries - r->total_entries; r->total_entries = r->entries; /* * Set the starting record number and buffer address for the * next chunk, clear the entries. */ ++bnd; bnd->recno = r->recno; bnd->start = r->first_free; bnd->entries = 0; /* * Set the space available to another split-size chunk, if we * have one. */ current_len = WT_PTRDIFF32(r->first_free, dsk); if (current_len + r->split_size <= r->page_size) { r->space_avail = r->split_size - WT_PAGE_DISK_SIZE; break; } /* * We don't have room for another split chunk. Add whatever * space remains in the maximum page size, hopefully it will * be enough. */ r->state = SPLIT_MAX; r->space_avail = (r->page_size - WT_PAGE_DISK_SIZE) - current_len; break; case SPLIT_MAX: /* Case #2 */ /* * It didn't all fit, but we just noticed that. * * Cycle through the saved split-point information, writing the * split chunks we have tracked. */ WT_RET(__rec_split_fixup(session)); /* We're done saving split chunks. */ r->state = SPLIT_TRACKING_OFF; break; case SPLIT_TRACKING_OFF: /* Case #3 */ /* * It didn't all fit, but either we've already noticed it and * are now processing the rest of the page at the split-size * boundaries, or, the split size was the same as the page size, * so we never bothered with saving split-point information. * * Write the current disk image. */ dsk->u.entries = r->entries; WT_RET(__rec_split_write(session, &r->dsk, r->first_free)); /* * Set the starting record number and buffer information for the * next chunk; we only get here if we've never split, or have * already split, so we're using using split-size chunks from * here on out. */ dsk->recno = r->recno; r->entries = 0; r->first_free = WT_PAGE_DISK_BYTE(dsk); r->space_avail = r->split_size - WT_PAGE_DISK_SIZE; break; } return (0); } /* * __rec_split_finish -- * Wrap up reconciliation. */ static int __rec_split_finish(SESSION *session) { WT_PAGE_DISK *dsk; WT_RECONCILE *r; r = S2C(session)->cache->rec; /* * We're done reconciling a page; write anything we have accumulated * in the buffer at this point. */ if (r->entries == 0) return (0); dsk = r->dsk.mem; dsk->u.entries = r->entries; return (__rec_split_write(session, &r->dsk, r->first_free)); } /* * __rec_split_fixup -- * Physically split the already-written page data. */ static int __rec_split_fixup(SESSION *session) { WT_BOUNDARY *bnd; WT_BUF *tmp; WT_PAGE_DISK *dsk; WT_RECONCILE *r; uint32_t i, len; uint8_t *dsk_start; int ret; /* * When we overflow physical limits of the page, we walk the list of * split chunks we've created and write those pages out, then update * the caller's information. */ r = S2C(session)->cache->rec; ret = 0; /* * The data isn't laid out on page-boundaries or nul-byte padded; copy * it into a clean buffer before writing it. * * Allocate a scratch buffer to hold the new disk image. Copy the * WT_PAGE_DISK header onto the scratch buffer, most of the header * information remains unchanged between the pages. */ WT_RET(__wt_scr_alloc(session, r->split_size, &tmp)); memcpy(tmp->mem, r->dsk.mem, WT_PAGE_DISK_SIZE); /* * For each split chunk we've created, update the disk image and copy * it into place. */ dsk = tmp->mem; dsk_start = WT_PAGE_DISK_BYTE(dsk); for (i = 0, bnd = r->bnd; i < r->bnd_next; ++i, ++bnd) { /* Set the starting record number and number of entries. */ dsk->recno = bnd->recno; dsk->u.entries = bnd->entries; /* Copy out the page contents, and write it. */ len = WT_PTRDIFF32((bnd + 1)->start, bnd->start); memcpy(dsk_start, bnd->start, len); WT_ERR(__rec_split_write(session, tmp, dsk_start + len)); } /* * There is probably a remnant in the working buffer that didn't get * written; copy it down to the beginning of the working buffer, and * update the starting record number. * * Confirm the remnant is no larger than the available split buffer. * * Fix up our caller's information. */ len = WT_PTRDIFF32(r->first_free, bnd->start); WT_ASSERT(session, len < r->split_size - WT_PAGE_DISK_SIZE); dsk = r->dsk.mem; dsk->recno = bnd->recno; dsk_start = WT_PAGE_DISK_BYTE(dsk); (void)memmove(dsk_start, bnd->start, len); r->entries -= r->total_entries; r->first_free = dsk_start + len; r->space_avail = (r->split_size - WT_PAGE_DISK_SIZE) - len; err: if (tmp != NULL) __wt_scr_release(&tmp); return (ret); } /* * __rec_split_write -- * Write a disk block out for the split helper functions. */ static int __rec_split_write(SESSION *session, WT_BUF *buf, void *end) { WT_CELL *cell; WT_PAGE *page; WT_PAGE_DISK *dsk; WT_RECONCILE *r; WT_SPLIT *spl; uint32_t addr, size; int ret; r = S2C(session)->cache->rec; dsk = NULL; /* * Set the disk block size and clear trailing bytes. * Allocate file space. * Write the disk block. */ size = __rec_allocation_size(session, buf, end); WT_RET(__wt_block_alloc(session, &addr, size)); WT_RET(__wt_disk_write(session, buf->mem, addr, size)); /* Save the key and addr/size pairs to update the parent. */ if (r->split_next == r->split_entries) { WT_RET(__wt_realloc(session, &r->split_allocated, (r->split_entries + 20) * sizeof(*r->split), &r->split)); r->split_entries += 20; } spl = &r->split[r->split_next++]; spl->off.addr = addr; spl->off.size = size; /* * For a column-store, the key is the recno, for a row-store, it's the * first key on the page, a variable-length byte string. */ dsk = buf->mem; switch (dsk->type) { case WT_PAGE_ROW_INT: case WT_PAGE_ROW_LEAF: /* * The cell had better have a zero-length prefix: it's the first * key on the page. (If it doesn't have a zero-length prefix, * __wt_cell_copy() won't be sufficient any way, we'd only copy * the non-prefix-compressed portion of the key.) */ cell = WT_PAGE_DISK_BYTE(dsk); WT_ASSERT(session, __wt_cell_type(cell) == WT_CELL_KEY_OVFL || __wt_cell_prefix(cell) == 0); WT_RET(__wt_cell_copy(session, cell, &spl->key)); break; case WT_PAGE_COL_FIX: case WT_PAGE_COL_INT: case WT_PAGE_COL_RLE: case WT_PAGE_COL_VAR: WT_RECNO(&spl->off) = dsk->recno; break; } /* If we're evicting the page, there's no more work to do. */ if (r->evict) { spl->imp = NULL; return (0); } /* * If we're not evicting the page, we'll need a new in-memory version: * create an in-memory page and take any list of in-memory references. */ WT_RET(__wt_calloc(session, (size_t)size, sizeof(uint8_t), &dsk)); memcpy(dsk, buf->mem, size); if ((ret = __wt_page_inmem(session, NULL, NULL, dsk, &page)) != 0) { __wt_free(session, dsk); return (ret); } spl->imp = page; return (0); } /* * __rec_col_int -- * Reconcile a column-store internal page. */ static int __rec_col_int(SESSION *session, WT_PAGE *page) { WT_RET(__rec_split_init(session, page, page->u.col_int.recno, session->btree->intlmax, session->btree->intlmin)); /* * Walking the row-store internal pages is complicated by the fact that * we're taking keys from the underlying disk image for the top-level * page and we're taking keys from in-memory structures for merge pages. * Column-store is simpler because the only information we copy is the * WT_OFF_RECORD structure, and it comes from in-memory structures in * both the top-level and merge cases. In short, both the top-level * and merge page walks look the same, and we just call the merge page * function on the top-level page. */ WT_RET(__rec_col_merge(session, page)); /* Write the remnant page. */ return (__rec_split_finish(session)); } /* * __rec_col_merge -- * Recursively walk a column-store internal tree of merge pages. */ static int __rec_col_merge(SESSION *session, WT_PAGE *page) { WT_COL_REF *cref; WT_OFF_RECORD off; WT_PAGE *rp; WT_RECONCILE *r; uint32_t i; r = S2C(session)->cache->rec; /* For each entry in the page... */ WT_COL_REF_FOREACH(page, cref, i) { /* Update the starting record number in case we split. */ r->recno = cref->recno; /* * The page may be deleted or internally created during a split. * Deleted/split pages are merged into the parent and discarded. * * !!! * Column-store formats don't support deleted pages; they can * shrink, but deleting a page would remove part of the record * count name space. This code is here for if/when they support * deletes, but for now it's not OK. */ if (WT_COL_REF_STATE(cref) != WT_REF_DISK) { rp = WT_COL_REF_PAGE(cref); if (F_ISSET(rp, WT_PAGE_DELETED | WT_PAGE_SPLIT)) { WT_ASSERT( session, !F_ISSET(rp, WT_PAGE_DELETED)); if (F_ISSET(rp, WT_PAGE_SPLIT)) WT_RET(__rec_col_merge(session, rp)); continue; } /* Save any in-memory subtree reference. */ WT_RET(__rec_imref_add(session, &cref->ref)); } /* Boundary: split or write the page. */ while (sizeof(WT_OFF_RECORD) > r->space_avail) WT_RET(__rec_split(session)); /* * Copy a new WT_OFF_RECORD structure onto the page; any * off-page reference must be a valid disk address. */ WT_ASSERT(session, WT_COL_REF_ADDR(cref) != WT_ADDR_INVALID); off.addr = WT_COL_REF_ADDR(cref); off.size = WT_COL_REF_SIZE(cref); WT_RECNO(&off) = cref->recno; memcpy(r->first_free, &off, sizeof(WT_OFF_RECORD)); __rec_incr(session, r, WT_SIZEOF32(WT_OFF_RECORD)); } return (0); } /* * __rec_col_fix -- * Reconcile a fixed-width, column-store leaf page (does not handle * run-length encoding). */ static int __rec_col_fix(SESSION *session, WT_PAGE *page) { BTREE *btree; WT_BUF *tmp; WT_COL *cip; WT_RECONCILE *r; WT_UPDATE *upd; uint32_t i, len; uint8_t *data; int ret; void *cipdata; r = S2C(session)->cache->rec; btree = session->btree; tmp = NULL; ret = 0; /* * We need a "deleted" data item to store on the page; get a scratch * buffer, clear the contents and set the delete flag. */ len = btree->fixed_len; WT_ERR(__wt_scr_alloc(session, len, &tmp)); memset(tmp->mem, 0, len); WT_FIX_DELETE_SET(tmp->mem); /* * Fixed-size pages can't split; use the underlying helper functions * because they don't add much overhead, and it's better if all the * reconciliation functions look the same. * * Use the current page size as our min/max page size, we know exactly * how much space we need. */ WT_ERR(__rec_split_init(session, page, page->u.col_leaf.recno, page->parent_ref->size, page->parent_ref->size)); /* For each entry in the in-memory page... */ WT_COL_FOREACH(page, cip, i) { /* * Get a reference to the data, on- or off- page, and see if * it's been deleted. */ if ((upd = WT_COL_UPDATE(page, cip)) != NULL) { if (WT_UPDATE_DELETED_ISSET(upd)) data = tmp->mem; /* Deleted */ else /* Updated */ data = WT_UPDATE_DATA(upd); } else { cipdata = WT_COL_PTR(page, cip); if (WT_FIX_DELETE_ISSET(cipdata)) data = tmp->mem; /* On-disk deleted */ else data = cipdata; /* On-disk data */ } /* * When reconciling a fixed-width page that doesn't support * run-length encoding, the on-page information can't change * size -- there's no reason to ever split such a page. */ memcpy(r->first_free, data, len); __rec_incr(session, r, len); } /* Write the remnant page. */ ret = __rec_split_finish(session); err: if (tmp != NULL) __wt_scr_release(&tmp); return (ret); } /* * __rec_col_fix_bulk -- * Reconcile a bulk-loaded, fixed-width column-store leaf page (does not * handle run-length encoding). */ static int __rec_col_fix_bulk(SESSION *session, WT_PAGE *page) { BTREE *btree; WT_RECONCILE *r; WT_UPDATE *upd; uint32_t len; r = S2C(session)->cache->rec; btree = session->btree; len = btree->fixed_len; WT_RET(__rec_split_init(session, page, page->u.bulk.recno, session->btree->leafmax, session->btree->leafmin)); /* For each entry in the update list... */ for (upd = page->u.bulk.upd; upd != NULL; upd = upd->next) { /* Boundary: split or write the page. */ while (len > r->space_avail) WT_RET(__rec_split(session)); memcpy(r->first_free, WT_UPDATE_DATA(upd), len); __rec_incr(session, r, len); /* Update the starting record number in case we split. */ ++r->recno; } /* Write the remnant page. */ return (__rec_split_finish(session)); } /* * __rec_col_rle -- * Reconcile a fixed-width, run-length encoded, column-store leaf page. */ static int __rec_col_rle(SESSION *session, WT_PAGE *page) { BTREE *btree; WT_BUF *tmp; WT_COL *cip; WT_INSERT *ins; WT_RECONCILE *r; uint32_t i, len; uint16_t n, nrepeat, repeat_count; uint8_t *data, *last_data; int ret; void *cipdata; r = S2C(session)->cache->rec; btree = session->btree; tmp = NULL; last_data = NULL; ret = 0; /* * We need a "deleted" data item to store on the page. Make sure the * SESSION's scratch buffer is big enough. Clear the buffer's contents * and set the delete flag. */ len = btree->fixed_len; WT_ERR(__wt_scr_alloc(session, len, &tmp)); memset(tmp->mem, 0, len); WT_FIX_DELETE_SET(tmp->mem); WT_RET(__rec_split_init(session, page, page->u.col_leaf.recno, session->btree->leafmax, session->btree->leafmin)); /* For each entry in the in-memory page... */ WT_COL_FOREACH(page, cip, i) { cipdata = WT_COL_PTR(page, cip); /* * Generate entries for the new page: loop through the repeat * records, checking for WT_INSERT entries matching the record * number. * * Note the increment of recno in the for loop to update the * starting record number in case we split. */ ins = WT_COL_INSERT(page, cip), nrepeat = WT_RLE_REPEAT_COUNT(cipdata); for (n = 0; n < nrepeat; n += repeat_count, r->recno += repeat_count) { if (ins != NULL && WT_INSERT_RECNO(ins) == r->recno) { /* Use the WT_INSERT's WT_UPDATE field. */ if (WT_UPDATE_DELETED_ISSET(ins->upd)) data = tmp->mem; else data = WT_UPDATE_DATA(ins->upd); repeat_count = 1; ins = ins->next; } else { if (WT_FIX_DELETE_ISSET(cipdata)) data = tmp->mem; else data = WT_RLE_REPEAT_DATA(cipdata); /* * The repeat count is the number of records * up to the next WT_INSERT record, or up to * the end of this entry if we have no more * WT_INSERT records. */ if (ins == NULL) repeat_count = nrepeat - n; else repeat_count = (uint16_t) (WT_INSERT_RECNO(ins) - r->recno); } /* * In all cases, check the last entry written on the * page to see if it's identical, and increment its * repeat count where possible. */ if (last_data != NULL && memcmp( WT_RLE_REPEAT_DATA(last_data), data, len) == 0 && WT_RLE_REPEAT_COUNT(last_data) < UINT16_MAX) { WT_RLE_REPEAT_COUNT(last_data) += repeat_count; continue; } /* Boundary: split or write the page. */ while (len + WT_SIZEOF32(uint16_t) > r->space_avail) WT_ERR(__rec_split(session)); last_data = r->first_free; WT_RLE_REPEAT_COUNT(last_data) = repeat_count; memcpy(WT_RLE_REPEAT_DATA(last_data), data, len); __rec_incr(session, r, len + WT_SIZEOF32(uint16_t)); } } /* Write the remnant page. */ ret = __rec_split_finish(session); err: if (tmp != NULL) __wt_scr_release(&tmp); return (ret); } /* * __rec_col_rle_bulk -- * Reconcile a bulk-loaded, fixed-width, run-length encoded, column-store * leaf page. */ static int __rec_col_rle_bulk(SESSION *session, WT_PAGE *page) { BTREE *btree; WT_RECONCILE *r; WT_UPDATE *upd; uint32_t len; uint8_t *data, *last_data; r = S2C(session)->cache->rec; btree = session->btree; len = btree->fixed_len; last_data = NULL; WT_RET(__rec_split_init(session, page, page->u.bulk.recno, session->btree->leafmax, session->btree->leafmin)); /* For each entry in the update list... */ for (upd = page->u.bulk.upd; upd != NULL; upd = upd->next, ++r->recno) { data = WT_UPDATE_DATA(upd); /* * In all cases, check the last entry written on the page to * see if it's identical, and increment its repeat count where * possible. */ if (last_data != NULL && memcmp( WT_RLE_REPEAT_DATA(last_data), data, len) == 0 && WT_RLE_REPEAT_COUNT(last_data) < UINT16_MAX) { ++WT_RLE_REPEAT_COUNT(last_data); continue; } /* Boundary: split or write the page. */ while (len + WT_SIZEOF32(uint16_t) > r->space_avail) WT_RET(__rec_split(session)); last_data = r->first_free; WT_RLE_REPEAT_COUNT(last_data) = 1; memcpy(WT_RLE_REPEAT_DATA(last_data), data, len); __rec_incr(session, r, len + WT_SIZEOF32(uint16_t)); } /* Write the remnant page. */ return (__rec_split_finish(session)); } /* * __rec_col_var -- * Reconcile a variable-width column-store leaf page. */ static int __rec_col_var(SESSION *session, WT_PAGE *page) { WT_CELL *cell; WT_COL *cip; WT_KV *val; WT_RECONCILE *r; WT_UPDATE *upd; uint32_t i; r = S2C(session)->cache->rec; val = &r->v; WT_RET(__rec_split_init(session, page, page->u.col_leaf.recno, session->btree->leafmax, session->btree->leafmin)); /* For each entry in the in-memory page... */ WT_COL_FOREACH(page, cip, i) { /* * Get a reference to the value: it's either an update or the * original on-page item. */ cell = WT_COL_PTR(page, cip); if ((upd = WT_COL_UPDATE(page, cip)) == NULL) { val->buf.data = cell; val->buf.size = __wt_cell_len(cell); val->cell_len = 0; val->len = val->buf.size; } else { /* * If we update an overflow value, free the underlying * file space. */ WT_RET(__rec_discard_add_ovfl(session, cell)); /* * Check for deletion, else build the value's WT_CELL * chunk from the most recent update value. */ if (WT_UPDATE_DELETED_ISSET(upd)) { __wt_cell_set_fixed( &val->cell, WT_CELL_DEL, &val->cell_len); val->buf.size = 0; val->len = val->cell_len; } else WT_RET(__rec_cell_build_val( session, WT_UPDATE_DATA(upd), upd->size)); } /* Boundary: split or write the page. */ while (val->len > r->space_avail) WT_RET(__rec_split(session)); /* Copy the value onto the page. */ __rec_copy_incr(session, r, val); /* Update the starting record number in case we split. */ ++r->recno; } /* Write the remnant page. */ return (__rec_split_finish(session)); } /* * __rec_col_var_bulk -- * Reconcile a bulk-loaded, variable-width column-store leaf page. */ static int __rec_col_var_bulk(SESSION *session, WT_PAGE *page) { WT_KV *val; WT_RECONCILE *r; WT_UPDATE *upd; r = S2C(session)->cache->rec; val = &r->v; WT_RET(__rec_split_init(session, page, page->u.bulk.recno, session->btree->leafmax, session->btree->leafmin)); /* For each entry in the update list... */ for (upd = page->u.bulk.upd; upd != NULL; upd = upd->next) { WT_RET(__rec_cell_build_val( session, WT_UPDATE_DATA(upd), upd->size)); /* Boundary: split or write the page. */ while (val->len > r->space_avail) WT_RET(__rec_split(session)); /* Copy the value onto the page. */ __rec_copy_incr(session, r, val); /* Update the starting record number in case we split. */ ++r->recno; } /* Write the remnant page. */ return (__rec_split_finish(session)); } /* * __rec_row_int -- * Reconcile a row-store internal page. */ static int __rec_row_int(SESSION *session, WT_PAGE *page) { WT_CELL *cell; WT_KV *key, *val; WT_PAGE *rp; WT_RECONCILE *r; WT_ROW_REF *rref; uint32_t i; int ovfl_key; r = S2C(session)->cache->rec; key = &r->k; val = &r->v; r->key_prefix_compress = 0; /* New page, compression off. */ WT_RET(__rec_split_init(session, page, 0ULL, session->btree->intlmax, session->btree->intlmin)); /* * Ideally, we'd never store the 0th key on row-store internal pages * because it's never used during tree search and there's no reason * to waste the space. The problem is how we do splits: when we split, * we've potentially picked out several "split points" in the buffer * which is overflowing the maximum page size, and when the overflow * happens, we go back and physically split the buffer, at those split * points, into new pages. It would be both difficult and expensive * to re-process the 0th key at each split point to be an empty key, * so we don't do that. However, we are reconciling an internal page * for whatever reason, and the 0th key is known to be useless. Set * the 0th key to something small. (We do NOT put a zero-length key * in the tree: zero-length keys are keys requiring further processing, * not keys we are going to ignore.) */ r->cell_zero = 1; /* * The value cells all look the same -- we can set it up once and then * just reset the addr/size pairs we're writing after the cell. */ __wt_cell_set_fixed(&val->cell, WT_CELL_OFF, &val->cell_len); val->buf.data = &val->off; val->buf.size = WT_SIZEOF32(WT_OFF); val->len = val->cell_len + WT_SIZEOF32(WT_OFF); /* * There are two kinds of row-store internal pages we reconcile: the * first is a page created entirely in-memory, in which case there's * no underlying disk image. The second is a page read from disk, * in which case we can take the keys from the underlying disk image. * * Internal pages created in-memory are always merged into their parent * in order to keep the tree from growing deeper on every split. For * that reason, reconciliation of those pages consists of updating the * page state and returning, as none of the real work of reconciliation * is done until the parent page into which the created pages will be * merged is itself reconciled. In other words, we ignore internally * created pages until that parent is reconciled, at which time we walk * the subtree rooted in that parent and consolidate the merged pages. * * There is a special case: if the root splits, there's no parent into * which it can be merged, so the reconciliation code turns off the * merge flag, and reconciles the page anyway. In that case we end up * here, with no disk image. This code is here to handle that specific * case. */ if (page->dsk == NULL) { WT_RET(__rec_row_merge(session, page)); return (__rec_split_finish(session)); } /* * We have to walk both the WT_ROW structures and the original page -- * see the comment at WT_ROW_REF_AND_KEY_FOREACH for details. * * For each entry in the in-memory page... */ WT_ROW_REF_AND_KEY_FOREACH(page, rref, cell, i) { /* * The page may be deleted or internally created during a split. * Deleted/split pages are merged into the parent and discarded. * * There's one special case we have to handle here: the internal * page being merged has a potentially incorrect first key and * we need to replace it with the one we have. The problem is * caused by the fact that the page search algorithm coerces the * 0th key on any internal page to be smaller than any search * key. We do that because we don't want to have to update the * internal pages every time a new "smallest" key is inserted * into the tree. But, if a new "smallest" key is inserted into * our split-created subtree, and we don't update the internal * page, when we merge that internal page into its parent page, * the key may be incorrect. Imagine the following tree: * * 2 5 40 internal page * | * 10 | 20 split-created internal page * | * 6 inserted smallest key * * after a simple merge, we'd have corruption: * * 2 10 20 40 merged internal page * | * 6 key sorts before parent's key * * To fix this problem, we take the original page's key as our * first key, because we know that key sorts before any possible * key inserted into the subtree, and discard whatever 0th key * is on the split-created internal page. */ if (WT_ROW_REF_STATE(rref) != WT_REF_DISK) { rp = WT_ROW_REF_PAGE(rref); if (F_ISSET(rp, WT_PAGE_DELETED | WT_PAGE_SPLIT)) { /* Delete overflow keys for merged pages. */ WT_RET(__rec_discard_add_ovfl(session, cell)); /* Merge split subtrees */ if (F_ISSET(rp, WT_PAGE_SPLIT)) { r->merge_ref = rref; WT_RET(__rec_row_merge(session, rp)); } continue; } /* Save any in-memory subtree reference. */ WT_RET(__rec_imref_add(session, &rref->ref)); } /* * Build key cell. * * Discard any 0th key, internal pages don't need 0th keys. * * If the key is an overflow item, assume prefix compression * won't make things better, and simply copy it. */ if (r->cell_zero) { WT_RET(__rec_cell_build_key(session, "*", 1)); r->cell_zero = 0; /* If the 0th key was an overflow key, discard it. */ WT_RET(__rec_discard_add_ovfl(session, cell)); ovfl_key = 0; } else if (__wt_cell_type(cell) == WT_CELL_KEY_OVFL) { key->buf.data = cell; key->buf.size = __wt_cell_len(cell); key->cell_len = 0; key->len = key->buf.size; ovfl_key = 1; } else if (__wt_key_process(rref)) { WT_RET(__wt_row_key(session, page, rref, &key->buf)); WT_RET(__rec_cell_build_key( session, key->buf.data, key->buf.size)); ovfl_key = __wt_cell_type_is_ovfl(&key->cell); } else { WT_RET(__rec_cell_build_key( session, rref->key, rref->size)); ovfl_key = __wt_cell_type_is_ovfl(&key->cell); } /* * Boundary, split or write the page. If the K/V pair doesn't * fit: split the page, turn off compression (until a full key * is written to the page), change to a non-prefix-compressed * key. */ while (key->len + val->len > r->space_avail) { WT_RET(__rec_split(session)); r->key_prefix_compress = 0; if (!ovfl_key) { WT_RET(__rec_cell_build_key(session, NULL, 0)); ovfl_key = __wt_cell_type_is_ovfl(&key->cell); } } /* Copy the key onto the page. */ __rec_copy_incr(session, r, key); /* * Copy the off-page reference onto the page; any off-page * reference must be a valid disk address. */ WT_ASSERT(session, WT_ROW_REF_ADDR(rref) != WT_ADDR_INVALID); val->off.addr = WT_ROW_REF_ADDR(rref); val->off.size = WT_ROW_REF_SIZE(rref); __rec_copy_incr(session, r, val); /* * If we wrote a non-overflow key onto the page, update the * last-key value and turn on prefix compression. */ if (!ovfl_key) { __rec_prefix_key_update(r); r->key_prefix_compress = 1; } } /* Write the remnant page. */ return (__rec_split_finish(session)); } /* * __rec_row_merge -- * Recursively walk a row-store internal tree of merge pages. */ static int __rec_row_merge(SESSION *session, WT_PAGE *page) { WT_KV *key, *val; WT_PAGE *rp; WT_RECONCILE *r; WT_ROW_REF *rref; uint32_t i; int ovfl_key; r = S2C(session)->cache->rec; key = &r->k; val = &r->v; /* For each entry in the in-memory page... */ WT_ROW_REF_FOREACH(page, rref, i) { /* * The page may be deleted or internally created during a split. * Deleted/split pages are merged into the parent and discarded. */ if (WT_ROW_REF_STATE(rref) != WT_REF_DISK) { rp = WT_ROW_REF_PAGE(rref); if (F_ISSET(rp, WT_PAGE_DELETED | WT_PAGE_SPLIT)) { /* Merge split subtrees */ if (F_ISSET(rp, WT_PAGE_SPLIT)) WT_RET(__rec_row_merge(session, rp)); continue; } /* Save any in-memory subtree reference. */ WT_RET(__rec_imref_add(session, &rref->ref)); } /* * Build key cell. * * Discard any 0th key, internal pages don't need 0th keys; this * test preceeds the merge test as we don't need any 0th keys, * merge or not. * * If this is the first key in a "to be merged" subtree, use the * merge correction key saved in the top-level parent page when * this function was called. */ if (r->cell_zero) WT_RET(__rec_cell_build_key(session, "*", 1)); else if (r->merge_ref != NULL) WT_RET(__rec_cell_build_key( session, r->merge_ref->key, r->merge_ref->size)); else WT_RET(__rec_cell_build_key( session, rref->key, rref->size)); ovfl_key = __wt_cell_type_is_ovfl(&key->cell); /* * Boundary, split or write the page. If the K/V pair doesn't * fit: split the page, turn off compression (until a full key * is written to the page), change to a non-prefix-compressed * key. */ while (key->len + val->len > r->space_avail) { WT_RET(__rec_split(session)); r->key_prefix_compress = 0; if (!ovfl_key) { WT_RET(__rec_cell_build_key(session, NULL, 0)); ovfl_key = __wt_cell_type_is_ovfl(&key->cell); } } r->cell_zero = 0; r->merge_ref = NULL; /* Copy the key onto the page. */ __rec_copy_incr(session, r, key); /* * Copy the off-page reference onto the page; any off-page * reference must be a valid disk address. */ WT_ASSERT(session, WT_ROW_REF_ADDR(rref) != WT_ADDR_INVALID); val->off.addr = WT_ROW_REF_ADDR(rref); val->off.size = WT_ROW_REF_SIZE(rref); __rec_copy_incr(session, r, val); /* * If we wrote a non-overflow key onto the page, update the * last-key value and turn on prefix compression. */ if (!ovfl_key) { __rec_prefix_key_update(r); r->key_prefix_compress = 1; } } return (0); } /* * __rec_row_leaf -- * Reconcile a row-store leaf page. */ static int __rec_row_leaf(SESSION *session, WT_PAGE *page, uint32_t slvg_skip) { WT_CELL *cell, *val_cell; WT_INSERT *ins; WT_KV *key, *val; WT_RECONCILE *r; WT_ROW *rip; WT_UPDATE *upd; uint32_t i; int ovfl_key; r = S2C(session)->cache->rec; key = &r->k; val = &r->v; r->key_prefix_compress = 0; /* New page, compression off. */ WT_RET(__rec_split_init(session, page, 0ULL, session->btree->leafmax, session->btree->leafmin)); /* * Bulk-loaded pages are just an insert list and nothing more. As * row-store leaf pages already have to deal with insert lists, it's * pretty easy to hack into that path. */ if (F_ISSET(page, WT_PAGE_BULK_LOAD)) { WT_RET(__rec_row_leaf_insert(session, page->u.bulk.ins)); return (__rec_split_finish(session)); } /* * Write any K/V pairs inserted into the page before the first from-disk * key on the page. */ if ((ins = WT_ROW_INSERT_SMALLEST(page)) != NULL) WT_RET(__rec_row_leaf_insert(session, ins)); /* * Walk the page, writing key/value pairs. * * We have to walk both the WT_ROW structures and the original page -- * see the comment at WT_ROW_AND_KEY_FOREACH for details. */ WT_ROW_AND_KEY_FOREACH(page, rip, cell, i) { /* * The salvage code, on some rare occasions, wants to reconcile * a page but skip some leading records on the page. Because * the row-store leaf reconciliation function copies keys from * the original disk page, this is non-trivial -- just changing * the in-memory pointers isn't sufficient, we have to change * the WT_CELL structures on the disk page, too. It's ugly, but * we pass in a value that tells us how many records to skip in * this case. */ if (slvg_skip != 0) { --slvg_skip; continue; } /* Build value cell. */ if ((upd = WT_ROW_UPDATE(page, rip)) == NULL) { /* * Copy the item off the page -- however, when the page * was read into memory, there may not have been a value * item, that is, it may have been zero length. */ if (WT_ROW_EMPTY_ISSET(rip)) val->buf.size = 0; else { val_cell = WT_ROW_PTR(page, rip); val->buf.data = val_cell; val->buf.size = __wt_cell_len(val_cell); } val->cell_len = 0; val->len = val->buf.size; } else { /* * If we updated an overflow value, free the underlying * file space. */ if (!WT_ROW_EMPTY_ISSET(rip)) WT_RET(__rec_discard_add_ovfl( session, WT_ROW_PTR(page, rip))); /* * If this key/value pair was deleted, we're done. If * we deleted an overflow key, free the underlying file * space. */ if (WT_UPDATE_DELETED_ISSET(upd)) { WT_RET(__rec_discard_add_ovfl(session, cell)); goto leaf_insert; } /* * If no value, nothing needs to be copied. Otherwise, * build the value's WT_CELL chunk from the most recent * update value. */ if (upd->size == 0) val->cell_len = val->len = val->buf.size = 0; else WT_RET(__rec_cell_build_val( session, WT_UPDATE_DATA(upd), upd->size)); } /* * Build key cell. * * If the key is an overflow item, assume prefix compression * won't make things better, and simply copy it. */ if (__wt_cell_type(cell) == WT_CELL_KEY_OVFL) { key->buf.data = cell; key->buf.size = __wt_cell_len(cell); key->cell_len = 0; key->len = key->buf.size; ovfl_key = 1; } else if (__wt_key_process(rip)) { WT_RET(__wt_row_key(session, page, rip, &key->buf)); WT_RET(__rec_cell_build_key( session, key->buf.data, key->buf.size)); ovfl_key = __wt_cell_type_is_ovfl(&key->cell); } else { WT_RET( __rec_cell_build_key(session, rip->key, rip->size)); ovfl_key = __wt_cell_type_is_ovfl(&key->cell); } /* * Boundary, split or write the page. If the K/V pair doesn't * fit: split the page, switch to the non-prefix-compressed key * and turn off compression until a full key is written to the * new page. */ while (key->len + val->len > r->space_avail) { WT_RET(__rec_split(session)); r->key_prefix_compress = 0; if (!ovfl_key) { WT_RET(__rec_cell_build_key(session, NULL, 0)); ovfl_key = __wt_cell_type_is_ovfl(&key->cell); } } /* Copy the key/value pair onto the page. */ __rec_copy_incr(session, r, key); if (val->len != 0) __rec_copy_incr(session, r, val); /* * If we wrote a non-overflow key onto the page, update the * last-key value and turn on prefix compression. */ if (!ovfl_key) { __rec_prefix_key_update(r); r->key_prefix_compress = 1; } leaf_insert: /* Write any K/V pairs inserted into the page after this key. */ if ((ins = WT_ROW_INSERT(page, rip)) != NULL) WT_RET(__rec_row_leaf_insert(session, ins)); } /* Write the remnant page. */ return (__rec_split_finish(session)); } /* * __rec_row_leaf_insert -- * Walk an insert chain, writing K/V pairs. */ static int __rec_row_leaf_insert(SESSION *session, WT_INSERT *ins) { WT_KV *key, *val; WT_RECONCILE *r; WT_UPDATE *upd; int ovfl_key; r = S2C(session)->cache->rec; key = &r->k; val = &r->v; for (; ins != NULL; ins = ins->next) { upd = ins->upd; /* Build value cell. */ if (WT_UPDATE_DELETED_ISSET(upd)) continue; if (upd->size == 0) val->len = 0; else WT_RET(__rec_cell_build_val( session, WT_UPDATE_DATA(upd), upd->size)); WT_RET(__rec_cell_build_key( /* Build key cell. */ session, WT_INSERT_KEY(ins), WT_INSERT_KEY_SIZE(ins))); ovfl_key = __wt_cell_type_is_ovfl(&key->cell); /* * Boundary, split or write the page. If the K/V pair doesn't * fit: split the page, switch to the non-prefix-compressed key * and turn off compression until a full key is written to the * new page. */ while (key->len + val->len > r->space_avail) { WT_RET(__rec_split(session)); r->key_prefix_compress = 0; if (!ovfl_key) { WT_RET(__rec_cell_build_key(session, NULL, 0)); ovfl_key = __wt_cell_type_is_ovfl(&key->cell); } } /* Copy the key/value pair onto the page. */ __rec_copy_incr(session, r, key); if (val->len != 0) __rec_copy_incr(session, r, val); /* * If we wrote a non-overflow key onto the page, update the * last-key value and turn on prefix compression. */ if (!ovfl_key) { __rec_prefix_key_update(r); r->key_prefix_compress = 1; } } return (0); } /* * __rec_wrapup -- * Resolve the WT_RECONCILE information. */ static int __rec_wrapup(SESSION *session, WT_PAGE *page) { BTREE *btree; WT_PAGE *imp; WT_RECONCILE *r; r = S2C(session)->cache->rec; btree = session->btree; /* * If the page was empty, we want to eventually discard it from the * tree by merging it into its parent, not just evict it from memory. */ if (r->split_next == 0) { WT_VERBOSE(S2C(session), WT_VERB_EVICT, (session, "reconcile: delete page %lu (%luB)", (u_long)WT_PADDR(page), (u_long)WT_PSIZE(page))); WT_STAT_INCR(btree->stats, page_delete); /* * Deleted pages cannot be evicted; we're going to evict the * file blocks and it's possible for a thread to traverse into * them before we reconcile their parent. That's a big problem * we don't want to solve, so keep the page around, and if it is * re-used before parent reconciliation, that's OK, we reconcile * it again. */ F_SET(page, WT_PAGE_DELETED); /* * XXX * I'm pretty sure we can empty the entire tree and arrive here * deleting the root page, and that's not OK: ignore for now. */ WT_ASSERT(session, !WT_PAGE_IS_ROOT(page)); /* * We're not going to reconcile this page after all -- release * our exclusive reference to it, as well as any pages in its * subtree that we locked down. */ if (!r->locked) { page->parent_ref->state = WT_REF_MEM; switch (page->type) { case WT_PAGE_COL_INT: __rec_subtree_col_clear(session, page); break; case WT_PAGE_ROW_INT: __rec_subtree_row_clear(session, page); break; default: break; } } /* The parent is dirty. */ WT_PAGE_SET_MODIFIED(page->parent); return (0); } /* * We're replacing a page's on-disk image with one or more on-disk * images. If the page is not being discarded from memory and * references in-memory subtrees, we must reconnect its in-memory * references to the existing pages. Sort the original page list of * references so it's easily searchable. */ r->imref_found = 0; if (r->imref_next != 0) qsort(r->imref, (size_t)r->imref_next, sizeof(WT_REF *), __rec_imref_qsort_cmp); /* * Because WiredTiger's pages grow without splitting, we're replacing a * single page with another single page most of the time. */ if (r->split_next == 1) { WT_VERBOSE(S2C(session), WT_VERB_EVICT, (session, "reconcile: move %lu to %lu, (%luB to %luB)", (u_long)WT_PADDR(page), (u_long)r->split[0].off.addr, (u_long)WT_PSIZE(page), (u_long)r->split[0].off.size)); /* Queue the original page to be discarded, we're done. */ WT_RET(__rec_discard_add_page(session, page)); /* * Update the page's parent reference -- when we created it, * we didn't know if it would be part of a split or not, so * we didn't know what page would be its parent. * * If we're evicting this page, the new state is "on-disk", * else, in-memory. * * Make sure we do not free the page twice on error. */ if ((imp = r->split[0].imp) != NULL) { r->split[0].imp = NULL; imp->parent = page->parent; imp->parent_ref = page->parent_ref; if (r->imref_next != 0) WT_RET(__rec_imref_fixup(session, imp)); } WT_RET(__rec_parent_update(session, page, imp, r->split[0].off.addr, r->split[0].off.size, r->evict ? WT_REF_DISK : WT_REF_MEM)); /* We're done, discard everything we've queued for discard. */ WT_RET(__rec_discard_evict(session)); return (0); } /* * A page grew so large we had to divide it into two or more physical * pages -- create a new internal page. */ WT_VERBOSE(S2C(session), WT_VERB_EVICT, (session, "reconcile: %lu (%luB) splitting", (u_long)WT_PADDR(page), (u_long)WT_PSIZE(page))); switch (page->type) { case WT_PAGE_COL_INT: case WT_PAGE_ROW_INT: WT_STAT_INCR(btree->stats, split_intl); break; case WT_PAGE_COL_FIX: case WT_PAGE_COL_RLE: case WT_PAGE_COL_VAR: case WT_PAGE_ROW_LEAF: WT_STAT_INCR(btree->stats, split_leaf); break; WT_ILLEGAL_FORMAT(session); } switch (page->type) { case WT_PAGE_ROW_INT: case WT_PAGE_ROW_LEAF: WT_RET(__rec_row_split(session, &imp, page)); break; case WT_PAGE_COL_INT: case WT_PAGE_COL_FIX: case WT_PAGE_COL_RLE: case WT_PAGE_COL_VAR: WT_RET(__rec_col_split(session, &imp, page)); break; WT_ILLEGAL_FORMAT(session); } /* Queue the original page to be discarded, we're done. */ WT_RET(__rec_discard_add_page(session, page)); /* Update the parent to reference the newly created internal page. */ WT_RET(__rec_parent_update( session, page, imp, WT_ADDR_INVALID, 0, WT_REF_MEM)); /* We're done, discard everything we've queued for discard. */ WT_RET(__rec_discard_evict(session)); return (0); } /* * __rec_parent_update_clean -- * Update a parent page's reference for an evicted, clean page. */ static void __rec_parent_update_clean(SESSION *session, WT_PAGE *page) { WT_REF *parent_ref; parent_ref = page->parent_ref; /* * If a page is on disk, it must have a valid disk address -- with * one exception: if you create a root page and never use it, then * it won't have a disk address. */ WT_ASSERT(session, WT_PAGE_IS_ROOT(page) || parent_ref->addr != WT_ADDR_INVALID); /* * Update the relevant WT_REF structure; no memory flush is needed, * the state field is declared volatile. */ parent_ref->page = NULL; parent_ref->state = WT_REF_DISK; } /* * __rec_parent_update -- * Update a parent page's reference to a reconciled page. */ static int __rec_parent_update(SESSION *session, WT_PAGE *page, WT_PAGE *replace, uint32_t addr, uint32_t size, uint32_t state) { WT_REF *parent_ref; /* * Update the relevant parent WT_REF structure, flush memory, and then * update the state of the parent reference. No further memory flush * needed, the state field is declared volatile. * * If we're replacing a valid addr/size pair, free the original disk * blocks, they're no longer in use. */ parent_ref = page->parent_ref; parent_ref->page = replace; if (replace != NULL) { WT_ASSERT(session, page->parent == replace->parent); WT_ASSERT(session, page->parent_ref == replace->parent_ref); } if (parent_ref->addr != WT_ADDR_INVALID) WT_RET(__wt_block_free( session, parent_ref->addr, parent_ref->size)); parent_ref->addr = addr; parent_ref->size = size; WT_MEMORY_FLUSH; parent_ref->state = state; /* * If it's not the root page, mark the parent page dirty. * * There's no chance we need to flush this write -- the eviction thread * is the only thread that eventually cares if the page is dirty or not, * and it's our update that's making it dirty. (The workQ thread does * have to flush its set-page-modified update, of course). * * We don't care if we race with the workQ; if the workQ thread races * with us, the page will still be marked dirty and that's all we care * about. */ if (!WT_PAGE_IS_ROOT(page)) WT_PAGE_SET_MODIFIED(page->parent); return (0); } /* * __rec_cell_build_key -- * Process a key and return a WT_CELL structure and byte string to be * stored on the page. */ static int __rec_cell_build_key(SESSION *session, const void *data, uint32_t size) { BTREE *btree; WT_KV *key; WT_RECONCILE *r; uint32_t pfx_len, pfx; const uint8_t *a, *b; r = S2C(session)->cache->rec; btree = session->btree; key = &r->k; pfx = 0; if (data == NULL) /* * When data is NULL, our caller has a prefix compressed key * they can't use (probably because they just crossed a split * point). Use the full key saved when last called, instead. */ WT_RET(__wt_buf_set( session, &key->buf, r->full->data, r->full->size)); else { /* * Save a copy of the key for later reference: we use the full * key for prefix-compression comparisons, and if we are, for * any reason, unable to use the compressed key we generate. */ WT_RET(__wt_buf_set(session, r->full, data, size)); /* * Do prefix compression on the key. We know by definition the * previous key sorts before the current key, which means the * keys must differ and we just need to compare up to the * shorter of the two keys. Also, we can't compress out more * than 256 bytes, limit the comparison to that. */ if (r->key_prefix_compress) { pfx_len = size; if (pfx_len > r->last->size) pfx_len = r->last->size; if (pfx_len > UINT8_MAX) pfx_len = UINT8_MAX; for (a = data, b = r->last->data; pfx < pfx_len; ++pfx) if (*a++ != *b++) break; } /* Copy the non-prefix bytes into the key buffer. */ WT_RET(__wt_buf_set( session, &key->buf, (uint8_t *)data + pfx, size - pfx)); } /* Optionally compress the value using the Huffman engine. */ if (btree->huffman_key != NULL) WT_RET(__wt_huffman_encode(session, btree->huffman_key, key->buf.data, key->buf.size, &key->buf)); /* Create an overflow object if the data won't fit. */ if (key->buf.size > btree->leafitemsize) { WT_STAT_INCR(btree->stats, overflow_key); /* * Overflow objects aren't prefix compressed -- rebuild any * object that was prefix compressed. */ return (pfx == 0 ? __rec_cell_build_ovfl(session, key, WT_CELL_KEY_OVFL) : __rec_cell_build_key(session, NULL, 0)); } __wt_cell_set( &key->cell, WT_CELL_KEY, pfx, key->buf.size, &key->cell_len); key->len = key->cell_len + key->buf.size; return (0); } /* * __rec_cell_build_val -- * Process a data item and return a WT_CELL structure and byte string to * be stored on the page. */ static int __rec_cell_build_val(SESSION *session, void *data, uint32_t size) { BTREE *btree; WT_KV *val; WT_RECONCILE *r; r = S2C(session)->cache->rec; btree = session->btree; val = &r->v; /* * We don't copy the data into the buffer, it's not necessary; just * re-point the buffers data/length fields. */ val->buf.data = data; val->buf.size = size; /* * Handle zero-length cells quickly -- this is a common value, it's * a deleted column-store variable length cell. */ if (size == 0) { __wt_cell_set(&val->cell, WT_CELL_DATA, 0, 0, &val->cell_len); val->len = val->cell_len + val->buf.size; return (0); } /* Optionally compress the data using the Huffman engine. */ if (btree->huffman_value != NULL) WT_RET(__wt_huffman_encode(session, btree->huffman_value, val->buf.data, val->buf.size, &val->buf)); /* Create an overflow object if the data won't fit. */ if (val->buf.size > btree->leafitemsize) { WT_STAT_INCR(btree->stats, overflow_data); return (__rec_cell_build_ovfl(session, val, WT_CELL_DATA_OVFL)); } __wt_cell_set( &val->cell, WT_CELL_DATA, 0, val->buf.size, &val->cell_len); val->len = val->cell_len + val->buf.size; return (0); } /* * __rec_cell_build_ovfl -- * Store bulk-loaded overflow items in the file, returning the WT_OFF. */ static int __rec_cell_build_ovfl(SESSION *session, WT_KV *kv, u_int type) { WT_BUF *tmp; WT_PAGE_DISK *dsk; uint32_t addr, size; int ret; tmp = NULL; ret = 0; /* Allocate a scratch buffer big enough to hold the overflow chunk. */ size = WT_DISK_REQUIRED(session, kv->buf.size); WT_RET(__wt_scr_alloc(session, size, &tmp)); /* * Set up the chunk; clear any unused bytes so the contents are never * random. */ dsk = tmp->mem; memset(dsk, 0, WT_PAGE_DISK_SIZE); dsk->type = WT_PAGE_OVFL; dsk->u.datalen = kv->buf.size; memcpy(WT_PAGE_DISK_BYTE(dsk), kv->buf.data, kv->buf.size); memset((uint8_t *)WT_PAGE_DISK_BYTE(dsk) + kv->buf.size, 0, size - (WT_PAGE_DISK_SIZE + kv->buf.size)); /* * Allocate a file address (we're the only writer of the file, so we * can allocate file space on demand). Fill in the WT_OFF structure. */ WT_ERR(__wt_block_alloc(session, &addr, size)); kv->off.addr = addr; kv->off.size = size; /* Set the callers K/V to reference the WT_OFF structure. */ kv->buf.data = &kv->off; kv->buf.size = sizeof(kv->off); __wt_cell_set_fixed(&kv->cell, type, &kv->cell_len); kv->len = kv->cell_len + kv->buf.size; ret = __wt_disk_write(session, dsk, addr, size); err: if (tmp != NULL) __wt_scr_release(&tmp); return (ret); } /* * __rec_prefix_key_update -- * Swap the full-key and last-key buffers. */ static void __rec_prefix_key_update(WT_RECONCILE *r) { WT_BUF *a; a = r->full; r->full = r->last; r->last = a; } /* * __rec_row_split -- * Update a row-store parent page's reference when a page is split. */ static int __rec_row_split(SESSION *session, WT_PAGE **splitp, WT_PAGE *orig) { WT_PAGE *imp, *page; WT_RECONCILE *r; WT_ROW_REF *rref; WT_SPLIT *spl; uint32_t i, size; int ret; void *key; r = S2C(session)->cache->rec; ret = 0; /* Allocate a row-store internal page. */ WT_RET(__wt_calloc_def(session, 1, &page)); WT_ERR(__wt_calloc_def( session, (size_t)r->split_next, &page->u.row_int.t)); /* Fill it in. */ page->parent = orig->parent; page->parent_ref = orig->parent_ref; page->read_gen = __wt_cache_read_gen(session); page->entries = r->split_next; page->type = WT_PAGE_ROW_INT; WT_PAGE_SET_MODIFIED(page); /* * Newly created internal pages are not persistent as we don't want the * tree to deepen whenever a leaf page splits. Flag the page for merge * into its parent when the parent is reconciled. */ F_SET(page, WT_PAGE_SPLIT); /* Enter each split page into the new, internal page. */ for (rref = page->u.row_int.t, spl = r->split, i = 0; i < r->split_next; ++rref, ++spl, ++i) { /* * Steal the split buffer's pointer -- we could allocate and * copy here, but that means split buffers would potentially * grow without bound, this way we do the same number of * memory allocations and the split buffers don't just keep * getting bigger. */ __wt_buf_steal(session, &spl->key, &key, &size); rref->key = key; rref->size = size; WT_ROW_REF_ADDR(rref) = spl->off.addr; WT_ROW_REF_SIZE(rref) = spl->off.size; /* * If there's an in-memory version of the page, connect it to * this parent and fix up its in-memory references. */ if ((imp = spl->imp) == NULL) { WT_ROW_REF_PAGE(rref) = NULL; WT_ROW_REF_STATE(rref) = WT_REF_DISK; } else { imp->parent = page; imp->parent_ref = &rref->ref; WT_ROW_REF_PAGE(rref) = imp; WT_ROW_REF_STATE(rref) = WT_REF_MEM; /* * The created in-memory page is hooked into the new * page, don't free it on error. */ spl->imp = NULL; /* * If we've found all the in-memory references we expect * to find, don't keep searching. */ if (r->imref_found < r->imref_next) WT_RET(__rec_imref_fixup(session, imp)); } WT_VERBOSE(S2C(session), WT_VERB_EVICT, (session, "split: %lu (%luB)", (u_long)spl->off.addr, (u_long)spl->off.size)); } /* * We should have connected the same number of in-memory references as * we found originally. */ WT_ASSERT(session, r->imref_found == r->imref_next); *splitp = page; return (0); err: __wt_free(session, page); return (ret); } /* * __rec_col_split -- * Update a column-store parent page's reference when a page is split. */ static int __rec_col_split(SESSION *session, WT_PAGE **splitp, WT_PAGE *orig) { WT_COL_REF *cref; WT_OFF_RECORD *off; WT_PAGE *imp, *page; WT_RECONCILE *r; WT_SPLIT *spl; uint32_t i; int ret; r = S2C(session)->cache->rec; ret = 0; /* Allocate a column-store internal page. */ WT_RET(__wt_calloc_def(session, 1, &page)); WT_ERR(__wt_calloc_def( session, (size_t)r->split_next, &page->u.col_int.t)); /* Fill it in. */ page->parent = orig->parent; page->parent_ref = orig->parent_ref; page->read_gen = __wt_cache_read_gen(session); page->u.col_int.recno = WT_RECNO(&r->split->off); page->entries = r->split_next; page->type = WT_PAGE_COL_INT; WT_PAGE_SET_MODIFIED(page); /* * Newly created internal pages are not persistent as we don't want the * tree to deepen whenever a leaf page splits. Flag the page for merge * into its parent when the parent is reconciled. */ F_SET(page, WT_PAGE_SPLIT); /* Enter each split page into the new, internal page. */ for (cref = page->u.col_int.t, spl = r->split, i = 0; i < r->split_next; ++cref, ++spl, ++i) { off = &spl->off; cref->recno = WT_RECNO(off); WT_COL_REF_ADDR(cref) = off->addr; WT_COL_REF_SIZE(cref) = off->size; /* * If there's an in-memory version of the page, connect it to * this parent and fix up its in-memory references. */ if ((imp = spl->imp) == NULL) { WT_COL_REF_PAGE(cref) = NULL; WT_COL_REF_STATE(cref) = WT_REF_DISK; } else { imp->parent = page; imp->parent_ref = &cref->ref; WT_COL_REF_PAGE(cref) = imp; WT_COL_REF_STATE(cref) = WT_REF_MEM; /* * The created in-memory page is hooked into the new * page, don't free it on error. */ spl->imp = NULL; /* * If we've found all the in-memory references we expect * to find, don't keep searching. */ if (r->imref_found < r->imref_next) WT_RET(__rec_imref_fixup(session, imp)); } WT_VERBOSE(S2C(session), WT_VERB_EVICT, (session, "split: %lu (%luB), starting record %llu", (u_long)off->addr, (u_long)off->size, (unsigned long long)WT_RECNO(&spl->off))); } /* * We should have connected the same number of in-memory references as * we found originally. */ WT_ASSERT(session, r->imref_found == r->imref_next); *splitp = page; return (0); err: __wt_free(session, page); return (ret); } /* * __rec_imref_qsort_cmp -- * Qsort sort function for the in-memory subtree list. */ static int __rec_imref_qsort_cmp(const void *a, const void *b) { uint32_t a_addr, b_addr; /* Sort the addr in ascending order. */ a_addr = (*(WT_REF **)a)->addr; b_addr = (*(WT_REF **)b)->addr; return (a_addr > b_addr ? 1 : (a_addr < b_addr ? -1 : 0)); } /* * __rec_imref_bsearch_cmp -- * Bsearch comparison routine for the in-memory subtree list. */ static int __rec_imref_bsearch_cmp(const void *a, const void *b) { WT_REF *search, *entry; search = (WT_REF *)a; entry = *(WT_REF **)b; return (search->addr > entry->addr ? 1 : ((search->addr < entry->addr) ? -1 : 0)); } /* * __rec_imref_fixup -- * Fix up after splitting a page referencing an in-memory subtree. */ static int __rec_imref_fixup(SESSION *session, WT_PAGE *page) { WT_COL_REF *cref; WT_RECONCILE *r; WT_REF **searchp, *ref; WT_ROW_REF *rref; uint32_t i; r = S2C(session)->cache->rec; /* * Walk the page we just created, and for any addr that appears, check * if the slot references an in-memory entry from the original page. * For each connection we find, link the pages together, both parent to * child and child to parent. */ switch (page->type) { case WT_PAGE_COL_INT: WT_COL_REF_FOREACH(page, cref, i) if ((searchp = bsearch(&cref->ref, r->imref, r->imref_next, sizeof(WT_REF *), __rec_imref_bsearch_cmp)) != NULL) { ref = *searchp; ref->page->parent = page; ref->page->parent_ref = &cref->ref; cref->ref = *ref; if (++r->imref_found == r->imref_next) break; } break; case WT_PAGE_ROW_INT: WT_ROW_REF_FOREACH(page, rref, i) if ((searchp = bsearch(&rref->ref, r->imref, r->imref_next, sizeof(WT_REF *), __rec_imref_bsearch_cmp)) != NULL) { ref = *searchp; ref->page->parent = page; ref->page->parent_ref = &rref->ref; rref->ref = *ref; if (++r->imref_found == r->imref_next) break; } break; WT_ILLEGAL_FORMAT(session); } return (0); } /* * __rec_imref_init -- * Initialize the list of in-memory subtree references. */ static void __rec_imref_init(WT_RECONCILE *r) { r->imref_next = 0; } /* * __rec_imref_add -- * Append a new reference to the list of in-memory subtree references. */ static int __rec_imref_add(SESSION *session, WT_REF *ref) { WT_RECONCILE *r; r = S2C(session)->cache->rec; if (r->imref_next == r->imref_entries) { WT_RET(__wt_realloc(session, &r->imref_allocated, (r->imref_entries + 20) * sizeof(*r->imref), &r->imref)); r->imref_entries += 20; } r->imref[r->imref_next++] = ref; return (0); } /* * __rec_discard_init -- * Initialize the list of discard objects. */ static void __rec_discard_init(WT_RECONCILE *r) { r->discard_next = 0; } /* * __rec_discard_add -- * Append an object to the list of discard objects. */ static int __rec_discard_add(SESSION *session, WT_PAGE *page, uint32_t addr, uint32_t size) { WT_RECONCILE *r; r = S2C(session)->cache->rec; if (r->discard_next == r->discard_entries) { WT_RET(__wt_realloc(session, &r->discard_allocated, (r->discard_entries + 20) * sizeof(*r->discard), &r->discard)); r->discard_entries += 20; } r->discard[r->discard_next].page = page; r->discard[r->discard_next].addr = addr; r->discard[r->discard_next].size = size; ++r->discard_next; return (0); } /* * __rec_discard_evict -- * Discard the list of discard objects. */ static int __rec_discard_evict(SESSION *session) { WT_DISCARD *discard; WT_RECONCILE *r; uint32_t i; r = S2C(session)->cache->rec; for (discard = r->discard, i = 0; i < r->discard_next; ++discard, ++i) if (discard->page == NULL) WT_RET(__wt_block_free( session, discard->addr, discard->size)); else __wt_page_free(session, discard->page, discard->addr, discard->size); return (0); } /* * __hazard_qsort_cmp -- * Qsort function: sort hazard list based on the page's address. */ static int __hazard_qsort_cmp(const void *a, const void *b) { WT_PAGE *a_page, *b_page; a_page = ((WT_HAZARD *)a)->page; b_page = ((WT_HAZARD *)b)->page; return (a_page > b_page ? 1 : (a_page < b_page ? -1 : 0)); } /* * __hazard_copy -- * Copy the hazard array and prepare it for searching. */ static void __hazard_copy(SESSION *session) { CONNECTION *conn; WT_RECONCILE *r; uint32_t elem, i, j; r = S2C(session)->cache->rec; conn = S2C(session); /* Copy the list of hazard references, compacting it as we go. */ elem = conn->session_size * conn->hazard_size; for (i = j = 0; j < elem; ++j) { if (conn->hazard[j].page == NULL) continue; r->hazard[i] = conn->hazard[j]; ++i; } elem = i; /* Sort the list by page address. */ qsort(r->hazard, (size_t)elem, sizeof(WT_HAZARD), __hazard_qsort_cmp); r->hazard_elem = elem; } /* * __hazard_bsearch_cmp -- * Bsearch function: search sorted hazard list. */ static int __hazard_bsearch_cmp(const void *search, const void *b) { void *entry; entry = ((WT_HAZARD *)b)->page; return (search > entry ? 1 : ((search < entry) ? -1 : 0)); } /* * __hazard_exclusive -- * Request exclusive access to a page. */ static int __hazard_exclusive(SESSION *session, WT_REF *ref) { WT_RECONCILE *r; r = S2C(session)->cache->rec; /* * Hazard references are acquired down the tree, which means we can't * deadlock. * * Request exclusive access to the page; no memory flush needed, the * state field is declared volatile. */ ref->state = WT_REF_LOCKED; /* Get a fresh copy of the hazard reference array. */ __hazard_copy(session); /* If we find a matching hazard reference, the page is still in use. */ if (bsearch(ref->page, r->hazard, r->hazard_elem, sizeof(WT_HAZARD), __hazard_bsearch_cmp) == NULL) return (0); /* Return the page to in-use. */ ref->state = WT_REF_MEM; return (1); }