diff options
author | Ramon Fernandez <rfmnyc@gmail.com> | 2015-11-19 09:37:38 -0500 |
---|---|---|
committer | Ramon Fernandez <rfmnyc@gmail.com> | 2015-11-19 09:41:39 -0500 |
commit | a0771ea5ec1b44537d3c409e3d712db24fd8e6bb (patch) | |
tree | 62517780ad0982ec80b8a6d968a72cf0474df617 /src/third_party/wiredtiger/src/btree | |
parent | 042d8fa2d252142489c5fa3009927bad20d77efb (diff) | |
download | mongo-a0771ea5ec1b44537d3c409e3d712db24fd8e6bb.tar.gz |
Import wiredtiger-wiredtiger-mongodb-3.2.0-rc3-177-g9d375e3.tar.gz from wiredtiger branch mongodb-3.2
ref: d9ec1ff..9d375e3
16c0a1a WT-1315 Fix some leaks with join cursors.
59857f9 WT-2222 Add statistics for named snapshots.
4368d39 WT-1315 Cursor join implementation
a72ddb7 WT-2218 Add truncate stats
fb9cebe WT-2224 Track which deleted refs are discarded by a split.
e2f1130 WT-2220 Split WT_TIMEDIFF macro into unit specific macros.
be412b5 WT-2182 when internal pages grow large enough, split them into their parents
ce8c091 WT-2219 Enhancements to in-memory testing
347d922 WT-2220 time_t cleanup.
08c0fcd WT-2217 change WT_CURSOR.insert to clear "set" key/value on return
d1b5e7f WT-2135 Fix log_only setting for backup cursor. Fix initialization.
78bd4ac WT-2210 raw compression fails if row-store recovery precedes column-store recovery
c1b2634 WT-2182 fixes for splitting up the tree.
0a1ee34 WT-2199 Fix transaction sync inconsistency.
ee31bb2 WT-2182 Simplify the split deepen logic.
c360d53 WT-2212 Add a "use_environment" config to "wiredtiger_open"
3f132a4 WT-2182 detect internal page split races.
Diffstat (limited to 'src/third_party/wiredtiger/src/btree')
-rw-r--r-- | src/third_party/wiredtiger/src/btree/bt_cursor.c | 1 | ||||
-rw-r--r-- | src/third_party/wiredtiger/src/btree/bt_debug.c | 2 | ||||
-rw-r--r-- | src/third_party/wiredtiger/src/btree/bt_delete.c | 2 | ||||
-rw-r--r-- | src/third_party/wiredtiger/src/btree/bt_handle.c | 17 | ||||
-rw-r--r-- | src/third_party/wiredtiger/src/btree/bt_read.c | 6 | ||||
-rw-r--r-- | src/third_party/wiredtiger/src/btree/bt_split.c | 1808 | ||||
-rw-r--r-- | src/third_party/wiredtiger/src/btree/bt_sync.c | 2 | ||||
-rw-r--r-- | src/third_party/wiredtiger/src/btree/col_srch.c | 33 | ||||
-rw-r--r-- | src/third_party/wiredtiger/src/btree/row_srch.c | 89 |
9 files changed, 1194 insertions, 766 deletions
diff --git a/src/third_party/wiredtiger/src/btree/bt_cursor.c b/src/third_party/wiredtiger/src/btree/bt_cursor.c index 3290fd6374c..69512f45933 100644 --- a/src/third_party/wiredtiger/src/btree/bt_cursor.c +++ b/src/third_party/wiredtiger/src/btree/bt_cursor.c @@ -1093,6 +1093,7 @@ __wt_btcur_range_truncate(WT_CURSOR_BTREE *start, WT_CURSOR_BTREE *stop) cbt = (start != NULL) ? start : stop; session = (WT_SESSION_IMPL *)cbt->iface.session; btree = cbt->btree; + WT_STAT_FAST_DATA_INCR(session, cursor_truncate); /* * We always delete in a forward direction because it's faster, assert diff --git a/src/third_party/wiredtiger/src/btree/bt_debug.c b/src/third_party/wiredtiger/src/btree/bt_debug.c index 8edc40794e2..0f47c060daf 100644 --- a/src/third_party/wiredtiger/src/btree/bt_debug.c +++ b/src/third_party/wiredtiger/src/btree/bt_debug.c @@ -566,7 +566,7 @@ __debug_tree( /* A NULL page starts at the top of the tree -- it's a convenience. */ if (page == NULL) - page = S2BT(session)->root.page; + page = btree->root.page; WT_WITH_BTREE(session, btree, ret = __debug_page(ds, page, flags)); diff --git a/src/third_party/wiredtiger/src/btree/bt_delete.c b/src/third_party/wiredtiger/src/btree/bt_delete.c index 757b7b51cdd..98c6390e0f4 100644 --- a/src/third_party/wiredtiger/src/btree/bt_delete.c +++ b/src/third_party/wiredtiger/src/btree/bt_delete.c @@ -138,6 +138,8 @@ __wt_delete_page(WT_SESSION_IMPL *session, WT_REF *ref, bool *skipp) WT_ERR(__wt_txn_modify_ref(session, ref)); *skipp = true; + WT_STAT_FAST_CONN_INCR(session, rec_page_delete_fast); + WT_STAT_FAST_DATA_INCR(session, rec_page_delete_fast); WT_PUBLISH(ref->state, WT_REF_DELETED); return (0); diff --git a/src/third_party/wiredtiger/src/btree/bt_handle.c b/src/third_party/wiredtiger/src/btree/bt_handle.c index 3e611a107ab..dbdf94fc1b6 100644 --- a/src/third_party/wiredtiger/src/btree/bt_handle.c +++ b/src/third_party/wiredtiger/src/btree/bt_handle.c @@ -643,11 +643,13 @@ __btree_page_sizes(WT_SESSION_IMPL *session) { WT_BTREE *btree; WT_CONFIG_ITEM cval; + WT_CONNECTION_IMPL *conn; uint64_t cache_size; uint32_t intl_split_size, leaf_split_size; const char **cfg; btree = S2BT(session); + conn = S2C(session); cfg = btree->dhandle->cfg; /* @@ -688,8 +690,8 @@ __btree_page_sizes(WT_SESSION_IMPL *session) WT_RET(__wt_config_gets(session, cfg, "memory_page_max", &cval)); btree->maxmempage = WT_MAX((uint64_t)cval.val, 50 * (uint64_t)btree->maxleafpage); - if (!F_ISSET(S2C(session), WT_CONN_CACHE_POOL)) { - if ((cache_size = S2C(session)->cache_size) > 0) + if (!F_ISSET(conn, WT_CONN_CACHE_POOL)) { + if ((cache_size = conn->cache_size) > 0) btree->maxmempage = WT_MIN(btree->maxmempage, cache_size / 4); } @@ -723,6 +725,17 @@ __btree_page_sizes(WT_SESSION_IMPL *session) /* * Get the maximum internal/leaf page key/value sizes. * + * In-memory configuration overrides any key/value sizes, there's no + * such thing as an overflow item in an in-memory configuration. + */ + if (F_ISSET(conn, WT_CONN_IN_MEMORY)) { + btree->maxintlkey = WT_BTREE_MAX_OBJECT_SIZE; + btree->maxleafkey = WT_BTREE_MAX_OBJECT_SIZE; + btree->maxleafvalue = WT_BTREE_MAX_OBJECT_SIZE; + return (0); + } + + /* * In historic versions of WiredTiger, the maximum internal/leaf page * key/value sizes were set by the internal_item_max and leaf_item_max * configuration strings. Look for those strings if we don't find the diff --git a/src/third_party/wiredtiger/src/btree/bt_read.c b/src/third_party/wiredtiger/src/btree/bt_read.c index e60f7b3fb02..389ac761c5b 100644 --- a/src/third_party/wiredtiger/src/btree/bt_read.c +++ b/src/third_party/wiredtiger/src/btree/bt_read.c @@ -586,8 +586,8 @@ skip_evict: * CPU to no purpose. */ if (stalled) - wait_cnt += 1000; - else if (++wait_cnt < 1000) { + wait_cnt += WT_THOUSAND; + else if (++wait_cnt < WT_THOUSAND) { __wt_yield(); continue; } @@ -603,7 +603,7 @@ skip_evict: if (cache_work) continue; } - sleep_cnt = WT_MIN(sleep_cnt + 1000, 10000); + sleep_cnt = WT_MIN(sleep_cnt + WT_THOUSAND, 10000); WT_STAT_FAST_CONN_INCRV(session, page_sleep, sleep_cnt); __wt_sleep(0, sleep_cnt); } diff --git a/src/third_party/wiredtiger/src/btree/bt_split.c b/src/third_party/wiredtiger/src/btree/bt_split.c index 9e45bf10a5c..caba12b78f1 100644 --- a/src/third_party/wiredtiger/src/btree/bt_split.c +++ b/src/third_party/wiredtiger/src/btree/bt_split.c @@ -169,54 +169,58 @@ __split_safe_free(WT_SESSION_IMPL *session, return (__split_stash_add(session, split_gen, p, s)); } +#ifdef HAVE_DIAGNOSTIC /* - * __split_should_deepen -- - * Return if we should deepen the tree. + * __split_verify_intl_key_order -- + * Verify the key order on an internal page after a split, diagnostic only. */ -static bool -__split_should_deepen(WT_SESSION_IMPL *session, WT_REF *ref) +static void +__split_verify_intl_key_order(WT_SESSION_IMPL *session, WT_PAGE *page) { WT_BTREE *btree; - WT_PAGE *page; - WT_PAGE_INDEX *pindex; + WT_ITEM *next, _next, *last, _last, *tmp; + WT_REF *ref; + uint64_t recno; + int cmp; + bool first; btree = S2BT(session); - page = ref->page; - - /* - * Our caller is holding the parent page locked to single-thread splits, - * which means we can safely look at the page's index without setting a - * split generation. - */ - pindex = WT_INTL_INDEX_GET_SAFE(page); - - /* - * Sanity check for a reasonable number of keys on-page keys. Splitting - * with too few keys leads to excessively deep trees. - */ - if (pindex->entries < 100) - return (false); - - /* - * Deepen the tree if the page's memory footprint is larger than the - * maximum size for a page in memory (presumably putting eviction - * pressure on the cache). - */ - if (page->memory_footprint > btree->maxmempage) - return (true); - /* - * Check if the page has enough keys to make it worth splitting. If - * the number of keys is allowed to grow too large, the cost of - * splitting into parent pages can become large enough to result - * in slow operations. - */ - if (!__wt_ref_is_root(ref) && - pindex->entries > btree->split_deepen_min_child) - return (true); + switch (page->type) { + case WT_PAGE_COL_INT: + recno = 0; /* Less than any valid record number. */ + WT_INTL_FOREACH_BEGIN(session, page, ref) { + WT_ASSERT(session, ref->key.recno > recno); + recno = ref->key.recno; + } WT_INTL_FOREACH_END; + break; + case WT_PAGE_ROW_INT: + next = &_next; + WT_CLEAR(_next); + last = &_last; + WT_CLEAR(_last); - return (false); + first = true; + WT_INTL_FOREACH_BEGIN(session, page, ref) { + __wt_ref_key(page, ref, &next->data, &next->size); + if (last->size == 0) { + if (first) + first = false; + else { + WT_ASSERT(session, __wt_compare( + session, btree->collator, last, + next, &cmp) == 0); + WT_ASSERT(session, cmp < 0); + } + } + tmp = last; + last = next; + next = tmp; + } WT_INTL_FOREACH_END; + break; + } } +#endif /* * __split_ovfl_key_cleanup -- @@ -267,47 +271,58 @@ __split_ovfl_key_cleanup(WT_SESSION_IMPL *session, WT_PAGE *page, WT_REF *ref) } /* - * __split_ref_deepen_move -- - * Move a WT_REF from a parent to a child in service of a split to deepen - * the tree, including updating the accounting information. + * __split_ref_move -- + * Move a WT_REF from one page to another, including updating accounting + * information. */ static int -__split_ref_deepen_move(WT_SESSION_IMPL *session, - WT_PAGE *parent, WT_REF *ref, size_t *parent_decrp, size_t *child_incrp) +__split_ref_move(WT_SESSION_IMPL *session, WT_PAGE *from_home, + WT_REF **from_refp, size_t *decrp, WT_REF **to_refp, size_t *incrp) { WT_ADDR *addr; WT_CELL_UNPACK unpack; WT_DECL_RET; WT_IKEY *ikey; + WT_REF *ref; size_t size; void *key; + ref = *from_refp; + /* + * The from-home argument is the page into which the "from" WT_REF may + * point, for example, if there's an on-page key the "from" WT_REF + * references, it will be on the page "from-home". + * * Instantiate row-store keys, and column- and row-store addresses in - * the WT_REF structures referenced by a page that's being split (and - * deepening the tree). The WT_REF structures aren't moving, but the - * index references are moving from the page we're splitting to a set - * of child pages, and so we can no longer reference the block image - * that remains with the page being split. + * the WT_REF structures referenced by a page that's being split. The + * WT_REF structures aren't moving, but the index references are moving + * from the page we're splitting to a set of new pages, and so we can + * no longer reference the block image that remains with the page being + * split. * * No locking is required to update the WT_REF structure because we're - * the only thread splitting the parent page, and there's no way for - * readers to race with our updates of single pointers. The changes - * have to be written before the page goes away, of course, our caller - * owns that problem. - * - * Row-store keys, first. + * the only thread splitting the page, and there's no way for readers + * to race with our updates of single pointers. The changes have to be + * written before the page goes away, of course, our caller owns that + * problem. */ - if (parent->type == WT_PAGE_ROW_INT) { + if (from_home->type == WT_PAGE_ROW_INT) { + /* + * Row-store keys: if it's not yet instantiated, instantiate it. + * If already instantiated, check for overflow cleanup (overflow + * keys are always instantiated). + */ if ((ikey = __wt_ref_key_instantiated(ref)) == NULL) { - __wt_ref_key(parent, ref, &key, &size); + __wt_ref_key(from_home, ref, &key, &size); WT_RET(__wt_row_ikey(session, 0, key, size, ref)); ikey = ref->key.ikey; } else { - WT_RET(__split_ovfl_key_cleanup(session, parent, ref)); - *parent_decrp += sizeof(WT_IKEY) + ikey->size; + WT_RET( + __split_ovfl_key_cleanup(session, from_home, ref)); + *decrp += sizeof(WT_IKEY) + ikey->size; } - *child_incrp += sizeof(WT_IKEY) + ikey->size; + *incrp += sizeof(WT_IKEY) + ikey->size; } /* @@ -316,7 +331,7 @@ __split_ref_deepen_move(WT_SESSION_IMPL *session, * get the address from the on-page cell. */ addr = ref->addr; - if (addr != NULL && !__wt_off_page(parent, addr)) { + if (addr != NULL && !__wt_off_page(from_home, addr)) { __wt_cell_unpack((WT_CELL *)ref->addr, &unpack); WT_RET(__wt_calloc_one(session, &addr)); if ((ret = __wt_strndup( @@ -330,364 +345,1048 @@ __split_ref_deepen_move(WT_SESSION_IMPL *session, ref->addr = addr; } - /* And finally, the WT_REF itself. */ - WT_MEM_TRANSFER(*parent_decrp, *child_incrp, sizeof(WT_REF)); + /* And finally, copy the WT_REF pointer itself. */ + *to_refp = ref; + WT_MEM_TRANSFER(*decrp, *incrp, sizeof(WT_REF)); return (0); } -#ifdef HAVE_DIAGNOSTIC /* - * __split_verify_intl_key_order -- - * Verify the key order on an internal page after a split, diagnostic only. + * __split_child_block_evict_and_split -- + * Ensure the newly created child isn't evicted or split for now. */ static void -__split_verify_intl_key_order(WT_SESSION_IMPL *session, WT_PAGE *page) +__split_child_block_evict_and_split(WT_PAGE *child) { - WT_BTREE *btree; - WT_ITEM *next, _next, *last, _last, *tmp; - WT_REF *ref; - uint64_t recno; - int cmp; - bool first; + /* + * Once the split is live, newly created internal pages might be evicted + * and their WT_REF structures freed. If that happens before all threads + * exit the index of the page which previously "owned" the WT_REF, a + * thread might see a freed WT_REF. To ensure that doesn't happen, the + * newly created page's modify structure has a field with a transaction + * ID that's checked before any internal page is evicted. Unfortunately, + * we don't know the correct value until we update the original page's + * index (we need a transaction ID from after that update), but the act + * of updating the original page's index is what allows the eviction to + * happen. + * + * Once the split is live, newly created internal pages might themselves + * split. The split itself is not the problem: if a page splits before + * we fix up its WT_REF (in other words, a WT_REF we move is then moved + * again, before we reset the underlying page's parent reference), it's + * OK because the test we use to find a WT_REF and WT_PAGE that require + * fixing up is only that the WT_REF points to the wrong parent, not it + * points to a specific wrong parent. The problem is our fix up of the + * WT_REFs in the created page could race with the subsequent fix of the + * same WT_REFs (in a different created page), we'd have to acquire some + * lock to prevent that race, and that's going to be difficult at best. + * + * For now, block eviction and splits in newly created pages until they + * have been fixed up. + */ + F_SET_ATOMIC(child, WT_PAGE_SPLIT_BLOCK); +} - btree = S2BT(session); +/* + * __split_ref_move_final -- + * Finalize the moved WT_REF structures after the split succeeds. + */ +static int +__split_ref_move_final( + WT_SESSION_IMPL *session, WT_REF **refp, uint32_t entries) +{ + WT_DECL_RET; + WT_PAGE *child; + WT_REF *ref, *child_ref; + uint64_t txn_new_id; + uint32_t i; - switch (page->type) { - case WT_PAGE_COL_INT: - recno = 0; /* Less than any valid record number. */ - WT_INTL_FOREACH_BEGIN(session, page, ref) { - WT_ASSERT(session, ref->key.recno > recno); - recno = ref->key.recno; - } WT_INTL_FOREACH_END; - break; - case WT_PAGE_ROW_INT: - next = &_next; - WT_CLEAR(_next); - last = &_last; - WT_CLEAR(_last); + /* + * When creating new internal pages as part of a split, we set a field + * in those pages modify structure to prevent them from being evicted + * until all threads are known to have exited the index of the page that + * previously "owned" the WT_REF. Set that field to a safe value. + */ + txn_new_id = __wt_txn_new_id(session); - first = true; - WT_INTL_FOREACH_BEGIN(session, page, ref) { - __wt_ref_key(page, ref, &next->data, &next->size); - if (last->size == 0) { - if (first) - first = false; - else { - WT_ASSERT(session, __wt_compare( - session, btree->collator, last, - next, &cmp) == 0); - WT_ASSERT(session, cmp < 0); - } + /* + * The WT_REF structures moved to newly allocated child pages reference + * the wrong parent page and we have to fix that up. The problem is + * revealed when a thread of control searches for the child page's + * reference structure slot, and fails to find it because the parent + * page being searched no longer references the child. When that failure + * happens the thread waits for the reference's home page to be updated, + * which we do here: walk the children and fix them up. + */ + for (i = 0; i < entries; ++i, ++refp) { + ref = *refp; + + /* + * We don't hold hazard pointers on created pages, they cannot + * be evicted because the page-modify transaction value set as + * they were created prevents eviction. (See above, we reset + * that value as part of fixing up the page.) But, an eviction + * thread might be attempting to evict the page (the WT_REF may + * be WT_REF_LOCKED), or it may be a disk based page (the WT_REF + * may be WT_REF_READING), or it may be in some other state. + * Acquire a hazard pointer for any in-memory pages so we know + * the state of the page. Ignore pages not in-memory (deleted, + * on-disk, being read), there's no in-memory structure to fix. + */ + if ((ret = __wt_page_in(session, + ref, WT_READ_CACHE | WT_READ_NO_EVICT)) == WT_NOTFOUND) + continue; + WT_ERR(ret); + + child = ref->page; +#ifdef HAVE_DIAGNOSTIC + WT_WITH_PAGE_INDEX(session, + __split_verify_intl_key_order(session, child)); +#endif + /* + * We use a page flag to prevent the child from splitting from + * underneath us, but the split-generation error checks don't + * know about that flag; use the standard macros to ensure that + * reading the child's page index structure is safe. + */ + WT_ENTER_PAGE_INDEX(session); + WT_INTL_FOREACH_BEGIN(session, child, child_ref) { + /* + * The page's home reference may not be wrong, as we + * opened up access from the top of the tree already, + * disk pages may have been read in since then, and + * those pages would have correct parent references. + */ + if (child_ref->home != child) { + child_ref->home = child; + child_ref->pindex_hint = 0; + + child->modify->mod_split_txn = txn_new_id; } - tmp = last; - last = next; - next = tmp; } WT_INTL_FOREACH_END; - break; + WT_LEAVE_PAGE_INDEX(session); + + /* The child can now be evicted or split. */ + F_CLR_ATOMIC(child, WT_PAGE_SPLIT_BLOCK); + + WT_ERR(__wt_hazard_clear(session, child)); } + + /* + * Push out the changes: not required for correctness, but don't let + * threads spin on incorrect page references longer than necessary. + */ + WT_FULL_BARRIER(); + return (0); + +err: /* Something really bad just happened. */ + WT_PANIC_RET(session, ret, "fatal error resolving a split"); } -#endif /* - * __split_deepen -- - * Split an internal page in-memory, deepening the tree. + * __split_root -- + * Split the root page in-memory, deepening the tree. */ static int -__split_deepen(WT_SESSION_IMPL *session, WT_PAGE *parent) +__split_root(WT_SESSION_IMPL *session, WT_PAGE *root) { WT_BTREE *btree; WT_DECL_RET; WT_PAGE *child; WT_PAGE_INDEX *alloc_index, *child_pindex, *pindex; WT_REF **alloc_refp; - WT_REF *child_ref, **child_refp, *parent_ref, **parent_refp, *ref; - size_t child_incr, parent_decr, parent_incr, size; + WT_REF **child_refp, *ref, **root_refp; + size_t child_incr, root_decr, root_incr, size; uint64_t split_gen; - uint32_t children, chunk, i, j, moved_entries, new_entries, remain; - uint32_t skip_leading, slots; + uint32_t children, chunk, i, j, remain; + uint32_t slots; bool complete; void *p; WT_STAT_FAST_CONN_INCR(session, cache_eviction_deepen); WT_STAT_FAST_DATA_INCR(session, cache_eviction_deepen); + WT_STAT_FAST_CONN_INCR(session, cache_eviction_split_internal); + WT_STAT_FAST_DATA_INCR(session, cache_eviction_split_internal); btree = S2BT(session); alloc_index = NULL; - parent_incr = parent_decr = 0; + root_decr = root_incr = 0; complete = false; + /* The root page will be marked dirty, make sure that will succeed. */ + WT_RET(__wt_page_modify_init(session, root)); + /* - * Our caller is holding the parent page locked to single-thread splits, + * Our caller is holding the root page locked to single-thread splits, * which means we can safely look at the page's index without setting a * split generation. */ - pindex = WT_INTL_INDEX_GET_SAFE(parent); + pindex = WT_INTL_INDEX_GET_SAFE(root); /* - * A prepending/appending workload will repeatedly deepen parts of the - * tree that aren't changing, and appending workloads are not uncommon. - * First, keep the first/last pages of the tree at their current level, - * to catch simple workloads. Second, track the number of entries which - * resulted from the last time we deepened this page, and if we refilled - * this page without splitting into those slots, ignore them for this - * split. It's not exact because an eviction might split into any part - * of the page: if 80% of the splits are at the end of the page, assume - * an append-style workload. Of course, the plan eventually fails: when - * repeatedly deepening this page for an append-only workload, we will - * progressively ignore more and more of the slots. When ignoring 90% of - * the slots, deepen the entire page again. - * - * Figure out how many slots we're leaving at this level and how many - * child pages we're creating. + * Decide how many child pages to create, then calculate the standard + * chunk and whatever remains. Sanity check the number of children: + * the decision to split matched to the deepen-per-child configuration + * might get it wrong. */ -#undef skip_trailing -#define skip_trailing 1 - skip_leading = 1; - new_entries = pindex->entries - parent->pg_intl_deepen_split_last; - if (parent->pg_intl_deepen_split_append > (new_entries * 8) / 10) - skip_leading = parent->pg_intl_deepen_split_last; - if (skip_leading > (pindex->entries * 9) * 10) - skip_leading = 1; - - /* - * In a few (rare) cases we split pages with only a few entries, and in - * those cases we keep it simple, 10 children, skip only first and last - * entries. Otherwise, split into a lot of child pages. - */ - moved_entries = pindex->entries - (skip_leading + skip_trailing); - children = moved_entries / btree->split_deepen_per_child; + children = pindex->entries / btree->split_deepen_per_child; if (children < 10) { + if (pindex->entries < 100) + return (EBUSY); children = 10; - skip_leading = 1; - moved_entries = - pindex->entries - (skip_leading + skip_trailing); } + chunk = pindex->entries / children; + remain = pindex->entries - chunk * (children - 1); WT_ERR(__wt_verbose(session, WT_VERB_SPLIT, - "%p: %" PRIu32 " elements, splitting into %" PRIu32 " children", - parent, pindex->entries, children)); + "%p: %" PRIu32 " root page elements, splitting into %" PRIu32 + " children", + root, pindex->entries, children)); /* - * Allocate a new WT_PAGE_INDEX and set of WT_REF objects. Initialize - * the slots of the allocated WT_PAGE_INDEX to point to the pages we're - * keeping at the current level, and the rest of the slots to point to - * new WT_REF objects. + * Allocate a new WT_PAGE_INDEX and set of WT_REF objects to be inserted + * into the root page, replacing the root's page-index. */ - size = sizeof(WT_PAGE_INDEX) + - (children + skip_leading + skip_trailing) * sizeof(WT_REF *); + size = sizeof(WT_PAGE_INDEX) + children * sizeof(WT_REF *); WT_ERR(__wt_calloc(session, 1, size, &alloc_index)); - parent_incr += size; + root_incr += size; alloc_index->index = (WT_REF **)(alloc_index + 1); - alloc_index->entries = children + skip_leading + skip_trailing; - for (alloc_refp = alloc_index->index, - i = 0; i < skip_leading; ++alloc_refp, ++i) - alloc_index->index[i] = pindex->index[i]; - for (i = 0; i < children; ++alloc_refp, ++i) + alloc_index->entries = children; + alloc_refp = alloc_index->index; + for (i = 0; i < children; alloc_refp++, ++i) WT_ERR(__wt_calloc_one(session, alloc_refp)); - parent_incr += children * sizeof(WT_REF); - alloc_index->index[alloc_index->entries - 1] = - pindex->index[pindex->entries - 1]; + root_incr += children * sizeof(WT_REF); /* Allocate child pages, and connect them into the new page index. */ - chunk = moved_entries / children; - remain = moved_entries - chunk * (children - 1); - for (parent_refp = pindex->index + skip_leading, - alloc_refp = alloc_index->index + skip_leading, - i = 0; i < children; ++i) { + for (root_refp = pindex->index, + alloc_refp = alloc_index->index, i = 0; i < children; ++i) { slots = i == children - 1 ? remain : chunk; WT_ERR(__wt_page_alloc( - session, parent->type, 0, slots, false, &child)); + session, root->type, 0, slots, false, &child)); /* - * Initialize the parent page's child reference; we need a copy - * of the page's key. + * Initialize the page's child reference; we need a copy of the + * page's key. */ ref = *alloc_refp++; - ref->home = parent; + ref->home = root; ref->page = child; ref->addr = NULL; - if (parent->type == WT_PAGE_ROW_INT) { - __wt_ref_key(parent, *parent_refp, &p, &size); + if (root->type == WT_PAGE_ROW_INT) { + __wt_ref_key(root, *root_refp, &p, &size); WT_ERR(__wt_row_ikey(session, 0, p, size, ref)); - parent_incr += sizeof(WT_IKEY) + size; + root_incr += sizeof(WT_IKEY) + size; } else - ref->key.recno = (*parent_refp)->key.recno; + ref->key.recno = (*root_refp)->key.recno; ref->state = WT_REF_MEM; /* Initialize the child page. */ - if (parent->type == WT_PAGE_COL_INT) - child->pg_intl_recno = (*parent_refp)->key.recno; + if (root->type == WT_PAGE_COL_INT) + child->pg_intl_recno = (*root_refp)->key.recno; child->pg_intl_parent_ref = ref; /* Mark it dirty. */ WT_ERR(__wt_page_modify_init(session, child)); __wt_page_modify_set(session, child); - /* - * Once the split goes live, the newly created internal pages - * might be evicted and their WT_REF structures freed. If those - * pages are evicted before threads exit the previous page index - * array, a thread might see a freed WT_REF. Set the eviction - * transaction requirement for the newly created internal pages. - */ - child->modify->mod_split_txn = __wt_txn_new_id(session); + /* Ensure the page isn't evicted or split for now. */ + __split_child_block_evict_and_split(child); /* * The newly allocated child's page index references the same - * structures as the parent. (We cannot move WT_REF structures, + * structures as the root. (We cannot move WT_REF structures, * threads may be underneath us right now changing the structure * state.) However, if the WT_REF structures reference on-page * information, we have to fix that, because the disk image for * the page that has an page index entry for the WT_REF is about * to change. */ - child_incr = 0; child_pindex = WT_INTL_INDEX_GET_SAFE(child); - for (child_refp = child_pindex->index, j = 0; j < slots; ++j) { - WT_ERR(__split_ref_deepen_move(session, - parent, *parent_refp, &parent_decr, &child_incr)); - *child_refp++ = *parent_refp++; - } + child_incr = 0; + for (child_refp = child_pindex->index, + j = 0; j < slots; ++child_refp, ++root_refp, ++j) + WT_ERR(__split_ref_move(session, root, + root_refp, &root_decr, child_refp, &child_incr)); + __wt_cache_page_inmem_incr(session, child, child_incr); } WT_ASSERT(session, - alloc_refp - alloc_index->index == - (ptrdiff_t)(alloc_index->entries - skip_trailing)); - WT_ASSERT(session, parent_refp - pindex->index == - (ptrdiff_t)(pindex->entries - skip_trailing)); + alloc_refp - alloc_index->index == (ptrdiff_t)alloc_index->entries); + WT_ASSERT(session, + root_refp - pindex->index == (ptrdiff_t)pindex->entries); /* - * Confirm the parent page's index hasn't moved, then update it, which + * Confirm the root page's index hasn't moved, then update it, which * makes the split visible to threads descending the tree. From this * point on, we're committed to the split. * * A note on error handling: until this point, there's no problem with * unwinding on error. We allocated a new page index, a new set of * WT_REFs and a new set of child pages -- if an error occurred, the - * parent remained unchanged, although it may have an incorrect memory - * footprint. From now on we've modified the parent page, attention + * root remained unchanged, although it may have an incorrect memory + * footprint. From now on we've modified the root page, attention * needs to be paid. However, subsequent failures are relatively benign, * the split is OK and complete. For that reason, we ignore errors past * this point unless there's a panic. */ + WT_ASSERT(session, WT_INTL_INDEX_GET_SAFE(root) == pindex); + WT_INTL_INDEX_SET(root, alloc_index); + complete = true; + +#ifdef HAVE_DIAGNOSTIC + WT_WITH_PAGE_INDEX(session, + __split_verify_intl_key_order(session, root)); +#endif + /* Fix up the moved WT_REF structures. */ + WT_ERR(__split_ref_move_final( + session, alloc_index->index, alloc_index->entries)); + + /* We've installed the allocated page-index, ensure error handling. */ + alloc_index = NULL; + + /* + * We can't free the previous root's index, there may be threads using + * it. Add to the session's discard list, to be freed once we know no + * threads can still be using it. + * + * This change requires care with error handling: we have already + * updated the page with a new index. Even if stashing the old value + * fails, we don't roll back that change, because threads may already + * be using the new index. + */ + split_gen = __wt_atomic_addv64(&S2C(session)->split_gen, 1); + size = sizeof(WT_PAGE_INDEX) + pindex->entries * sizeof(WT_REF *); + WT_TRET(__split_safe_free(session, split_gen, false, pindex, size)); + root_decr += size; + + /* Adjust the root's memory footprint and mark it dirty. */ + __wt_cache_page_inmem_incr(session, root, root_incr); + __wt_cache_page_inmem_decr(session, root, root_decr); + __wt_page_modify_set(session, root); + +err: /* + * If complete is true, we saw an error after opening up the tree to + * descent through the root page's new index. There is nothing we + * can do, there are threads potentially active in both versions of + * the tree. + * + * A note on error handling: if we completed the split, return success, + * nothing really bad can have happened, and our caller has to proceed + * with the split. + */ + if (!complete) + __wt_free_ref_index(session, root, alloc_index, true); + + if (ret != 0 && ret != WT_PANIC) + __wt_err(session, ret, + "ignoring not-fatal error during root page split to " + "deepen the tree"); + return (ret == WT_PANIC || !complete ? ret : 0); +} + +/* + * __split_parent -- + * Resolve a multi-page split, inserting new information into the parent. + */ +static int +__split_parent(WT_SESSION_IMPL *session, WT_REF *ref, WT_REF **ref_new, + uint32_t new_entries, size_t parent_incr, bool exclusive, bool discard) +{ + WT_DECL_ITEM(scr); + WT_DECL_RET; + WT_IKEY *ikey; + WT_PAGE *parent; + WT_PAGE_INDEX *alloc_index, *pindex; + WT_REF **alloc_refp, *next_ref; + size_t parent_decr, size; + uint64_t split_gen; + uint32_t i, j; + uint32_t deleted_entries, parent_entries, result_entries; + uint32_t *deleted_refs; + bool complete, empty_parent; + + parent = ref->home; + + alloc_index = pindex = NULL; + parent_decr = 0; + parent_entries = 0; + complete = empty_parent = false; + + /* The parent page will be marked dirty, make sure that will succeed. */ + WT_RET(__wt_page_modify_init(session, parent)); + + /* + * We've locked the parent, which means it cannot split (which is the + * only reason to worry about split generation values). + */ + pindex = WT_INTL_INDEX_GET_SAFE(parent); + parent_entries = pindex->entries; + + /* + * Remove any refs to deleted pages while we are splitting, we have + * the internal page locked down, and are copying the refs into a new + * array anyway. Switch them to the special split state, so that any + * reading thread will restart. + */ + WT_RET(__wt_scr_alloc(session, 10 * sizeof(uint32_t), &scr)); + for (deleted_entries = 0, i = 0; i < parent_entries; ++i) { + next_ref = pindex->index[i]; + WT_ASSERT(session, next_ref->state != WT_REF_SPLIT); + if ((discard && next_ref == ref) || + (next_ref->state == WT_REF_DELETED && + __wt_delete_page_skip(session, next_ref, true) && + __wt_atomic_casv32( + &next_ref->state, WT_REF_DELETED, WT_REF_SPLIT))) { + WT_ERR(__wt_buf_grow(session, scr, + (deleted_entries + 1) * sizeof(uint32_t))); + deleted_refs = scr->mem; + deleted_refs[deleted_entries++] = i; + } + } + + /* + * The final entry count consists of the original count, plus any new + * pages, less any WT_REFs we're removing (deleted entries plus the + * entry we're replacing). + */ + result_entries = (parent_entries + new_entries) - deleted_entries; + if (!discard) + --result_entries; + + /* + * If there are no remaining entries on the parent, give up, we can't + * leave an empty internal page. Mark it to be evicted soon and clean + * up any references that have changed state. + */ + if (result_entries == 0) { + empty_parent = true; + __wt_page_evict_soon(parent); + goto err; + } + + /* + * Allocate and initialize a new page index array for the parent, then + * copy references from the original index array, plus references from + * the newly created split array, into place. + */ + size = sizeof(WT_PAGE_INDEX) + result_entries * sizeof(WT_REF *); + WT_ERR(__wt_calloc(session, 1, size, &alloc_index)); + parent_incr += size; + alloc_index->index = (WT_REF **)(alloc_index + 1); + alloc_index->entries = result_entries; + for (alloc_refp = alloc_index->index, i = 0; i < parent_entries; ++i) { + next_ref = pindex->index[i]; + if (next_ref == ref) + for (j = 0; j < new_entries; ++j) { + ref_new[j]->home = parent; + *alloc_refp++ = ref_new[j]; + } + else if (next_ref->state != WT_REF_SPLIT) + /* Skip refs we have marked for deletion. */ + *alloc_refp++ = next_ref; + } + + /* Check that we filled in all the entries. */ + WT_ASSERT(session, + alloc_refp - alloc_index->index == (ptrdiff_t)result_entries); + + /* + * Confirm the parent page's index hasn't moved then update it, which + * makes the split visible to threads descending the tree. + */ WT_ASSERT(session, WT_INTL_INDEX_GET_SAFE(parent) == pindex); WT_INTL_INDEX_SET(parent, alloc_index); - split_gen = __wt_atomic_addv64(&S2C(session)->split_gen, 1); - complete = true; + alloc_index = NULL; #ifdef HAVE_DIAGNOSTIC WT_WITH_PAGE_INDEX(session, __split_verify_intl_key_order(session, parent)); #endif + /* - * Save the number of entries created by deepening the tree and reset - * the count of splits into this page after that point. + * If discarding the page's original WT_REF field, reset it to split. + * Threads cursoring through the tree were blocked because that WT_REF + * state was set to locked. Changing the locked state to split unblocks + * those threads and causes them to re-calculate their position based + * on the just-updated parent page's index. */ - parent->pg_intl_deepen_split_append = 0; - parent->pg_intl_deepen_split_last = alloc_index->entries; + if (discard) + WT_PUBLISH(ref->state, WT_REF_SPLIT); /* - * The moved reference structures now reference the wrong parent page, - * and we have to fix that up. The problem is revealed when a thread - * of control searches for a page's reference structure slot, and fails - * to find it because the page it's searching no longer references it. - * When that failure happens, the thread waits for the reference's home - * page to be updated, which we do here: walk the children and fix them - * up. + * Push out the changes: not required for correctness, but don't let + * threads spin on incorrect page references longer than necessary. + */ + WT_FULL_BARRIER(); + + /* + * A note on error handling: failures before we swapped the new page + * index into the parent can be resolved by freeing allocated memory + * because the original page is unchanged, we can continue to use it + * and we have not yet modified the parent. Failures after we swap + * the new page index into the parent are also relatively benign, the + * split is OK and complete. For those reasons, we ignore errors past + * this point unless there's a panic. + */ + complete = true; + + WT_ERR(__wt_verbose(session, WT_VERB_SPLIT, + "%p: %s %s" "split into parent %p, %" PRIu32 " -> %" PRIu32 + " (%s%" PRIu32 ")", + ref->page, ref->page == NULL ? + "unknown page type" : __wt_page_type_string(ref->page->type), + ref->page == NULL ? "reverse " : "", parent, + parent_entries, result_entries, + ref->page == NULL ? "-" : "+", + ref->page == NULL ? + parent_entries - result_entries : result_entries - parent_entries)); + + /* + * The new page index is in place, free the WT_REF we were splitting and + * any deleted WT_REFs we found, modulo the usual safe free semantics. * - * We're not acquiring hazard pointers on these pages, they cannot be - * evicted because of the eviction transaction value set above. - */ - for (parent_refp = alloc_index->index, - i = alloc_index->entries; i > 0; ++parent_refp, --i) { - parent_ref = *parent_refp; - WT_ASSERT(session, parent_ref->home == parent); - if (parent_ref->state != WT_REF_MEM) - continue; + * Acquire a new split generation. + */ + split_gen = __wt_atomic_addv64(&S2C(session)->split_gen, 1); + for (i = 0, deleted_refs = scr->mem; i < deleted_entries; ++i) { + next_ref = pindex->index[deleted_refs[i]]; + WT_ASSERT(session, next_ref->state == WT_REF_SPLIT); /* - * We left the first/last children of the parent at the current - * level to avoid bad split patterns, they might be leaf pages; - * check the page type before we continue. - */ - child = parent_ref->page; - if (!WT_PAGE_IS_INTERNAL(child)) - continue; -#ifdef HAVE_DIAGNOSTIC - WT_WITH_PAGE_INDEX(session, - __split_verify_intl_key_order(session, child)); -#endif - /* - * We have the parent locked, but there's nothing to prevent - * this child from splitting beneath us; ensure that reading - * the child's page index structure is safe. + * We set the WT_REF to split, discard it, freeing any resources + * it holds. + * + * Row-store trees where the old version of the page is being + * discarded: the previous parent page's key for this child page + * may have been an on-page overflow key. In that case, if the + * key hasn't been deleted, delete it now, including its backing + * blocks. We are exchanging the WT_REF that referenced it for + * the split page WT_REFs and their keys, and there's no longer + * any reference to it. Done after completing the split (if we + * failed, we'd leak the underlying blocks, but the parent page + * would be unaffected). */ - WT_ENTER_PAGE_INDEX(session); - WT_INTL_FOREACH_BEGIN(session, child, child_ref) { + if (parent->type == WT_PAGE_ROW_INT) { + WT_TRET(__split_ovfl_key_cleanup( + session, parent, next_ref)); + ikey = __wt_ref_key_instantiated(next_ref); + if (ikey != NULL) { + size = sizeof(WT_IKEY) + ikey->size; + WT_TRET(__split_safe_free( + session, split_gen, exclusive, ikey, size)); + parent_decr += size; + } /* - * The page's parent reference may not be wrong, as we - * opened up access from the top of the tree already, - * pages may have been read in since then. Check and - * only update pages that reference the original page, - * they must be wrong. + * The page_del structure can be freed immediately: it + * is only read when the ref state is WT_REF_DELETED. + * The size of the structure wasn't added to the parent, + * don't decrement. */ - if (child_ref->home == parent) { - child_ref->home = child; - child_ref->pindex_hint = 0; + if (next_ref->page_del != NULL) { + __wt_free(session, + next_ref->page_del->update_list); + __wt_free(session, next_ref->page_del); } - } WT_INTL_FOREACH_END; - WT_LEAVE_PAGE_INDEX(session); + } + + WT_TRET(__split_safe_free( + session, split_gen, exclusive, next_ref, sizeof(WT_REF))); + parent_decr += sizeof(WT_REF); } + /* We freed the reference that was split in the loop above. */ + ref = NULL; + /* - * Push out the changes: not required for correctness, but don't let - * threads spin on incorrect page references longer than necessary. + * We can't free the previous page index, there may be threads using it. + * Add it to the session discard list, to be freed when it's safe. */ - WT_FULL_BARRIER(); - alloc_index = NULL; + size = sizeof(WT_PAGE_INDEX) + pindex->entries * sizeof(WT_REF *); + WT_TRET(__split_safe_free(session, split_gen, exclusive, pindex, size)); + parent_decr += size; + /* Adjust the parent's memory footprint and mark it dirty. */ + __wt_cache_page_inmem_incr(session, parent, parent_incr); + __wt_cache_page_inmem_decr(session, parent, parent_decr); + __wt_page_modify_set(session, parent); + +err: __wt_scr_free(session, &scr); /* - * We can't free the previous parent's index, there may be threads using - * it. Add to the session's discard list, to be freed once we know no - * threads can still be using it. + * A note on error handling: if we completed the split, return success, + * nothing really bad can have happened, and our caller has to proceed + * with the split. + */ + if (!complete) { + for (i = 0; i < parent_entries; ++i) { + next_ref = pindex->index[i]; + if (next_ref->state == WT_REF_SPLIT) + next_ref->state = WT_REF_DELETED; + } + + __wt_free_ref_index(session, NULL, alloc_index, false); + + /* + * The split couldn't proceed because the parent would be empty, + * return EBUSY so our caller knows to unlock the WT_REF that's + * being deleted, but don't be noisy, there's nothing wrong. + */ + if (empty_parent) + return (EBUSY); + } + + if (ret != 0 && ret != WT_PANIC) + __wt_err(session, ret, + "ignoring not-fatal error during parent page split"); + return (ret == WT_PANIC || !complete ? ret : 0); +} + +/* + * __split_internal -- + * Split an internal page into its parent. + */ +static int +__split_internal(WT_SESSION_IMPL *session, WT_PAGE *parent, WT_PAGE *page) +{ + WT_BTREE *btree; + WT_DECL_RET; + WT_PAGE *child; + WT_PAGE_INDEX *alloc_index, *child_pindex, *pindex, *replace_index; + WT_REF **alloc_refp; + WT_REF **child_refp, *page_ref, **page_refp, *ref; + size_t child_incr, page_decr, page_incr, parent_incr, size; + uint64_t split_gen; + uint32_t children, chunk, i, j, remain; + uint32_t slots; + bool complete; + void *p; + + WT_STAT_FAST_CONN_INCR(session, cache_eviction_split_internal); + WT_STAT_FAST_DATA_INCR(session, cache_eviction_split_internal); + + /* The page will be marked dirty, make sure that will succeed. */ + WT_RET(__wt_page_modify_init(session, page)); + + btree = S2BT(session); + alloc_index = replace_index = NULL; + page_ref = page->pg_intl_parent_ref; + page_decr = page_incr = parent_incr = 0; + complete = false; + + /* + * Our caller is holding the page locked to single-thread splits, which + * means we can safely look at the page's index without setting a split + * generation. + */ + pindex = WT_INTL_INDEX_GET_SAFE(page); + + /* + * Decide how many child pages to create, then calculate the standard + * chunk and whatever remains. Sanity check the number of children: + * the decision to split matched to the deepen-per-child configuration + * might get it wrong. + */ + children = pindex->entries / btree->split_deepen_per_child; + if (children < 10) { + if (pindex->entries < 100) + return (EBUSY); + children = 10; + } + chunk = pindex->entries / children; + remain = pindex->entries - chunk * (children - 1); + + WT_ERR(__wt_verbose(session, WT_VERB_SPLIT, + "%p: %" PRIu32 " internal page elements, splitting %" PRIu32 + " children into parent %p", + page, pindex->entries, children, parent)); + + /* + * Ideally, we'd discard the original page, but that's hard since other + * threads of control are using it (for example, if eviction is walking + * the tree and looking at the page.) Instead, perform a right-split, + * moving all except the first chunk of the page's WT_REF objects to new + * pages. * - * This change requires care with error handling: we have already - * updated the page with a new index. Even if stashing the old value - * fails, we don't roll back that change, because threads may already - * be using the new index. + * Create and initialize a replacement WT_PAGE_INDEX for the original + * page. */ - size = sizeof(WT_PAGE_INDEX) + pindex->entries * sizeof(WT_REF *); - WT_TRET(__split_safe_free(session, split_gen, 0, pindex, size)); - parent_decr += size; + size = sizeof(WT_PAGE_INDEX) + chunk * sizeof(WT_REF *); + WT_ERR(__wt_calloc(session, 1, size, &replace_index)); + page_incr += size; + replace_index->index = (WT_REF **)(replace_index + 1); + replace_index->entries = chunk; + for (page_refp = pindex->index, i = 0; i < chunk; ++i) + replace_index->index[i] = *page_refp++; /* - * Adjust the parent's memory footprint. + * Allocate a new WT_PAGE_INDEX and set of WT_REF objects to be inserted + * into the page's parent, replacing the page's page-index. + * + * The first slot of the new WT_PAGE_INDEX is the original page WT_REF. + * The remainder of the slots are allocated WT_REFs. */ - __wt_cache_page_inmem_incr(session, parent, parent_incr); - __wt_cache_page_inmem_decr(session, parent, parent_decr); + size = sizeof(WT_PAGE_INDEX) + children * sizeof(WT_REF *); + WT_ERR(__wt_calloc(session, 1, size, &alloc_index)); + parent_incr += size; + alloc_index->index = (WT_REF **)(alloc_index + 1); + alloc_index->entries = children; + alloc_refp = alloc_index->index; + *alloc_refp++ = page_ref; + for (i = 1; i < children; ++alloc_refp, ++i) + WT_ERR(__wt_calloc_one(session, alloc_refp)); + parent_incr += children * sizeof(WT_REF); + + /* Allocate child pages, and connect them into the new page index. */ + WT_ASSERT(session, page_refp == pindex->index + chunk); + for (alloc_refp = alloc_index->index + 1, i = 1; i < children; ++i) { + slots = i == children - 1 ? remain : chunk; + WT_ERR(__wt_page_alloc( + session, page->type, 0, slots, false, &child)); + + /* + * Initialize the page's child reference; we need a copy of the + * page's key. + */ + ref = *alloc_refp++; + ref->home = parent; + ref->page = child; + ref->addr = NULL; + if (page->type == WT_PAGE_ROW_INT) { + __wt_ref_key(page, *page_refp, &p, &size); + WT_ERR(__wt_row_ikey(session, 0, p, size, ref)); + parent_incr += sizeof(WT_IKEY) + size; + } else + ref->key.recno = (*page_refp)->key.recno; + ref->state = WT_REF_MEM; + + /* Initialize the child page. */ + if (page->type == WT_PAGE_COL_INT) + child->pg_intl_recno = (*page_refp)->key.recno; + child->pg_intl_parent_ref = ref; + + /* Mark it dirty. */ + WT_ERR(__wt_page_modify_init(session, child)); + __wt_page_modify_set(session, child); + + /* Ensure the page isn't evicted or split for now. */ + __split_child_block_evict_and_split(child); + + /* + * The newly allocated child's page index references the same + * structures as the parent. (We cannot move WT_REF structures, + * threads may be underneath us right now changing the structure + * state.) However, if the WT_REF structures reference on-page + * information, we have to fix that, because the disk image for + * the page that has an page index entry for the WT_REF is about + * to be discarded. + */ + child_pindex = WT_INTL_INDEX_GET_SAFE(child); + child_incr = 0; + for (child_refp = child_pindex->index, + j = 0; j < slots; ++child_refp, ++page_refp, ++j) + WT_ERR(__split_ref_move(session, page, + page_refp, &page_decr, child_refp, &child_incr)); + + __wt_cache_page_inmem_incr(session, child, child_incr); + } + WT_ASSERT(session, alloc_refp - + alloc_index->index == (ptrdiff_t)alloc_index->entries); + WT_ASSERT(session, + page_refp - pindex->index == (ptrdiff_t)pindex->entries); + + /* Split into the parent. */ + WT_ERR(__split_parent(session, page_ref, alloc_index->index, + alloc_index->entries, parent_incr, false, false)); + + /* + * A note on error handling: until this point, there's no problem with + * unwinding on error. We allocated a new page index, a new set of + * WT_REFs and a new set of child pages -- if an error occurred, the + * page remained unchanged, although it may have an incorrect memory + * footprint. From now on we've modified the parent page, attention + * needs to be paid. However, subsequent failures are relatively benign, + * the split is OK and complete. For that reason, we ignore errors past + * this point unless there's a panic. + */ + complete = true; + + /* Confirm the page's index hasn't moved, then update it. */ + WT_ASSERT(session, WT_INTL_INDEX_GET_SAFE(page) == pindex); + WT_INTL_INDEX_SET(page, replace_index); + +#ifdef HAVE_DIAGNOSTIC + WT_WITH_PAGE_INDEX(session, + __split_verify_intl_key_order(session, parent)); + WT_WITH_PAGE_INDEX(session, + __split_verify_intl_key_order(session, page)); +#endif + + /* Fix up the moved WT_REF structures. */ + WT_ERR(__split_ref_move_final( + session, alloc_index->index + 1, alloc_index->entries - 1)); + + /* + * We don't care about the page-index we allocated, all we needed was + * the array of WT_REF structures, which has now been split into the + * parent page. + */ + __wt_free(session, alloc_index); + + /* + * We can't free the previous page's index, there may be threads using + * it. Add to the session's discard list, to be freed once we know no + * threads can still be using it. + * + * This change requires care with error handling, we've already updated + * the parent page. Even if stashing the old value fails, we don't roll + * back that change, because threads may already be using the new parent + * page. + */ + split_gen = __wt_atomic_addv64(&S2C(session)->split_gen, 1); + size = sizeof(WT_PAGE_INDEX) + pindex->entries * sizeof(WT_REF *); + WT_TRET(__split_safe_free(session, split_gen, false, pindex, size)); + page_decr += size; + + /* Adjust the page's memory footprint, and mark it dirty. */ + __wt_cache_page_inmem_incr(session, page, page_incr); + __wt_cache_page_inmem_decr(session, page, page_decr); + __wt_page_modify_set(session, page); err: /* * If complete is true, we saw an error after opening up the tree to - * descent through the parent page's new index. There is nothing we - * can do, there are threads potentially active in both versions of - * the tree. + * descent through the page's new index. There is nothing we can do, + * there are threads potentially active in both versions of the tree. * * A note on error handling: if we completed the split, return success, * nothing really bad can have happened, and our caller has to proceed * with the split. */ - if (!complete) - __wt_free_ref_index(session, parent, alloc_index, true); + if (!complete) { + __wt_free_ref_index(session, page, alloc_index, true); + __wt_free_ref_index(session, page, replace_index, false); + } if (ret != 0 && ret != WT_PANIC) __wt_err(session, ret, - "ignoring not-fatal error during parent page split to " - "deepen the tree"); + "ignoring not-fatal error during internal page split"); return (ret == WT_PANIC || !complete ? ret : 0); } /* + * __split_internal_lock -- + * Lock an internal page. + */ +static int +__split_internal_lock( + WT_SESSION_IMPL *session, WT_REF *ref, WT_PAGE **parentp, bool *hazardp) +{ + WT_DECL_RET; + WT_PAGE *parent; + WT_REF *parent_ref; + + *hazardp = false; + *parentp = NULL; + + /* + * A checkpoint reconciling this parent page can deadlock with + * our split. We have an exclusive page lock on the child before + * we acquire the page's reconciliation lock, and reconciliation + * acquires the page's reconciliation lock before it encounters + * the child's exclusive lock (which causes reconciliation to + * loop until the exclusive lock is resolved). If we want to split + * the parent, give up to avoid that deadlock. + */ + if (S2BT(session)->checkpointing != WT_CKPT_OFF) + return (EBUSY); + + /* + * Get a page-level lock on the parent to single-thread splits into the + * page because we need to single-thread sizing/growing the page index. + * It's OK to queue up multiple splits as the child pages split, but the + * actual split into the parent has to be serialized. Note we allocate + * memory inside of the lock and may want to invest effort in making the + * locked period shorter. + * + * We use the reconciliation lock here because not only do we have to + * single-thread the split, we have to lock out reconciliation of the + * parent because reconciliation of the parent can't deal with finding + * a split child during internal page traversal. Basically, there's no + * reason to use a different lock if we have to block reconciliation + * anyway. + */ + for (;;) { + parent = ref->home; + + /* Skip pages that aren't ready to split. */ + if (F_ISSET_ATOMIC(parent, WT_PAGE_SPLIT_BLOCK)) + return (EBUSY); + + WT_RET(__wt_fair_lock(session, &parent->page_lock)); + if (parent == ref->home) + break; + WT_RET(__wt_fair_unlock(session, &parent->page_lock)); + } + + /* + * We have exclusive access to split the parent, and at this point, the + * child prevents the parent from being evicted. However, once we + * update the parent's index, it may no longer refer to the child, and + * could conceivably be evicted. Get a hazard pointer on the parent + * now, so that we can safely access it after updating the index. + * + * Take care getting the page doesn't trigger eviction work: we could + * block trying to split a different child of our parent and deadlock + * or we could be the eviction server relied upon by other threads to + * populate the eviction queue. + */ + if (!__wt_ref_is_root(parent_ref = parent->pg_intl_parent_ref)) { + WT_ERR(__wt_page_in(session, parent_ref, WT_READ_NO_EVICT)); + *hazardp = true; + } + + *parentp = parent; + return (0); + +err: WT_TRET(__wt_fair_unlock(session, &parent->page_lock)); + return (ret); +} + +/* + * __split_internal_unlock -- + * Unlock the parent page. + */ +static int +__split_internal_unlock(WT_SESSION_IMPL *session, WT_PAGE *parent, bool hazard) +{ + WT_DECL_RET; + + if (hazard) + ret = __wt_hazard_clear(session, parent); + + WT_TRET(__wt_fair_unlock(session, &parent->page_lock)); + return (ret); +} + +/* + * __split_internal_should_split -- + * Return if we should split an internal page. + */ +static bool +__split_internal_should_split(WT_SESSION_IMPL *session, WT_REF *ref) +{ + WT_BTREE *btree; + WT_PAGE *page; + WT_PAGE_INDEX *pindex; + + btree = S2BT(session); + page = ref->page; + + /* + * Our caller is holding the parent page locked to single-thread splits, + * which means we can safely look at the page's index without setting a + * split generation. + */ + pindex = WT_INTL_INDEX_GET_SAFE(page); + + /* Sanity check for a reasonable number of on-page keys. */ + if (pindex->entries < 100) + return (false); + + /* + * Deepen the tree if the page's memory footprint is larger than the + * maximum size for a page in memory (presumably putting eviction + * pressure on the cache). + */ + if (page->memory_footprint > btree->maxmempage) + return (true); + + /* + * Check if the page has enough keys to make it worth splitting. If + * the number of keys is allowed to grow too large, the cost of + * splitting into parent pages can become large enough to result + * in slow operations. + */ + if (pindex->entries > btree->split_deepen_min_child) + return (true); + + return (false); +} + +/* + * __split_parent_climb -- + * Check if we should split up the tree. + */ +static int +__split_parent_climb(WT_SESSION_IMPL *session, WT_PAGE *page, bool page_hazard) +{ + WT_DECL_RET; + WT_PAGE *parent; + WT_REF *ref; + bool parent_hazard; + + /* + * Page splits trickle up the tree, that is, as leaf pages grow large + * enough and are evicted, they'll split into their parent. And, as + * that parent page grows large enough and is evicted, it splits into + * its parent and so on. When the page split wave reaches the root, + * the tree will permanently deepen as multiple root pages are written. + * + * However, this only helps if internal pages are evicted (and we resist + * evicting internal pages for obvious reasons), or if the tree were to + * be closed and re-opened from a disk image, which may be a rare event. + * + * To avoid internal pages becoming too large absent eviction, check + * parent pages each time pages are split into them. If the page is big + * enough, either split the page into its parent or, in the case of the + * root, deepen the tree. + * + * Split up the tree. + */ + for (;;) { + parent = NULL; + parent_hazard = false; + ref = page->pg_intl_parent_ref; + + /* If we don't need to split the page, we're done. */ + if (!__split_internal_should_split(session, ref)) + break; + + /* + * If we've reached the root page, there are no subsequent pages + * to review, deepen the tree and quit. + */ + if (__wt_ref_is_root(ref)) { + ret = __split_root(session, page); + break; + } + + /* + * Lock the parent and split into it, then swap the parent/page + * locks, lock-coupling up the tree. + */ + WT_ERR(__split_internal_lock( + session, ref, &parent, &parent_hazard)); + ret = __split_internal(session, parent, page); + WT_TRET(__split_internal_unlock(session, page, page_hazard)); + + page = parent; + page_hazard = parent_hazard; + parent = NULL; + parent_hazard = false; + WT_ERR(ret); + } + +err: if (parent != NULL) + WT_TRET( + __split_internal_unlock(session, parent, parent_hazard)); + WT_TRET(__split_internal_unlock(session, page, page_hazard)); + + /* A page may have been busy, in which case return without error. */ + WT_RET_BUSY_OK(ret); + return (0); +} + +/* * __split_multi_inmem -- * Instantiate a page in a multi-block set. */ @@ -901,369 +1600,6 @@ __wt_multi_to_ref(WT_SESSION_IMPL *session, } /* - * __split_parent_lock -- - * Lock the parent page. - */ -static int -__split_parent_lock( - WT_SESSION_IMPL *session, WT_REF *ref, WT_PAGE **parentp, bool *hazardp) -{ - WT_DECL_RET; - WT_PAGE *parent; - WT_REF *parent_ref; - - *hazardp = false; - *parentp = NULL; - - /* - * A checkpoint reconciling this parent page can deadlock with - * our split. We have an exclusive page lock on the child before - * we acquire the page's reconciliation lock, and reconciliation - * acquires the page's reconciliation lock before it encounters - * the child's exclusive lock (which causes reconciliation to - * loop until the exclusive lock is resolved). If we want to split - * the parent, give up to avoid that deadlock. - */ - if (S2BT(session)->checkpointing != WT_CKPT_OFF) - return (EBUSY); - - /* - * Get a page-level lock on the parent to single-thread splits into the - * page because we need to single-thread sizing/growing the page index. - * It's OK to queue up multiple splits as the child pages split, but the - * actual split into the parent has to be serialized. Note we allocate - * memory inside of the lock and may want to invest effort in making the - * locked period shorter. - * - * We use the reconciliation lock here because not only do we have to - * single-thread the split, we have to lock out reconciliation of the - * parent because reconciliation of the parent can't deal with finding - * a split child during internal page traversal. Basically, there's no - * reason to use a different lock if we have to block reconciliation - * anyway. - */ - for (;;) { - parent = ref->home; - WT_RET(__wt_fair_lock(session, &parent->page_lock)); - if (parent == ref->home) - break; - /* Try again if the page deepened while we were waiting */ - WT_RET(__wt_fair_unlock(session, &parent->page_lock)); - } - - /* - * We have exclusive access to split the parent, and at this point, the - * child prevents the parent from being evicted. However, once we - * update the parent's index, it will no longer refer to the child, and - * could conceivably be evicted. Get a hazard pointer on the parent - * now, so that we can safely access it after updating the index. - * - * Take care getting the page doesn't trigger eviction work: we could - * block trying to split a different child of our parent and deadlock - * or we could be the eviction server relied upon by other threads to - * populate the eviction queue. - */ - if (!__wt_ref_is_root(parent_ref = parent->pg_intl_parent_ref)) { - WT_ERR(__wt_page_in(session, parent_ref, WT_READ_NO_EVICT)); - *hazardp = true; - } - - *parentp = parent; - return (0); - -err: WT_TRET(__wt_fair_unlock(session, &parent->page_lock)); - return (ret); -} - -/* - * __split_parent_unlock -- - * Unlock the parent page. - */ -static int -__split_parent_unlock(WT_SESSION_IMPL *session, WT_PAGE *parent, bool hazard) -{ - WT_DECL_RET; - - if (hazard) - ret = __wt_hazard_clear(session, parent); - - WT_TRET(__wt_fair_unlock(session, &parent->page_lock)); - return (ret); -} - -/* - * __split_parent -- - * Resolve a multi-page split, inserting new information into the parent. - */ -static int -__split_parent(WT_SESSION_IMPL *session, WT_REF *ref, - WT_REF **ref_new, uint32_t new_entries, size_t parent_incr, int exclusive) -{ - WT_DECL_RET; - WT_IKEY *ikey; - WT_PAGE *parent; - WT_PAGE_INDEX *alloc_index, *pindex; - WT_REF **alloc_refp, *next_ref, *parent_ref; - size_t parent_decr, size; - uint64_t split_gen; - uint32_t i, j; - uint32_t deleted_entries, parent_entries, result_entries; - bool complete; - - parent = ref->home; - parent_ref = parent->pg_intl_parent_ref; - - alloc_index = pindex = NULL; - parent_decr = 0; - parent_entries = 0; - complete = false; - - /* - * We've locked the parent, which means it cannot split (which is the - * only reason to worry about split generation values). - */ - pindex = WT_INTL_INDEX_GET_SAFE(parent); - parent_entries = pindex->entries; - - /* - * Remove any refs to deleted pages while we are splitting, we have - * the internal page locked down, and are copying the refs into a new - * array anyway. Switch them to the special split state, so that any - * reading thread will restart. Include the ref we are splitting in - * the count to be deleted. - */ - for (deleted_entries = 1, i = 0; i < parent_entries; ++i) { - next_ref = pindex->index[i]; - WT_ASSERT(session, next_ref->state != WT_REF_SPLIT); - if (next_ref->state == WT_REF_DELETED && - __wt_delete_page_skip(session, next_ref, true) && - __wt_atomic_casv32( - &next_ref->state, WT_REF_DELETED, WT_REF_SPLIT)) - deleted_entries++; - } - - /* - * The final entry count consists of the original count, plus any new - * pages, less any WT_REFs we're removing. - */ - result_entries = (parent_entries + new_entries) - deleted_entries; - - /* - * If the entire (sub)tree is empty, give up: we can't leave an empty - * internal page. Mark it to be evicted soon and clean up any - * references that have changed state. - */ - if (result_entries == 0) { - __wt_page_evict_soon(parent); - goto err; - } - - /* - * Allocate and initialize a new page index array for the parent, then - * copy references from the original index array, plus references from - * the newly created split array, into place. - */ - size = sizeof(WT_PAGE_INDEX) + result_entries * sizeof(WT_REF *); - WT_ERR(__wt_calloc(session, 1, size, &alloc_index)); - parent_incr += size; - alloc_index->index = (WT_REF **)(alloc_index + 1); - alloc_index->entries = result_entries; - for (alloc_refp = alloc_index->index, i = 0; i < parent_entries; ++i) { - next_ref = pindex->index[i]; - if (next_ref == ref) { - for (j = 0; j < new_entries; ++j) { - ref_new[j]->home = parent; - *alloc_refp++ = ref_new[j]; - - /* - * Clear the split reference as it moves to the - * allocated page index, so it never appears on - * both after an error. - */ - ref_new[j] = NULL; - } - - /* - * We detect append-style workloads to avoid repeatedly - * deepening parts of the tree where no work is being - * done by tracking if we're splitting after the slots - * created by the last split to deepen this parent. - * - * Note the calculation: i is a 0-based array offset and - * split-last is a count of entries, also either or both - * i and split-last might be unsigned 0, don't decrement - * either one. - */ - if (i > parent->pg_intl_deepen_split_last) - parent-> - pg_intl_deepen_split_append += new_entries; - } else if (next_ref->state != WT_REF_SPLIT) - /* Skip refs we have marked for deletion. */ - *alloc_refp++ = next_ref; - } - - /* Check that we filled in all the entries. */ - WT_ASSERT(session, - alloc_refp - alloc_index->index == (ptrdiff_t)result_entries); - - /* - * Confirm the parent page's index hasn't moved then update it, which - * makes the split visible to threads descending the tree. - */ - WT_ASSERT(session, WT_INTL_INDEX_GET_SAFE(parent) == pindex); - WT_INTL_INDEX_SET(parent, alloc_index); - split_gen = __wt_atomic_addv64(&S2C(session)->split_gen, 1); - alloc_index = NULL; - -#ifdef HAVE_DIAGNOSTIC - WT_WITH_PAGE_INDEX(session, - __split_verify_intl_key_order(session, parent)); -#endif - - /* - * Reset the page's original WT_REF field to split. Threads cursoring - * through the tree were blocked because that WT_REF state was set to - * locked. This update changes the locked state to split, unblocking - * those threads and causing them to re-calculate their position based - * on the updated parent page's index. - */ - WT_PUBLISH(ref->state, WT_REF_SPLIT); - - /* - * A note on error handling: failures before we swapped the new page - * index into the parent can be resolved by freeing allocated memory - * because the original page is unchanged, we can continue to use it - * and we have not yet modified the parent. Failures after we swap - * the new page index into the parent are also relatively benign, the - * split is OK and complete. For those reasons, we ignore errors past - * this point unless there's a panic. - */ - complete = true; - - WT_ERR(__wt_verbose(session, WT_VERB_SPLIT, - "%s split into parent %" PRIu32 " -> %" PRIu32 - " (%" PRIu32 ")", ref->page == NULL ? - "reverse" : __wt_page_type_string(ref->page->type), - parent_entries, result_entries, result_entries - parent_entries)); - - /* - * The new page index is in place, free the WT_REF we were splitting - * and any deleted WT_REFs we found, modulo the usual safe free - * semantics. - */ - for (i = 0; deleted_entries > 0 && i < parent_entries; ++i) { - next_ref = pindex->index[i]; - if (next_ref->state != WT_REF_SPLIT) - continue; - --deleted_entries; - - /* - * We set the WT_REF to split, discard it, freeing any resources - * it holds. - * - * Row-store trees where the old version of the page is being - * discarded: the previous parent page's key for this child page - * may have been an on-page overflow key. In that case, if the - * key hasn't been deleted, delete it now, including its backing - * blocks. We are exchanging the WT_REF that referenced it for - * the split page WT_REFs and their keys, and there's no longer - * any reference to it. Done after completing the split (if we - * failed, we'd leak the underlying blocks, but the parent page - * would be unaffected). - */ - if (parent->type == WT_PAGE_ROW_INT) { - WT_TRET(__split_ovfl_key_cleanup( - session, parent, next_ref)); - ikey = __wt_ref_key_instantiated(next_ref); - if (ikey != NULL) { - size = sizeof(WT_IKEY) + ikey->size; - WT_TRET(__split_safe_free( - session, split_gen, 0, ikey, size)); - parent_decr += size; - } - /* - * The page_del structure can be freed immediately: it - * is only read when the ref state is WT_REF_DELETED. - * The size of the structure wasn't added to the parent, - * don't decrement. - */ - if (next_ref->page_del != NULL) { - __wt_free(session, - next_ref->page_del->update_list); - __wt_free(session, next_ref->page_del); - } - } - - WT_TRET(__split_safe_free( - session, split_gen, 0, next_ref, sizeof(WT_REF))); - parent_decr += sizeof(WT_REF); - } - - /* We freed the reference that was split in the loop above. */ - ref = NULL; - - /* - * We can't free the previous page index, there may be threads using it. - * Add it to the session discard list, to be freed when it's safe. - */ - size = sizeof(WT_PAGE_INDEX) + pindex->entries * sizeof(WT_REF *); - WT_TRET(__split_safe_free(session, split_gen, exclusive, pindex, size)); - parent_decr += size; - - /* - * Adjust the parent's memory footprint. - */ - __wt_cache_page_inmem_incr(session, parent, parent_incr); - __wt_cache_page_inmem_decr(session, parent, parent_decr); - - /* - * Simple page splits trickle up the tree, that is, as leaf pages grow - * large enough and are evicted, they'll split into their parent. And, - * as that parent grows large enough and is evicted, it will split into - * its parent and so on. When the page split wave reaches the root, - * the tree will permanently deepen as multiple root pages are written. - * However, this only helps if first, the pages are evicted (and - * we resist evicting internal pages for obvious reasons), and second, - * if the tree is closed and re-opened from a disk image, which may be - * a rare event. - * To avoid the case of internal pages becoming too large when they - * aren't being evicted, check internal pages each time a leaf page is - * split into them. If it's big enough, deepen the tree at that point. - * Do the check here because we've just grown the parent page and - * are holding it locked. - */ - if (ret == 0 && !exclusive && - __split_should_deepen(session, parent_ref)) - ret = __split_deepen(session, parent); - -err: /* - * A note on error handling: if we completed the split, return success, - * nothing really bad can have happened, and our caller has to proceed - * with the split. - */ - if (!complete) { - for (i = 0; i < parent_entries; ++i) { - next_ref = pindex->index[i]; - if (next_ref->state == WT_REF_SPLIT) - next_ref->state = WT_REF_DELETED; - } - - /* If we gave up on a reverse split, unlock the child. */ - if (ref_new == NULL) { - WT_ASSERT(session, ref->state == WT_REF_LOCKED); - ref->state = WT_REF_DELETED; - } - - __wt_free_ref_index(session, NULL, alloc_index, false); - } - - if (ret != 0 && ret != WT_PANIC) - __wt_err(session, ret, - "ignoring not-fatal error during parent page split"); - return (ret == WT_PANIC || !complete ? ret : 0); -} - -/* * __split_insert -- * Split a page's last insert list entries into a separate page. */ @@ -1279,6 +1615,9 @@ __split_insert(WT_SESSION_IMPL *session, WT_REF *ref) size_t page_decr, parent_incr, right_incr; int i; + WT_STAT_FAST_CONN_INCR(session, cache_inmem_split); + WT_STAT_FAST_DATA_INCR(session, cache_inmem_split); + page = ref->page; right = NULL; page_decr = parent_incr = right_incr = 0; @@ -1491,7 +1830,7 @@ __split_insert(WT_SESSION_IMPL *session, WT_REF *ref) */ page = NULL; if ((ret = __split_parent( - session, ref, split_ref, 2, parent_incr, false)) != 0) { + session, ref, split_ref, 2, parent_incr, false, true)) != 0) { /* * Move the insert list element back to the original page list. * For simplicity, the previous skip list pointers originally @@ -1513,9 +1852,6 @@ __split_insert(WT_SESSION_IMPL *session, WT_REF *ref) WT_ERR(ret); } - WT_STAT_FAST_CONN_INCR(session, cache_inmem_split); - WT_STAT_FAST_DATA_INCR(session, cache_inmem_split); - return (0); err: if (split_ref[0] != NULL) { @@ -1543,83 +1879,21 @@ __wt_split_insert(WT_SESSION_IMPL *session, WT_REF *ref) WT_PAGE *parent; bool hazard; - WT_RET(__split_parent_lock(session, ref, &parent, &hazard)); - ret = __split_insert(session, ref); - WT_TRET(__split_parent_unlock(session, parent, hazard)); - return (ret); -} + WT_RET(__wt_verbose( + session, WT_VERB_SPLIT, "%p: split-insert", ref->page)); -/* - * __wt_split_reverse -- - * We have a locked ref that is empty and we want to rewrite the index in - * its parent. - */ -int -__wt_split_reverse(WT_SESSION_IMPL *session, WT_REF *ref) -{ - WT_DECL_RET; - WT_PAGE *parent; - bool hazard; - - WT_RET(__split_parent_lock(session, ref, &parent, &hazard)); - ret = __split_parent(session, ref, NULL, 0, 0, 0); - WT_TRET(__split_parent_unlock(session, parent, hazard)); - return (ret); -} - -/* - * __wt_split_rewrite -- - * Rewrite an in-memory page with a new version. - */ -int -__wt_split_rewrite(WT_SESSION_IMPL *session, WT_REF *ref) -{ - WT_DECL_RET; - WT_PAGE *page; - WT_PAGE_MODIFY *mod; - WT_REF new; - - page = ref->page; - mod = page->modify; - - /* - * This isn't a split: a reconciliation failed because we couldn't write - * something, and in the case of forced eviction, we need to stop this - * page from being such a problem. We have exclusive access, rewrite the - * page in memory. The code lives here because the split code knows how - * to re-create a page in memory after it's been reconciled, and that's - * exactly what we want to do. - * - * Build the new page. - */ - memset(&new, 0, sizeof(new)); - WT_ERR(__split_multi_inmem(session, page, &new, &mod->mod_multi[0])); - - /* - * The rewrite succeeded, we can no longer fail. - * - * Finalize the move, discarding moved update lists from the original - * page. - */ - __split_multi_inmem_final(page, &mod->mod_multi[0]); + WT_RET(__split_internal_lock(session, ref, &parent, &hazard)); + if ((ret = __split_insert(session, ref)) != 0) { + WT_TRET(__split_internal_unlock(session, parent, hazard)); + return (ret); + } /* - * Discard the original page. - * - * Pages with unresolved changes are not marked clean during - * reconciliation, do it now. + * Split up through the tree as necessary; we're holding the original + * parent page locked, note the functions we call are responsible for + * releasing that lock. */ - __wt_page_modify_clear(session, page); - __wt_ref_out(session, ref); - - /* Swap the new page into place. */ - ref->page = new.page; - WT_PUBLISH(ref->state, WT_REF_MEM); - - return (0); - -err: __split_multi_inmem_fail(session, &new); - return (ret); + return (__split_parent_climb(session, parent, hazard)); } /* @@ -1636,6 +1910,9 @@ __split_multi(WT_SESSION_IMPL *session, WT_REF *ref, bool closing) size_t parent_incr; uint32_t i, new_entries; + WT_STAT_FAST_CONN_INCR(session, cache_eviction_split_leaf); + WT_STAT_FAST_DATA_INCR(session, cache_eviction_split_leaf); + page = ref->page; mod = page->modify; new_entries = mod->mod_multi_entries; @@ -1656,10 +1933,7 @@ __split_multi(WT_SESSION_IMPL *session, WT_REF *ref, bool closing) * exclusively. */ WT_ERR(__split_parent( - session, ref, ref_new, new_entries, parent_incr, closing)); - - WT_STAT_FAST_CONN_INCR(session, cache_eviction_split); - WT_STAT_FAST_DATA_INCR(session, cache_eviction_split); + session, ref, ref_new, new_entries, parent_incr, closing, true)); /* * The split succeeded, we can no longer fail. @@ -1697,8 +1971,98 @@ __wt_split_multi(WT_SESSION_IMPL *session, WT_REF *ref, int closing) WT_PAGE *parent; bool hazard; - WT_RET(__split_parent_lock(session, ref, &parent, &hazard)); - ret = __split_multi(session, ref, closing); - WT_TRET(__split_parent_unlock(session, parent, hazard)); + WT_RET(__wt_verbose( + session, WT_VERB_SPLIT, "%p: split-multi", ref->page)); + + WT_RET(__split_internal_lock(session, ref, &parent, &hazard)); + if ((ret = __split_multi(session, ref, closing)) != 0 || closing) { + WT_TRET(__split_internal_unlock(session, parent, hazard)); + return (ret); + } + + /* + * Split up through the tree as necessary; we're holding the original + * parent page locked, note the functions we call are responsible for + * releasing that lock. + */ + return (__split_parent_climb(session, parent, hazard)); +} + +/* + * __wt_split_reverse -- + * We have a locked ref that is empty and we want to rewrite the index in + * its parent. + */ +int +__wt_split_reverse(WT_SESSION_IMPL *session, WT_REF *ref) +{ + WT_DECL_RET; + WT_PAGE *parent; + bool hazard; + + WT_RET(__wt_verbose( + session, WT_VERB_SPLIT, "%p: reverse-split", ref->page)); + + WT_RET(__split_internal_lock(session, ref, &parent, &hazard)); + ret = __split_parent(session, ref, NULL, 0, 0, false, true); + WT_TRET(__split_internal_unlock(session, parent, hazard)); + return (ret); +} + +/* + * __wt_split_rewrite -- + * Rewrite an in-memory page with a new version. + */ +int +__wt_split_rewrite(WT_SESSION_IMPL *session, WT_REF *ref) +{ + WT_DECL_RET; + WT_PAGE *page; + WT_PAGE_MODIFY *mod; + WT_REF new; + + page = ref->page; + mod = page->modify; + + WT_RET(__wt_verbose( + session, WT_VERB_SPLIT, "%p: split-rewrite", ref->page)); + + /* + * This isn't a split: a reconciliation failed because we couldn't write + * something, and in the case of forced eviction, we need to stop this + * page from being such a problem. We have exclusive access, rewrite the + * page in memory. The code lives here because the split code knows how + * to re-create a page in memory after it's been reconciled, and that's + * exactly what we want to do. + * + * Build the new page. + */ + memset(&new, 0, sizeof(new)); + WT_ERR(__split_multi_inmem(session, page, &new, &mod->mod_multi[0])); + + /* + * The rewrite succeeded, we can no longer fail. + * + * Finalize the move, discarding moved update lists from the original + * page. + */ + __split_multi_inmem_final(page, &mod->mod_multi[0]); + + /* + * Discard the original page. + * + * Pages with unresolved changes are not marked clean during + * reconciliation, do it now. + */ + __wt_page_modify_clear(session, page); + __wt_ref_out(session, ref); + + /* Swap the new page into place. */ + ref->page = new.page; + WT_PUBLISH(ref->state, WT_REF_MEM); + + return (0); + +err: __split_multi_inmem_fail(session, &new); return (ret); } diff --git a/src/third_party/wiredtiger/src/btree/bt_sync.c b/src/third_party/wiredtiger/src/btree/bt_sync.c index 7395cce11e1..07bb2eb3a01 100644 --- a/src/third_party/wiredtiger/src/btree/bt_sync.c +++ b/src/third_party/wiredtiger/src/btree/bt_sync.c @@ -191,7 +191,7 @@ __sync_file(WT_SESSION_IMPL *session, WT_CACHE_OP syncop) syncop == WT_SYNC_WRITE_LEAVES ? "WRITE_LEAVES" : "CHECKPOINT", leaf_bytes, leaf_pages, internal_bytes, internal_pages, - WT_TIMEDIFF(end, start) / WT_MILLION)); + WT_TIMEDIFF_MS(end, start))); } err: /* On error, clear any left-over tree walk. */ diff --git a/src/third_party/wiredtiger/src/btree/col_srch.c b/src/third_party/wiredtiger/src/btree/col_srch.c index d02f23ed164..e9fa570f97b 100644 --- a/src/third_party/wiredtiger/src/btree/col_srch.c +++ b/src/third_party/wiredtiger/src/btree/col_srch.c @@ -22,7 +22,7 @@ __wt_col_search(WT_SESSION_IMPL *session, WT_INSERT *ins; WT_INSERT_HEAD *ins_head; WT_PAGE *page; - WT_PAGE_INDEX *pindex; + WT_PAGE_INDEX *pindex, *parent_pindex; WT_REF *current, *descent; uint32_t base, indx, limit; int depth; @@ -37,10 +37,12 @@ __wt_col_search(WT_SESSION_IMPL *session, goto leaf_only; } +restart_root: /* Search the internal pages of the tree. */ current = &btree->root; - for (depth = 2;; ++depth) { -restart: page = current->page; + for (depth = 2, pindex = NULL;; ++depth) { + parent_pindex = pindex; +restart_page: page = current->page; if (page->type != WT_PAGE_COL_INT) break; @@ -51,8 +53,19 @@ restart: page = current->page; descent = pindex->index[base - 1]; /* Fast path appends. */ - if (recno >= descent->key.recno) + if (recno >= descent->key.recno) { + /* + * If on the last slot (the key is larger than any key + * on the page), check for an internal page split race. + */ + if (parent_pindex != NULL && + __wt_split_intl_race( + session, current->home, parent_pindex)) { + WT_RET(__wt_page_release(session, current, 0)); + goto restart_root; + } goto descend; + } /* Binary search of internal pages. */ for (base = 0, @@ -90,15 +103,13 @@ descend: /* * page; otherwise return on error, the swap call ensures we're * holding nothing on failure. */ - switch (ret = __wt_page_swap(session, current, descent, 0)) { - case 0: + if ((ret = __wt_page_swap(session, current, descent, 0)) == 0) { current = descent; - break; - case WT_RESTART: - goto restart; - default: - return (ret); + continue; } + if (ret == WT_RESTART) + goto restart_page; + return (ret); } /* Track how deep the tree gets. */ diff --git a/src/third_party/wiredtiger/src/btree/row_srch.c b/src/third_party/wiredtiger/src/btree/row_srch.c index 7b21f1e40bb..d2d8a4640ca 100644 --- a/src/third_party/wiredtiger/src/btree/row_srch.c +++ b/src/third_party/wiredtiger/src/btree/row_srch.c @@ -144,7 +144,7 @@ __wt_row_search(WT_SESSION_IMPL *session, WT_DECL_RET; WT_ITEM *item; WT_PAGE *page; - WT_PAGE_INDEX *pindex; + WT_PAGE_INDEX *pindex, *parent_pindex; WT_REF *current, *descent; WT_ROW *rip; size_t match, skiphigh, skiplow; @@ -155,16 +155,16 @@ __wt_row_search(WT_SESSION_IMPL *session, btree = S2BT(session); collator = btree->collator; item = cbt->tmp; + current = NULL; __cursor_pos_clear(cbt); /* - * The row-store search routine uses a different comparison API. - * The assumption is we're comparing more than a few keys with - * matching prefixes, and it's a win to avoid the memory fetches - * by skipping over those prefixes. That's done by tracking the - * length of the prefix match for the lowest and highest keys we - * compare as we descend the tree. + * In some cases we expect we're comparing more than a few keys with + * matching prefixes, so it's faster to avoid the memory fetches by + * skipping over those prefixes. That's done by tracking the length of + * the prefix match for the lowest and highest keys we compare as we + * descend the tree. */ skiphigh = skiplow = 0; @@ -186,10 +186,11 @@ __wt_row_search(WT_SESSION_IMPL *session, } /* Search the internal pages of the tree. */ - cmp = -1; +restart_root: current = &btree->root; - for (depth = 2;; ++depth) { -restart: page = current->page; + for (depth = 2, pindex = NULL;; ++depth) { + parent_pindex = pindex; +restart_page: page = current->page; if (page->type != WT_PAGE_ROW_INT) break; @@ -211,7 +212,7 @@ restart: page = current->page; WT_ERR(__wt_compare( session, collator, srch_key, item, &cmp)); if (cmp >= 0) - goto descend; + goto append; /* A failed append check turns off append checks. */ append_check = false; @@ -252,7 +253,26 @@ restart: page = current->page; } else if (cmp == 0) goto descend; } - else if (collator == NULL) + else if (collator == NULL) { + /* + * Reset the skipped prefix counts; we'd normally expect + * the parent's skipped prefix values to be larger than + * the child's values and so we'd only increase them as + * we walk down the tree (in other words, if we can skip + * N bytes on the parent, we can skip at least N bytes + * on the child). However, if a child internal page was + * split up into the parent, the child page's key space + * will have been truncated, and the values from the + * parent's search may be wrong for the child. We only + * need to reset the high count because the split-page + * algorithm truncates the end of the internal page's + * key space, the low count is still correct. We also + * don't need to clear either count when transitioning + * to a leaf page, a leaf page's key space can't change + * in flight. + */ + skiphigh = 0; + for (; limit != 0; limit >>= 1) { indx = base + (limit >> 1); descent = pindex->index[indx]; @@ -271,7 +291,7 @@ restart: page = current->page; else goto descend; } - else + } else for (; limit != 0; limit >>= 1) { indx = base + (limit >> 1); descent = pindex->index[indx]; @@ -288,9 +308,10 @@ restart: page = current->page; } /* - * Set the slot to descend the tree: descent is already set if - * there was an exact match on the page, otherwise, base is - * the smallest index greater than key, possibly (last + 1). + * Set the slot to descend the tree: descent was already set if + * there was an exact match on the page, otherwise, base is the + * smallest index greater than key, possibly one past the last + * slot. */ descent = pindex->index[base - 1]; @@ -298,25 +319,41 @@ restart: page = current->page; * If we end up somewhere other than the last slot, it's not a * right-side descent. */ - if (pindex->entries != base - 1) + if (pindex->entries != base) descend_right = false; + /* + * If on the last slot (the key is larger than any key on the + * page), check for an internal page split race. + */ + if (pindex->entries == base) { +append: if (parent_pindex != NULL && + __wt_split_intl_race( + session, current->home, parent_pindex)) { + if ((ret = __wt_page_release( + session, current, 0)) != 0) + return (ret); + + skiplow = skiphigh = 0; + goto restart_root; + } + } + descend: /* * Swap the current page for the child page. If the page splits * while we're retrieving it, restart the search in the current * page; otherwise return on error, the swap call ensures we're * holding nothing on failure. */ - switch (ret = __wt_page_swap(session, current, descent, 0)) { - case 0: + if ((ret = __wt_page_swap(session, current, descent, 0)) == 0) { current = descent; - break; - case WT_RESTART: + continue; + } + if (ret == WT_RESTART) { skiphigh = skiplow = 0; - goto restart; - default: - return (ret); + goto restart_page; } + return (ret); } /* Track how deep the tree gets. */ @@ -517,7 +554,7 @@ __wt_row_random(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt) __cursor_pos_clear(cbt); -restart: +restart_root: /* Walk the internal pages of the tree. */ current = &btree->root; for (;;) { @@ -544,7 +581,7 @@ restart: */ if (ret == WT_RESTART && (ret = __wt_page_release(session, current, 0)) == 0) - goto restart; + goto restart_root; return (ret); } |