summaryrefslogtreecommitdiff
path: root/src/third_party/wiredtiger/src/btree
diff options
context:
space:
mode:
authorRamon Fernandez <rfmnyc@gmail.com>2015-11-19 09:37:38 -0500
committerRamon Fernandez <rfmnyc@gmail.com>2015-11-19 09:41:39 -0500
commita0771ea5ec1b44537d3c409e3d712db24fd8e6bb (patch)
tree62517780ad0982ec80b8a6d968a72cf0474df617 /src/third_party/wiredtiger/src/btree
parent042d8fa2d252142489c5fa3009927bad20d77efb (diff)
downloadmongo-a0771ea5ec1b44537d3c409e3d712db24fd8e6bb.tar.gz
Import wiredtiger-wiredtiger-mongodb-3.2.0-rc3-177-g9d375e3.tar.gz from wiredtiger branch mongodb-3.2
ref: d9ec1ff..9d375e3 16c0a1a WT-1315 Fix some leaks with join cursors. 59857f9 WT-2222 Add statistics for named snapshots. 4368d39 WT-1315 Cursor join implementation a72ddb7 WT-2218 Add truncate stats fb9cebe WT-2224 Track which deleted refs are discarded by a split. e2f1130 WT-2220 Split WT_TIMEDIFF macro into unit specific macros. be412b5 WT-2182 when internal pages grow large enough, split them into their parents ce8c091 WT-2219 Enhancements to in-memory testing 347d922 WT-2220 time_t cleanup. 08c0fcd WT-2217 change WT_CURSOR.insert to clear "set" key/value on return d1b5e7f WT-2135 Fix log_only setting for backup cursor. Fix initialization. 78bd4ac WT-2210 raw compression fails if row-store recovery precedes column-store recovery c1b2634 WT-2182 fixes for splitting up the tree. 0a1ee34 WT-2199 Fix transaction sync inconsistency. ee31bb2 WT-2182 Simplify the split deepen logic. c360d53 WT-2212 Add a "use_environment" config to "wiredtiger_open" 3f132a4 WT-2182 detect internal page split races.
Diffstat (limited to 'src/third_party/wiredtiger/src/btree')
-rw-r--r--src/third_party/wiredtiger/src/btree/bt_cursor.c1
-rw-r--r--src/third_party/wiredtiger/src/btree/bt_debug.c2
-rw-r--r--src/third_party/wiredtiger/src/btree/bt_delete.c2
-rw-r--r--src/third_party/wiredtiger/src/btree/bt_handle.c17
-rw-r--r--src/third_party/wiredtiger/src/btree/bt_read.c6
-rw-r--r--src/third_party/wiredtiger/src/btree/bt_split.c1808
-rw-r--r--src/third_party/wiredtiger/src/btree/bt_sync.c2
-rw-r--r--src/third_party/wiredtiger/src/btree/col_srch.c33
-rw-r--r--src/third_party/wiredtiger/src/btree/row_srch.c89
9 files changed, 1194 insertions, 766 deletions
diff --git a/src/third_party/wiredtiger/src/btree/bt_cursor.c b/src/third_party/wiredtiger/src/btree/bt_cursor.c
index 3290fd6374c..69512f45933 100644
--- a/src/third_party/wiredtiger/src/btree/bt_cursor.c
+++ b/src/third_party/wiredtiger/src/btree/bt_cursor.c
@@ -1093,6 +1093,7 @@ __wt_btcur_range_truncate(WT_CURSOR_BTREE *start, WT_CURSOR_BTREE *stop)
cbt = (start != NULL) ? start : stop;
session = (WT_SESSION_IMPL *)cbt->iface.session;
btree = cbt->btree;
+ WT_STAT_FAST_DATA_INCR(session, cursor_truncate);
/*
* We always delete in a forward direction because it's faster, assert
diff --git a/src/third_party/wiredtiger/src/btree/bt_debug.c b/src/third_party/wiredtiger/src/btree/bt_debug.c
index 8edc40794e2..0f47c060daf 100644
--- a/src/third_party/wiredtiger/src/btree/bt_debug.c
+++ b/src/third_party/wiredtiger/src/btree/bt_debug.c
@@ -566,7 +566,7 @@ __debug_tree(
/* A NULL page starts at the top of the tree -- it's a convenience. */
if (page == NULL)
- page = S2BT(session)->root.page;
+ page = btree->root.page;
WT_WITH_BTREE(session, btree, ret = __debug_page(ds, page, flags));
diff --git a/src/third_party/wiredtiger/src/btree/bt_delete.c b/src/third_party/wiredtiger/src/btree/bt_delete.c
index 757b7b51cdd..98c6390e0f4 100644
--- a/src/third_party/wiredtiger/src/btree/bt_delete.c
+++ b/src/third_party/wiredtiger/src/btree/bt_delete.c
@@ -138,6 +138,8 @@ __wt_delete_page(WT_SESSION_IMPL *session, WT_REF *ref, bool *skipp)
WT_ERR(__wt_txn_modify_ref(session, ref));
*skipp = true;
+ WT_STAT_FAST_CONN_INCR(session, rec_page_delete_fast);
+ WT_STAT_FAST_DATA_INCR(session, rec_page_delete_fast);
WT_PUBLISH(ref->state, WT_REF_DELETED);
return (0);
diff --git a/src/third_party/wiredtiger/src/btree/bt_handle.c b/src/third_party/wiredtiger/src/btree/bt_handle.c
index 3e611a107ab..dbdf94fc1b6 100644
--- a/src/third_party/wiredtiger/src/btree/bt_handle.c
+++ b/src/third_party/wiredtiger/src/btree/bt_handle.c
@@ -643,11 +643,13 @@ __btree_page_sizes(WT_SESSION_IMPL *session)
{
WT_BTREE *btree;
WT_CONFIG_ITEM cval;
+ WT_CONNECTION_IMPL *conn;
uint64_t cache_size;
uint32_t intl_split_size, leaf_split_size;
const char **cfg;
btree = S2BT(session);
+ conn = S2C(session);
cfg = btree->dhandle->cfg;
/*
@@ -688,8 +690,8 @@ __btree_page_sizes(WT_SESSION_IMPL *session)
WT_RET(__wt_config_gets(session, cfg, "memory_page_max", &cval));
btree->maxmempage =
WT_MAX((uint64_t)cval.val, 50 * (uint64_t)btree->maxleafpage);
- if (!F_ISSET(S2C(session), WT_CONN_CACHE_POOL)) {
- if ((cache_size = S2C(session)->cache_size) > 0)
+ if (!F_ISSET(conn, WT_CONN_CACHE_POOL)) {
+ if ((cache_size = conn->cache_size) > 0)
btree->maxmempage =
WT_MIN(btree->maxmempage, cache_size / 4);
}
@@ -723,6 +725,17 @@ __btree_page_sizes(WT_SESSION_IMPL *session)
/*
* Get the maximum internal/leaf page key/value sizes.
*
+ * In-memory configuration overrides any key/value sizes, there's no
+ * such thing as an overflow item in an in-memory configuration.
+ */
+ if (F_ISSET(conn, WT_CONN_IN_MEMORY)) {
+ btree->maxintlkey = WT_BTREE_MAX_OBJECT_SIZE;
+ btree->maxleafkey = WT_BTREE_MAX_OBJECT_SIZE;
+ btree->maxleafvalue = WT_BTREE_MAX_OBJECT_SIZE;
+ return (0);
+ }
+
+ /*
* In historic versions of WiredTiger, the maximum internal/leaf page
* key/value sizes were set by the internal_item_max and leaf_item_max
* configuration strings. Look for those strings if we don't find the
diff --git a/src/third_party/wiredtiger/src/btree/bt_read.c b/src/third_party/wiredtiger/src/btree/bt_read.c
index e60f7b3fb02..389ac761c5b 100644
--- a/src/third_party/wiredtiger/src/btree/bt_read.c
+++ b/src/third_party/wiredtiger/src/btree/bt_read.c
@@ -586,8 +586,8 @@ skip_evict:
* CPU to no purpose.
*/
if (stalled)
- wait_cnt += 1000;
- else if (++wait_cnt < 1000) {
+ wait_cnt += WT_THOUSAND;
+ else if (++wait_cnt < WT_THOUSAND) {
__wt_yield();
continue;
}
@@ -603,7 +603,7 @@ skip_evict:
if (cache_work)
continue;
}
- sleep_cnt = WT_MIN(sleep_cnt + 1000, 10000);
+ sleep_cnt = WT_MIN(sleep_cnt + WT_THOUSAND, 10000);
WT_STAT_FAST_CONN_INCRV(session, page_sleep, sleep_cnt);
__wt_sleep(0, sleep_cnt);
}
diff --git a/src/third_party/wiredtiger/src/btree/bt_split.c b/src/third_party/wiredtiger/src/btree/bt_split.c
index 9e45bf10a5c..caba12b78f1 100644
--- a/src/third_party/wiredtiger/src/btree/bt_split.c
+++ b/src/third_party/wiredtiger/src/btree/bt_split.c
@@ -169,54 +169,58 @@ __split_safe_free(WT_SESSION_IMPL *session,
return (__split_stash_add(session, split_gen, p, s));
}
+#ifdef HAVE_DIAGNOSTIC
/*
- * __split_should_deepen --
- * Return if we should deepen the tree.
+ * __split_verify_intl_key_order --
+ * Verify the key order on an internal page after a split, diagnostic only.
*/
-static bool
-__split_should_deepen(WT_SESSION_IMPL *session, WT_REF *ref)
+static void
+__split_verify_intl_key_order(WT_SESSION_IMPL *session, WT_PAGE *page)
{
WT_BTREE *btree;
- WT_PAGE *page;
- WT_PAGE_INDEX *pindex;
+ WT_ITEM *next, _next, *last, _last, *tmp;
+ WT_REF *ref;
+ uint64_t recno;
+ int cmp;
+ bool first;
btree = S2BT(session);
- page = ref->page;
-
- /*
- * Our caller is holding the parent page locked to single-thread splits,
- * which means we can safely look at the page's index without setting a
- * split generation.
- */
- pindex = WT_INTL_INDEX_GET_SAFE(page);
-
- /*
- * Sanity check for a reasonable number of keys on-page keys. Splitting
- * with too few keys leads to excessively deep trees.
- */
- if (pindex->entries < 100)
- return (false);
-
- /*
- * Deepen the tree if the page's memory footprint is larger than the
- * maximum size for a page in memory (presumably putting eviction
- * pressure on the cache).
- */
- if (page->memory_footprint > btree->maxmempage)
- return (true);
- /*
- * Check if the page has enough keys to make it worth splitting. If
- * the number of keys is allowed to grow too large, the cost of
- * splitting into parent pages can become large enough to result
- * in slow operations.
- */
- if (!__wt_ref_is_root(ref) &&
- pindex->entries > btree->split_deepen_min_child)
- return (true);
+ switch (page->type) {
+ case WT_PAGE_COL_INT:
+ recno = 0; /* Less than any valid record number. */
+ WT_INTL_FOREACH_BEGIN(session, page, ref) {
+ WT_ASSERT(session, ref->key.recno > recno);
+ recno = ref->key.recno;
+ } WT_INTL_FOREACH_END;
+ break;
+ case WT_PAGE_ROW_INT:
+ next = &_next;
+ WT_CLEAR(_next);
+ last = &_last;
+ WT_CLEAR(_last);
- return (false);
+ first = true;
+ WT_INTL_FOREACH_BEGIN(session, page, ref) {
+ __wt_ref_key(page, ref, &next->data, &next->size);
+ if (last->size == 0) {
+ if (first)
+ first = false;
+ else {
+ WT_ASSERT(session, __wt_compare(
+ session, btree->collator, last,
+ next, &cmp) == 0);
+ WT_ASSERT(session, cmp < 0);
+ }
+ }
+ tmp = last;
+ last = next;
+ next = tmp;
+ } WT_INTL_FOREACH_END;
+ break;
+ }
}
+#endif
/*
* __split_ovfl_key_cleanup --
@@ -267,47 +271,58 @@ __split_ovfl_key_cleanup(WT_SESSION_IMPL *session, WT_PAGE *page, WT_REF *ref)
}
/*
- * __split_ref_deepen_move --
- * Move a WT_REF from a parent to a child in service of a split to deepen
- * the tree, including updating the accounting information.
+ * __split_ref_move --
+ * Move a WT_REF from one page to another, including updating accounting
+ * information.
*/
static int
-__split_ref_deepen_move(WT_SESSION_IMPL *session,
- WT_PAGE *parent, WT_REF *ref, size_t *parent_decrp, size_t *child_incrp)
+__split_ref_move(WT_SESSION_IMPL *session, WT_PAGE *from_home,
+ WT_REF **from_refp, size_t *decrp, WT_REF **to_refp, size_t *incrp)
{
WT_ADDR *addr;
WT_CELL_UNPACK unpack;
WT_DECL_RET;
WT_IKEY *ikey;
+ WT_REF *ref;
size_t size;
void *key;
+ ref = *from_refp;
+
/*
+ * The from-home argument is the page into which the "from" WT_REF may
+ * point, for example, if there's an on-page key the "from" WT_REF
+ * references, it will be on the page "from-home".
+ *
* Instantiate row-store keys, and column- and row-store addresses in
- * the WT_REF structures referenced by a page that's being split (and
- * deepening the tree). The WT_REF structures aren't moving, but the
- * index references are moving from the page we're splitting to a set
- * of child pages, and so we can no longer reference the block image
- * that remains with the page being split.
+ * the WT_REF structures referenced by a page that's being split. The
+ * WT_REF structures aren't moving, but the index references are moving
+ * from the page we're splitting to a set of new pages, and so we can
+ * no longer reference the block image that remains with the page being
+ * split.
*
* No locking is required to update the WT_REF structure because we're
- * the only thread splitting the parent page, and there's no way for
- * readers to race with our updates of single pointers. The changes
- * have to be written before the page goes away, of course, our caller
- * owns that problem.
- *
- * Row-store keys, first.
+ * the only thread splitting the page, and there's no way for readers
+ * to race with our updates of single pointers. The changes have to be
+ * written before the page goes away, of course, our caller owns that
+ * problem.
*/
- if (parent->type == WT_PAGE_ROW_INT) {
+ if (from_home->type == WT_PAGE_ROW_INT) {
+ /*
+ * Row-store keys: if it's not yet instantiated, instantiate it.
+ * If already instantiated, check for overflow cleanup (overflow
+ * keys are always instantiated).
+ */
if ((ikey = __wt_ref_key_instantiated(ref)) == NULL) {
- __wt_ref_key(parent, ref, &key, &size);
+ __wt_ref_key(from_home, ref, &key, &size);
WT_RET(__wt_row_ikey(session, 0, key, size, ref));
ikey = ref->key.ikey;
} else {
- WT_RET(__split_ovfl_key_cleanup(session, parent, ref));
- *parent_decrp += sizeof(WT_IKEY) + ikey->size;
+ WT_RET(
+ __split_ovfl_key_cleanup(session, from_home, ref));
+ *decrp += sizeof(WT_IKEY) + ikey->size;
}
- *child_incrp += sizeof(WT_IKEY) + ikey->size;
+ *incrp += sizeof(WT_IKEY) + ikey->size;
}
/*
@@ -316,7 +331,7 @@ __split_ref_deepen_move(WT_SESSION_IMPL *session,
* get the address from the on-page cell.
*/
addr = ref->addr;
- if (addr != NULL && !__wt_off_page(parent, addr)) {
+ if (addr != NULL && !__wt_off_page(from_home, addr)) {
__wt_cell_unpack((WT_CELL *)ref->addr, &unpack);
WT_RET(__wt_calloc_one(session, &addr));
if ((ret = __wt_strndup(
@@ -330,364 +345,1048 @@ __split_ref_deepen_move(WT_SESSION_IMPL *session,
ref->addr = addr;
}
- /* And finally, the WT_REF itself. */
- WT_MEM_TRANSFER(*parent_decrp, *child_incrp, sizeof(WT_REF));
+ /* And finally, copy the WT_REF pointer itself. */
+ *to_refp = ref;
+ WT_MEM_TRANSFER(*decrp, *incrp, sizeof(WT_REF));
return (0);
}
-#ifdef HAVE_DIAGNOSTIC
/*
- * __split_verify_intl_key_order --
- * Verify the key order on an internal page after a split, diagnostic only.
+ * __split_child_block_evict_and_split --
+ * Ensure the newly created child isn't evicted or split for now.
*/
static void
-__split_verify_intl_key_order(WT_SESSION_IMPL *session, WT_PAGE *page)
+__split_child_block_evict_and_split(WT_PAGE *child)
{
- WT_BTREE *btree;
- WT_ITEM *next, _next, *last, _last, *tmp;
- WT_REF *ref;
- uint64_t recno;
- int cmp;
- bool first;
+ /*
+ * Once the split is live, newly created internal pages might be evicted
+ * and their WT_REF structures freed. If that happens before all threads
+ * exit the index of the page which previously "owned" the WT_REF, a
+ * thread might see a freed WT_REF. To ensure that doesn't happen, the
+ * newly created page's modify structure has a field with a transaction
+ * ID that's checked before any internal page is evicted. Unfortunately,
+ * we don't know the correct value until we update the original page's
+ * index (we need a transaction ID from after that update), but the act
+ * of updating the original page's index is what allows the eviction to
+ * happen.
+ *
+ * Once the split is live, newly created internal pages might themselves
+ * split. The split itself is not the problem: if a page splits before
+ * we fix up its WT_REF (in other words, a WT_REF we move is then moved
+ * again, before we reset the underlying page's parent reference), it's
+ * OK because the test we use to find a WT_REF and WT_PAGE that require
+ * fixing up is only that the WT_REF points to the wrong parent, not it
+ * points to a specific wrong parent. The problem is our fix up of the
+ * WT_REFs in the created page could race with the subsequent fix of the
+ * same WT_REFs (in a different created page), we'd have to acquire some
+ * lock to prevent that race, and that's going to be difficult at best.
+ *
+ * For now, block eviction and splits in newly created pages until they
+ * have been fixed up.
+ */
+ F_SET_ATOMIC(child, WT_PAGE_SPLIT_BLOCK);
+}
- btree = S2BT(session);
+/*
+ * __split_ref_move_final --
+ * Finalize the moved WT_REF structures after the split succeeds.
+ */
+static int
+__split_ref_move_final(
+ WT_SESSION_IMPL *session, WT_REF **refp, uint32_t entries)
+{
+ WT_DECL_RET;
+ WT_PAGE *child;
+ WT_REF *ref, *child_ref;
+ uint64_t txn_new_id;
+ uint32_t i;
- switch (page->type) {
- case WT_PAGE_COL_INT:
- recno = 0; /* Less than any valid record number. */
- WT_INTL_FOREACH_BEGIN(session, page, ref) {
- WT_ASSERT(session, ref->key.recno > recno);
- recno = ref->key.recno;
- } WT_INTL_FOREACH_END;
- break;
- case WT_PAGE_ROW_INT:
- next = &_next;
- WT_CLEAR(_next);
- last = &_last;
- WT_CLEAR(_last);
+ /*
+ * When creating new internal pages as part of a split, we set a field
+ * in those pages modify structure to prevent them from being evicted
+ * until all threads are known to have exited the index of the page that
+ * previously "owned" the WT_REF. Set that field to a safe value.
+ */
+ txn_new_id = __wt_txn_new_id(session);
- first = true;
- WT_INTL_FOREACH_BEGIN(session, page, ref) {
- __wt_ref_key(page, ref, &next->data, &next->size);
- if (last->size == 0) {
- if (first)
- first = false;
- else {
- WT_ASSERT(session, __wt_compare(
- session, btree->collator, last,
- next, &cmp) == 0);
- WT_ASSERT(session, cmp < 0);
- }
+ /*
+ * The WT_REF structures moved to newly allocated child pages reference
+ * the wrong parent page and we have to fix that up. The problem is
+ * revealed when a thread of control searches for the child page's
+ * reference structure slot, and fails to find it because the parent
+ * page being searched no longer references the child. When that failure
+ * happens the thread waits for the reference's home page to be updated,
+ * which we do here: walk the children and fix them up.
+ */
+ for (i = 0; i < entries; ++i, ++refp) {
+ ref = *refp;
+
+ /*
+ * We don't hold hazard pointers on created pages, they cannot
+ * be evicted because the page-modify transaction value set as
+ * they were created prevents eviction. (See above, we reset
+ * that value as part of fixing up the page.) But, an eviction
+ * thread might be attempting to evict the page (the WT_REF may
+ * be WT_REF_LOCKED), or it may be a disk based page (the WT_REF
+ * may be WT_REF_READING), or it may be in some other state.
+ * Acquire a hazard pointer for any in-memory pages so we know
+ * the state of the page. Ignore pages not in-memory (deleted,
+ * on-disk, being read), there's no in-memory structure to fix.
+ */
+ if ((ret = __wt_page_in(session,
+ ref, WT_READ_CACHE | WT_READ_NO_EVICT)) == WT_NOTFOUND)
+ continue;
+ WT_ERR(ret);
+
+ child = ref->page;
+#ifdef HAVE_DIAGNOSTIC
+ WT_WITH_PAGE_INDEX(session,
+ __split_verify_intl_key_order(session, child));
+#endif
+ /*
+ * We use a page flag to prevent the child from splitting from
+ * underneath us, but the split-generation error checks don't
+ * know about that flag; use the standard macros to ensure that
+ * reading the child's page index structure is safe.
+ */
+ WT_ENTER_PAGE_INDEX(session);
+ WT_INTL_FOREACH_BEGIN(session, child, child_ref) {
+ /*
+ * The page's home reference may not be wrong, as we
+ * opened up access from the top of the tree already,
+ * disk pages may have been read in since then, and
+ * those pages would have correct parent references.
+ */
+ if (child_ref->home != child) {
+ child_ref->home = child;
+ child_ref->pindex_hint = 0;
+
+ child->modify->mod_split_txn = txn_new_id;
}
- tmp = last;
- last = next;
- next = tmp;
} WT_INTL_FOREACH_END;
- break;
+ WT_LEAVE_PAGE_INDEX(session);
+
+ /* The child can now be evicted or split. */
+ F_CLR_ATOMIC(child, WT_PAGE_SPLIT_BLOCK);
+
+ WT_ERR(__wt_hazard_clear(session, child));
}
+
+ /*
+ * Push out the changes: not required for correctness, but don't let
+ * threads spin on incorrect page references longer than necessary.
+ */
+ WT_FULL_BARRIER();
+ return (0);
+
+err: /* Something really bad just happened. */
+ WT_PANIC_RET(session, ret, "fatal error resolving a split");
}
-#endif
/*
- * __split_deepen --
- * Split an internal page in-memory, deepening the tree.
+ * __split_root --
+ * Split the root page in-memory, deepening the tree.
*/
static int
-__split_deepen(WT_SESSION_IMPL *session, WT_PAGE *parent)
+__split_root(WT_SESSION_IMPL *session, WT_PAGE *root)
{
WT_BTREE *btree;
WT_DECL_RET;
WT_PAGE *child;
WT_PAGE_INDEX *alloc_index, *child_pindex, *pindex;
WT_REF **alloc_refp;
- WT_REF *child_ref, **child_refp, *parent_ref, **parent_refp, *ref;
- size_t child_incr, parent_decr, parent_incr, size;
+ WT_REF **child_refp, *ref, **root_refp;
+ size_t child_incr, root_decr, root_incr, size;
uint64_t split_gen;
- uint32_t children, chunk, i, j, moved_entries, new_entries, remain;
- uint32_t skip_leading, slots;
+ uint32_t children, chunk, i, j, remain;
+ uint32_t slots;
bool complete;
void *p;
WT_STAT_FAST_CONN_INCR(session, cache_eviction_deepen);
WT_STAT_FAST_DATA_INCR(session, cache_eviction_deepen);
+ WT_STAT_FAST_CONN_INCR(session, cache_eviction_split_internal);
+ WT_STAT_FAST_DATA_INCR(session, cache_eviction_split_internal);
btree = S2BT(session);
alloc_index = NULL;
- parent_incr = parent_decr = 0;
+ root_decr = root_incr = 0;
complete = false;
+ /* The root page will be marked dirty, make sure that will succeed. */
+ WT_RET(__wt_page_modify_init(session, root));
+
/*
- * Our caller is holding the parent page locked to single-thread splits,
+ * Our caller is holding the root page locked to single-thread splits,
* which means we can safely look at the page's index without setting a
* split generation.
*/
- pindex = WT_INTL_INDEX_GET_SAFE(parent);
+ pindex = WT_INTL_INDEX_GET_SAFE(root);
/*
- * A prepending/appending workload will repeatedly deepen parts of the
- * tree that aren't changing, and appending workloads are not uncommon.
- * First, keep the first/last pages of the tree at their current level,
- * to catch simple workloads. Second, track the number of entries which
- * resulted from the last time we deepened this page, and if we refilled
- * this page without splitting into those slots, ignore them for this
- * split. It's not exact because an eviction might split into any part
- * of the page: if 80% of the splits are at the end of the page, assume
- * an append-style workload. Of course, the plan eventually fails: when
- * repeatedly deepening this page for an append-only workload, we will
- * progressively ignore more and more of the slots. When ignoring 90% of
- * the slots, deepen the entire page again.
- *
- * Figure out how many slots we're leaving at this level and how many
- * child pages we're creating.
+ * Decide how many child pages to create, then calculate the standard
+ * chunk and whatever remains. Sanity check the number of children:
+ * the decision to split matched to the deepen-per-child configuration
+ * might get it wrong.
*/
-#undef skip_trailing
-#define skip_trailing 1
- skip_leading = 1;
- new_entries = pindex->entries - parent->pg_intl_deepen_split_last;
- if (parent->pg_intl_deepen_split_append > (new_entries * 8) / 10)
- skip_leading = parent->pg_intl_deepen_split_last;
- if (skip_leading > (pindex->entries * 9) * 10)
- skip_leading = 1;
-
- /*
- * In a few (rare) cases we split pages with only a few entries, and in
- * those cases we keep it simple, 10 children, skip only first and last
- * entries. Otherwise, split into a lot of child pages.
- */
- moved_entries = pindex->entries - (skip_leading + skip_trailing);
- children = moved_entries / btree->split_deepen_per_child;
+ children = pindex->entries / btree->split_deepen_per_child;
if (children < 10) {
+ if (pindex->entries < 100)
+ return (EBUSY);
children = 10;
- skip_leading = 1;
- moved_entries =
- pindex->entries - (skip_leading + skip_trailing);
}
+ chunk = pindex->entries / children;
+ remain = pindex->entries - chunk * (children - 1);
WT_ERR(__wt_verbose(session, WT_VERB_SPLIT,
- "%p: %" PRIu32 " elements, splitting into %" PRIu32 " children",
- parent, pindex->entries, children));
+ "%p: %" PRIu32 " root page elements, splitting into %" PRIu32
+ " children",
+ root, pindex->entries, children));
/*
- * Allocate a new WT_PAGE_INDEX and set of WT_REF objects. Initialize
- * the slots of the allocated WT_PAGE_INDEX to point to the pages we're
- * keeping at the current level, and the rest of the slots to point to
- * new WT_REF objects.
+ * Allocate a new WT_PAGE_INDEX and set of WT_REF objects to be inserted
+ * into the root page, replacing the root's page-index.
*/
- size = sizeof(WT_PAGE_INDEX) +
- (children + skip_leading + skip_trailing) * sizeof(WT_REF *);
+ size = sizeof(WT_PAGE_INDEX) + children * sizeof(WT_REF *);
WT_ERR(__wt_calloc(session, 1, size, &alloc_index));
- parent_incr += size;
+ root_incr += size;
alloc_index->index = (WT_REF **)(alloc_index + 1);
- alloc_index->entries = children + skip_leading + skip_trailing;
- for (alloc_refp = alloc_index->index,
- i = 0; i < skip_leading; ++alloc_refp, ++i)
- alloc_index->index[i] = pindex->index[i];
- for (i = 0; i < children; ++alloc_refp, ++i)
+ alloc_index->entries = children;
+ alloc_refp = alloc_index->index;
+ for (i = 0; i < children; alloc_refp++, ++i)
WT_ERR(__wt_calloc_one(session, alloc_refp));
- parent_incr += children * sizeof(WT_REF);
- alloc_index->index[alloc_index->entries - 1] =
- pindex->index[pindex->entries - 1];
+ root_incr += children * sizeof(WT_REF);
/* Allocate child pages, and connect them into the new page index. */
- chunk = moved_entries / children;
- remain = moved_entries - chunk * (children - 1);
- for (parent_refp = pindex->index + skip_leading,
- alloc_refp = alloc_index->index + skip_leading,
- i = 0; i < children; ++i) {
+ for (root_refp = pindex->index,
+ alloc_refp = alloc_index->index, i = 0; i < children; ++i) {
slots = i == children - 1 ? remain : chunk;
WT_ERR(__wt_page_alloc(
- session, parent->type, 0, slots, false, &child));
+ session, root->type, 0, slots, false, &child));
/*
- * Initialize the parent page's child reference; we need a copy
- * of the page's key.
+ * Initialize the page's child reference; we need a copy of the
+ * page's key.
*/
ref = *alloc_refp++;
- ref->home = parent;
+ ref->home = root;
ref->page = child;
ref->addr = NULL;
- if (parent->type == WT_PAGE_ROW_INT) {
- __wt_ref_key(parent, *parent_refp, &p, &size);
+ if (root->type == WT_PAGE_ROW_INT) {
+ __wt_ref_key(root, *root_refp, &p, &size);
WT_ERR(__wt_row_ikey(session, 0, p, size, ref));
- parent_incr += sizeof(WT_IKEY) + size;
+ root_incr += sizeof(WT_IKEY) + size;
} else
- ref->key.recno = (*parent_refp)->key.recno;
+ ref->key.recno = (*root_refp)->key.recno;
ref->state = WT_REF_MEM;
/* Initialize the child page. */
- if (parent->type == WT_PAGE_COL_INT)
- child->pg_intl_recno = (*parent_refp)->key.recno;
+ if (root->type == WT_PAGE_COL_INT)
+ child->pg_intl_recno = (*root_refp)->key.recno;
child->pg_intl_parent_ref = ref;
/* Mark it dirty. */
WT_ERR(__wt_page_modify_init(session, child));
__wt_page_modify_set(session, child);
- /*
- * Once the split goes live, the newly created internal pages
- * might be evicted and their WT_REF structures freed. If those
- * pages are evicted before threads exit the previous page index
- * array, a thread might see a freed WT_REF. Set the eviction
- * transaction requirement for the newly created internal pages.
- */
- child->modify->mod_split_txn = __wt_txn_new_id(session);
+ /* Ensure the page isn't evicted or split for now. */
+ __split_child_block_evict_and_split(child);
/*
* The newly allocated child's page index references the same
- * structures as the parent. (We cannot move WT_REF structures,
+ * structures as the root. (We cannot move WT_REF structures,
* threads may be underneath us right now changing the structure
* state.) However, if the WT_REF structures reference on-page
* information, we have to fix that, because the disk image for
* the page that has an page index entry for the WT_REF is about
* to change.
*/
- child_incr = 0;
child_pindex = WT_INTL_INDEX_GET_SAFE(child);
- for (child_refp = child_pindex->index, j = 0; j < slots; ++j) {
- WT_ERR(__split_ref_deepen_move(session,
- parent, *parent_refp, &parent_decr, &child_incr));
- *child_refp++ = *parent_refp++;
- }
+ child_incr = 0;
+ for (child_refp = child_pindex->index,
+ j = 0; j < slots; ++child_refp, ++root_refp, ++j)
+ WT_ERR(__split_ref_move(session, root,
+ root_refp, &root_decr, child_refp, &child_incr));
+
__wt_cache_page_inmem_incr(session, child, child_incr);
}
WT_ASSERT(session,
- alloc_refp - alloc_index->index ==
- (ptrdiff_t)(alloc_index->entries - skip_trailing));
- WT_ASSERT(session, parent_refp - pindex->index ==
- (ptrdiff_t)(pindex->entries - skip_trailing));
+ alloc_refp - alloc_index->index == (ptrdiff_t)alloc_index->entries);
+ WT_ASSERT(session,
+ root_refp - pindex->index == (ptrdiff_t)pindex->entries);
/*
- * Confirm the parent page's index hasn't moved, then update it, which
+ * Confirm the root page's index hasn't moved, then update it, which
* makes the split visible to threads descending the tree. From this
* point on, we're committed to the split.
*
* A note on error handling: until this point, there's no problem with
* unwinding on error. We allocated a new page index, a new set of
* WT_REFs and a new set of child pages -- if an error occurred, the
- * parent remained unchanged, although it may have an incorrect memory
- * footprint. From now on we've modified the parent page, attention
+ * root remained unchanged, although it may have an incorrect memory
+ * footprint. From now on we've modified the root page, attention
* needs to be paid. However, subsequent failures are relatively benign,
* the split is OK and complete. For that reason, we ignore errors past
* this point unless there's a panic.
*/
+ WT_ASSERT(session, WT_INTL_INDEX_GET_SAFE(root) == pindex);
+ WT_INTL_INDEX_SET(root, alloc_index);
+ complete = true;
+
+#ifdef HAVE_DIAGNOSTIC
+ WT_WITH_PAGE_INDEX(session,
+ __split_verify_intl_key_order(session, root));
+#endif
+ /* Fix up the moved WT_REF structures. */
+ WT_ERR(__split_ref_move_final(
+ session, alloc_index->index, alloc_index->entries));
+
+ /* We've installed the allocated page-index, ensure error handling. */
+ alloc_index = NULL;
+
+ /*
+ * We can't free the previous root's index, there may be threads using
+ * it. Add to the session's discard list, to be freed once we know no
+ * threads can still be using it.
+ *
+ * This change requires care with error handling: we have already
+ * updated the page with a new index. Even if stashing the old value
+ * fails, we don't roll back that change, because threads may already
+ * be using the new index.
+ */
+ split_gen = __wt_atomic_addv64(&S2C(session)->split_gen, 1);
+ size = sizeof(WT_PAGE_INDEX) + pindex->entries * sizeof(WT_REF *);
+ WT_TRET(__split_safe_free(session, split_gen, false, pindex, size));
+ root_decr += size;
+
+ /* Adjust the root's memory footprint and mark it dirty. */
+ __wt_cache_page_inmem_incr(session, root, root_incr);
+ __wt_cache_page_inmem_decr(session, root, root_decr);
+ __wt_page_modify_set(session, root);
+
+err: /*
+ * If complete is true, we saw an error after opening up the tree to
+ * descent through the root page's new index. There is nothing we
+ * can do, there are threads potentially active in both versions of
+ * the tree.
+ *
+ * A note on error handling: if we completed the split, return success,
+ * nothing really bad can have happened, and our caller has to proceed
+ * with the split.
+ */
+ if (!complete)
+ __wt_free_ref_index(session, root, alloc_index, true);
+
+ if (ret != 0 && ret != WT_PANIC)
+ __wt_err(session, ret,
+ "ignoring not-fatal error during root page split to "
+ "deepen the tree");
+ return (ret == WT_PANIC || !complete ? ret : 0);
+}
+
+/*
+ * __split_parent --
+ * Resolve a multi-page split, inserting new information into the parent.
+ */
+static int
+__split_parent(WT_SESSION_IMPL *session, WT_REF *ref, WT_REF **ref_new,
+ uint32_t new_entries, size_t parent_incr, bool exclusive, bool discard)
+{
+ WT_DECL_ITEM(scr);
+ WT_DECL_RET;
+ WT_IKEY *ikey;
+ WT_PAGE *parent;
+ WT_PAGE_INDEX *alloc_index, *pindex;
+ WT_REF **alloc_refp, *next_ref;
+ size_t parent_decr, size;
+ uint64_t split_gen;
+ uint32_t i, j;
+ uint32_t deleted_entries, parent_entries, result_entries;
+ uint32_t *deleted_refs;
+ bool complete, empty_parent;
+
+ parent = ref->home;
+
+ alloc_index = pindex = NULL;
+ parent_decr = 0;
+ parent_entries = 0;
+ complete = empty_parent = false;
+
+ /* The parent page will be marked dirty, make sure that will succeed. */
+ WT_RET(__wt_page_modify_init(session, parent));
+
+ /*
+ * We've locked the parent, which means it cannot split (which is the
+ * only reason to worry about split generation values).
+ */
+ pindex = WT_INTL_INDEX_GET_SAFE(parent);
+ parent_entries = pindex->entries;
+
+ /*
+ * Remove any refs to deleted pages while we are splitting, we have
+ * the internal page locked down, and are copying the refs into a new
+ * array anyway. Switch them to the special split state, so that any
+ * reading thread will restart.
+ */
+ WT_RET(__wt_scr_alloc(session, 10 * sizeof(uint32_t), &scr));
+ for (deleted_entries = 0, i = 0; i < parent_entries; ++i) {
+ next_ref = pindex->index[i];
+ WT_ASSERT(session, next_ref->state != WT_REF_SPLIT);
+ if ((discard && next_ref == ref) ||
+ (next_ref->state == WT_REF_DELETED &&
+ __wt_delete_page_skip(session, next_ref, true) &&
+ __wt_atomic_casv32(
+ &next_ref->state, WT_REF_DELETED, WT_REF_SPLIT))) {
+ WT_ERR(__wt_buf_grow(session, scr,
+ (deleted_entries + 1) * sizeof(uint32_t)));
+ deleted_refs = scr->mem;
+ deleted_refs[deleted_entries++] = i;
+ }
+ }
+
+ /*
+ * The final entry count consists of the original count, plus any new
+ * pages, less any WT_REFs we're removing (deleted entries plus the
+ * entry we're replacing).
+ */
+ result_entries = (parent_entries + new_entries) - deleted_entries;
+ if (!discard)
+ --result_entries;
+
+ /*
+ * If there are no remaining entries on the parent, give up, we can't
+ * leave an empty internal page. Mark it to be evicted soon and clean
+ * up any references that have changed state.
+ */
+ if (result_entries == 0) {
+ empty_parent = true;
+ __wt_page_evict_soon(parent);
+ goto err;
+ }
+
+ /*
+ * Allocate and initialize a new page index array for the parent, then
+ * copy references from the original index array, plus references from
+ * the newly created split array, into place.
+ */
+ size = sizeof(WT_PAGE_INDEX) + result_entries * sizeof(WT_REF *);
+ WT_ERR(__wt_calloc(session, 1, size, &alloc_index));
+ parent_incr += size;
+ alloc_index->index = (WT_REF **)(alloc_index + 1);
+ alloc_index->entries = result_entries;
+ for (alloc_refp = alloc_index->index, i = 0; i < parent_entries; ++i) {
+ next_ref = pindex->index[i];
+ if (next_ref == ref)
+ for (j = 0; j < new_entries; ++j) {
+ ref_new[j]->home = parent;
+ *alloc_refp++ = ref_new[j];
+ }
+ else if (next_ref->state != WT_REF_SPLIT)
+ /* Skip refs we have marked for deletion. */
+ *alloc_refp++ = next_ref;
+ }
+
+ /* Check that we filled in all the entries. */
+ WT_ASSERT(session,
+ alloc_refp - alloc_index->index == (ptrdiff_t)result_entries);
+
+ /*
+ * Confirm the parent page's index hasn't moved then update it, which
+ * makes the split visible to threads descending the tree.
+ */
WT_ASSERT(session, WT_INTL_INDEX_GET_SAFE(parent) == pindex);
WT_INTL_INDEX_SET(parent, alloc_index);
- split_gen = __wt_atomic_addv64(&S2C(session)->split_gen, 1);
- complete = true;
+ alloc_index = NULL;
#ifdef HAVE_DIAGNOSTIC
WT_WITH_PAGE_INDEX(session,
__split_verify_intl_key_order(session, parent));
#endif
+
/*
- * Save the number of entries created by deepening the tree and reset
- * the count of splits into this page after that point.
+ * If discarding the page's original WT_REF field, reset it to split.
+ * Threads cursoring through the tree were blocked because that WT_REF
+ * state was set to locked. Changing the locked state to split unblocks
+ * those threads and causes them to re-calculate their position based
+ * on the just-updated parent page's index.
*/
- parent->pg_intl_deepen_split_append = 0;
- parent->pg_intl_deepen_split_last = alloc_index->entries;
+ if (discard)
+ WT_PUBLISH(ref->state, WT_REF_SPLIT);
/*
- * The moved reference structures now reference the wrong parent page,
- * and we have to fix that up. The problem is revealed when a thread
- * of control searches for a page's reference structure slot, and fails
- * to find it because the page it's searching no longer references it.
- * When that failure happens, the thread waits for the reference's home
- * page to be updated, which we do here: walk the children and fix them
- * up.
+ * Push out the changes: not required for correctness, but don't let
+ * threads spin on incorrect page references longer than necessary.
+ */
+ WT_FULL_BARRIER();
+
+ /*
+ * A note on error handling: failures before we swapped the new page
+ * index into the parent can be resolved by freeing allocated memory
+ * because the original page is unchanged, we can continue to use it
+ * and we have not yet modified the parent. Failures after we swap
+ * the new page index into the parent are also relatively benign, the
+ * split is OK and complete. For those reasons, we ignore errors past
+ * this point unless there's a panic.
+ */
+ complete = true;
+
+ WT_ERR(__wt_verbose(session, WT_VERB_SPLIT,
+ "%p: %s %s" "split into parent %p, %" PRIu32 " -> %" PRIu32
+ " (%s%" PRIu32 ")",
+ ref->page, ref->page == NULL ?
+ "unknown page type" : __wt_page_type_string(ref->page->type),
+ ref->page == NULL ? "reverse " : "", parent,
+ parent_entries, result_entries,
+ ref->page == NULL ? "-" : "+",
+ ref->page == NULL ?
+ parent_entries - result_entries : result_entries - parent_entries));
+
+ /*
+ * The new page index is in place, free the WT_REF we were splitting and
+ * any deleted WT_REFs we found, modulo the usual safe free semantics.
*
- * We're not acquiring hazard pointers on these pages, they cannot be
- * evicted because of the eviction transaction value set above.
- */
- for (parent_refp = alloc_index->index,
- i = alloc_index->entries; i > 0; ++parent_refp, --i) {
- parent_ref = *parent_refp;
- WT_ASSERT(session, parent_ref->home == parent);
- if (parent_ref->state != WT_REF_MEM)
- continue;
+ * Acquire a new split generation.
+ */
+ split_gen = __wt_atomic_addv64(&S2C(session)->split_gen, 1);
+ for (i = 0, deleted_refs = scr->mem; i < deleted_entries; ++i) {
+ next_ref = pindex->index[deleted_refs[i]];
+ WT_ASSERT(session, next_ref->state == WT_REF_SPLIT);
/*
- * We left the first/last children of the parent at the current
- * level to avoid bad split patterns, they might be leaf pages;
- * check the page type before we continue.
- */
- child = parent_ref->page;
- if (!WT_PAGE_IS_INTERNAL(child))
- continue;
-#ifdef HAVE_DIAGNOSTIC
- WT_WITH_PAGE_INDEX(session,
- __split_verify_intl_key_order(session, child));
-#endif
- /*
- * We have the parent locked, but there's nothing to prevent
- * this child from splitting beneath us; ensure that reading
- * the child's page index structure is safe.
+ * We set the WT_REF to split, discard it, freeing any resources
+ * it holds.
+ *
+ * Row-store trees where the old version of the page is being
+ * discarded: the previous parent page's key for this child page
+ * may have been an on-page overflow key. In that case, if the
+ * key hasn't been deleted, delete it now, including its backing
+ * blocks. We are exchanging the WT_REF that referenced it for
+ * the split page WT_REFs and their keys, and there's no longer
+ * any reference to it. Done after completing the split (if we
+ * failed, we'd leak the underlying blocks, but the parent page
+ * would be unaffected).
*/
- WT_ENTER_PAGE_INDEX(session);
- WT_INTL_FOREACH_BEGIN(session, child, child_ref) {
+ if (parent->type == WT_PAGE_ROW_INT) {
+ WT_TRET(__split_ovfl_key_cleanup(
+ session, parent, next_ref));
+ ikey = __wt_ref_key_instantiated(next_ref);
+ if (ikey != NULL) {
+ size = sizeof(WT_IKEY) + ikey->size;
+ WT_TRET(__split_safe_free(
+ session, split_gen, exclusive, ikey, size));
+ parent_decr += size;
+ }
/*
- * The page's parent reference may not be wrong, as we
- * opened up access from the top of the tree already,
- * pages may have been read in since then. Check and
- * only update pages that reference the original page,
- * they must be wrong.
+ * The page_del structure can be freed immediately: it
+ * is only read when the ref state is WT_REF_DELETED.
+ * The size of the structure wasn't added to the parent,
+ * don't decrement.
*/
- if (child_ref->home == parent) {
- child_ref->home = child;
- child_ref->pindex_hint = 0;
+ if (next_ref->page_del != NULL) {
+ __wt_free(session,
+ next_ref->page_del->update_list);
+ __wt_free(session, next_ref->page_del);
}
- } WT_INTL_FOREACH_END;
- WT_LEAVE_PAGE_INDEX(session);
+ }
+
+ WT_TRET(__split_safe_free(
+ session, split_gen, exclusive, next_ref, sizeof(WT_REF)));
+ parent_decr += sizeof(WT_REF);
}
+ /* We freed the reference that was split in the loop above. */
+ ref = NULL;
+
/*
- * Push out the changes: not required for correctness, but don't let
- * threads spin on incorrect page references longer than necessary.
+ * We can't free the previous page index, there may be threads using it.
+ * Add it to the session discard list, to be freed when it's safe.
*/
- WT_FULL_BARRIER();
- alloc_index = NULL;
+ size = sizeof(WT_PAGE_INDEX) + pindex->entries * sizeof(WT_REF *);
+ WT_TRET(__split_safe_free(session, split_gen, exclusive, pindex, size));
+ parent_decr += size;
+ /* Adjust the parent's memory footprint and mark it dirty. */
+ __wt_cache_page_inmem_incr(session, parent, parent_incr);
+ __wt_cache_page_inmem_decr(session, parent, parent_decr);
+ __wt_page_modify_set(session, parent);
+
+err: __wt_scr_free(session, &scr);
/*
- * We can't free the previous parent's index, there may be threads using
- * it. Add to the session's discard list, to be freed once we know no
- * threads can still be using it.
+ * A note on error handling: if we completed the split, return success,
+ * nothing really bad can have happened, and our caller has to proceed
+ * with the split.
+ */
+ if (!complete) {
+ for (i = 0; i < parent_entries; ++i) {
+ next_ref = pindex->index[i];
+ if (next_ref->state == WT_REF_SPLIT)
+ next_ref->state = WT_REF_DELETED;
+ }
+
+ __wt_free_ref_index(session, NULL, alloc_index, false);
+
+ /*
+ * The split couldn't proceed because the parent would be empty,
+ * return EBUSY so our caller knows to unlock the WT_REF that's
+ * being deleted, but don't be noisy, there's nothing wrong.
+ */
+ if (empty_parent)
+ return (EBUSY);
+ }
+
+ if (ret != 0 && ret != WT_PANIC)
+ __wt_err(session, ret,
+ "ignoring not-fatal error during parent page split");
+ return (ret == WT_PANIC || !complete ? ret : 0);
+}
+
+/*
+ * __split_internal --
+ * Split an internal page into its parent.
+ */
+static int
+__split_internal(WT_SESSION_IMPL *session, WT_PAGE *parent, WT_PAGE *page)
+{
+ WT_BTREE *btree;
+ WT_DECL_RET;
+ WT_PAGE *child;
+ WT_PAGE_INDEX *alloc_index, *child_pindex, *pindex, *replace_index;
+ WT_REF **alloc_refp;
+ WT_REF **child_refp, *page_ref, **page_refp, *ref;
+ size_t child_incr, page_decr, page_incr, parent_incr, size;
+ uint64_t split_gen;
+ uint32_t children, chunk, i, j, remain;
+ uint32_t slots;
+ bool complete;
+ void *p;
+
+ WT_STAT_FAST_CONN_INCR(session, cache_eviction_split_internal);
+ WT_STAT_FAST_DATA_INCR(session, cache_eviction_split_internal);
+
+ /* The page will be marked dirty, make sure that will succeed. */
+ WT_RET(__wt_page_modify_init(session, page));
+
+ btree = S2BT(session);
+ alloc_index = replace_index = NULL;
+ page_ref = page->pg_intl_parent_ref;
+ page_decr = page_incr = parent_incr = 0;
+ complete = false;
+
+ /*
+ * Our caller is holding the page locked to single-thread splits, which
+ * means we can safely look at the page's index without setting a split
+ * generation.
+ */
+ pindex = WT_INTL_INDEX_GET_SAFE(page);
+
+ /*
+ * Decide how many child pages to create, then calculate the standard
+ * chunk and whatever remains. Sanity check the number of children:
+ * the decision to split matched to the deepen-per-child configuration
+ * might get it wrong.
+ */
+ children = pindex->entries / btree->split_deepen_per_child;
+ if (children < 10) {
+ if (pindex->entries < 100)
+ return (EBUSY);
+ children = 10;
+ }
+ chunk = pindex->entries / children;
+ remain = pindex->entries - chunk * (children - 1);
+
+ WT_ERR(__wt_verbose(session, WT_VERB_SPLIT,
+ "%p: %" PRIu32 " internal page elements, splitting %" PRIu32
+ " children into parent %p",
+ page, pindex->entries, children, parent));
+
+ /*
+ * Ideally, we'd discard the original page, but that's hard since other
+ * threads of control are using it (for example, if eviction is walking
+ * the tree and looking at the page.) Instead, perform a right-split,
+ * moving all except the first chunk of the page's WT_REF objects to new
+ * pages.
*
- * This change requires care with error handling: we have already
- * updated the page with a new index. Even if stashing the old value
- * fails, we don't roll back that change, because threads may already
- * be using the new index.
+ * Create and initialize a replacement WT_PAGE_INDEX for the original
+ * page.
*/
- size = sizeof(WT_PAGE_INDEX) + pindex->entries * sizeof(WT_REF *);
- WT_TRET(__split_safe_free(session, split_gen, 0, pindex, size));
- parent_decr += size;
+ size = sizeof(WT_PAGE_INDEX) + chunk * sizeof(WT_REF *);
+ WT_ERR(__wt_calloc(session, 1, size, &replace_index));
+ page_incr += size;
+ replace_index->index = (WT_REF **)(replace_index + 1);
+ replace_index->entries = chunk;
+ for (page_refp = pindex->index, i = 0; i < chunk; ++i)
+ replace_index->index[i] = *page_refp++;
/*
- * Adjust the parent's memory footprint.
+ * Allocate a new WT_PAGE_INDEX and set of WT_REF objects to be inserted
+ * into the page's parent, replacing the page's page-index.
+ *
+ * The first slot of the new WT_PAGE_INDEX is the original page WT_REF.
+ * The remainder of the slots are allocated WT_REFs.
*/
- __wt_cache_page_inmem_incr(session, parent, parent_incr);
- __wt_cache_page_inmem_decr(session, parent, parent_decr);
+ size = sizeof(WT_PAGE_INDEX) + children * sizeof(WT_REF *);
+ WT_ERR(__wt_calloc(session, 1, size, &alloc_index));
+ parent_incr += size;
+ alloc_index->index = (WT_REF **)(alloc_index + 1);
+ alloc_index->entries = children;
+ alloc_refp = alloc_index->index;
+ *alloc_refp++ = page_ref;
+ for (i = 1; i < children; ++alloc_refp, ++i)
+ WT_ERR(__wt_calloc_one(session, alloc_refp));
+ parent_incr += children * sizeof(WT_REF);
+
+ /* Allocate child pages, and connect them into the new page index. */
+ WT_ASSERT(session, page_refp == pindex->index + chunk);
+ for (alloc_refp = alloc_index->index + 1, i = 1; i < children; ++i) {
+ slots = i == children - 1 ? remain : chunk;
+ WT_ERR(__wt_page_alloc(
+ session, page->type, 0, slots, false, &child));
+
+ /*
+ * Initialize the page's child reference; we need a copy of the
+ * page's key.
+ */
+ ref = *alloc_refp++;
+ ref->home = parent;
+ ref->page = child;
+ ref->addr = NULL;
+ if (page->type == WT_PAGE_ROW_INT) {
+ __wt_ref_key(page, *page_refp, &p, &size);
+ WT_ERR(__wt_row_ikey(session, 0, p, size, ref));
+ parent_incr += sizeof(WT_IKEY) + size;
+ } else
+ ref->key.recno = (*page_refp)->key.recno;
+ ref->state = WT_REF_MEM;
+
+ /* Initialize the child page. */
+ if (page->type == WT_PAGE_COL_INT)
+ child->pg_intl_recno = (*page_refp)->key.recno;
+ child->pg_intl_parent_ref = ref;
+
+ /* Mark it dirty. */
+ WT_ERR(__wt_page_modify_init(session, child));
+ __wt_page_modify_set(session, child);
+
+ /* Ensure the page isn't evicted or split for now. */
+ __split_child_block_evict_and_split(child);
+
+ /*
+ * The newly allocated child's page index references the same
+ * structures as the parent. (We cannot move WT_REF structures,
+ * threads may be underneath us right now changing the structure
+ * state.) However, if the WT_REF structures reference on-page
+ * information, we have to fix that, because the disk image for
+ * the page that has an page index entry for the WT_REF is about
+ * to be discarded.
+ */
+ child_pindex = WT_INTL_INDEX_GET_SAFE(child);
+ child_incr = 0;
+ for (child_refp = child_pindex->index,
+ j = 0; j < slots; ++child_refp, ++page_refp, ++j)
+ WT_ERR(__split_ref_move(session, page,
+ page_refp, &page_decr, child_refp, &child_incr));
+
+ __wt_cache_page_inmem_incr(session, child, child_incr);
+ }
+ WT_ASSERT(session, alloc_refp -
+ alloc_index->index == (ptrdiff_t)alloc_index->entries);
+ WT_ASSERT(session,
+ page_refp - pindex->index == (ptrdiff_t)pindex->entries);
+
+ /* Split into the parent. */
+ WT_ERR(__split_parent(session, page_ref, alloc_index->index,
+ alloc_index->entries, parent_incr, false, false));
+
+ /*
+ * A note on error handling: until this point, there's no problem with
+ * unwinding on error. We allocated a new page index, a new set of
+ * WT_REFs and a new set of child pages -- if an error occurred, the
+ * page remained unchanged, although it may have an incorrect memory
+ * footprint. From now on we've modified the parent page, attention
+ * needs to be paid. However, subsequent failures are relatively benign,
+ * the split is OK and complete. For that reason, we ignore errors past
+ * this point unless there's a panic.
+ */
+ complete = true;
+
+ /* Confirm the page's index hasn't moved, then update it. */
+ WT_ASSERT(session, WT_INTL_INDEX_GET_SAFE(page) == pindex);
+ WT_INTL_INDEX_SET(page, replace_index);
+
+#ifdef HAVE_DIAGNOSTIC
+ WT_WITH_PAGE_INDEX(session,
+ __split_verify_intl_key_order(session, parent));
+ WT_WITH_PAGE_INDEX(session,
+ __split_verify_intl_key_order(session, page));
+#endif
+
+ /* Fix up the moved WT_REF structures. */
+ WT_ERR(__split_ref_move_final(
+ session, alloc_index->index + 1, alloc_index->entries - 1));
+
+ /*
+ * We don't care about the page-index we allocated, all we needed was
+ * the array of WT_REF structures, which has now been split into the
+ * parent page.
+ */
+ __wt_free(session, alloc_index);
+
+ /*
+ * We can't free the previous page's index, there may be threads using
+ * it. Add to the session's discard list, to be freed once we know no
+ * threads can still be using it.
+ *
+ * This change requires care with error handling, we've already updated
+ * the parent page. Even if stashing the old value fails, we don't roll
+ * back that change, because threads may already be using the new parent
+ * page.
+ */
+ split_gen = __wt_atomic_addv64(&S2C(session)->split_gen, 1);
+ size = sizeof(WT_PAGE_INDEX) + pindex->entries * sizeof(WT_REF *);
+ WT_TRET(__split_safe_free(session, split_gen, false, pindex, size));
+ page_decr += size;
+
+ /* Adjust the page's memory footprint, and mark it dirty. */
+ __wt_cache_page_inmem_incr(session, page, page_incr);
+ __wt_cache_page_inmem_decr(session, page, page_decr);
+ __wt_page_modify_set(session, page);
err: /*
* If complete is true, we saw an error after opening up the tree to
- * descent through the parent page's new index. There is nothing we
- * can do, there are threads potentially active in both versions of
- * the tree.
+ * descent through the page's new index. There is nothing we can do,
+ * there are threads potentially active in both versions of the tree.
*
* A note on error handling: if we completed the split, return success,
* nothing really bad can have happened, and our caller has to proceed
* with the split.
*/
- if (!complete)
- __wt_free_ref_index(session, parent, alloc_index, true);
+ if (!complete) {
+ __wt_free_ref_index(session, page, alloc_index, true);
+ __wt_free_ref_index(session, page, replace_index, false);
+ }
if (ret != 0 && ret != WT_PANIC)
__wt_err(session, ret,
- "ignoring not-fatal error during parent page split to "
- "deepen the tree");
+ "ignoring not-fatal error during internal page split");
return (ret == WT_PANIC || !complete ? ret : 0);
}
/*
+ * __split_internal_lock --
+ * Lock an internal page.
+ */
+static int
+__split_internal_lock(
+ WT_SESSION_IMPL *session, WT_REF *ref, WT_PAGE **parentp, bool *hazardp)
+{
+ WT_DECL_RET;
+ WT_PAGE *parent;
+ WT_REF *parent_ref;
+
+ *hazardp = false;
+ *parentp = NULL;
+
+ /*
+ * A checkpoint reconciling this parent page can deadlock with
+ * our split. We have an exclusive page lock on the child before
+ * we acquire the page's reconciliation lock, and reconciliation
+ * acquires the page's reconciliation lock before it encounters
+ * the child's exclusive lock (which causes reconciliation to
+ * loop until the exclusive lock is resolved). If we want to split
+ * the parent, give up to avoid that deadlock.
+ */
+ if (S2BT(session)->checkpointing != WT_CKPT_OFF)
+ return (EBUSY);
+
+ /*
+ * Get a page-level lock on the parent to single-thread splits into the
+ * page because we need to single-thread sizing/growing the page index.
+ * It's OK to queue up multiple splits as the child pages split, but the
+ * actual split into the parent has to be serialized. Note we allocate
+ * memory inside of the lock and may want to invest effort in making the
+ * locked period shorter.
+ *
+ * We use the reconciliation lock here because not only do we have to
+ * single-thread the split, we have to lock out reconciliation of the
+ * parent because reconciliation of the parent can't deal with finding
+ * a split child during internal page traversal. Basically, there's no
+ * reason to use a different lock if we have to block reconciliation
+ * anyway.
+ */
+ for (;;) {
+ parent = ref->home;
+
+ /* Skip pages that aren't ready to split. */
+ if (F_ISSET_ATOMIC(parent, WT_PAGE_SPLIT_BLOCK))
+ return (EBUSY);
+
+ WT_RET(__wt_fair_lock(session, &parent->page_lock));
+ if (parent == ref->home)
+ break;
+ WT_RET(__wt_fair_unlock(session, &parent->page_lock));
+ }
+
+ /*
+ * We have exclusive access to split the parent, and at this point, the
+ * child prevents the parent from being evicted. However, once we
+ * update the parent's index, it may no longer refer to the child, and
+ * could conceivably be evicted. Get a hazard pointer on the parent
+ * now, so that we can safely access it after updating the index.
+ *
+ * Take care getting the page doesn't trigger eviction work: we could
+ * block trying to split a different child of our parent and deadlock
+ * or we could be the eviction server relied upon by other threads to
+ * populate the eviction queue.
+ */
+ if (!__wt_ref_is_root(parent_ref = parent->pg_intl_parent_ref)) {
+ WT_ERR(__wt_page_in(session, parent_ref, WT_READ_NO_EVICT));
+ *hazardp = true;
+ }
+
+ *parentp = parent;
+ return (0);
+
+err: WT_TRET(__wt_fair_unlock(session, &parent->page_lock));
+ return (ret);
+}
+
+/*
+ * __split_internal_unlock --
+ * Unlock the parent page.
+ */
+static int
+__split_internal_unlock(WT_SESSION_IMPL *session, WT_PAGE *parent, bool hazard)
+{
+ WT_DECL_RET;
+
+ if (hazard)
+ ret = __wt_hazard_clear(session, parent);
+
+ WT_TRET(__wt_fair_unlock(session, &parent->page_lock));
+ return (ret);
+}
+
+/*
+ * __split_internal_should_split --
+ * Return if we should split an internal page.
+ */
+static bool
+__split_internal_should_split(WT_SESSION_IMPL *session, WT_REF *ref)
+{
+ WT_BTREE *btree;
+ WT_PAGE *page;
+ WT_PAGE_INDEX *pindex;
+
+ btree = S2BT(session);
+ page = ref->page;
+
+ /*
+ * Our caller is holding the parent page locked to single-thread splits,
+ * which means we can safely look at the page's index without setting a
+ * split generation.
+ */
+ pindex = WT_INTL_INDEX_GET_SAFE(page);
+
+ /* Sanity check for a reasonable number of on-page keys. */
+ if (pindex->entries < 100)
+ return (false);
+
+ /*
+ * Deepen the tree if the page's memory footprint is larger than the
+ * maximum size for a page in memory (presumably putting eviction
+ * pressure on the cache).
+ */
+ if (page->memory_footprint > btree->maxmempage)
+ return (true);
+
+ /*
+ * Check if the page has enough keys to make it worth splitting. If
+ * the number of keys is allowed to grow too large, the cost of
+ * splitting into parent pages can become large enough to result
+ * in slow operations.
+ */
+ if (pindex->entries > btree->split_deepen_min_child)
+ return (true);
+
+ return (false);
+}
+
+/*
+ * __split_parent_climb --
+ * Check if we should split up the tree.
+ */
+static int
+__split_parent_climb(WT_SESSION_IMPL *session, WT_PAGE *page, bool page_hazard)
+{
+ WT_DECL_RET;
+ WT_PAGE *parent;
+ WT_REF *ref;
+ bool parent_hazard;
+
+ /*
+ * Page splits trickle up the tree, that is, as leaf pages grow large
+ * enough and are evicted, they'll split into their parent. And, as
+ * that parent page grows large enough and is evicted, it splits into
+ * its parent and so on. When the page split wave reaches the root,
+ * the tree will permanently deepen as multiple root pages are written.
+ *
+ * However, this only helps if internal pages are evicted (and we resist
+ * evicting internal pages for obvious reasons), or if the tree were to
+ * be closed and re-opened from a disk image, which may be a rare event.
+ *
+ * To avoid internal pages becoming too large absent eviction, check
+ * parent pages each time pages are split into them. If the page is big
+ * enough, either split the page into its parent or, in the case of the
+ * root, deepen the tree.
+ *
+ * Split up the tree.
+ */
+ for (;;) {
+ parent = NULL;
+ parent_hazard = false;
+ ref = page->pg_intl_parent_ref;
+
+ /* If we don't need to split the page, we're done. */
+ if (!__split_internal_should_split(session, ref))
+ break;
+
+ /*
+ * If we've reached the root page, there are no subsequent pages
+ * to review, deepen the tree and quit.
+ */
+ if (__wt_ref_is_root(ref)) {
+ ret = __split_root(session, page);
+ break;
+ }
+
+ /*
+ * Lock the parent and split into it, then swap the parent/page
+ * locks, lock-coupling up the tree.
+ */
+ WT_ERR(__split_internal_lock(
+ session, ref, &parent, &parent_hazard));
+ ret = __split_internal(session, parent, page);
+ WT_TRET(__split_internal_unlock(session, page, page_hazard));
+
+ page = parent;
+ page_hazard = parent_hazard;
+ parent = NULL;
+ parent_hazard = false;
+ WT_ERR(ret);
+ }
+
+err: if (parent != NULL)
+ WT_TRET(
+ __split_internal_unlock(session, parent, parent_hazard));
+ WT_TRET(__split_internal_unlock(session, page, page_hazard));
+
+ /* A page may have been busy, in which case return without error. */
+ WT_RET_BUSY_OK(ret);
+ return (0);
+}
+
+/*
* __split_multi_inmem --
* Instantiate a page in a multi-block set.
*/
@@ -901,369 +1600,6 @@ __wt_multi_to_ref(WT_SESSION_IMPL *session,
}
/*
- * __split_parent_lock --
- * Lock the parent page.
- */
-static int
-__split_parent_lock(
- WT_SESSION_IMPL *session, WT_REF *ref, WT_PAGE **parentp, bool *hazardp)
-{
- WT_DECL_RET;
- WT_PAGE *parent;
- WT_REF *parent_ref;
-
- *hazardp = false;
- *parentp = NULL;
-
- /*
- * A checkpoint reconciling this parent page can deadlock with
- * our split. We have an exclusive page lock on the child before
- * we acquire the page's reconciliation lock, and reconciliation
- * acquires the page's reconciliation lock before it encounters
- * the child's exclusive lock (which causes reconciliation to
- * loop until the exclusive lock is resolved). If we want to split
- * the parent, give up to avoid that deadlock.
- */
- if (S2BT(session)->checkpointing != WT_CKPT_OFF)
- return (EBUSY);
-
- /*
- * Get a page-level lock on the parent to single-thread splits into the
- * page because we need to single-thread sizing/growing the page index.
- * It's OK to queue up multiple splits as the child pages split, but the
- * actual split into the parent has to be serialized. Note we allocate
- * memory inside of the lock and may want to invest effort in making the
- * locked period shorter.
- *
- * We use the reconciliation lock here because not only do we have to
- * single-thread the split, we have to lock out reconciliation of the
- * parent because reconciliation of the parent can't deal with finding
- * a split child during internal page traversal. Basically, there's no
- * reason to use a different lock if we have to block reconciliation
- * anyway.
- */
- for (;;) {
- parent = ref->home;
- WT_RET(__wt_fair_lock(session, &parent->page_lock));
- if (parent == ref->home)
- break;
- /* Try again if the page deepened while we were waiting */
- WT_RET(__wt_fair_unlock(session, &parent->page_lock));
- }
-
- /*
- * We have exclusive access to split the parent, and at this point, the
- * child prevents the parent from being evicted. However, once we
- * update the parent's index, it will no longer refer to the child, and
- * could conceivably be evicted. Get a hazard pointer on the parent
- * now, so that we can safely access it after updating the index.
- *
- * Take care getting the page doesn't trigger eviction work: we could
- * block trying to split a different child of our parent and deadlock
- * or we could be the eviction server relied upon by other threads to
- * populate the eviction queue.
- */
- if (!__wt_ref_is_root(parent_ref = parent->pg_intl_parent_ref)) {
- WT_ERR(__wt_page_in(session, parent_ref, WT_READ_NO_EVICT));
- *hazardp = true;
- }
-
- *parentp = parent;
- return (0);
-
-err: WT_TRET(__wt_fair_unlock(session, &parent->page_lock));
- return (ret);
-}
-
-/*
- * __split_parent_unlock --
- * Unlock the parent page.
- */
-static int
-__split_parent_unlock(WT_SESSION_IMPL *session, WT_PAGE *parent, bool hazard)
-{
- WT_DECL_RET;
-
- if (hazard)
- ret = __wt_hazard_clear(session, parent);
-
- WT_TRET(__wt_fair_unlock(session, &parent->page_lock));
- return (ret);
-}
-
-/*
- * __split_parent --
- * Resolve a multi-page split, inserting new information into the parent.
- */
-static int
-__split_parent(WT_SESSION_IMPL *session, WT_REF *ref,
- WT_REF **ref_new, uint32_t new_entries, size_t parent_incr, int exclusive)
-{
- WT_DECL_RET;
- WT_IKEY *ikey;
- WT_PAGE *parent;
- WT_PAGE_INDEX *alloc_index, *pindex;
- WT_REF **alloc_refp, *next_ref, *parent_ref;
- size_t parent_decr, size;
- uint64_t split_gen;
- uint32_t i, j;
- uint32_t deleted_entries, parent_entries, result_entries;
- bool complete;
-
- parent = ref->home;
- parent_ref = parent->pg_intl_parent_ref;
-
- alloc_index = pindex = NULL;
- parent_decr = 0;
- parent_entries = 0;
- complete = false;
-
- /*
- * We've locked the parent, which means it cannot split (which is the
- * only reason to worry about split generation values).
- */
- pindex = WT_INTL_INDEX_GET_SAFE(parent);
- parent_entries = pindex->entries;
-
- /*
- * Remove any refs to deleted pages while we are splitting, we have
- * the internal page locked down, and are copying the refs into a new
- * array anyway. Switch them to the special split state, so that any
- * reading thread will restart. Include the ref we are splitting in
- * the count to be deleted.
- */
- for (deleted_entries = 1, i = 0; i < parent_entries; ++i) {
- next_ref = pindex->index[i];
- WT_ASSERT(session, next_ref->state != WT_REF_SPLIT);
- if (next_ref->state == WT_REF_DELETED &&
- __wt_delete_page_skip(session, next_ref, true) &&
- __wt_atomic_casv32(
- &next_ref->state, WT_REF_DELETED, WT_REF_SPLIT))
- deleted_entries++;
- }
-
- /*
- * The final entry count consists of the original count, plus any new
- * pages, less any WT_REFs we're removing.
- */
- result_entries = (parent_entries + new_entries) - deleted_entries;
-
- /*
- * If the entire (sub)tree is empty, give up: we can't leave an empty
- * internal page. Mark it to be evicted soon and clean up any
- * references that have changed state.
- */
- if (result_entries == 0) {
- __wt_page_evict_soon(parent);
- goto err;
- }
-
- /*
- * Allocate and initialize a new page index array for the parent, then
- * copy references from the original index array, plus references from
- * the newly created split array, into place.
- */
- size = sizeof(WT_PAGE_INDEX) + result_entries * sizeof(WT_REF *);
- WT_ERR(__wt_calloc(session, 1, size, &alloc_index));
- parent_incr += size;
- alloc_index->index = (WT_REF **)(alloc_index + 1);
- alloc_index->entries = result_entries;
- for (alloc_refp = alloc_index->index, i = 0; i < parent_entries; ++i) {
- next_ref = pindex->index[i];
- if (next_ref == ref) {
- for (j = 0; j < new_entries; ++j) {
- ref_new[j]->home = parent;
- *alloc_refp++ = ref_new[j];
-
- /*
- * Clear the split reference as it moves to the
- * allocated page index, so it never appears on
- * both after an error.
- */
- ref_new[j] = NULL;
- }
-
- /*
- * We detect append-style workloads to avoid repeatedly
- * deepening parts of the tree where no work is being
- * done by tracking if we're splitting after the slots
- * created by the last split to deepen this parent.
- *
- * Note the calculation: i is a 0-based array offset and
- * split-last is a count of entries, also either or both
- * i and split-last might be unsigned 0, don't decrement
- * either one.
- */
- if (i > parent->pg_intl_deepen_split_last)
- parent->
- pg_intl_deepen_split_append += new_entries;
- } else if (next_ref->state != WT_REF_SPLIT)
- /* Skip refs we have marked for deletion. */
- *alloc_refp++ = next_ref;
- }
-
- /* Check that we filled in all the entries. */
- WT_ASSERT(session,
- alloc_refp - alloc_index->index == (ptrdiff_t)result_entries);
-
- /*
- * Confirm the parent page's index hasn't moved then update it, which
- * makes the split visible to threads descending the tree.
- */
- WT_ASSERT(session, WT_INTL_INDEX_GET_SAFE(parent) == pindex);
- WT_INTL_INDEX_SET(parent, alloc_index);
- split_gen = __wt_atomic_addv64(&S2C(session)->split_gen, 1);
- alloc_index = NULL;
-
-#ifdef HAVE_DIAGNOSTIC
- WT_WITH_PAGE_INDEX(session,
- __split_verify_intl_key_order(session, parent));
-#endif
-
- /*
- * Reset the page's original WT_REF field to split. Threads cursoring
- * through the tree were blocked because that WT_REF state was set to
- * locked. This update changes the locked state to split, unblocking
- * those threads and causing them to re-calculate their position based
- * on the updated parent page's index.
- */
- WT_PUBLISH(ref->state, WT_REF_SPLIT);
-
- /*
- * A note on error handling: failures before we swapped the new page
- * index into the parent can be resolved by freeing allocated memory
- * because the original page is unchanged, we can continue to use it
- * and we have not yet modified the parent. Failures after we swap
- * the new page index into the parent are also relatively benign, the
- * split is OK and complete. For those reasons, we ignore errors past
- * this point unless there's a panic.
- */
- complete = true;
-
- WT_ERR(__wt_verbose(session, WT_VERB_SPLIT,
- "%s split into parent %" PRIu32 " -> %" PRIu32
- " (%" PRIu32 ")", ref->page == NULL ?
- "reverse" : __wt_page_type_string(ref->page->type),
- parent_entries, result_entries, result_entries - parent_entries));
-
- /*
- * The new page index is in place, free the WT_REF we were splitting
- * and any deleted WT_REFs we found, modulo the usual safe free
- * semantics.
- */
- for (i = 0; deleted_entries > 0 && i < parent_entries; ++i) {
- next_ref = pindex->index[i];
- if (next_ref->state != WT_REF_SPLIT)
- continue;
- --deleted_entries;
-
- /*
- * We set the WT_REF to split, discard it, freeing any resources
- * it holds.
- *
- * Row-store trees where the old version of the page is being
- * discarded: the previous parent page's key for this child page
- * may have been an on-page overflow key. In that case, if the
- * key hasn't been deleted, delete it now, including its backing
- * blocks. We are exchanging the WT_REF that referenced it for
- * the split page WT_REFs and their keys, and there's no longer
- * any reference to it. Done after completing the split (if we
- * failed, we'd leak the underlying blocks, but the parent page
- * would be unaffected).
- */
- if (parent->type == WT_PAGE_ROW_INT) {
- WT_TRET(__split_ovfl_key_cleanup(
- session, parent, next_ref));
- ikey = __wt_ref_key_instantiated(next_ref);
- if (ikey != NULL) {
- size = sizeof(WT_IKEY) + ikey->size;
- WT_TRET(__split_safe_free(
- session, split_gen, 0, ikey, size));
- parent_decr += size;
- }
- /*
- * The page_del structure can be freed immediately: it
- * is only read when the ref state is WT_REF_DELETED.
- * The size of the structure wasn't added to the parent,
- * don't decrement.
- */
- if (next_ref->page_del != NULL) {
- __wt_free(session,
- next_ref->page_del->update_list);
- __wt_free(session, next_ref->page_del);
- }
- }
-
- WT_TRET(__split_safe_free(
- session, split_gen, 0, next_ref, sizeof(WT_REF)));
- parent_decr += sizeof(WT_REF);
- }
-
- /* We freed the reference that was split in the loop above. */
- ref = NULL;
-
- /*
- * We can't free the previous page index, there may be threads using it.
- * Add it to the session discard list, to be freed when it's safe.
- */
- size = sizeof(WT_PAGE_INDEX) + pindex->entries * sizeof(WT_REF *);
- WT_TRET(__split_safe_free(session, split_gen, exclusive, pindex, size));
- parent_decr += size;
-
- /*
- * Adjust the parent's memory footprint.
- */
- __wt_cache_page_inmem_incr(session, parent, parent_incr);
- __wt_cache_page_inmem_decr(session, parent, parent_decr);
-
- /*
- * Simple page splits trickle up the tree, that is, as leaf pages grow
- * large enough and are evicted, they'll split into their parent. And,
- * as that parent grows large enough and is evicted, it will split into
- * its parent and so on. When the page split wave reaches the root,
- * the tree will permanently deepen as multiple root pages are written.
- * However, this only helps if first, the pages are evicted (and
- * we resist evicting internal pages for obvious reasons), and second,
- * if the tree is closed and re-opened from a disk image, which may be
- * a rare event.
- * To avoid the case of internal pages becoming too large when they
- * aren't being evicted, check internal pages each time a leaf page is
- * split into them. If it's big enough, deepen the tree at that point.
- * Do the check here because we've just grown the parent page and
- * are holding it locked.
- */
- if (ret == 0 && !exclusive &&
- __split_should_deepen(session, parent_ref))
- ret = __split_deepen(session, parent);
-
-err: /*
- * A note on error handling: if we completed the split, return success,
- * nothing really bad can have happened, and our caller has to proceed
- * with the split.
- */
- if (!complete) {
- for (i = 0; i < parent_entries; ++i) {
- next_ref = pindex->index[i];
- if (next_ref->state == WT_REF_SPLIT)
- next_ref->state = WT_REF_DELETED;
- }
-
- /* If we gave up on a reverse split, unlock the child. */
- if (ref_new == NULL) {
- WT_ASSERT(session, ref->state == WT_REF_LOCKED);
- ref->state = WT_REF_DELETED;
- }
-
- __wt_free_ref_index(session, NULL, alloc_index, false);
- }
-
- if (ret != 0 && ret != WT_PANIC)
- __wt_err(session, ret,
- "ignoring not-fatal error during parent page split");
- return (ret == WT_PANIC || !complete ? ret : 0);
-}
-
-/*
* __split_insert --
* Split a page's last insert list entries into a separate page.
*/
@@ -1279,6 +1615,9 @@ __split_insert(WT_SESSION_IMPL *session, WT_REF *ref)
size_t page_decr, parent_incr, right_incr;
int i;
+ WT_STAT_FAST_CONN_INCR(session, cache_inmem_split);
+ WT_STAT_FAST_DATA_INCR(session, cache_inmem_split);
+
page = ref->page;
right = NULL;
page_decr = parent_incr = right_incr = 0;
@@ -1491,7 +1830,7 @@ __split_insert(WT_SESSION_IMPL *session, WT_REF *ref)
*/
page = NULL;
if ((ret = __split_parent(
- session, ref, split_ref, 2, parent_incr, false)) != 0) {
+ session, ref, split_ref, 2, parent_incr, false, true)) != 0) {
/*
* Move the insert list element back to the original page list.
* For simplicity, the previous skip list pointers originally
@@ -1513,9 +1852,6 @@ __split_insert(WT_SESSION_IMPL *session, WT_REF *ref)
WT_ERR(ret);
}
- WT_STAT_FAST_CONN_INCR(session, cache_inmem_split);
- WT_STAT_FAST_DATA_INCR(session, cache_inmem_split);
-
return (0);
err: if (split_ref[0] != NULL) {
@@ -1543,83 +1879,21 @@ __wt_split_insert(WT_SESSION_IMPL *session, WT_REF *ref)
WT_PAGE *parent;
bool hazard;
- WT_RET(__split_parent_lock(session, ref, &parent, &hazard));
- ret = __split_insert(session, ref);
- WT_TRET(__split_parent_unlock(session, parent, hazard));
- return (ret);
-}
+ WT_RET(__wt_verbose(
+ session, WT_VERB_SPLIT, "%p: split-insert", ref->page));
-/*
- * __wt_split_reverse --
- * We have a locked ref that is empty and we want to rewrite the index in
- * its parent.
- */
-int
-__wt_split_reverse(WT_SESSION_IMPL *session, WT_REF *ref)
-{
- WT_DECL_RET;
- WT_PAGE *parent;
- bool hazard;
-
- WT_RET(__split_parent_lock(session, ref, &parent, &hazard));
- ret = __split_parent(session, ref, NULL, 0, 0, 0);
- WT_TRET(__split_parent_unlock(session, parent, hazard));
- return (ret);
-}
-
-/*
- * __wt_split_rewrite --
- * Rewrite an in-memory page with a new version.
- */
-int
-__wt_split_rewrite(WT_SESSION_IMPL *session, WT_REF *ref)
-{
- WT_DECL_RET;
- WT_PAGE *page;
- WT_PAGE_MODIFY *mod;
- WT_REF new;
-
- page = ref->page;
- mod = page->modify;
-
- /*
- * This isn't a split: a reconciliation failed because we couldn't write
- * something, and in the case of forced eviction, we need to stop this
- * page from being such a problem. We have exclusive access, rewrite the
- * page in memory. The code lives here because the split code knows how
- * to re-create a page in memory after it's been reconciled, and that's
- * exactly what we want to do.
- *
- * Build the new page.
- */
- memset(&new, 0, sizeof(new));
- WT_ERR(__split_multi_inmem(session, page, &new, &mod->mod_multi[0]));
-
- /*
- * The rewrite succeeded, we can no longer fail.
- *
- * Finalize the move, discarding moved update lists from the original
- * page.
- */
- __split_multi_inmem_final(page, &mod->mod_multi[0]);
+ WT_RET(__split_internal_lock(session, ref, &parent, &hazard));
+ if ((ret = __split_insert(session, ref)) != 0) {
+ WT_TRET(__split_internal_unlock(session, parent, hazard));
+ return (ret);
+ }
/*
- * Discard the original page.
- *
- * Pages with unresolved changes are not marked clean during
- * reconciliation, do it now.
+ * Split up through the tree as necessary; we're holding the original
+ * parent page locked, note the functions we call are responsible for
+ * releasing that lock.
*/
- __wt_page_modify_clear(session, page);
- __wt_ref_out(session, ref);
-
- /* Swap the new page into place. */
- ref->page = new.page;
- WT_PUBLISH(ref->state, WT_REF_MEM);
-
- return (0);
-
-err: __split_multi_inmem_fail(session, &new);
- return (ret);
+ return (__split_parent_climb(session, parent, hazard));
}
/*
@@ -1636,6 +1910,9 @@ __split_multi(WT_SESSION_IMPL *session, WT_REF *ref, bool closing)
size_t parent_incr;
uint32_t i, new_entries;
+ WT_STAT_FAST_CONN_INCR(session, cache_eviction_split_leaf);
+ WT_STAT_FAST_DATA_INCR(session, cache_eviction_split_leaf);
+
page = ref->page;
mod = page->modify;
new_entries = mod->mod_multi_entries;
@@ -1656,10 +1933,7 @@ __split_multi(WT_SESSION_IMPL *session, WT_REF *ref, bool closing)
* exclusively.
*/
WT_ERR(__split_parent(
- session, ref, ref_new, new_entries, parent_incr, closing));
-
- WT_STAT_FAST_CONN_INCR(session, cache_eviction_split);
- WT_STAT_FAST_DATA_INCR(session, cache_eviction_split);
+ session, ref, ref_new, new_entries, parent_incr, closing, true));
/*
* The split succeeded, we can no longer fail.
@@ -1697,8 +1971,98 @@ __wt_split_multi(WT_SESSION_IMPL *session, WT_REF *ref, int closing)
WT_PAGE *parent;
bool hazard;
- WT_RET(__split_parent_lock(session, ref, &parent, &hazard));
- ret = __split_multi(session, ref, closing);
- WT_TRET(__split_parent_unlock(session, parent, hazard));
+ WT_RET(__wt_verbose(
+ session, WT_VERB_SPLIT, "%p: split-multi", ref->page));
+
+ WT_RET(__split_internal_lock(session, ref, &parent, &hazard));
+ if ((ret = __split_multi(session, ref, closing)) != 0 || closing) {
+ WT_TRET(__split_internal_unlock(session, parent, hazard));
+ return (ret);
+ }
+
+ /*
+ * Split up through the tree as necessary; we're holding the original
+ * parent page locked, note the functions we call are responsible for
+ * releasing that lock.
+ */
+ return (__split_parent_climb(session, parent, hazard));
+}
+
+/*
+ * __wt_split_reverse --
+ * We have a locked ref that is empty and we want to rewrite the index in
+ * its parent.
+ */
+int
+__wt_split_reverse(WT_SESSION_IMPL *session, WT_REF *ref)
+{
+ WT_DECL_RET;
+ WT_PAGE *parent;
+ bool hazard;
+
+ WT_RET(__wt_verbose(
+ session, WT_VERB_SPLIT, "%p: reverse-split", ref->page));
+
+ WT_RET(__split_internal_lock(session, ref, &parent, &hazard));
+ ret = __split_parent(session, ref, NULL, 0, 0, false, true);
+ WT_TRET(__split_internal_unlock(session, parent, hazard));
+ return (ret);
+}
+
+/*
+ * __wt_split_rewrite --
+ * Rewrite an in-memory page with a new version.
+ */
+int
+__wt_split_rewrite(WT_SESSION_IMPL *session, WT_REF *ref)
+{
+ WT_DECL_RET;
+ WT_PAGE *page;
+ WT_PAGE_MODIFY *mod;
+ WT_REF new;
+
+ page = ref->page;
+ mod = page->modify;
+
+ WT_RET(__wt_verbose(
+ session, WT_VERB_SPLIT, "%p: split-rewrite", ref->page));
+
+ /*
+ * This isn't a split: a reconciliation failed because we couldn't write
+ * something, and in the case of forced eviction, we need to stop this
+ * page from being such a problem. We have exclusive access, rewrite the
+ * page in memory. The code lives here because the split code knows how
+ * to re-create a page in memory after it's been reconciled, and that's
+ * exactly what we want to do.
+ *
+ * Build the new page.
+ */
+ memset(&new, 0, sizeof(new));
+ WT_ERR(__split_multi_inmem(session, page, &new, &mod->mod_multi[0]));
+
+ /*
+ * The rewrite succeeded, we can no longer fail.
+ *
+ * Finalize the move, discarding moved update lists from the original
+ * page.
+ */
+ __split_multi_inmem_final(page, &mod->mod_multi[0]);
+
+ /*
+ * Discard the original page.
+ *
+ * Pages with unresolved changes are not marked clean during
+ * reconciliation, do it now.
+ */
+ __wt_page_modify_clear(session, page);
+ __wt_ref_out(session, ref);
+
+ /* Swap the new page into place. */
+ ref->page = new.page;
+ WT_PUBLISH(ref->state, WT_REF_MEM);
+
+ return (0);
+
+err: __split_multi_inmem_fail(session, &new);
return (ret);
}
diff --git a/src/third_party/wiredtiger/src/btree/bt_sync.c b/src/third_party/wiredtiger/src/btree/bt_sync.c
index 7395cce11e1..07bb2eb3a01 100644
--- a/src/third_party/wiredtiger/src/btree/bt_sync.c
+++ b/src/third_party/wiredtiger/src/btree/bt_sync.c
@@ -191,7 +191,7 @@ __sync_file(WT_SESSION_IMPL *session, WT_CACHE_OP syncop)
syncop == WT_SYNC_WRITE_LEAVES ?
"WRITE_LEAVES" : "CHECKPOINT",
leaf_bytes, leaf_pages, internal_bytes, internal_pages,
- WT_TIMEDIFF(end, start) / WT_MILLION));
+ WT_TIMEDIFF_MS(end, start)));
}
err: /* On error, clear any left-over tree walk. */
diff --git a/src/third_party/wiredtiger/src/btree/col_srch.c b/src/third_party/wiredtiger/src/btree/col_srch.c
index d02f23ed164..e9fa570f97b 100644
--- a/src/third_party/wiredtiger/src/btree/col_srch.c
+++ b/src/third_party/wiredtiger/src/btree/col_srch.c
@@ -22,7 +22,7 @@ __wt_col_search(WT_SESSION_IMPL *session,
WT_INSERT *ins;
WT_INSERT_HEAD *ins_head;
WT_PAGE *page;
- WT_PAGE_INDEX *pindex;
+ WT_PAGE_INDEX *pindex, *parent_pindex;
WT_REF *current, *descent;
uint32_t base, indx, limit;
int depth;
@@ -37,10 +37,12 @@ __wt_col_search(WT_SESSION_IMPL *session,
goto leaf_only;
}
+restart_root:
/* Search the internal pages of the tree. */
current = &btree->root;
- for (depth = 2;; ++depth) {
-restart: page = current->page;
+ for (depth = 2, pindex = NULL;; ++depth) {
+ parent_pindex = pindex;
+restart_page: page = current->page;
if (page->type != WT_PAGE_COL_INT)
break;
@@ -51,8 +53,19 @@ restart: page = current->page;
descent = pindex->index[base - 1];
/* Fast path appends. */
- if (recno >= descent->key.recno)
+ if (recno >= descent->key.recno) {
+ /*
+ * If on the last slot (the key is larger than any key
+ * on the page), check for an internal page split race.
+ */
+ if (parent_pindex != NULL &&
+ __wt_split_intl_race(
+ session, current->home, parent_pindex)) {
+ WT_RET(__wt_page_release(session, current, 0));
+ goto restart_root;
+ }
goto descend;
+ }
/* Binary search of internal pages. */
for (base = 0,
@@ -90,15 +103,13 @@ descend: /*
* page; otherwise return on error, the swap call ensures we're
* holding nothing on failure.
*/
- switch (ret = __wt_page_swap(session, current, descent, 0)) {
- case 0:
+ if ((ret = __wt_page_swap(session, current, descent, 0)) == 0) {
current = descent;
- break;
- case WT_RESTART:
- goto restart;
- default:
- return (ret);
+ continue;
}
+ if (ret == WT_RESTART)
+ goto restart_page;
+ return (ret);
}
/* Track how deep the tree gets. */
diff --git a/src/third_party/wiredtiger/src/btree/row_srch.c b/src/third_party/wiredtiger/src/btree/row_srch.c
index 7b21f1e40bb..d2d8a4640ca 100644
--- a/src/third_party/wiredtiger/src/btree/row_srch.c
+++ b/src/third_party/wiredtiger/src/btree/row_srch.c
@@ -144,7 +144,7 @@ __wt_row_search(WT_SESSION_IMPL *session,
WT_DECL_RET;
WT_ITEM *item;
WT_PAGE *page;
- WT_PAGE_INDEX *pindex;
+ WT_PAGE_INDEX *pindex, *parent_pindex;
WT_REF *current, *descent;
WT_ROW *rip;
size_t match, skiphigh, skiplow;
@@ -155,16 +155,16 @@ __wt_row_search(WT_SESSION_IMPL *session,
btree = S2BT(session);
collator = btree->collator;
item = cbt->tmp;
+ current = NULL;
__cursor_pos_clear(cbt);
/*
- * The row-store search routine uses a different comparison API.
- * The assumption is we're comparing more than a few keys with
- * matching prefixes, and it's a win to avoid the memory fetches
- * by skipping over those prefixes. That's done by tracking the
- * length of the prefix match for the lowest and highest keys we
- * compare as we descend the tree.
+ * In some cases we expect we're comparing more than a few keys with
+ * matching prefixes, so it's faster to avoid the memory fetches by
+ * skipping over those prefixes. That's done by tracking the length of
+ * the prefix match for the lowest and highest keys we compare as we
+ * descend the tree.
*/
skiphigh = skiplow = 0;
@@ -186,10 +186,11 @@ __wt_row_search(WT_SESSION_IMPL *session,
}
/* Search the internal pages of the tree. */
- cmp = -1;
+restart_root:
current = &btree->root;
- for (depth = 2;; ++depth) {
-restart: page = current->page;
+ for (depth = 2, pindex = NULL;; ++depth) {
+ parent_pindex = pindex;
+restart_page: page = current->page;
if (page->type != WT_PAGE_ROW_INT)
break;
@@ -211,7 +212,7 @@ restart: page = current->page;
WT_ERR(__wt_compare(
session, collator, srch_key, item, &cmp));
if (cmp >= 0)
- goto descend;
+ goto append;
/* A failed append check turns off append checks. */
append_check = false;
@@ -252,7 +253,26 @@ restart: page = current->page;
} else if (cmp == 0)
goto descend;
}
- else if (collator == NULL)
+ else if (collator == NULL) {
+ /*
+ * Reset the skipped prefix counts; we'd normally expect
+ * the parent's skipped prefix values to be larger than
+ * the child's values and so we'd only increase them as
+ * we walk down the tree (in other words, if we can skip
+ * N bytes on the parent, we can skip at least N bytes
+ * on the child). However, if a child internal page was
+ * split up into the parent, the child page's key space
+ * will have been truncated, and the values from the
+ * parent's search may be wrong for the child. We only
+ * need to reset the high count because the split-page
+ * algorithm truncates the end of the internal page's
+ * key space, the low count is still correct. We also
+ * don't need to clear either count when transitioning
+ * to a leaf page, a leaf page's key space can't change
+ * in flight.
+ */
+ skiphigh = 0;
+
for (; limit != 0; limit >>= 1) {
indx = base + (limit >> 1);
descent = pindex->index[indx];
@@ -271,7 +291,7 @@ restart: page = current->page;
else
goto descend;
}
- else
+ } else
for (; limit != 0; limit >>= 1) {
indx = base + (limit >> 1);
descent = pindex->index[indx];
@@ -288,9 +308,10 @@ restart: page = current->page;
}
/*
- * Set the slot to descend the tree: descent is already set if
- * there was an exact match on the page, otherwise, base is
- * the smallest index greater than key, possibly (last + 1).
+ * Set the slot to descend the tree: descent was already set if
+ * there was an exact match on the page, otherwise, base is the
+ * smallest index greater than key, possibly one past the last
+ * slot.
*/
descent = pindex->index[base - 1];
@@ -298,25 +319,41 @@ restart: page = current->page;
* If we end up somewhere other than the last slot, it's not a
* right-side descent.
*/
- if (pindex->entries != base - 1)
+ if (pindex->entries != base)
descend_right = false;
+ /*
+ * If on the last slot (the key is larger than any key on the
+ * page), check for an internal page split race.
+ */
+ if (pindex->entries == base) {
+append: if (parent_pindex != NULL &&
+ __wt_split_intl_race(
+ session, current->home, parent_pindex)) {
+ if ((ret = __wt_page_release(
+ session, current, 0)) != 0)
+ return (ret);
+
+ skiplow = skiphigh = 0;
+ goto restart_root;
+ }
+ }
+
descend: /*
* Swap the current page for the child page. If the page splits
* while we're retrieving it, restart the search in the current
* page; otherwise return on error, the swap call ensures we're
* holding nothing on failure.
*/
- switch (ret = __wt_page_swap(session, current, descent, 0)) {
- case 0:
+ if ((ret = __wt_page_swap(session, current, descent, 0)) == 0) {
current = descent;
- break;
- case WT_RESTART:
+ continue;
+ }
+ if (ret == WT_RESTART) {
skiphigh = skiplow = 0;
- goto restart;
- default:
- return (ret);
+ goto restart_page;
}
+ return (ret);
}
/* Track how deep the tree gets. */
@@ -517,7 +554,7 @@ __wt_row_random(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt)
__cursor_pos_clear(cbt);
-restart:
+restart_root:
/* Walk the internal pages of the tree. */
current = &btree->root;
for (;;) {
@@ -544,7 +581,7 @@ restart:
*/
if (ret == WT_RESTART &&
(ret = __wt_page_release(session, current, 0)) == 0)
- goto restart;
+ goto restart_root;
return (ret);
}