summaryrefslogtreecommitdiff
path: root/src/third_party/wiredtiger/src/btree/bt_split.c
diff options
context:
space:
mode:
Diffstat (limited to 'src/third_party/wiredtiger/src/btree/bt_split.c')
-rw-r--r--src/third_party/wiredtiger/src/btree/bt_split.c320
1 files changed, 178 insertions, 142 deletions
diff --git a/src/third_party/wiredtiger/src/btree/bt_split.c b/src/third_party/wiredtiger/src/btree/bt_split.c
index 631aca0d5c0..12f4197e9e7 100644
--- a/src/third_party/wiredtiger/src/btree/bt_split.c
+++ b/src/third_party/wiredtiger/src/btree/bt_split.c
@@ -190,6 +190,8 @@ __split_verify_intl_key_order(WT_SESSION_IMPL *session, WT_PAGE *page)
case WT_PAGE_COL_INT:
recno = 0; /* Less than any valid record number. */
WT_INTL_FOREACH_BEGIN(session, page, ref) {
+ WT_ASSERT(session, ref->home == page);
+
WT_ASSERT(session, ref->key.recno > recno);
recno = ref->key.recno;
} WT_INTL_FOREACH_END;
@@ -202,6 +204,8 @@ __split_verify_intl_key_order(WT_SESSION_IMPL *session, WT_PAGE *page)
first = true;
WT_INTL_FOREACH_BEGIN(session, page, ref) {
+ WT_ASSERT(session, ref->home == page);
+
__wt_ref_key(page, ref, &next->data, &next->size);
if (last->size == 0) {
if (first)
@@ -328,7 +332,7 @@ __split_ref_move(WT_SESSION_IMPL *session, WT_PAGE *from_home,
/*
* If there's no address (the page has never been written), or the
* address has been instantiated, there's no work to do. Otherwise,
- * get the address from the on-page cell.
+ * instantiate the address in-memory, from the on-page cell.
*/
addr = ref->addr;
if (addr != NULL && !__wt_off_page(from_home, addr)) {
@@ -363,65 +367,101 @@ __split_ref_move(WT_SESSION_IMPL *session, WT_PAGE *from_home,
}
/*
- * __split_child_block_evict_and_split --
- * Ensure the newly created child isn't evicted or split for now.
+ * __split_ref_step1 --
+ * Prepare a set of WT_REFs for a move.
*/
static void
-__split_child_block_evict_and_split(WT_PAGE *child)
+__split_ref_step1(
+ WT_SESSION_IMPL *session, WT_PAGE_INDEX *pindex, bool skip_first)
{
+ WT_PAGE *child;
+ WT_REF *child_ref, *ref;
+ uint32_t i, j;
+
+ /* The newly created subtree is complete. */
+ WT_WRITE_BARRIER();
+
/*
- * Once the split is live, newly created internal pages might be evicted
- * and their WT_REF structures freed. If that happens before all threads
- * exit the index of the page which previously "owned" the WT_REF, a
- * thread might see a freed WT_REF. To ensure that doesn't happen, the
- * newly created page's modify structure has a field with a transaction
- * ID that's checked before any internal page is evicted. Unfortunately,
- * we don't know the correct value until we update the original page's
- * index (we need a transaction ID from after that update), but the act
- * of updating the original page's index is what allows the eviction to
- * happen.
- *
- * Once the split is live, newly created internal pages might themselves
- * split. The split itself is not the problem: if a page splits before
- * we fix up its WT_REF (in other words, a WT_REF we move is then moved
- * again, before we reset the underlying page's parent reference), it's
- * OK because the test we use to find a WT_REF and WT_PAGE that require
- * fixing up is only that the WT_REF points to the wrong parent, not it
- * points to a specific wrong parent. The problem is our fix up of the
- * WT_REFs in the created page could race with the subsequent fix of the
- * same WT_REFs (in a different created page), we'd have to acquire some
- * lock to prevent that race, and that's going to be difficult at best.
- *
- * For now, block eviction and splits in newly created pages until they
- * have been fixed up.
+ * Update the moved WT_REFs so threads moving through them start looking
+ * at the created children's page index information. Because we've not
+ * yet updated the page index of the parent page into which we are going
+ * to split this subtree, a cursor moving through these WT_REFs will
+ * ascend into the created children, but eventually fail as that parent
+ * page won't yet know about the created children pages. That's OK, we
+ * spin there until the parent's page index is updated.
*/
- F_SET_ATOMIC(child, WT_PAGE_SPLIT_BLOCK);
+ for (i = skip_first ? 1 : 0; i < pindex->entries; ++i) {
+ ref = pindex->index[i];
+ child = ref->page;
+
+ /*
+ * Block eviction and splits in newly created pages.
+ *
+ * Once the split is live, newly created internal pages might be
+ * evicted and their WT_REF structures freed. If that happened
+ * before all threads exit the index of the page that previously
+ * "owned" the WT_REF, a thread might see a freed WT_REF. To
+ * ensure that doesn't happen, the newly created page's modify
+ * structure has a field with a transaction ID that's checked
+ * before any internal page is evicted. Unfortunately, we don't
+ * know the correct value until we update the original page's
+ * index (we need a transaction ID from after that update), but
+ * the act of updating the original page's index is what allows
+ * the eviction to happen.
+ *
+ * Split blocking was because historic versions of the split
+ * code didn't update the WT_REF.home field until after the
+ * split was live, so the WT_REF.home fields being updated could
+ * split again before the update, there's a race between splits
+ * as to which would update them first. The current code updates
+ * the WT_REF.home fields before going live (in this function),
+ * this shouldn't be an issue, but for now splits remain turned
+ * off.
+ */
+ F_SET_ATOMIC(child, WT_PAGE_SPLIT_BLOCK);
+
+ /*
+ * We use a page flag to prevent the child from splitting from
+ * underneath us, but the split-generation error checks don't
+ * know about that flag; use the standard macros to ensure that
+ * reading the child's page index structure is safe.
+ */
+ j = 0;
+ WT_ENTER_PAGE_INDEX(session);
+ WT_INTL_FOREACH_BEGIN(session, child, child_ref) {
+ child_ref->home = child;
+ child_ref->pindex_hint = j++;
+ } WT_INTL_FOREACH_END;
+ WT_LEAVE_PAGE_INDEX(session);
+
+#ifdef HAVE_DIAGNOSTIC
+ WT_WITH_PAGE_INDEX(session,
+ __split_verify_intl_key_order(session, child));
+#endif
+ }
}
/*
- * __split_ref_move_final --
- * Finalize the moved WT_REF structures after the split succeeds.
+ * __split_ref_step2 --
+ * Allow the newly created children to be evicted or split.
*/
static int
-__split_ref_move_final(
- WT_SESSION_IMPL *session, WT_REF **refp, uint32_t entries)
+__split_ref_step2(
+ WT_SESSION_IMPL *session, WT_PAGE_INDEX *pindex, bool skip_first)
{
WT_DECL_RET;
WT_PAGE *child;
- WT_REF *ref, *child_ref;
+ WT_REF *ref;
uint32_t i;
/*
- * The WT_REF structures moved to newly allocated child pages reference
- * the wrong parent page and we have to fix that up. The problem is
- * revealed when a thread of control searches for the child page's
- * reference structure slot, and fails to find it because the parent
- * page being searched no longer references the child. When that failure
- * happens the thread waits for the reference's home page to be updated,
- * which we do here: walk the children and fix them up.
+ * The split has gone live, enable eviction and splits on the newly
+ * created internal pages.
*/
- for (i = 0; i < entries; ++i, ++refp) {
- ref = *refp;
+ WT_WRITE_BARRIER();
+
+ for (i = skip_first ? 1 : 0; i < pindex->entries; ++i) {
+ ref = pindex->index[i];
/*
* We don't hold hazard pointers on created pages, they cannot
@@ -441,42 +481,18 @@ __split_ref_move_final(
WT_ERR(ret);
child = ref->page;
+
+ /* The child can now be evicted or split. */
+ F_CLR_ATOMIC(child, WT_PAGE_SPLIT_BLOCK);
+
#ifdef HAVE_DIAGNOSTIC
WT_WITH_PAGE_INDEX(session,
__split_verify_intl_key_order(session, child));
#endif
- /*
- * We use a page flag to prevent the child from splitting from
- * underneath us, but the split-generation error checks don't
- * know about that flag; use the standard macros to ensure that
- * reading the child's page index structure is safe.
- */
- WT_ENTER_PAGE_INDEX(session);
- WT_INTL_FOREACH_BEGIN(session, child, child_ref) {
- /*
- * The page's home reference may not be wrong, as we
- * opened up access from the top of the tree already,
- * disk pages may have been read in since then, and
- * those pages would have correct parent references.
- */
- if (child_ref->home != child) {
- child_ref->home = child;
- child_ref->pindex_hint = 0;
- }
- } WT_INTL_FOREACH_END;
- WT_LEAVE_PAGE_INDEX(session);
-
- /* The child can now be evicted or split. */
- F_CLR_ATOMIC(child, WT_PAGE_SPLIT_BLOCK);
WT_ERR(__wt_hazard_clear(session, child));
}
- /*
- * Push out the changes: not required for correctness, but don't let
- * threads spin on incorrect page references longer than necessary.
- */
- WT_FULL_BARRIER();
return (0);
err: /* Something really bad just happened. */
@@ -500,9 +516,21 @@ __split_root(WT_SESSION_IMPL *session, WT_PAGE *root)
uint64_t split_gen;
uint32_t children, chunk, i, j, remain;
uint32_t slots;
- bool complete;
void *p;
+ /*
+ * A note on error handling: this function first allocates/initializes
+ * new structures; failures during that period are handled by discarding
+ * the memory and returning an error code, our caller knows the split
+ * didn't happen and proceeds accordingly. Second, this function updates
+ * the tree, and a failure in that period is catastrophic, any partial
+ * update to the tree requires a panic, we can't recover. Third, once
+ * the split is complete and the tree has been fully updated, we have to
+ * ignore most errors because the split is complete and correct, callers
+ * have to proceed accordingly.
+ */
+ enum { ERR_RETURN, ERR_PANIC, ERR_IGNORE } complete;
+
WT_STAT_FAST_CONN_INCR(session, cache_eviction_deepen);
WT_STAT_FAST_DATA_INCR(session, cache_eviction_deepen);
WT_STAT_FAST_CONN_INCR(session, cache_eviction_split_internal);
@@ -511,7 +539,7 @@ __split_root(WT_SESSION_IMPL *session, WT_PAGE *root)
btree = S2BT(session);
alloc_index = NULL;
root_decr = root_incr = 0;
- complete = false;
+ complete = ERR_RETURN;
/* The root page will be marked dirty, make sure that will succeed. */
WT_RET(__wt_page_modify_init(session, root));
@@ -589,9 +617,6 @@ __split_root(WT_SESSION_IMPL *session, WT_PAGE *root)
WT_ERR(__wt_page_modify_init(session, child));
__wt_page_modify_set(session, child);
- /* Ensure the page isn't evicted or split for now. */
- __split_child_block_evict_and_split(child);
-
/*
* The newly allocated child's page index references the same
* structures as the root. (We cannot move WT_REF structures,
@@ -615,31 +640,28 @@ __split_root(WT_SESSION_IMPL *session, WT_PAGE *root)
WT_ASSERT(session,
root_refp - pindex->index == (ptrdiff_t)pindex->entries);
+ /* Start making real changes to the tree, errors are fatal. */
+ complete = ERR_PANIC;
+
+ /* Prepare the WT_REFs for the move. */
+ __split_ref_step1(session, alloc_index, false);
+
/*
* Confirm the root page's index hasn't moved, then update it, which
- * makes the split visible to threads descending the tree. From this
- * point on, we're committed to the split.
- *
- * A note on error handling: until this point, there's no problem with
- * unwinding on error. We allocated a new page index, a new set of
- * WT_REFs and a new set of child pages -- if an error occurred, the
- * root remained unchanged, although it may have an incorrect memory
- * footprint. From now on we've modified the root page, attention
- * needs to be paid. However, subsequent failures are relatively benign,
- * the split is OK and complete. For that reason, we ignore errors past
- * this point unless there's a panic.
+ * makes the split visible to threads descending the tree.
*/
WT_ASSERT(session, WT_INTL_INDEX_GET_SAFE(root) == pindex);
WT_INTL_INDEX_SET(root, alloc_index);
- complete = true;
#ifdef HAVE_DIAGNOSTIC
WT_WITH_PAGE_INDEX(session,
__split_verify_intl_key_order(session, root));
#endif
- /* Fix up the moved WT_REF structures. */
- WT_ERR(__split_ref_move_final(
- session, alloc_index->index, alloc_index->entries));
+ /* Finalize the WT_REFs we moved. */
+ WT_ERR(__split_ref_step2(session, alloc_index, false));
+
+ /* The split is complete and correct, ignore benign errors. */
+ complete = ERR_IGNORE;
/* We've installed the allocated page-index, ensure error handling. */
alloc_index = NULL;
@@ -664,24 +686,25 @@ __split_root(WT_SESSION_IMPL *session, WT_PAGE *root)
__wt_cache_page_inmem_decr(session, root, root_decr);
__wt_page_modify_set(session, root);
-err: /*
- * If complete is true, we saw an error after opening up the tree to
- * descent through the root page's new index. There is nothing we
- * can do, there are threads potentially active in both versions of
- * the tree.
- *
- * A note on error handling: if we completed the split, return success,
- * nothing really bad can have happened, and our caller has to proceed
- * with the split.
- */
- if (!complete)
+err: switch (complete) {
+ case ERR_RETURN:
__wt_free_ref_index(session, root, alloc_index, true);
-
- if (ret != 0 && ret != WT_PANIC)
+ break;
+ case ERR_PANIC:
__wt_err(session, ret,
- "ignoring not-fatal error during root page split to "
- "deepen the tree");
- return (ret == WT_PANIC || !complete ? ret : 0);
+ "fatal error during root page split to deepen the tree");
+ ret = WT_PANIC;
+ break;
+ case ERR_IGNORE:
+ if (ret != 0 && ret != WT_PANIC) {
+ __wt_err(session, ret,
+ "ignoring not-fatal error during root page split "
+ "to deepen the tree");
+ ret = 0;
+ }
+ break;
+ }
+ return (ret);
}
/*
@@ -964,9 +987,21 @@ __split_internal(WT_SESSION_IMPL *session, WT_PAGE *parent, WT_PAGE *page)
uint64_t split_gen;
uint32_t children, chunk, i, j, remain;
uint32_t slots;
- bool complete;
void *p;
+ /*
+ * A note on error handling: this function first allocates/initializes
+ * new structures; failures during that period are handled by discarding
+ * the memory and returning an error code, our caller knows the split
+ * didn't happen and proceeds accordingly. Second, this function updates
+ * the tree, and a failure in that period is catastrophic, any partial
+ * update to the tree requires a panic, we can't recover. Third, once
+ * the split is complete and the tree has been fully updated, we have to
+ * ignore most errors because the split is complete and correct, callers
+ * have to proceed accordingly.
+ */
+ enum { ERR_RETURN, ERR_PANIC, ERR_IGNORE } complete;
+
WT_STAT_FAST_CONN_INCR(session, cache_eviction_split_internal);
WT_STAT_FAST_DATA_INCR(session, cache_eviction_split_internal);
@@ -977,7 +1012,7 @@ __split_internal(WT_SESSION_IMPL *session, WT_PAGE *parent, WT_PAGE *page)
alloc_index = replace_index = NULL;
page_ref = page->pg_intl_parent_ref;
page_decr = page_incr = parent_incr = 0;
- complete = false;
+ complete = ERR_RETURN;
/*
* Our caller is holding the page locked to single-thread splits, which
@@ -1074,9 +1109,6 @@ __split_internal(WT_SESSION_IMPL *session, WT_PAGE *parent, WT_PAGE *page)
WT_ERR(__wt_page_modify_init(session, child));
__wt_page_modify_set(session, child);
- /* Ensure the page isn't evicted or split for now. */
- __split_child_block_evict_and_split(child);
-
/*
* The newly allocated child's page index references the same
* structures as the parent. (We cannot move WT_REF structures,
@@ -1100,22 +1132,16 @@ __split_internal(WT_SESSION_IMPL *session, WT_PAGE *parent, WT_PAGE *page)
WT_ASSERT(session,
page_refp - pindex->index == (ptrdiff_t)pindex->entries);
+ /* Start making real changes to the tree, errors are fatal. */
+ complete = ERR_PANIC;
+
+ /* Prepare the WT_REFs for the move. */
+ __split_ref_step1(session, alloc_index, true);
+
/* Split into the parent. */
WT_ERR(__split_parent(session, page_ref, alloc_index->index,
alloc_index->entries, parent_incr, false, false));
- /*
- * A note on error handling: until this point, there's no problem with
- * unwinding on error. We allocated a new page index, a new set of
- * WT_REFs and a new set of child pages -- if an error occurred, the
- * page remained unchanged, although it may have an incorrect memory
- * footprint. From now on we've modified the parent page, attention
- * needs to be paid. However, subsequent failures are relatively benign,
- * the split is OK and complete. For that reason, we ignore errors past
- * this point unless there's a panic.
- */
- complete = true;
-
/* Confirm the page's index hasn't moved, then update it. */
WT_ASSERT(session, WT_INTL_INDEX_GET_SAFE(page) == pindex);
WT_INTL_INDEX_SET(page, replace_index);
@@ -1127,9 +1153,17 @@ __split_internal(WT_SESSION_IMPL *session, WT_PAGE *parent, WT_PAGE *page)
__split_verify_intl_key_order(session, page));
#endif
- /* Fix up the moved WT_REF structures. */
- WT_ERR(__split_ref_move_final(
- session, alloc_index->index + 1, alloc_index->entries - 1));
+ /* Finalize the WT_REFs we moved. */
+ WT_ERR(__split_ref_step2(session, alloc_index, true));
+
+ /* The split is complete and correct, ignore benign errors. */
+ complete = ERR_IGNORE;
+
+ /*
+ * Push out the changes: not required for correctness, but no reason
+ * to wait.
+ */
+ WT_FULL_BARRIER();
/*
* We don't care about the page-index we allocated, all we needed was
@@ -1158,24 +1192,26 @@ __split_internal(WT_SESSION_IMPL *session, WT_PAGE *parent, WT_PAGE *page)
__wt_cache_page_inmem_decr(session, page, page_decr);
__wt_page_modify_set(session, page);
-err: /*
- * If complete is true, we saw an error after opening up the tree to
- * descent through the page's new index. There is nothing we can do,
- * there are threads potentially active in both versions of the tree.
- *
- * A note on error handling: if we completed the split, return success,
- * nothing really bad can have happened, and our caller has to proceed
- * with the split.
- */
- if (!complete) {
+err: switch (complete) {
+ case ERR_RETURN:
__wt_free_ref_index(session, page, alloc_index, true);
__wt_free_ref_index(session, page, replace_index, false);
- }
-
- if (ret != 0 && ret != WT_PANIC)
+ break;
+ case ERR_PANIC:
__wt_err(session, ret,
- "ignoring not-fatal error during internal page split");
- return (ret == WT_PANIC || !complete ? ret : 0);
+ "fatal error during internal page split");
+ ret = WT_PANIC;
+ break;
+ case ERR_IGNORE:
+ if (ret != 0 && ret != WT_PANIC) {
+ __wt_err(session, ret,
+ "ignoring not-fatal error during internal page "
+ "split");
+ ret = 0;
+ }
+ break;
+ }
+ return (ret);
}
/*