summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--src/third_party/wiredtiger/dist/s_string.ok1
-rw-r--r--src/third_party/wiredtiger/src/btree/bt_split.c320
-rw-r--r--src/third_party/wiredtiger/src/btree/bt_walk.c98
3 files changed, 248 insertions, 171 deletions
diff --git a/src/third_party/wiredtiger/dist/s_string.ok b/src/third_party/wiredtiger/dist/s_string.ok
index 7de139f6a40..3669fa4d608 100644
--- a/src/third_party/wiredtiger/dist/s_string.ok
+++ b/src/third_party/wiredtiger/dist/s_string.ok
@@ -420,6 +420,7 @@ checkpointer
checkpointing
checksum
checksums
+children's
chk
chongo
cip
diff --git a/src/third_party/wiredtiger/src/btree/bt_split.c b/src/third_party/wiredtiger/src/btree/bt_split.c
index 631aca0d5c0..12f4197e9e7 100644
--- a/src/third_party/wiredtiger/src/btree/bt_split.c
+++ b/src/third_party/wiredtiger/src/btree/bt_split.c
@@ -190,6 +190,8 @@ __split_verify_intl_key_order(WT_SESSION_IMPL *session, WT_PAGE *page)
case WT_PAGE_COL_INT:
recno = 0; /* Less than any valid record number. */
WT_INTL_FOREACH_BEGIN(session, page, ref) {
+ WT_ASSERT(session, ref->home == page);
+
WT_ASSERT(session, ref->key.recno > recno);
recno = ref->key.recno;
} WT_INTL_FOREACH_END;
@@ -202,6 +204,8 @@ __split_verify_intl_key_order(WT_SESSION_IMPL *session, WT_PAGE *page)
first = true;
WT_INTL_FOREACH_BEGIN(session, page, ref) {
+ WT_ASSERT(session, ref->home == page);
+
__wt_ref_key(page, ref, &next->data, &next->size);
if (last->size == 0) {
if (first)
@@ -328,7 +332,7 @@ __split_ref_move(WT_SESSION_IMPL *session, WT_PAGE *from_home,
/*
* If there's no address (the page has never been written), or the
* address has been instantiated, there's no work to do. Otherwise,
- * get the address from the on-page cell.
+ * instantiate the address in-memory, from the on-page cell.
*/
addr = ref->addr;
if (addr != NULL && !__wt_off_page(from_home, addr)) {
@@ -363,65 +367,101 @@ __split_ref_move(WT_SESSION_IMPL *session, WT_PAGE *from_home,
}
/*
- * __split_child_block_evict_and_split --
- * Ensure the newly created child isn't evicted or split for now.
+ * __split_ref_step1 --
+ * Prepare a set of WT_REFs for a move.
*/
static void
-__split_child_block_evict_and_split(WT_PAGE *child)
+__split_ref_step1(
+ WT_SESSION_IMPL *session, WT_PAGE_INDEX *pindex, bool skip_first)
{
+ WT_PAGE *child;
+ WT_REF *child_ref, *ref;
+ uint32_t i, j;
+
+ /* The newly created subtree is complete. */
+ WT_WRITE_BARRIER();
+
/*
- * Once the split is live, newly created internal pages might be evicted
- * and their WT_REF structures freed. If that happens before all threads
- * exit the index of the page which previously "owned" the WT_REF, a
- * thread might see a freed WT_REF. To ensure that doesn't happen, the
- * newly created page's modify structure has a field with a transaction
- * ID that's checked before any internal page is evicted. Unfortunately,
- * we don't know the correct value until we update the original page's
- * index (we need a transaction ID from after that update), but the act
- * of updating the original page's index is what allows the eviction to
- * happen.
- *
- * Once the split is live, newly created internal pages might themselves
- * split. The split itself is not the problem: if a page splits before
- * we fix up its WT_REF (in other words, a WT_REF we move is then moved
- * again, before we reset the underlying page's parent reference), it's
- * OK because the test we use to find a WT_REF and WT_PAGE that require
- * fixing up is only that the WT_REF points to the wrong parent, not it
- * points to a specific wrong parent. The problem is our fix up of the
- * WT_REFs in the created page could race with the subsequent fix of the
- * same WT_REFs (in a different created page), we'd have to acquire some
- * lock to prevent that race, and that's going to be difficult at best.
- *
- * For now, block eviction and splits in newly created pages until they
- * have been fixed up.
+ * Update the moved WT_REFs so threads moving through them start looking
+ * at the created children's page index information. Because we've not
+ * yet updated the page index of the parent page into which we are going
+ * to split this subtree, a cursor moving through these WT_REFs will
+ * ascend into the created children, but eventually fail as that parent
+ * page won't yet know about the created children pages. That's OK, we
+ * spin there until the parent's page index is updated.
*/
- F_SET_ATOMIC(child, WT_PAGE_SPLIT_BLOCK);
+ for (i = skip_first ? 1 : 0; i < pindex->entries; ++i) {
+ ref = pindex->index[i];
+ child = ref->page;
+
+ /*
+ * Block eviction and splits in newly created pages.
+ *
+ * Once the split is live, newly created internal pages might be
+ * evicted and their WT_REF structures freed. If that happened
+ * before all threads exit the index of the page that previously
+ * "owned" the WT_REF, a thread might see a freed WT_REF. To
+ * ensure that doesn't happen, the newly created page's modify
+ * structure has a field with a transaction ID that's checked
+ * before any internal page is evicted. Unfortunately, we don't
+ * know the correct value until we update the original page's
+ * index (we need a transaction ID from after that update), but
+ * the act of updating the original page's index is what allows
+ * the eviction to happen.
+ *
+ * Split blocking was because historic versions of the split
+ * code didn't update the WT_REF.home field until after the
+ * split was live, so the WT_REF.home fields being updated could
+ * split again before the update, there's a race between splits
+ * as to which would update them first. The current code updates
+ * the WT_REF.home fields before going live (in this function),
+ * this shouldn't be an issue, but for now splits remain turned
+ * off.
+ */
+ F_SET_ATOMIC(child, WT_PAGE_SPLIT_BLOCK);
+
+ /*
+ * We use a page flag to prevent the child from splitting from
+ * underneath us, but the split-generation error checks don't
+ * know about that flag; use the standard macros to ensure that
+ * reading the child's page index structure is safe.
+ */
+ j = 0;
+ WT_ENTER_PAGE_INDEX(session);
+ WT_INTL_FOREACH_BEGIN(session, child, child_ref) {
+ child_ref->home = child;
+ child_ref->pindex_hint = j++;
+ } WT_INTL_FOREACH_END;
+ WT_LEAVE_PAGE_INDEX(session);
+
+#ifdef HAVE_DIAGNOSTIC
+ WT_WITH_PAGE_INDEX(session,
+ __split_verify_intl_key_order(session, child));
+#endif
+ }
}
/*
- * __split_ref_move_final --
- * Finalize the moved WT_REF structures after the split succeeds.
+ * __split_ref_step2 --
+ * Allow the newly created children to be evicted or split.
*/
static int
-__split_ref_move_final(
- WT_SESSION_IMPL *session, WT_REF **refp, uint32_t entries)
+__split_ref_step2(
+ WT_SESSION_IMPL *session, WT_PAGE_INDEX *pindex, bool skip_first)
{
WT_DECL_RET;
WT_PAGE *child;
- WT_REF *ref, *child_ref;
+ WT_REF *ref;
uint32_t i;
/*
- * The WT_REF structures moved to newly allocated child pages reference
- * the wrong parent page and we have to fix that up. The problem is
- * revealed when a thread of control searches for the child page's
- * reference structure slot, and fails to find it because the parent
- * page being searched no longer references the child. When that failure
- * happens the thread waits for the reference's home page to be updated,
- * which we do here: walk the children and fix them up.
+ * The split has gone live, enable eviction and splits on the newly
+ * created internal pages.
*/
- for (i = 0; i < entries; ++i, ++refp) {
- ref = *refp;
+ WT_WRITE_BARRIER();
+
+ for (i = skip_first ? 1 : 0; i < pindex->entries; ++i) {
+ ref = pindex->index[i];
/*
* We don't hold hazard pointers on created pages, they cannot
@@ -441,42 +481,18 @@ __split_ref_move_final(
WT_ERR(ret);
child = ref->page;
+
+ /* The child can now be evicted or split. */
+ F_CLR_ATOMIC(child, WT_PAGE_SPLIT_BLOCK);
+
#ifdef HAVE_DIAGNOSTIC
WT_WITH_PAGE_INDEX(session,
__split_verify_intl_key_order(session, child));
#endif
- /*
- * We use a page flag to prevent the child from splitting from
- * underneath us, but the split-generation error checks don't
- * know about that flag; use the standard macros to ensure that
- * reading the child's page index structure is safe.
- */
- WT_ENTER_PAGE_INDEX(session);
- WT_INTL_FOREACH_BEGIN(session, child, child_ref) {
- /*
- * The page's home reference may not be wrong, as we
- * opened up access from the top of the tree already,
- * disk pages may have been read in since then, and
- * those pages would have correct parent references.
- */
- if (child_ref->home != child) {
- child_ref->home = child;
- child_ref->pindex_hint = 0;
- }
- } WT_INTL_FOREACH_END;
- WT_LEAVE_PAGE_INDEX(session);
-
- /* The child can now be evicted or split. */
- F_CLR_ATOMIC(child, WT_PAGE_SPLIT_BLOCK);
WT_ERR(__wt_hazard_clear(session, child));
}
- /*
- * Push out the changes: not required for correctness, but don't let
- * threads spin on incorrect page references longer than necessary.
- */
- WT_FULL_BARRIER();
return (0);
err: /* Something really bad just happened. */
@@ -500,9 +516,21 @@ __split_root(WT_SESSION_IMPL *session, WT_PAGE *root)
uint64_t split_gen;
uint32_t children, chunk, i, j, remain;
uint32_t slots;
- bool complete;
void *p;
+ /*
+ * A note on error handling: this function first allocates/initializes
+ * new structures; failures during that period are handled by discarding
+ * the memory and returning an error code, our caller knows the split
+ * didn't happen and proceeds accordingly. Second, this function updates
+ * the tree, and a failure in that period is catastrophic, any partial
+ * update to the tree requires a panic, we can't recover. Third, once
+ * the split is complete and the tree has been fully updated, we have to
+ * ignore most errors because the split is complete and correct, callers
+ * have to proceed accordingly.
+ */
+ enum { ERR_RETURN, ERR_PANIC, ERR_IGNORE } complete;
+
WT_STAT_FAST_CONN_INCR(session, cache_eviction_deepen);
WT_STAT_FAST_DATA_INCR(session, cache_eviction_deepen);
WT_STAT_FAST_CONN_INCR(session, cache_eviction_split_internal);
@@ -511,7 +539,7 @@ __split_root(WT_SESSION_IMPL *session, WT_PAGE *root)
btree = S2BT(session);
alloc_index = NULL;
root_decr = root_incr = 0;
- complete = false;
+ complete = ERR_RETURN;
/* The root page will be marked dirty, make sure that will succeed. */
WT_RET(__wt_page_modify_init(session, root));
@@ -589,9 +617,6 @@ __split_root(WT_SESSION_IMPL *session, WT_PAGE *root)
WT_ERR(__wt_page_modify_init(session, child));
__wt_page_modify_set(session, child);
- /* Ensure the page isn't evicted or split for now. */
- __split_child_block_evict_and_split(child);
-
/*
* The newly allocated child's page index references the same
* structures as the root. (We cannot move WT_REF structures,
@@ -615,31 +640,28 @@ __split_root(WT_SESSION_IMPL *session, WT_PAGE *root)
WT_ASSERT(session,
root_refp - pindex->index == (ptrdiff_t)pindex->entries);
+ /* Start making real changes to the tree, errors are fatal. */
+ complete = ERR_PANIC;
+
+ /* Prepare the WT_REFs for the move. */
+ __split_ref_step1(session, alloc_index, false);
+
/*
* Confirm the root page's index hasn't moved, then update it, which
- * makes the split visible to threads descending the tree. From this
- * point on, we're committed to the split.
- *
- * A note on error handling: until this point, there's no problem with
- * unwinding on error. We allocated a new page index, a new set of
- * WT_REFs and a new set of child pages -- if an error occurred, the
- * root remained unchanged, although it may have an incorrect memory
- * footprint. From now on we've modified the root page, attention
- * needs to be paid. However, subsequent failures are relatively benign,
- * the split is OK and complete. For that reason, we ignore errors past
- * this point unless there's a panic.
+ * makes the split visible to threads descending the tree.
*/
WT_ASSERT(session, WT_INTL_INDEX_GET_SAFE(root) == pindex);
WT_INTL_INDEX_SET(root, alloc_index);
- complete = true;
#ifdef HAVE_DIAGNOSTIC
WT_WITH_PAGE_INDEX(session,
__split_verify_intl_key_order(session, root));
#endif
- /* Fix up the moved WT_REF structures. */
- WT_ERR(__split_ref_move_final(
- session, alloc_index->index, alloc_index->entries));
+ /* Finalize the WT_REFs we moved. */
+ WT_ERR(__split_ref_step2(session, alloc_index, false));
+
+ /* The split is complete and correct, ignore benign errors. */
+ complete = ERR_IGNORE;
/* We've installed the allocated page-index, ensure error handling. */
alloc_index = NULL;
@@ -664,24 +686,25 @@ __split_root(WT_SESSION_IMPL *session, WT_PAGE *root)
__wt_cache_page_inmem_decr(session, root, root_decr);
__wt_page_modify_set(session, root);
-err: /*
- * If complete is true, we saw an error after opening up the tree to
- * descent through the root page's new index. There is nothing we
- * can do, there are threads potentially active in both versions of
- * the tree.
- *
- * A note on error handling: if we completed the split, return success,
- * nothing really bad can have happened, and our caller has to proceed
- * with the split.
- */
- if (!complete)
+err: switch (complete) {
+ case ERR_RETURN:
__wt_free_ref_index(session, root, alloc_index, true);
-
- if (ret != 0 && ret != WT_PANIC)
+ break;
+ case ERR_PANIC:
__wt_err(session, ret,
- "ignoring not-fatal error during root page split to "
- "deepen the tree");
- return (ret == WT_PANIC || !complete ? ret : 0);
+ "fatal error during root page split to deepen the tree");
+ ret = WT_PANIC;
+ break;
+ case ERR_IGNORE:
+ if (ret != 0 && ret != WT_PANIC) {
+ __wt_err(session, ret,
+ "ignoring not-fatal error during root page split "
+ "to deepen the tree");
+ ret = 0;
+ }
+ break;
+ }
+ return (ret);
}
/*
@@ -964,9 +987,21 @@ __split_internal(WT_SESSION_IMPL *session, WT_PAGE *parent, WT_PAGE *page)
uint64_t split_gen;
uint32_t children, chunk, i, j, remain;
uint32_t slots;
- bool complete;
void *p;
+ /*
+ * A note on error handling: this function first allocates/initializes
+ * new structures; failures during that period are handled by discarding
+ * the memory and returning an error code, our caller knows the split
+ * didn't happen and proceeds accordingly. Second, this function updates
+ * the tree, and a failure in that period is catastrophic, any partial
+ * update to the tree requires a panic, we can't recover. Third, once
+ * the split is complete and the tree has been fully updated, we have to
+ * ignore most errors because the split is complete and correct, callers
+ * have to proceed accordingly.
+ */
+ enum { ERR_RETURN, ERR_PANIC, ERR_IGNORE } complete;
+
WT_STAT_FAST_CONN_INCR(session, cache_eviction_split_internal);
WT_STAT_FAST_DATA_INCR(session, cache_eviction_split_internal);
@@ -977,7 +1012,7 @@ __split_internal(WT_SESSION_IMPL *session, WT_PAGE *parent, WT_PAGE *page)
alloc_index = replace_index = NULL;
page_ref = page->pg_intl_parent_ref;
page_decr = page_incr = parent_incr = 0;
- complete = false;
+ complete = ERR_RETURN;
/*
* Our caller is holding the page locked to single-thread splits, which
@@ -1074,9 +1109,6 @@ __split_internal(WT_SESSION_IMPL *session, WT_PAGE *parent, WT_PAGE *page)
WT_ERR(__wt_page_modify_init(session, child));
__wt_page_modify_set(session, child);
- /* Ensure the page isn't evicted or split for now. */
- __split_child_block_evict_and_split(child);
-
/*
* The newly allocated child's page index references the same
* structures as the parent. (We cannot move WT_REF structures,
@@ -1100,22 +1132,16 @@ __split_internal(WT_SESSION_IMPL *session, WT_PAGE *parent, WT_PAGE *page)
WT_ASSERT(session,
page_refp - pindex->index == (ptrdiff_t)pindex->entries);
+ /* Start making real changes to the tree, errors are fatal. */
+ complete = ERR_PANIC;
+
+ /* Prepare the WT_REFs for the move. */
+ __split_ref_step1(session, alloc_index, true);
+
/* Split into the parent. */
WT_ERR(__split_parent(session, page_ref, alloc_index->index,
alloc_index->entries, parent_incr, false, false));
- /*
- * A note on error handling: until this point, there's no problem with
- * unwinding on error. We allocated a new page index, a new set of
- * WT_REFs and a new set of child pages -- if an error occurred, the
- * page remained unchanged, although it may have an incorrect memory
- * footprint. From now on we've modified the parent page, attention
- * needs to be paid. However, subsequent failures are relatively benign,
- * the split is OK and complete. For that reason, we ignore errors past
- * this point unless there's a panic.
- */
- complete = true;
-
/* Confirm the page's index hasn't moved, then update it. */
WT_ASSERT(session, WT_INTL_INDEX_GET_SAFE(page) == pindex);
WT_INTL_INDEX_SET(page, replace_index);
@@ -1127,9 +1153,17 @@ __split_internal(WT_SESSION_IMPL *session, WT_PAGE *parent, WT_PAGE *page)
__split_verify_intl_key_order(session, page));
#endif
- /* Fix up the moved WT_REF structures. */
- WT_ERR(__split_ref_move_final(
- session, alloc_index->index + 1, alloc_index->entries - 1));
+ /* Finalize the WT_REFs we moved. */
+ WT_ERR(__split_ref_step2(session, alloc_index, true));
+
+ /* The split is complete and correct, ignore benign errors. */
+ complete = ERR_IGNORE;
+
+ /*
+ * Push out the changes: not required for correctness, but no reason
+ * to wait.
+ */
+ WT_FULL_BARRIER();
/*
* We don't care about the page-index we allocated, all we needed was
@@ -1158,24 +1192,26 @@ __split_internal(WT_SESSION_IMPL *session, WT_PAGE *parent, WT_PAGE *page)
__wt_cache_page_inmem_decr(session, page, page_decr);
__wt_page_modify_set(session, page);
-err: /*
- * If complete is true, we saw an error after opening up the tree to
- * descent through the page's new index. There is nothing we can do,
- * there are threads potentially active in both versions of the tree.
- *
- * A note on error handling: if we completed the split, return success,
- * nothing really bad can have happened, and our caller has to proceed
- * with the split.
- */
- if (!complete) {
+err: switch (complete) {
+ case ERR_RETURN:
__wt_free_ref_index(session, page, alloc_index, true);
__wt_free_ref_index(session, page, replace_index, false);
- }
-
- if (ret != 0 && ret != WT_PANIC)
+ break;
+ case ERR_PANIC:
__wt_err(session, ret,
- "ignoring not-fatal error during internal page split");
- return (ret == WT_PANIC || !complete ? ret : 0);
+ "fatal error during internal page split");
+ ret = WT_PANIC;
+ break;
+ case ERR_IGNORE:
+ if (ret != 0 && ret != WT_PANIC) {
+ __wt_err(session, ret,
+ "ignoring not-fatal error during internal page "
+ "split");
+ ret = 0;
+ }
+ break;
+ }
+ return (ret);
}
/*
diff --git a/src/third_party/wiredtiger/src/btree/bt_walk.c b/src/third_party/wiredtiger/src/btree/bt_walk.c
index b46c9a03dcf..abb18529041 100644
--- a/src/third_party/wiredtiger/src/btree/bt_walk.c
+++ b/src/third_party/wiredtiger/src/btree/bt_walk.c
@@ -90,6 +90,48 @@ __ref_is_leaf(WT_REF *ref)
}
/*
+ * __page_ascend --
+ * Ascend the tree one level.
+ */
+static void
+__page_ascend(WT_SESSION_IMPL *session,
+ WT_REF **refp, WT_PAGE_INDEX **pindexp, uint32_t *slotp)
+{
+ WT_REF *parent_ref, *ref;
+
+ /*
+ * Ref points to the first/last slot on an internal page from which we
+ * are ascending the tree, moving to the parent page. This is tricky
+ * because the internal page we're on may be splitting into its parent.
+ * Find a stable configuration where the page we start from and the
+ * page we're moving to are connected. The tree eventually stabilizes
+ * into that configuration, keep trying until we succeed.
+ */
+ for (ref = *refp;;) {
+ /*
+ * Find our parent slot on the next higher internal page, the
+ * slot from which we move to a next/prev slot, checking that
+ * we haven't reached the root.
+ */
+ parent_ref = ref->home->pg_intl_parent_ref;
+ if (__wt_ref_is_root(parent_ref))
+ break;
+ __page_refp(session, parent_ref, pindexp, slotp);
+
+ /*
+ * When internal pages split, the WT_REF structures being moved
+ * are updated first. If the WT_REF we started with references
+ * the same page as we found on our search of the parent, there
+ * is a consistent view.
+ */
+ if (ref->home == parent_ref->page)
+ break;
+ }
+
+ *refp = parent_ref;
+}
+
+/*
* __tree_walk_internal --
* Move to the next/previous page in the tree.
*/
@@ -173,7 +215,7 @@ __tree_walk_internal(WT_SESSION_IMPL *session,
goto descend;
}
-ascend: /*
+ /*
* If the active page was the root, we've reached the walk's end.
* Release any hazard-pointer we're holding.
*/
@@ -187,13 +229,14 @@ ascend: /*
for (;;) {
/*
- * If we're at the last/first slot on the page, return this page
- * in post-order traversal. Otherwise we move to the next/prev
- * slot and left/right-most element in its subtree.
+ * If we're at the last/first slot on the internal page, return
+ * it in post-order traversal. Otherwise move to the next/prev
+ * slot and left/right-most element in that subtree.
*/
- if ((prev && slot == 0) ||
+ while ((prev && slot == 0) ||
(!prev && slot == pindex->entries - 1)) {
- ref = ref->home->pg_intl_parent_ref;
+ /* Ascend to the parent. */
+ __page_ascend(session, &ref, &pindex, &slot);
/*
* If we got all the way through an internal page and
@@ -205,40 +248,37 @@ ascend: /*
empty_internal = false;
}
- /* Optionally skip internal pages. */
- if (LF_ISSET(WT_READ_SKIP_INTL))
- goto ascend;
-
/*
- * We've ascended the tree and are returning an internal
- * page. If it's the root, discard our hazard pointer,
- * otherwise, swap our hazard pointer for the page we'll
- * return.
+ * If at the root and returning internal pages, return
+ * the root page, otherwise we're done. Regardless, no
+ * hazard pointer is required, release the one we hold.
*/
- if (__wt_ref_is_root(ref))
+ if (__wt_ref_is_root(ref)) {
WT_ERR(__wt_page_release(
session, couple, flags));
- else {
- /*
- * Locate the reference to our parent page then
- * swap our child hazard pointer for the parent.
- * We don't handle restart or not-found returns.
- * It would require additional complexity and is
- * not a possible return: we're moving to the
- * parent of the current child page, our parent
- * reference can't have split or been evicted.
- */
- __page_refp(session, ref, &pindex, &slot);
+ if (!LF_ISSET(WT_READ_SKIP_INTL))
+ *refp = ref;
+ goto done;
+ }
+
+ /*
+ * Optionally return internal pages. Swap our previous
+ * hazard pointer for the page we'll return. We don't
+ * handle restart or not-found returns, it would require
+ * additional complexity and is not a possible return:
+ * we're moving to the parent of the current child page,
+ * the parent can't have been evicted.
+ */
+ if (!LF_ISSET(WT_READ_SKIP_INTL)) {
if ((ret = __wt_page_swap(
session, couple, ref, flags)) != 0) {
WT_TRET(__wt_page_release(
session, couple, flags));
WT_ERR(ret);
}
+ *refp = ref;
+ goto done;
}
-
- *refp = ref;
- goto done;
}
if (prev)