summaryrefslogtreecommitdiff
path: root/src/third_party/wiredtiger/src/btree/bt_split.c
diff options
context:
space:
mode:
Diffstat (limited to 'src/third_party/wiredtiger/src/btree/bt_split.c')
-rw-r--r--src/third_party/wiredtiger/src/btree/bt_split.c166
1 files changed, 111 insertions, 55 deletions
diff --git a/src/third_party/wiredtiger/src/btree/bt_split.c b/src/third_party/wiredtiger/src/btree/bt_split.c
index 92be2125b88..355bc09871b 100644
--- a/src/third_party/wiredtiger/src/btree/bt_split.c
+++ b/src/third_party/wiredtiger/src/btree/bt_split.c
@@ -440,17 +440,49 @@ __split_ref_move(WT_SESSION_IMPL *session, WT_PAGE *from_home,
}
/*
+ * __split_ref_final --
+ * Finalize the WT_REF move.
+ */
+static void
+__split_ref_final(WT_SESSION_IMPL *session, WT_PAGE ***lockedp)
+{
+ WT_PAGE **locked;
+ size_t i;
+
+ /* The parent page's page index has been updated. */
+ WT_WRITE_BARRIER();
+
+ if ((locked = *lockedp) == NULL)
+ return;
+ *lockedp = NULL;
+
+ /*
+ * The moved child pages are locked to prevent them from splitting
+ * before the parent move completes, unlock them as the final step.
+ */
+ for (i = 0; locked[i] != NULL; ++i)
+ WT_PAGE_UNLOCK(session, locked[i]);
+ __wt_free(session, locked);
+}
+
+/*
* __split_ref_prepare --
* Prepare a set of WT_REFs for a move.
*/
-static void
+static int
__split_ref_prepare(WT_SESSION_IMPL *session,
- WT_PAGE_INDEX *pindex, uint64_t split_gen, bool skip_first)
+ WT_PAGE_INDEX *pindex, WT_PAGE ***lockedp, bool skip_first)
{
- WT_PAGE *child;
+ WT_DECL_RET;
+ WT_PAGE *child, **locked;
WT_REF *child_ref, *ref;
+ size_t alloc, cnt;
uint32_t i, j;
+ *lockedp = NULL;
+
+ locked = NULL;
+
/* The newly created subtree is complete. */
WT_WRITE_BARRIER();
@@ -462,51 +494,44 @@ __split_ref_prepare(WT_SESSION_IMPL *session,
* ascend into the created children, but eventually fail as that parent
* page won't yet know about the created children pages. That's OK, we
* spin there until the parent's page index is updated.
- */
+ *
+ * Lock the newly created page to ensure none of its children can split.
+ * First, to ensure all of the child pages are updated before any pages
+ * can split. Second, to ensure the original split completes before any
+ * of the children can split. The latter involves split generations:
+ * the original split page has references to these children. If they
+ * split immediately, they could free WT_REF structures based on split
+ * generations earlier than the split generation we'll eventually choose
+ * to protect the original split page's previous page index.
+ */
+ alloc = cnt = 0;
for (i = skip_first ? 1 : 0; i < pindex->entries; ++i) {
ref = pindex->index[i];
child = ref->page;
- /*
- * Block eviction in newly created pages.
- *
- * Once the split is live, newly created internal pages might be
- * evicted and their WT_REF structures freed. If that happened
- * before all threads exit the index of the page that previously
- * "owned" the WT_REF, a thread might see a freed WT_REF. To
- * ensure that doesn't happen, the newly created page contains
- * the current split generation and can't be evicted until
- * all readers have left the old generation.
- *
- * Historic, we also blocked splits in newly created pages
- * because we didn't update the WT_REF.home field until after
- * the split was live, so the WT_REF.home fields being updated
- * could split again before the update, there's a race between
- * splits as to which would update them first. The current code
- * updates the WT_REF.home fields before going live (in this
- * function), this isn't an issue.
- */
- child->pg_intl_split_gen = split_gen;
+ WT_PAGE_LOCK(session, child);
- /*
- * We use a page flag to prevent the child from splitting from
- * underneath us, but the split-generation error checks don't
- * know about that flag; use the standard macros to ensure that
- * reading the child's page index structure is safe.
- */
+ /* Track the locked pages for cleanup. */
+ WT_ERR(__wt_realloc_def(session, &alloc, cnt + 2, &locked));
+ locked[cnt++] = child;
+
+ /* Switch the WT_REF's to their new page. */
j = 0;
- WT_ENTER_PAGE_INDEX(session);
WT_INTL_FOREACH_BEGIN(session, child, child_ref) {
child_ref->home = child;
child_ref->pindex_hint = j++;
} WT_INTL_FOREACH_END;
- WT_LEAVE_PAGE_INDEX(session);
#ifdef HAVE_DIAGNOSTIC
WT_WITH_PAGE_INDEX(session,
__split_verify_intl_key_order(session, child));
#endif
}
+ *lockedp = locked;
+ return (0);
+
+err: __split_ref_final(session, &locked);
+ return (ret);
}
/*
@@ -518,10 +543,9 @@ __split_root(WT_SESSION_IMPL *session, WT_PAGE *root)
{
WT_BTREE *btree;
WT_DECL_RET;
- WT_PAGE *child;
+ WT_PAGE *child, **locked;
WT_PAGE_INDEX *alloc_index, *child_pindex, *pindex;
- WT_REF **alloc_refp;
- WT_REF **child_refp, *ref, **root_refp;
+ WT_REF **alloc_refp, **child_refp, *ref, **root_refp;
WT_SPLIT_ERROR_PHASE complete;
size_t child_incr, root_decr, root_incr, size;
uint64_t split_gen;
@@ -536,11 +560,13 @@ __split_root(WT_SESSION_IMPL *session, WT_PAGE *root)
btree = S2BT(session);
alloc_index = NULL;
+ locked = NULL;
root_decr = root_incr = 0;
complete = WT_ERR_RETURN;
- /* The root page will be marked dirty, make sure that will succeed. */
+ /* Mark the root page dirty. */
WT_RET(__wt_page_modify_init(session, root));
+ __wt_page_modify_set(session, root);
/*
* Our caller is holding the root page locked to single-thread splits,
@@ -583,6 +609,17 @@ __split_root(WT_SESSION_IMPL *session, WT_PAGE *root)
WT_ERR(__wt_calloc_one(session, alloc_refp));
root_incr += children * sizeof(WT_REF);
+ /*
+ * Once the split is live, newly created internal pages might be evicted
+ * and their WT_REF structures freed. If that happens before all threads
+ * exit the index of the page that previously "owned" the WT_REF, a
+ * thread might see a freed WT_REF. To ensure that doesn't happen, the
+ * created pages are set to the current split generation and so can't be
+ * evicted until all readers have left the old generation.
+ */
+ split_gen = __wt_atomic_addv64(&S2C(session)->split_gen, 1);
+ WT_ASSERT(session, root->pg_intl_split_gen < split_gen);
+
/* Allocate child pages, and connect them into the new page index. */
for (root_refp = pindex->index,
alloc_refp = alloc_index->index, i = 0; i < children; ++i) {
@@ -609,6 +646,7 @@ __split_root(WT_SESSION_IMPL *session, WT_PAGE *root)
/* Initialize the child page. */
child->pg_intl_parent_ref = ref;
+ child->pg_intl_split_gen = split_gen;
/* Mark it dirty. */
WT_ERR(__wt_page_modify_init(session, child));
@@ -640,12 +678,9 @@ __split_root(WT_SESSION_IMPL *session, WT_PAGE *root)
/* Start making real changes to the tree, errors are fatal. */
complete = WT_ERR_PANIC;
- /*
- * Prepare the WT_REFs for the move: this requires a stable split
- * generation to block splits in newly created pages, so get one.
- */
+ /* Prepare the WT_REFs for the move. */
WT_ENTER_PAGE_INDEX(session);
- __split_ref_prepare(session, alloc_index, session->split_gen, false);
+ WT_ERR(__split_ref_prepare(session, alloc_index, &locked, false));
/*
* Confirm the root page's index hasn't moved, then update it, which
@@ -665,6 +700,9 @@ __split_root(WT_SESSION_IMPL *session, WT_PAGE *root)
split_gen = __wt_atomic_addv64(&S2C(session)->split_gen, 1);
root->pg_intl_split_gen = split_gen;
+ /* Finalize the WT_REF move. */
+ __split_ref_final(session, &locked);
+
#ifdef HAVE_DIAGNOSTIC
WT_WITH_PAGE_INDEX(session,
ret = __split_verify_root(session, root));
@@ -688,12 +726,13 @@ __split_root(WT_SESSION_IMPL *session, WT_PAGE *root)
WT_TRET(__split_safe_free(session, split_gen, false, pindex, size));
root_decr += size;
- /* Adjust the root's memory footprint and mark it dirty. */
+ /* Adjust the root's memory footprint. */
__wt_cache_page_inmem_incr(session, root, root_incr);
__wt_cache_page_inmem_decr(session, root, root_decr);
- __wt_page_modify_set(session, root);
-err: switch (complete) {
+err: __split_ref_final(session, &locked);
+
+ switch (complete) {
case WT_ERR_RETURN:
__wt_free_ref_index(session, root, alloc_index, true);
break;
@@ -743,8 +782,9 @@ __split_parent(WT_SESSION_IMPL *session, WT_REF *ref, WT_REF **ref_new,
empty_parent = false;
complete = WT_ERR_RETURN;
- /* The parent page will be marked dirty, make sure that will succeed. */
+ /* Mark the page dirty. */
WT_RET(__wt_page_modify_init(session, parent));
+ __wt_page_modify_set(session, parent);
/*
* We've locked the parent, which means it cannot split (which is the
@@ -972,10 +1012,9 @@ __split_parent(WT_SESSION_IMPL *session, WT_REF *ref, WT_REF **ref_new,
WT_TRET(__split_safe_free(session, split_gen, exclusive, pindex, size));
parent_decr += size;
- /* Adjust the parent's memory footprint and mark it dirty. */
+ /* Adjust the parent's memory footprint. */
__wt_cache_page_inmem_incr(session, parent, parent_incr);
__wt_cache_page_inmem_decr(session, parent, parent_decr);
- __wt_page_modify_set(session, parent);
err: __wt_scr_free(session, &scr);
/*
@@ -1025,10 +1064,9 @@ __split_internal(WT_SESSION_IMPL *session, WT_PAGE *parent, WT_PAGE *page)
{
WT_BTREE *btree;
WT_DECL_RET;
- WT_PAGE *child;
+ WT_PAGE *child, **locked;
WT_PAGE_INDEX *alloc_index, *child_pindex, *pindex, *replace_index;
- WT_REF **alloc_refp;
- WT_REF **child_refp, *page_ref, **page_refp, *ref;
+ WT_REF **alloc_refp, **child_refp, *page_ref, **page_refp, *ref;
WT_SPLIT_ERROR_PHASE complete;
size_t child_incr, page_decr, page_incr, parent_incr, size;
uint64_t split_gen;
@@ -1039,12 +1077,14 @@ __split_internal(WT_SESSION_IMPL *session, WT_PAGE *parent, WT_PAGE *page)
WT_STAT_CONN_INCR(session, cache_eviction_split_internal);
WT_STAT_DATA_INCR(session, cache_eviction_split_internal);
- /* The page will be marked dirty, make sure that will succeed. */
+ /* Mark the page dirty. */
WT_RET(__wt_page_modify_init(session, page));
+ __wt_page_modify_set(session, page);
btree = S2BT(session);
alloc_index = replace_index = NULL;
page_ref = page->pg_intl_parent_ref;
+ locked = NULL;
page_decr = page_incr = parent_incr = 0;
complete = WT_ERR_RETURN;
@@ -1111,6 +1151,17 @@ __split_internal(WT_SESSION_IMPL *session, WT_PAGE *parent, WT_PAGE *page)
WT_ERR(__wt_calloc_one(session, alloc_refp));
parent_incr += children * sizeof(WT_REF);
+ /*
+ * Once the split is live, newly created internal pages might be evicted
+ * and their WT_REF structures freed. If that happens before all threads
+ * exit the index of the page that previously "owned" the WT_REF, a
+ * thread might see a freed WT_REF. To ensure that doesn't happen, the
+ * created pages are set to the current split generation and so can't be
+ * evicted until all readers have left the old generation.
+ */
+ split_gen = __wt_atomic_addv64(&S2C(session)->split_gen, 1);
+ WT_ASSERT(session, page->pg_intl_split_gen < split_gen);
+
/* Allocate child pages, and connect them into the new page index. */
WT_ASSERT(session, page_refp == pindex->index + chunk);
for (alloc_refp = alloc_index->index + 1, i = 1; i < children; ++i) {
@@ -1137,6 +1188,7 @@ __split_internal(WT_SESSION_IMPL *session, WT_PAGE *parent, WT_PAGE *page)
/* Initialize the child page. */
child->pg_intl_parent_ref = ref;
+ child->pg_intl_split_gen = split_gen;
/* Mark it dirty. */
WT_ERR(__wt_page_modify_init(session, child));
@@ -1173,7 +1225,7 @@ __split_internal(WT_SESSION_IMPL *session, WT_PAGE *parent, WT_PAGE *page)
* generation to block splits in newly created pages, so get one.
*/
WT_ENTER_PAGE_INDEX(session);
- __split_ref_prepare(session, alloc_index, session->split_gen, true);
+ WT_ERR(__split_ref_prepare(session, alloc_index, &locked, true));
/* Split into the parent. */
if ((ret = __split_parent(session, page_ref, alloc_index->index,
@@ -1197,6 +1249,9 @@ __split_internal(WT_SESSION_IMPL *session, WT_PAGE *parent, WT_PAGE *page)
split_gen = __wt_atomic_addv64(&S2C(session)->split_gen, 1);
page->pg_intl_split_gen = split_gen;
+ /* Finalize the WT_REF move. */
+ __split_ref_final(session, &locked);
+
#ifdef HAVE_DIAGNOSTIC
WT_WITH_PAGE_INDEX(session,
__split_verify_intl_key_order(session, parent));
@@ -1228,12 +1283,13 @@ __split_internal(WT_SESSION_IMPL *session, WT_PAGE *parent, WT_PAGE *page)
WT_TRET(__split_safe_free(session, split_gen, false, pindex, size));
page_decr += size;
- /* Adjust the page's memory footprint, and mark it dirty. */
+ /* Adjust the page's memory footprint. */
__wt_cache_page_inmem_incr(session, page, page_incr);
__wt_cache_page_inmem_decr(session, page, page_decr);
- __wt_page_modify_set(session, page);
-err: switch (complete) {
+err: __split_ref_final(session, &locked);
+
+ switch (complete) {
case WT_ERR_RETURN:
__wt_free_ref_index(session, page, alloc_index, true);
__wt_free_ref_index(session, page, replace_index, false);