summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorKeith Bostic <keith.bostic@mongodb.com>2017-07-28 02:42:46 -0400
committerMichael Cahill <michael.cahill@mongodb.com>2017-07-28 16:42:46 +1000
commitbfc8a1e83d236e60a0b5b5215957f583dd5a5ee4 (patch)
tree0432fc6cec7bde599fbbfcb64600e0f5544993af
parenteb01ff0d21f2ec64f37ed54f867c13958fc1d16a (diff)
downloadmongo-bfc8a1e83d236e60a0b5b5215957f583dd5a5ee4.tar.gz
WT-3389 Restructure split code to hold a split generation for the entire operation. (#3479)
* Change insert, multi-page and reverse splits to acquire and hold a split generation for the entire operation. This is intended to ensure internal pages split by other threads cannot be evicted from underneath the current split. This is intended to avoid problems like the one in WT-3373, where an internal page was evicted during an attempt to split into it, leading to an access violation. These functions will already acquire and hold a split generation at some point, so there's no performance cost with holding it for a longer period (other than potentially failing to evict internal pages as soon as would otherwise be possible). * Now that we're holding a split-generation across the insert operations, simplify some underlying code. We continue to need to set the split generation on newly created pages, but move that set from `__split_ref_prepare()` into the calling code where the other fields of the newly created pages are initialized. * We no longer need to call `WT_ENTER_PAGE_INDEX/WT_LEAVE_PAGE_INDEX` in `__split_ref_prepare()`, `__split_root()` or `__split_internal()`. * `__split_internal_lock()` gets a hazard pointer on the child's parent page to avoid parent eviction. The problem that solved was that while the existence of the child page prevents parent page eviction initially, once the parent's page index is updated, the child may no longer be a child of that parent, the parent might have no children, and so could potentially be evicted. Now we hold a page lock on the parent page that prevents eviction if the parent page is dirty, because eviction would require the page be reconciled/cleaned. That should be safe: if the child page is moved so the parent could be evicted, then the parent page must also have been marked dirty.
-rw-r--r--src/btree/bt_split.c335
1 files changed, 167 insertions, 168 deletions
diff --git a/src/btree/bt_split.c b/src/btree/bt_split.c
index 2c0e247ce06..99416c1be63 100644
--- a/src/btree/bt_split.c
+++ b/src/btree/bt_split.c
@@ -328,8 +328,8 @@ __split_ref_move(WT_SESSION_IMPL *session, WT_PAGE *from_home,
* Prepare a set of WT_REFs for a move.
*/
static void
-__split_ref_prepare(WT_SESSION_IMPL *session,
- WT_PAGE_INDEX *pindex, uint64_t split_gen, bool skip_first)
+__split_ref_prepare(
+ WT_SESSION_IMPL *session, WT_PAGE_INDEX *pindex, bool skip_first)
{
WT_PAGE *child;
WT_REF *child_ref, *ref;
@@ -351,40 +351,12 @@ __split_ref_prepare(WT_SESSION_IMPL *session,
ref = pindex->index[i];
child = ref->page;
- /*
- * Block eviction in newly created pages.
- *
- * Once the split is live, newly created internal pages might be
- * evicted and their WT_REF structures freed. If that happened
- * before all threads exit the index of the page that previously
- * "owned" the WT_REF, a thread might see a freed WT_REF. To
- * ensure that doesn't happen, the newly created page contains
- * the current split generation and can't be evicted until
- * all readers have left the old generation.
- *
- * Historic, we also blocked splits in newly created pages
- * because we didn't update the WT_REF.home field until after
- * the split was live, so the WT_REF.home fields being updated
- * could split again before the update, there's a race between
- * splits as to which would update them first. The current code
- * updates the WT_REF.home fields before going live (in this
- * function), this isn't an issue.
- */
- child->pg_intl_split_gen = split_gen;
-
- /*
- * We use a page flag to prevent the child from splitting from
- * underneath us, but the split-generation error checks don't
- * know about that flag; use the standard macros to ensure that
- * reading the child's page index structure is safe.
- */
+ /* Switch the WT_REF's to their new page. */
j = 0;
- WT_ENTER_PAGE_INDEX(session);
WT_INTL_FOREACH_BEGIN(session, child, child_ref) {
child_ref->home = child;
child_ref->pindex_hint = j++;
} WT_INTL_FOREACH_END;
- WT_LEAVE_PAGE_INDEX(session);
#ifdef HAVE_DIAGNOSTIC
WT_WITH_PAGE_INDEX(session,
@@ -467,6 +439,18 @@ __split_root(WT_SESSION_IMPL *session, WT_PAGE *root)
WT_ERR(__wt_calloc_one(session, alloc_refp));
root_incr += children * sizeof(WT_REF);
+ /*
+ * Once the split is live, newly created internal pages might be evicted
+ * and their WT_REF structures freed. If that happens before all threads
+ * exit the index of the page that previously "owned" the WT_REF, a
+ * thread might see a freed WT_REF. To ensure that doesn't happen, the
+ * created pages are set to the current split generation and so can't be
+ * evicted until all readers have left the old generation.
+ *
+ * Our thread has a stable split generation, get a copy.
+ */
+ split_gen = __wt_session_gen(session, WT_GEN_SPLIT);
+
/* Allocate child pages, and connect them into the new page index. */
for (root_refp = pindex->index,
alloc_refp = alloc_index->index, i = 0; i < children; ++i) {
@@ -491,10 +475,12 @@ __split_root(WT_SESSION_IMPL *session, WT_PAGE *root)
ref->ref_recno = (*root_refp)->ref_recno;
ref->state = WT_REF_MEM;
- /* Initialize the child page. */
+ /*
+ * Initialize the child page.
+ * Block eviction in newly created pages and mark them dirty.
+ */
child->pg_intl_parent_ref = ref;
-
- /* Mark it dirty. */
+ child->pg_intl_split_gen = split_gen;
WT_ERR(__wt_page_modify_init(session, child));
__wt_page_modify_set(session, child);
@@ -524,13 +510,8 @@ __split_root(WT_SESSION_IMPL *session, WT_PAGE *root)
/* Start making real changes to the tree, errors are fatal. */
complete = WT_ERR_PANIC;
- /*
- * Prepare the WT_REFs for the move: this requires a stable split
- * generation to block splits in newly created pages, so get one.
- */
- WT_ENTER_PAGE_INDEX(session);
- __split_ref_prepare(session, alloc_index,
- __wt_session_gen(session, WT_GEN_SPLIT), false);
+ /* Prepare the WT_REFs for the move. */
+ __split_ref_prepare(session, alloc_index, false);
/* Encourage a race */
__page_split_timing_stress(session,
@@ -548,12 +529,17 @@ __split_root(WT_SESSION_IMPL *session, WT_PAGE *root)
__page_split_timing_stress(session,
WT_TIMING_STRESS_INTERNAL_PAGE_SPLIT_RACE, 100 * WT_THOUSAND);
- WT_LEAVE_PAGE_INDEX(session);
-
/*
* Get a generation for this split, mark the root page. This must be
* after the new index is swapped into place in order to know that no
* readers are looking at the old index.
+ *
+ * Note: as the root page cannot currently be evicted, the root split
+ * generation isn't ever used. That said, it future proofs eviction
+ * and isn't expensive enough to special-case.
+ *
+ * Getting a new split generation implies a full barrier, no additional
+ * barrier is needed.
*/
split_gen = __wt_gen_next(session, WT_GEN_SPLIT);
root->pg_intl_split_gen = split_gen;
@@ -748,6 +734,9 @@ __split_parent(WT_SESSION_IMPL *session, WT_REF *ref, WT_REF **ref_new,
* Get a generation for this split, mark the page. This must be after
* the new index is swapped into place in order to know that no readers
* are looking at the old index.
+ *
+ * Getting a new split generation implies a full barrier, no additional
+ * barrier is needed.
*/
split_gen = __wt_gen_next(session, WT_GEN_SPLIT);
parent->pg_intl_split_gen = split_gen;
@@ -796,7 +785,6 @@ __split_parent(WT_SESSION_IMPL *session, WT_REF *ref, WT_REF **ref_new,
* Swapping in the new page index released the page for eviction, we can
* no longer look inside the page.
*/
-
if (ref->page == NULL)
__wt_verbose(session, WT_VERB_SPLIT,
"%p: reverse split into parent %p, %" PRIu32 " -> %" PRIu32
@@ -815,8 +803,6 @@ __split_parent(WT_SESSION_IMPL *session, WT_REF *ref, WT_REF **ref_new,
/*
* The new page index is in place, free the WT_REF we were splitting and
* any deleted WT_REFs we found, modulo the usual safe free semantics.
- *
- * Acquire a new split generation.
*/
for (i = 0, deleted_refs = scr->mem; i < deleted_entries; ++i) {
next_ref = pindex->index[deleted_refs[i]];
@@ -1012,6 +998,18 @@ __split_internal(WT_SESSION_IMPL *session, WT_PAGE *parent, WT_PAGE *page)
WT_ERR(__wt_calloc_one(session, alloc_refp));
parent_incr += children * sizeof(WT_REF);
+ /*
+ * Once the split is live, newly created internal pages might be evicted
+ * and their WT_REF structures freed. If that happens before all threads
+ * exit the index of the page that previously "owned" the WT_REF, a
+ * thread might see a freed WT_REF. To ensure that doesn't happen, the
+ * created pages are set to the current split generation and so can't be
+ * evicted until all readers have left the old generation.
+ *
+ * Our thread has a stable split generation, get a copy.
+ */
+ split_gen = __wt_session_gen(session, WT_GEN_SPLIT);
+
/* Allocate child pages, and connect them into the new page index. */
WT_ASSERT(session, page_refp == pindex->index + chunk);
for (alloc_refp = alloc_index->index + 1, i = 1; i < children; ++i) {
@@ -1036,10 +1034,12 @@ __split_internal(WT_SESSION_IMPL *session, WT_PAGE *parent, WT_PAGE *page)
ref->ref_recno = (*page_refp)->ref_recno;
ref->state = WT_REF_MEM;
- /* Initialize the child page. */
+ /*
+ * Initialize the child page.
+ * Block eviction in newly created pages and mark them dirty.
+ */
child->pg_intl_parent_ref = ref;
-
- /* Mark it dirty. */
+ child->pg_intl_split_gen = split_gen;
WT_ERR(__wt_page_modify_init(session, child));
__wt_page_modify_set(session, child);
@@ -1069,31 +1069,23 @@ __split_internal(WT_SESSION_IMPL *session, WT_PAGE *parent, WT_PAGE *page)
/* Start making real changes to the tree, errors are fatal. */
complete = WT_ERR_PANIC;
- /*
- * Prepare the WT_REFs for the move: this requires a stable split
- * generation to block splits in newly created pages, so get one.
- */
- WT_ENTER_PAGE_INDEX(session);
- __split_ref_prepare(session, alloc_index,
- __wt_session_gen(session, WT_GEN_SPLIT), true);
+ /* Prepare the WT_REFs for the move. */
+ __split_ref_prepare(session, alloc_index, true);
/* Encourage a race */
__page_split_timing_stress(session,
WT_TIMING_STRESS_INTERNAL_PAGE_SPLIT_RACE, 100 * WT_THOUSAND);
/* Split into the parent. */
- if ((ret = __split_parent(session, page_ref, alloc_index->index,
- alloc_index->entries, parent_incr, false, false)) == 0) {
- /*
- * Confirm the page's index hasn't moved, then update it, which
- * makes the split visible to threads descending the tree.
- */
- WT_ASSERT(session, WT_INTL_INDEX_GET_SAFE(page) == pindex);
- WT_INTL_INDEX_SET(page, replace_index);
- }
+ WT_ERR(__split_parent(session, page_ref, alloc_index->index,
+ alloc_index->entries, parent_incr, false, false));
- WT_LEAVE_PAGE_INDEX(session);
- WT_ERR(ret);
+ /*
+ * Confirm the page's index hasn't moved, then update it, which
+ * makes the split visible to threads descending the tree.
+ */
+ WT_ASSERT(session, WT_INTL_INDEX_GET_SAFE(page) == pindex);
+ WT_INTL_INDEX_SET(page, replace_index);
/* Encourage a race */
__page_split_timing_stress(session,
@@ -1103,6 +1095,9 @@ __split_internal(WT_SESSION_IMPL *session, WT_PAGE *parent, WT_PAGE *page)
* Get a generation for this split, mark the parent page. This must be
* after the new index is swapped into place in order to know that no
* readers are looking at the old index.
+ *
+ * Getting a new split generation implies a full barrier, no additional
+ * barrier is needed.
*/
split_gen = __wt_gen_next(session, WT_GEN_SPLIT);
page->pg_intl_split_gen = split_gen;
@@ -1166,18 +1161,15 @@ err: switch (complete) {
}
/*
- * __split_internal_lock_worker --
+ * __split_internal_lock --
* Lock an internal page.
*/
static int
-__split_internal_lock_worker(WT_SESSION_IMPL *session,
- WT_REF *ref, bool trylock, WT_PAGE **parentp, bool *hazardp)
+__split_internal_lock(
+ WT_SESSION_IMPL *session, WT_REF *ref, bool trylock, WT_PAGE **parentp)
{
- WT_DECL_RET;
WT_PAGE *parent;
- WT_REF *parent_ref;
- *hazardp = false;
*parentp = NULL;
/*
@@ -1210,12 +1202,11 @@ __split_internal_lock_worker(WT_SESSION_IMPL *session,
for (;;) {
parent = ref->home;
+ /* Encourage race */
__page_split_timing_stress(session,
WT_TIMING_STRESS_PAGE_SPLIT_RACE, WT_THOUSAND);
- /*
- * The page will be marked dirty, and we can only lock a page
- * with a modify structure.
- */
+
+ /* Page locks live in the modify structure. */
WT_RET(__wt_page_modify_init(session, parent));
if (trylock)
@@ -1228,69 +1219,28 @@ __split_internal_lock_worker(WT_SESSION_IMPL *session,
}
/*
- * We have exclusive access to split the parent, and at this point, the
- * child prevents the parent from being evicted. However, once we
+ * This child has exclusive access to split its parent and the child's
+ * existence prevents the parent from being evicted. However, once we
* update the parent's index, it may no longer refer to the child, and
- * could conceivably be evicted. Get a hazard pointer on the parent
- * now, so that we can safely access it after updating the index.
- *
- * Take care getting the page doesn't trigger eviction work: we could
- * block trying to split a different child of our parent and deadlock
- * or we could be the eviction server relied upon by other threads to
- * populate the eviction queue.
- */
- if (!__wt_ref_is_root(parent_ref = parent->pg_intl_parent_ref)) {
- WT_ERR(__wt_page_in(session, parent_ref, WT_READ_NO_EVICT));
- *hazardp = true;
- }
+ * could conceivably be evicted. If the parent page is dirty, our page
+ * lock prevents eviction because reconciliation is blocked. However,
+ * if the page were clean, it could be evicted without encountering our
+ * page lock. That isn't possible because you cannot move a child page
+ * and still leave the parent page clean.
+ */
*parentp = parent;
return (0);
-
-err: WT_PAGE_UNLOCK(session, parent);
- return (ret);
-}
-
-/*
- * __split_internal_lock --
- * Lock an internal page.
- */
-static int
-__split_internal_lock(WT_SESSION_IMPL *session,
- WT_REF *ref, bool trylock, WT_PAGE **parentp, bool *hazardp)
-{
- WT_DECL_RET;
-
- /*
- * There's no lock on our parent page and we're about to acquire one,
- * which implies using the WT_REF.home field to reference our parent
- * page. As a child of the parent page, we prevent its eviction, but
- * that's a weak guarantee. If the parent page splits, and our WT_REF
- * were to move with the split, the WT_REF.home field might change
- * underneath us and we could race, and end up attempting to access
- * an evicted page. Set the session page-index generation so if the
- * parent splits, it still can't be evicted.
- */
- WT_WITH_PAGE_INDEX(session,
- ret = __split_internal_lock_worker(
- session, ref, trylock, parentp, hazardp));
- return (ret);
}
/*
* __split_internal_unlock --
* Unlock the parent page.
*/
-static int
-__split_internal_unlock(WT_SESSION_IMPL *session, WT_PAGE *parent, bool hazard)
+static void
+__split_internal_unlock(WT_SESSION_IMPL *session, WT_PAGE *parent)
{
- WT_DECL_RET;
-
- if (hazard)
- ret = __wt_hazard_clear(session, parent->pg_intl_parent_ref);
-
WT_PAGE_UNLOCK(session, parent);
- return (ret);
}
/*
@@ -1343,13 +1293,12 @@ __split_internal_should_split(WT_SESSION_IMPL *session, WT_REF *ref)
* Check if we should split up the tree.
*/
static int
-__split_parent_climb(WT_SESSION_IMPL *session, WT_PAGE *page, bool page_hazard)
+__split_parent_climb(WT_SESSION_IMPL *session, WT_PAGE *page)
{
WT_BTREE *btree;
WT_DECL_RET;
WT_PAGE *parent;
WT_REF *ref;
- bool parent_hazard;
btree = S2BT(session);
@@ -1363,8 +1312,10 @@ __split_parent_climb(WT_SESSION_IMPL *session, WT_PAGE *page, bool page_hazard)
* split chunk, but we'll write it upon finding it in a different part
* of the tree.
*/
- if (btree->checkpointing != WT_CKPT_OFF)
- return (__split_internal_unlock(session, page, page_hazard));
+ if (btree->checkpointing != WT_CKPT_OFF) {
+ __split_internal_unlock(session, page);
+ return (0);
+ }
/*
* Page splits trickle up the tree, that is, as leaf pages grow large
@@ -1386,7 +1337,6 @@ __split_parent_climb(WT_SESSION_IMPL *session, WT_PAGE *page, bool page_hazard)
*/
for (;;) {
parent = NULL;
- parent_hazard = false;
ref = page->pg_intl_parent_ref;
/* If we don't need to split the page, we're done. */
@@ -1406,22 +1356,18 @@ __split_parent_climb(WT_SESSION_IMPL *session, WT_PAGE *page, bool page_hazard)
* Lock the parent and split into it, then swap the parent/page
* locks, lock-coupling up the tree.
*/
- WT_ERR(__split_internal_lock(
- session, ref, true, &parent, &parent_hazard));
+ WT_ERR(__split_internal_lock(session, ref, true, &parent));
ret = __split_internal(session, parent, page);
- WT_TRET(__split_internal_unlock(session, page, page_hazard));
+ __split_internal_unlock(session, page);
page = parent;
- page_hazard = parent_hazard;
parent = NULL;
- parent_hazard = false;
WT_ERR(ret);
}
err: if (parent != NULL)
- WT_TRET(
- __split_internal_unlock(session, parent, parent_hazard));
- WT_TRET(__split_internal_unlock(session, page, page_hazard));
+ __split_internal_unlock(session, parent);
+ __split_internal_unlock(session, page);
/* A page may have been busy, in which case return without error. */
WT_RET_BUSY_OK(ret);
@@ -2032,21 +1978,19 @@ err: if (split_ref[0] != NULL) {
}
/*
- * __wt_split_insert --
- * Lock, then split.
+ * __split_insert_lock --
+ * Split a page's last insert list entries into a separate page.
*/
-int
-__wt_split_insert(WT_SESSION_IMPL *session, WT_REF *ref)
+static int
+__split_insert_lock(WT_SESSION_IMPL *session, WT_REF *ref)
{
WT_DECL_RET;
WT_PAGE *parent;
- bool hazard;
-
- __wt_verbose(session, WT_VERB_SPLIT, "%p: split-insert", (void *)ref);
- WT_RET(__split_internal_lock(session, ref, true, &parent, &hazard));
+ /* Lock the parent page, then proceed with the insert split. */
+ WT_RET(__split_internal_lock(session, ref, true, &parent));
if ((ret = __split_insert(session, ref)) != 0) {
- WT_TRET(__split_internal_unlock(session, parent, hazard));
+ __split_internal_unlock(session, parent);
return (ret);
}
@@ -2055,7 +1999,27 @@ __wt_split_insert(WT_SESSION_IMPL *session, WT_REF *ref)
* parent page locked, note the functions we call are responsible for
* releasing that lock.
*/
- return (__split_parent_climb(session, parent, hazard));
+ return (__split_parent_climb(session, parent));
+}
+
+/*
+ * __wt_split_insert --
+ * Split a page's last insert list entries into a separate page.
+ */
+int
+__wt_split_insert(WT_SESSION_IMPL *session, WT_REF *ref)
+{
+ WT_DECL_RET;
+
+ __wt_verbose(session, WT_VERB_SPLIT, "%p: split-insert", (void *)ref);
+
+ /*
+ * Set the session split generation to ensure underlying code isn't
+ * surprised by internal page eviction, then proceed with the insert
+ * split.
+ */
+ WT_WITH_PAGE_INDEX(session, ret = __split_insert_lock(session, ref));
+ return (ret);
}
/*
@@ -2123,21 +2087,19 @@ err: for (i = 0; i < new_entries; ++i)
}
/*
- * __wt_split_multi --
- * Lock, then split.
+ * __split_multi_lock --
+ * Split a page into multiple pages.
*/
-int
-__wt_split_multi(WT_SESSION_IMPL *session, WT_REF *ref, int closing)
+static int
+__split_multi_lock(WT_SESSION_IMPL *session, WT_REF *ref, int closing)
{
WT_DECL_RET;
WT_PAGE *parent;
- bool hazard;
- __wt_verbose(session, WT_VERB_SPLIT, "%p: split-multi", (void *)ref);
-
- WT_RET(__split_internal_lock(session, ref, false, &parent, &hazard));
+ /* Lock the parent page, then proceed with the split. */
+ WT_RET(__split_internal_lock(session, ref, false, &parent));
if ((ret = __split_multi(session, ref, closing)) != 0 || closing) {
- WT_TRET(__split_internal_unlock(session, parent, hazard));
+ __split_internal_unlock(session, parent);
return (ret);
}
@@ -2146,26 +2108,63 @@ __wt_split_multi(WT_SESSION_IMPL *session, WT_REF *ref, int closing)
* parent page locked, note the functions we call are responsible for
* releasing that lock.
*/
- return (__split_parent_climb(session, parent, hazard));
+ return (__split_parent_climb(session, parent));
+}
+
+/*
+ * __wt_split_multi --
+ * Split a page into multiple pages.
+ */
+int
+__wt_split_multi(WT_SESSION_IMPL *session, WT_REF *ref, int closing)
+{
+ WT_DECL_RET;
+
+ __wt_verbose(session, WT_VERB_SPLIT, "%p: split-multi", (void *)ref);
+
+ /*
+ * Set the session split generation to ensure underlying code isn't
+ * surprised by internal page eviction, then proceed with the split.
+ */
+ WT_WITH_PAGE_INDEX(session,
+ ret = __split_multi_lock(session, ref, closing));
+ return (ret);
+}
+
+/*
+ * __split_reverse --
+ * Reverse split (rewrite a parent page's index to reflect an empty page).
+ */
+static int
+__split_reverse(WT_SESSION_IMPL *session, WT_REF *ref)
+{
+ WT_DECL_RET;
+ WT_PAGE *parent;
+
+ /* Lock the parent page, then proceed with the reverse split. */
+ WT_RET(__split_internal_lock(session, ref, false, &parent));
+ ret = __split_parent(session, ref, NULL, 0, 0, false, true);
+ __split_internal_unlock(session, parent);
+ return (ret);
}
/*
* __wt_split_reverse --
- * We have a locked ref that is empty and we want to rewrite the index in
- * its parent.
+ * Reverse split (rewrite a parent page's index to reflect an empty page).
*/
int
__wt_split_reverse(WT_SESSION_IMPL *session, WT_REF *ref)
{
WT_DECL_RET;
- WT_PAGE *parent;
- bool hazard;
__wt_verbose(session, WT_VERB_SPLIT, "%p: reverse-split", (void *)ref);
- WT_RET(__split_internal_lock(session, ref, false, &parent, &hazard));
- ret = __split_parent(session, ref, NULL, 0, 0, false, true);
- WT_TRET(__split_internal_unlock(session, parent, hazard));
+ /*
+ * Set the session split generation to ensure underlying code isn't
+ * surprised by internal page eviction, then proceed with the reverse
+ * split.
+ */
+ WT_WITH_PAGE_INDEX(session, ret = __split_reverse(session, ref));
return (ret);
}