summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorMichael Cahill <michael.cahill@mongodb.com>2017-03-16 16:26:49 +1100
committerAlex Gorrod <alexander.gorrod@mongodb.com>2017-03-16 16:26:49 +1100
commit51d22616094e0a0d34997d26aec925adf949fbdf (patch)
treee8edf353bfbafea6dc4b9c632d3bfebb5d6650da
parentb77f9cc3b7fe7c15445c13df9bef74f1dd39b991 (diff)
downloadmongo-51d22616094e0a0d34997d26aec925adf949fbdf.tar.gz
WT-3206 Fix a race allocating split generations. (#3332)
We use split generations to detect when readers may be looking at structures that are replaced by a split. For correctness, we should only increment the global split generation *after* a split becomes public. Only then can we safely check that no thread is still reading with the old generation. Previously, a split could increment the global split generation, then a thread could start reading with the new split generation but see the old index structure. This issue was introduced by WT 3088, where we wanted a way to ensure that newly-allocated pages don't split until it is safe. That is solved here by having the split code pin a split generation in the ordinary way (without allocating a new one) for the duration that splits of new pages need to be prevented.
-rw-r--r--src/btree/bt_split.c71
1 files changed, 48 insertions, 23 deletions
diff --git a/src/btree/bt_split.c b/src/btree/bt_split.c
index 45550ff627f..6b2100ec7e3 100644
--- a/src/btree/bt_split.c
+++ b/src/btree/bt_split.c
@@ -187,7 +187,7 @@ __split_safe_free(WT_SESSION_IMPL *session,
exclusive = true;
if (exclusive) {
- __wt_free(session, p);
+ __wt_overwrite_and_free_len(session, p, s);
return (0);
}
@@ -640,12 +640,12 @@ __split_root(WT_SESSION_IMPL *session, WT_PAGE *root)
/* Start making real changes to the tree, errors are fatal. */
complete = WT_ERR_PANIC;
- /* Get a generation for this split, mark the root page. */
- split_gen = __wt_atomic_addv64(&S2C(session)->split_gen, 1);
- root->pg_intl_split_gen = split_gen;
-
- /* Prepare the WT_REFs for the move. */
- __split_ref_prepare(session, alloc_index, split_gen, false);
+ /*
+ * Prepare the WT_REFs for the move: this requires a stable split
+ * generation to block splits in newly created pages, so get one.
+ */
+ WT_ENTER_PAGE_INDEX(session);
+ __split_ref_prepare(session, alloc_index, session->split_gen, false);
/*
* Confirm the root page's index hasn't moved, then update it, which
@@ -655,6 +655,16 @@ __split_root(WT_SESSION_IMPL *session, WT_PAGE *root)
WT_INTL_INDEX_SET(root, alloc_index);
alloc_index = NULL;
+ WT_LEAVE_PAGE_INDEX(session);
+
+ /*
+ * Get a generation for this split, mark the root page. This must be
+ * after the new index is swapped into place in order to know that no
+ * readers are looking at the old index.
+ */
+ split_gen = __wt_atomic_addv64(&S2C(session)->split_gen, 1);
+ root->pg_intl_split_gen = split_gen;
+
#ifdef HAVE_DIAGNOSTIC
WT_WITH_PAGE_INDEX(session,
ret = __split_verify_root(session, root));
@@ -825,10 +835,6 @@ __split_parent(WT_SESSION_IMPL *session, WT_REF *ref, WT_REF **ref_new,
/* Start making real changes to the tree, errors are fatal. */
complete = WT_ERR_PANIC;
- /* Get a generation for this split, mark the parent page. */
- split_gen = __wt_atomic_addv64(&S2C(session)->split_gen, 1);
- parent->pg_intl_split_gen = split_gen;
-
/*
* Confirm the parent page's index hasn't moved then update it, which
* makes the split visible to threads descending the tree.
@@ -838,6 +844,14 @@ __split_parent(WT_SESSION_IMPL *session, WT_REF *ref, WT_REF **ref_new,
alloc_index = NULL;
/*
+ * Get a generation for this split, mark the page. This must be after
+ * the new index is swapped into place in order to know that no readers
+ * are looking at the old index.
+ */
+ split_gen = __wt_atomic_addv64(&S2C(session)->split_gen, 1);
+ parent->pg_intl_split_gen = split_gen;
+
+ /*
* If discarding the page's original WT_REF field, reset it to split.
* Threads cursoring through the tree were blocked because that WT_REF
* state was set to locked. Changing the locked state to split unblocks
@@ -1154,23 +1168,34 @@ __split_internal(WT_SESSION_IMPL *session, WT_PAGE *parent, WT_PAGE *page)
/* Start making real changes to the tree, errors are fatal. */
complete = WT_ERR_PANIC;
- /* Get a generation for this split, mark the page. */
- split_gen = __wt_atomic_addv64(&S2C(session)->split_gen, 1);
- page->pg_intl_split_gen = split_gen;
-
- /* Prepare the WT_REFs for the move. */
- __split_ref_prepare(session, alloc_index, split_gen, true);
+ /*
+ * Prepare the WT_REFs for the move: this requires a stable split
+ * generation to block splits in newly created pages, so get one.
+ */
+ WT_ENTER_PAGE_INDEX(session);
+ __split_ref_prepare(session, alloc_index, session->split_gen, true);
/* Split into the parent. */
- WT_ERR(__split_parent(session, page_ref, alloc_index->index,
- alloc_index->entries, parent_incr, false, false));
+ if ((ret = __split_parent(session, page_ref, alloc_index->index,
+ alloc_index->entries, parent_incr, false, false)) == 0) {
+ /*
+ * Confirm the page's index hasn't moved, then update it, which
+ * makes the split visible to threads descending the tree.
+ */
+ WT_ASSERT(session, WT_INTL_INDEX_GET_SAFE(page) == pindex);
+ WT_INTL_INDEX_SET(page, replace_index);
+ }
+
+ WT_LEAVE_PAGE_INDEX(session);
+ WT_ERR(ret);
/*
- * Confirm the page's index hasn't moved, then update it, which makes
- * the split visible to threads descending the tree.
+ * Get a generation for this split, mark the parent page. This must be
+ * after the new index is swapped into place in order to know that no
+ * readers are looking at the old index.
*/
- WT_ASSERT(session, WT_INTL_INDEX_GET_SAFE(page) == pindex);
- WT_INTL_INDEX_SET(page, replace_index);
+ split_gen = __wt_atomic_addv64(&S2C(session)->split_gen, 1);
+ page->pg_intl_split_gen = split_gen;
#ifdef HAVE_DIAGNOSTIC
WT_WITH_PAGE_INDEX(session,