diff options
author | Michael Cahill <michael.cahill@mongodb.com> | 2017-03-16 16:26:49 +1100 |
---|---|---|
committer | Alex Gorrod <alexander.gorrod@mongodb.com> | 2017-03-16 16:26:49 +1100 |
commit | 51d22616094e0a0d34997d26aec925adf949fbdf (patch) | |
tree | e8edf353bfbafea6dc4b9c632d3bfebb5d6650da | |
parent | b77f9cc3b7fe7c15445c13df9bef74f1dd39b991 (diff) | |
download | mongo-51d22616094e0a0d34997d26aec925adf949fbdf.tar.gz |
WT-3206 Fix a race allocating split generations. (#3332)
We use split generations to detect when readers may be looking at
structures that are replaced by a split. For correctness, we should
only increment the global split generation *after* a split becomes
public. Only then can we safely check that no thread is still reading
with the old generation.
Previously, a split could increment the global split generation, then a
thread could start reading with the new split generation but see the old
index structure.
This issue was introduced by WT 3088, where we wanted a way to ensure
that newly-allocated pages don't split until it is safe. That is solved
here by having the split code pin a split generation in the ordinary way
(without allocating a new one) for the duration that splits of new
pages need to be prevented.
-rw-r--r-- | src/btree/bt_split.c | 71 |
1 files changed, 48 insertions, 23 deletions
diff --git a/src/btree/bt_split.c b/src/btree/bt_split.c index 45550ff627f..6b2100ec7e3 100644 --- a/src/btree/bt_split.c +++ b/src/btree/bt_split.c @@ -187,7 +187,7 @@ __split_safe_free(WT_SESSION_IMPL *session, exclusive = true; if (exclusive) { - __wt_free(session, p); + __wt_overwrite_and_free_len(session, p, s); return (0); } @@ -640,12 +640,12 @@ __split_root(WT_SESSION_IMPL *session, WT_PAGE *root) /* Start making real changes to the tree, errors are fatal. */ complete = WT_ERR_PANIC; - /* Get a generation for this split, mark the root page. */ - split_gen = __wt_atomic_addv64(&S2C(session)->split_gen, 1); - root->pg_intl_split_gen = split_gen; - - /* Prepare the WT_REFs for the move. */ - __split_ref_prepare(session, alloc_index, split_gen, false); + /* + * Prepare the WT_REFs for the move: this requires a stable split + * generation to block splits in newly created pages, so get one. + */ + WT_ENTER_PAGE_INDEX(session); + __split_ref_prepare(session, alloc_index, session->split_gen, false); /* * Confirm the root page's index hasn't moved, then update it, which @@ -655,6 +655,16 @@ __split_root(WT_SESSION_IMPL *session, WT_PAGE *root) WT_INTL_INDEX_SET(root, alloc_index); alloc_index = NULL; + WT_LEAVE_PAGE_INDEX(session); + + /* + * Get a generation for this split, mark the root page. This must be + * after the new index is swapped into place in order to know that no + * readers are looking at the old index. + */ + split_gen = __wt_atomic_addv64(&S2C(session)->split_gen, 1); + root->pg_intl_split_gen = split_gen; + #ifdef HAVE_DIAGNOSTIC WT_WITH_PAGE_INDEX(session, ret = __split_verify_root(session, root)); @@ -825,10 +835,6 @@ __split_parent(WT_SESSION_IMPL *session, WT_REF *ref, WT_REF **ref_new, /* Start making real changes to the tree, errors are fatal. */ complete = WT_ERR_PANIC; - /* Get a generation for this split, mark the parent page. */ - split_gen = __wt_atomic_addv64(&S2C(session)->split_gen, 1); - parent->pg_intl_split_gen = split_gen; - /* * Confirm the parent page's index hasn't moved then update it, which * makes the split visible to threads descending the tree. @@ -838,6 +844,14 @@ __split_parent(WT_SESSION_IMPL *session, WT_REF *ref, WT_REF **ref_new, alloc_index = NULL; /* + * Get a generation for this split, mark the page. This must be after + * the new index is swapped into place in order to know that no readers + * are looking at the old index. + */ + split_gen = __wt_atomic_addv64(&S2C(session)->split_gen, 1); + parent->pg_intl_split_gen = split_gen; + + /* * If discarding the page's original WT_REF field, reset it to split. * Threads cursoring through the tree were blocked because that WT_REF * state was set to locked. Changing the locked state to split unblocks @@ -1154,23 +1168,34 @@ __split_internal(WT_SESSION_IMPL *session, WT_PAGE *parent, WT_PAGE *page) /* Start making real changes to the tree, errors are fatal. */ complete = WT_ERR_PANIC; - /* Get a generation for this split, mark the page. */ - split_gen = __wt_atomic_addv64(&S2C(session)->split_gen, 1); - page->pg_intl_split_gen = split_gen; - - /* Prepare the WT_REFs for the move. */ - __split_ref_prepare(session, alloc_index, split_gen, true); + /* + * Prepare the WT_REFs for the move: this requires a stable split + * generation to block splits in newly created pages, so get one. + */ + WT_ENTER_PAGE_INDEX(session); + __split_ref_prepare(session, alloc_index, session->split_gen, true); /* Split into the parent. */ - WT_ERR(__split_parent(session, page_ref, alloc_index->index, - alloc_index->entries, parent_incr, false, false)); + if ((ret = __split_parent(session, page_ref, alloc_index->index, + alloc_index->entries, parent_incr, false, false)) == 0) { + /* + * Confirm the page's index hasn't moved, then update it, which + * makes the split visible to threads descending the tree. + */ + WT_ASSERT(session, WT_INTL_INDEX_GET_SAFE(page) == pindex); + WT_INTL_INDEX_SET(page, replace_index); + } + + WT_LEAVE_PAGE_INDEX(session); + WT_ERR(ret); /* - * Confirm the page's index hasn't moved, then update it, which makes - * the split visible to threads descending the tree. + * Get a generation for this split, mark the parent page. This must be + * after the new index is swapped into place in order to know that no + * readers are looking at the old index. */ - WT_ASSERT(session, WT_INTL_INDEX_GET_SAFE(page) == pindex); - WT_INTL_INDEX_SET(page, replace_index); + split_gen = __wt_atomic_addv64(&S2C(session)->split_gen, 1); + page->pg_intl_split_gen = split_gen; #ifdef HAVE_DIAGNOSTIC WT_WITH_PAGE_INDEX(session, |