diff options
author | Dan Pasette <dan@mongodb.com> | 2015-12-29 11:19:56 -0500 |
---|---|---|
committer | Dan Pasette <dan@mongodb.com> | 2015-12-29 11:19:56 -0500 |
commit | 0d3d4aacc239c507f660bb4295217df92d9cfe79 (patch) | |
tree | 953a520ea8db9f90bb4903776155add3683ea969 /src/third_party/wiredtiger | |
parent | eedd8c3f2f75c4678d65373f2f75aa72bc58b590 (diff) | |
download | mongo-0d3d4aacc239c507f660bb4295217df92d9cfe79.tar.gz |
Import wiredtiger-wiredtiger-mongodb-3.2-rc4-80-g3c2ad56.tar.gz from wiredtiger branch mongodb-3.2r3.2.1-rc2
ref: decd916..3c2ad56
3c2ad56 WT-2307: Fix for cursor iteration bug when pages are splitting
b1768d0 The change wasn't ready for back port into 3.2.1
This reverts commit 21b5f9951e233c835ec5f531458b6ff3c36fcca0.
21b5f99 WT-2291: fix for sampling in newly created trees.
Diffstat (limited to 'src/third_party/wiredtiger')
-rw-r--r-- | src/third_party/wiredtiger/dist/s_string.ok | 1 | ||||
-rw-r--r-- | src/third_party/wiredtiger/src/btree/bt_split.c | 320 | ||||
-rw-r--r-- | src/third_party/wiredtiger/src/btree/bt_walk.c | 98 |
3 files changed, 248 insertions, 171 deletions
diff --git a/src/third_party/wiredtiger/dist/s_string.ok b/src/third_party/wiredtiger/dist/s_string.ok index 7de139f6a40..3669fa4d608 100644 --- a/src/third_party/wiredtiger/dist/s_string.ok +++ b/src/third_party/wiredtiger/dist/s_string.ok @@ -420,6 +420,7 @@ checkpointer checkpointing checksum checksums +children's chk chongo cip diff --git a/src/third_party/wiredtiger/src/btree/bt_split.c b/src/third_party/wiredtiger/src/btree/bt_split.c index 631aca0d5c0..12f4197e9e7 100644 --- a/src/third_party/wiredtiger/src/btree/bt_split.c +++ b/src/third_party/wiredtiger/src/btree/bt_split.c @@ -190,6 +190,8 @@ __split_verify_intl_key_order(WT_SESSION_IMPL *session, WT_PAGE *page) case WT_PAGE_COL_INT: recno = 0; /* Less than any valid record number. */ WT_INTL_FOREACH_BEGIN(session, page, ref) { + WT_ASSERT(session, ref->home == page); + WT_ASSERT(session, ref->key.recno > recno); recno = ref->key.recno; } WT_INTL_FOREACH_END; @@ -202,6 +204,8 @@ __split_verify_intl_key_order(WT_SESSION_IMPL *session, WT_PAGE *page) first = true; WT_INTL_FOREACH_BEGIN(session, page, ref) { + WT_ASSERT(session, ref->home == page); + __wt_ref_key(page, ref, &next->data, &next->size); if (last->size == 0) { if (first) @@ -328,7 +332,7 @@ __split_ref_move(WT_SESSION_IMPL *session, WT_PAGE *from_home, /* * If there's no address (the page has never been written), or the * address has been instantiated, there's no work to do. Otherwise, - * get the address from the on-page cell. + * instantiate the address in-memory, from the on-page cell. */ addr = ref->addr; if (addr != NULL && !__wt_off_page(from_home, addr)) { @@ -363,65 +367,101 @@ __split_ref_move(WT_SESSION_IMPL *session, WT_PAGE *from_home, } /* - * __split_child_block_evict_and_split -- - * Ensure the newly created child isn't evicted or split for now. + * __split_ref_step1 -- + * Prepare a set of WT_REFs for a move. */ static void -__split_child_block_evict_and_split(WT_PAGE *child) +__split_ref_step1( + WT_SESSION_IMPL *session, WT_PAGE_INDEX *pindex, bool skip_first) { + WT_PAGE *child; + WT_REF *child_ref, *ref; + uint32_t i, j; + + /* The newly created subtree is complete. */ + WT_WRITE_BARRIER(); + /* - * Once the split is live, newly created internal pages might be evicted - * and their WT_REF structures freed. If that happens before all threads - * exit the index of the page which previously "owned" the WT_REF, a - * thread might see a freed WT_REF. To ensure that doesn't happen, the - * newly created page's modify structure has a field with a transaction - * ID that's checked before any internal page is evicted. Unfortunately, - * we don't know the correct value until we update the original page's - * index (we need a transaction ID from after that update), but the act - * of updating the original page's index is what allows the eviction to - * happen. - * - * Once the split is live, newly created internal pages might themselves - * split. The split itself is not the problem: if a page splits before - * we fix up its WT_REF (in other words, a WT_REF we move is then moved - * again, before we reset the underlying page's parent reference), it's - * OK because the test we use to find a WT_REF and WT_PAGE that require - * fixing up is only that the WT_REF points to the wrong parent, not it - * points to a specific wrong parent. The problem is our fix up of the - * WT_REFs in the created page could race with the subsequent fix of the - * same WT_REFs (in a different created page), we'd have to acquire some - * lock to prevent that race, and that's going to be difficult at best. - * - * For now, block eviction and splits in newly created pages until they - * have been fixed up. + * Update the moved WT_REFs so threads moving through them start looking + * at the created children's page index information. Because we've not + * yet updated the page index of the parent page into which we are going + * to split this subtree, a cursor moving through these WT_REFs will + * ascend into the created children, but eventually fail as that parent + * page won't yet know about the created children pages. That's OK, we + * spin there until the parent's page index is updated. */ - F_SET_ATOMIC(child, WT_PAGE_SPLIT_BLOCK); + for (i = skip_first ? 1 : 0; i < pindex->entries; ++i) { + ref = pindex->index[i]; + child = ref->page; + + /* + * Block eviction and splits in newly created pages. + * + * Once the split is live, newly created internal pages might be + * evicted and their WT_REF structures freed. If that happened + * before all threads exit the index of the page that previously + * "owned" the WT_REF, a thread might see a freed WT_REF. To + * ensure that doesn't happen, the newly created page's modify + * structure has a field with a transaction ID that's checked + * before any internal page is evicted. Unfortunately, we don't + * know the correct value until we update the original page's + * index (we need a transaction ID from after that update), but + * the act of updating the original page's index is what allows + * the eviction to happen. + * + * Split blocking was because historic versions of the split + * code didn't update the WT_REF.home field until after the + * split was live, so the WT_REF.home fields being updated could + * split again before the update, there's a race between splits + * as to which would update them first. The current code updates + * the WT_REF.home fields before going live (in this function), + * this shouldn't be an issue, but for now splits remain turned + * off. + */ + F_SET_ATOMIC(child, WT_PAGE_SPLIT_BLOCK); + + /* + * We use a page flag to prevent the child from splitting from + * underneath us, but the split-generation error checks don't + * know about that flag; use the standard macros to ensure that + * reading the child's page index structure is safe. + */ + j = 0; + WT_ENTER_PAGE_INDEX(session); + WT_INTL_FOREACH_BEGIN(session, child, child_ref) { + child_ref->home = child; + child_ref->pindex_hint = j++; + } WT_INTL_FOREACH_END; + WT_LEAVE_PAGE_INDEX(session); + +#ifdef HAVE_DIAGNOSTIC + WT_WITH_PAGE_INDEX(session, + __split_verify_intl_key_order(session, child)); +#endif + } } /* - * __split_ref_move_final -- - * Finalize the moved WT_REF structures after the split succeeds. + * __split_ref_step2 -- + * Allow the newly created children to be evicted or split. */ static int -__split_ref_move_final( - WT_SESSION_IMPL *session, WT_REF **refp, uint32_t entries) +__split_ref_step2( + WT_SESSION_IMPL *session, WT_PAGE_INDEX *pindex, bool skip_first) { WT_DECL_RET; WT_PAGE *child; - WT_REF *ref, *child_ref; + WT_REF *ref; uint32_t i; /* - * The WT_REF structures moved to newly allocated child pages reference - * the wrong parent page and we have to fix that up. The problem is - * revealed when a thread of control searches for the child page's - * reference structure slot, and fails to find it because the parent - * page being searched no longer references the child. When that failure - * happens the thread waits for the reference's home page to be updated, - * which we do here: walk the children and fix them up. + * The split has gone live, enable eviction and splits on the newly + * created internal pages. */ - for (i = 0; i < entries; ++i, ++refp) { - ref = *refp; + WT_WRITE_BARRIER(); + + for (i = skip_first ? 1 : 0; i < pindex->entries; ++i) { + ref = pindex->index[i]; /* * We don't hold hazard pointers on created pages, they cannot @@ -441,42 +481,18 @@ __split_ref_move_final( WT_ERR(ret); child = ref->page; + + /* The child can now be evicted or split. */ + F_CLR_ATOMIC(child, WT_PAGE_SPLIT_BLOCK); + #ifdef HAVE_DIAGNOSTIC WT_WITH_PAGE_INDEX(session, __split_verify_intl_key_order(session, child)); #endif - /* - * We use a page flag to prevent the child from splitting from - * underneath us, but the split-generation error checks don't - * know about that flag; use the standard macros to ensure that - * reading the child's page index structure is safe. - */ - WT_ENTER_PAGE_INDEX(session); - WT_INTL_FOREACH_BEGIN(session, child, child_ref) { - /* - * The page's home reference may not be wrong, as we - * opened up access from the top of the tree already, - * disk pages may have been read in since then, and - * those pages would have correct parent references. - */ - if (child_ref->home != child) { - child_ref->home = child; - child_ref->pindex_hint = 0; - } - } WT_INTL_FOREACH_END; - WT_LEAVE_PAGE_INDEX(session); - - /* The child can now be evicted or split. */ - F_CLR_ATOMIC(child, WT_PAGE_SPLIT_BLOCK); WT_ERR(__wt_hazard_clear(session, child)); } - /* - * Push out the changes: not required for correctness, but don't let - * threads spin on incorrect page references longer than necessary. - */ - WT_FULL_BARRIER(); return (0); err: /* Something really bad just happened. */ @@ -500,9 +516,21 @@ __split_root(WT_SESSION_IMPL *session, WT_PAGE *root) uint64_t split_gen; uint32_t children, chunk, i, j, remain; uint32_t slots; - bool complete; void *p; + /* + * A note on error handling: this function first allocates/initializes + * new structures; failures during that period are handled by discarding + * the memory and returning an error code, our caller knows the split + * didn't happen and proceeds accordingly. Second, this function updates + * the tree, and a failure in that period is catastrophic, any partial + * update to the tree requires a panic, we can't recover. Third, once + * the split is complete and the tree has been fully updated, we have to + * ignore most errors because the split is complete and correct, callers + * have to proceed accordingly. + */ + enum { ERR_RETURN, ERR_PANIC, ERR_IGNORE } complete; + WT_STAT_FAST_CONN_INCR(session, cache_eviction_deepen); WT_STAT_FAST_DATA_INCR(session, cache_eviction_deepen); WT_STAT_FAST_CONN_INCR(session, cache_eviction_split_internal); @@ -511,7 +539,7 @@ __split_root(WT_SESSION_IMPL *session, WT_PAGE *root) btree = S2BT(session); alloc_index = NULL; root_decr = root_incr = 0; - complete = false; + complete = ERR_RETURN; /* The root page will be marked dirty, make sure that will succeed. */ WT_RET(__wt_page_modify_init(session, root)); @@ -589,9 +617,6 @@ __split_root(WT_SESSION_IMPL *session, WT_PAGE *root) WT_ERR(__wt_page_modify_init(session, child)); __wt_page_modify_set(session, child); - /* Ensure the page isn't evicted or split for now. */ - __split_child_block_evict_and_split(child); - /* * The newly allocated child's page index references the same * structures as the root. (We cannot move WT_REF structures, @@ -615,31 +640,28 @@ __split_root(WT_SESSION_IMPL *session, WT_PAGE *root) WT_ASSERT(session, root_refp - pindex->index == (ptrdiff_t)pindex->entries); + /* Start making real changes to the tree, errors are fatal. */ + complete = ERR_PANIC; + + /* Prepare the WT_REFs for the move. */ + __split_ref_step1(session, alloc_index, false); + /* * Confirm the root page's index hasn't moved, then update it, which - * makes the split visible to threads descending the tree. From this - * point on, we're committed to the split. - * - * A note on error handling: until this point, there's no problem with - * unwinding on error. We allocated a new page index, a new set of - * WT_REFs and a new set of child pages -- if an error occurred, the - * root remained unchanged, although it may have an incorrect memory - * footprint. From now on we've modified the root page, attention - * needs to be paid. However, subsequent failures are relatively benign, - * the split is OK and complete. For that reason, we ignore errors past - * this point unless there's a panic. + * makes the split visible to threads descending the tree. */ WT_ASSERT(session, WT_INTL_INDEX_GET_SAFE(root) == pindex); WT_INTL_INDEX_SET(root, alloc_index); - complete = true; #ifdef HAVE_DIAGNOSTIC WT_WITH_PAGE_INDEX(session, __split_verify_intl_key_order(session, root)); #endif - /* Fix up the moved WT_REF structures. */ - WT_ERR(__split_ref_move_final( - session, alloc_index->index, alloc_index->entries)); + /* Finalize the WT_REFs we moved. */ + WT_ERR(__split_ref_step2(session, alloc_index, false)); + + /* The split is complete and correct, ignore benign errors. */ + complete = ERR_IGNORE; /* We've installed the allocated page-index, ensure error handling. */ alloc_index = NULL; @@ -664,24 +686,25 @@ __split_root(WT_SESSION_IMPL *session, WT_PAGE *root) __wt_cache_page_inmem_decr(session, root, root_decr); __wt_page_modify_set(session, root); -err: /* - * If complete is true, we saw an error after opening up the tree to - * descent through the root page's new index. There is nothing we - * can do, there are threads potentially active in both versions of - * the tree. - * - * A note on error handling: if we completed the split, return success, - * nothing really bad can have happened, and our caller has to proceed - * with the split. - */ - if (!complete) +err: switch (complete) { + case ERR_RETURN: __wt_free_ref_index(session, root, alloc_index, true); - - if (ret != 0 && ret != WT_PANIC) + break; + case ERR_PANIC: __wt_err(session, ret, - "ignoring not-fatal error during root page split to " - "deepen the tree"); - return (ret == WT_PANIC || !complete ? ret : 0); + "fatal error during root page split to deepen the tree"); + ret = WT_PANIC; + break; + case ERR_IGNORE: + if (ret != 0 && ret != WT_PANIC) { + __wt_err(session, ret, + "ignoring not-fatal error during root page split " + "to deepen the tree"); + ret = 0; + } + break; + } + return (ret); } /* @@ -964,9 +987,21 @@ __split_internal(WT_SESSION_IMPL *session, WT_PAGE *parent, WT_PAGE *page) uint64_t split_gen; uint32_t children, chunk, i, j, remain; uint32_t slots; - bool complete; void *p; + /* + * A note on error handling: this function first allocates/initializes + * new structures; failures during that period are handled by discarding + * the memory and returning an error code, our caller knows the split + * didn't happen and proceeds accordingly. Second, this function updates + * the tree, and a failure in that period is catastrophic, any partial + * update to the tree requires a panic, we can't recover. Third, once + * the split is complete and the tree has been fully updated, we have to + * ignore most errors because the split is complete and correct, callers + * have to proceed accordingly. + */ + enum { ERR_RETURN, ERR_PANIC, ERR_IGNORE } complete; + WT_STAT_FAST_CONN_INCR(session, cache_eviction_split_internal); WT_STAT_FAST_DATA_INCR(session, cache_eviction_split_internal); @@ -977,7 +1012,7 @@ __split_internal(WT_SESSION_IMPL *session, WT_PAGE *parent, WT_PAGE *page) alloc_index = replace_index = NULL; page_ref = page->pg_intl_parent_ref; page_decr = page_incr = parent_incr = 0; - complete = false; + complete = ERR_RETURN; /* * Our caller is holding the page locked to single-thread splits, which @@ -1074,9 +1109,6 @@ __split_internal(WT_SESSION_IMPL *session, WT_PAGE *parent, WT_PAGE *page) WT_ERR(__wt_page_modify_init(session, child)); __wt_page_modify_set(session, child); - /* Ensure the page isn't evicted or split for now. */ - __split_child_block_evict_and_split(child); - /* * The newly allocated child's page index references the same * structures as the parent. (We cannot move WT_REF structures, @@ -1100,22 +1132,16 @@ __split_internal(WT_SESSION_IMPL *session, WT_PAGE *parent, WT_PAGE *page) WT_ASSERT(session, page_refp - pindex->index == (ptrdiff_t)pindex->entries); + /* Start making real changes to the tree, errors are fatal. */ + complete = ERR_PANIC; + + /* Prepare the WT_REFs for the move. */ + __split_ref_step1(session, alloc_index, true); + /* Split into the parent. */ WT_ERR(__split_parent(session, page_ref, alloc_index->index, alloc_index->entries, parent_incr, false, false)); - /* - * A note on error handling: until this point, there's no problem with - * unwinding on error. We allocated a new page index, a new set of - * WT_REFs and a new set of child pages -- if an error occurred, the - * page remained unchanged, although it may have an incorrect memory - * footprint. From now on we've modified the parent page, attention - * needs to be paid. However, subsequent failures are relatively benign, - * the split is OK and complete. For that reason, we ignore errors past - * this point unless there's a panic. - */ - complete = true; - /* Confirm the page's index hasn't moved, then update it. */ WT_ASSERT(session, WT_INTL_INDEX_GET_SAFE(page) == pindex); WT_INTL_INDEX_SET(page, replace_index); @@ -1127,9 +1153,17 @@ __split_internal(WT_SESSION_IMPL *session, WT_PAGE *parent, WT_PAGE *page) __split_verify_intl_key_order(session, page)); #endif - /* Fix up the moved WT_REF structures. */ - WT_ERR(__split_ref_move_final( - session, alloc_index->index + 1, alloc_index->entries - 1)); + /* Finalize the WT_REFs we moved. */ + WT_ERR(__split_ref_step2(session, alloc_index, true)); + + /* The split is complete and correct, ignore benign errors. */ + complete = ERR_IGNORE; + + /* + * Push out the changes: not required for correctness, but no reason + * to wait. + */ + WT_FULL_BARRIER(); /* * We don't care about the page-index we allocated, all we needed was @@ -1158,24 +1192,26 @@ __split_internal(WT_SESSION_IMPL *session, WT_PAGE *parent, WT_PAGE *page) __wt_cache_page_inmem_decr(session, page, page_decr); __wt_page_modify_set(session, page); -err: /* - * If complete is true, we saw an error after opening up the tree to - * descent through the page's new index. There is nothing we can do, - * there are threads potentially active in both versions of the tree. - * - * A note on error handling: if we completed the split, return success, - * nothing really bad can have happened, and our caller has to proceed - * with the split. - */ - if (!complete) { +err: switch (complete) { + case ERR_RETURN: __wt_free_ref_index(session, page, alloc_index, true); __wt_free_ref_index(session, page, replace_index, false); - } - - if (ret != 0 && ret != WT_PANIC) + break; + case ERR_PANIC: __wt_err(session, ret, - "ignoring not-fatal error during internal page split"); - return (ret == WT_PANIC || !complete ? ret : 0); + "fatal error during internal page split"); + ret = WT_PANIC; + break; + case ERR_IGNORE: + if (ret != 0 && ret != WT_PANIC) { + __wt_err(session, ret, + "ignoring not-fatal error during internal page " + "split"); + ret = 0; + } + break; + } + return (ret); } /* diff --git a/src/third_party/wiredtiger/src/btree/bt_walk.c b/src/third_party/wiredtiger/src/btree/bt_walk.c index b46c9a03dcf..abb18529041 100644 --- a/src/third_party/wiredtiger/src/btree/bt_walk.c +++ b/src/third_party/wiredtiger/src/btree/bt_walk.c @@ -90,6 +90,48 @@ __ref_is_leaf(WT_REF *ref) } /* + * __page_ascend -- + * Ascend the tree one level. + */ +static void +__page_ascend(WT_SESSION_IMPL *session, + WT_REF **refp, WT_PAGE_INDEX **pindexp, uint32_t *slotp) +{ + WT_REF *parent_ref, *ref; + + /* + * Ref points to the first/last slot on an internal page from which we + * are ascending the tree, moving to the parent page. This is tricky + * because the internal page we're on may be splitting into its parent. + * Find a stable configuration where the page we start from and the + * page we're moving to are connected. The tree eventually stabilizes + * into that configuration, keep trying until we succeed. + */ + for (ref = *refp;;) { + /* + * Find our parent slot on the next higher internal page, the + * slot from which we move to a next/prev slot, checking that + * we haven't reached the root. + */ + parent_ref = ref->home->pg_intl_parent_ref; + if (__wt_ref_is_root(parent_ref)) + break; + __page_refp(session, parent_ref, pindexp, slotp); + + /* + * When internal pages split, the WT_REF structures being moved + * are updated first. If the WT_REF we started with references + * the same page as we found on our search of the parent, there + * is a consistent view. + */ + if (ref->home == parent_ref->page) + break; + } + + *refp = parent_ref; +} + +/* * __tree_walk_internal -- * Move to the next/previous page in the tree. */ @@ -173,7 +215,7 @@ __tree_walk_internal(WT_SESSION_IMPL *session, goto descend; } -ascend: /* + /* * If the active page was the root, we've reached the walk's end. * Release any hazard-pointer we're holding. */ @@ -187,13 +229,14 @@ ascend: /* for (;;) { /* - * If we're at the last/first slot on the page, return this page - * in post-order traversal. Otherwise we move to the next/prev - * slot and left/right-most element in its subtree. + * If we're at the last/first slot on the internal page, return + * it in post-order traversal. Otherwise move to the next/prev + * slot and left/right-most element in that subtree. */ - if ((prev && slot == 0) || + while ((prev && slot == 0) || (!prev && slot == pindex->entries - 1)) { - ref = ref->home->pg_intl_parent_ref; + /* Ascend to the parent. */ + __page_ascend(session, &ref, &pindex, &slot); /* * If we got all the way through an internal page and @@ -205,40 +248,37 @@ ascend: /* empty_internal = false; } - /* Optionally skip internal pages. */ - if (LF_ISSET(WT_READ_SKIP_INTL)) - goto ascend; - /* - * We've ascended the tree and are returning an internal - * page. If it's the root, discard our hazard pointer, - * otherwise, swap our hazard pointer for the page we'll - * return. + * If at the root and returning internal pages, return + * the root page, otherwise we're done. Regardless, no + * hazard pointer is required, release the one we hold. */ - if (__wt_ref_is_root(ref)) + if (__wt_ref_is_root(ref)) { WT_ERR(__wt_page_release( session, couple, flags)); - else { - /* - * Locate the reference to our parent page then - * swap our child hazard pointer for the parent. - * We don't handle restart or not-found returns. - * It would require additional complexity and is - * not a possible return: we're moving to the - * parent of the current child page, our parent - * reference can't have split or been evicted. - */ - __page_refp(session, ref, &pindex, &slot); + if (!LF_ISSET(WT_READ_SKIP_INTL)) + *refp = ref; + goto done; + } + + /* + * Optionally return internal pages. Swap our previous + * hazard pointer for the page we'll return. We don't + * handle restart or not-found returns, it would require + * additional complexity and is not a possible return: + * we're moving to the parent of the current child page, + * the parent can't have been evicted. + */ + if (!LF_ISSET(WT_READ_SKIP_INTL)) { if ((ret = __wt_page_swap( session, couple, ref, flags)) != 0) { WT_TRET(__wt_page_release( session, couple, flags)); WT_ERR(ret); } + *refp = ref; + goto done; } - - *refp = ref; - goto done; } if (prev) |