diff options
author | Ramon Fernandez <ramon@mongodb.com> | 2016-02-24 11:31:55 -0500 |
---|---|---|
committer | Ramon Fernandez <ramon@mongodb.com> | 2016-02-24 11:32:02 -0500 |
commit | f77630a9e971cae1f921292ea31d9d40a4b096b8 (patch) | |
tree | 5c14f42716ba59ff352413f6888ad099e7568294 /src/third_party/wiredtiger/src/btree/bt_walk.c | |
parent | 68f07ff3cc8b7bcbd37bf298d28156ad77adaa32 (diff) | |
download | mongo-f77630a9e971cae1f921292ea31d9d40a4b096b8.tar.gz |
Import wiredtiger-wiredtiger-2.7.0-650-g5cdd3e3.tar.gz from wiredtiger branch mongodb-3.2
ref: 07966a4..5cdd3e3
SERVER-22437 Coverity analysis defect 77704: Redundant test
SERVER-22438 Coverity analysis defect 77705: Dereference before null check
SERVER-22676 WiredTiger fails to open databases created by 3.0.0 or 3.0.1
WT-2130 Improve on-disk page utlilization with random workloads
WT-2215 WT_LSN needs to support atomic reads and updates
WT-2295 WT_SESSION.create does a full-scan of the main table
WT-2352 Allow build and test without requiring lz4
WT-2356 log scan advances to next log file on partially written record
WT-2363 Remove built in support for bzip2
WT-2368 row-store can pass garbage keys to collator functions
WT-2369 Use C compiler to detect headers instead of C++ compiler
WT-2371 parent split cannot access the page after page-index swap
WT-2372 WiredTiger windows builder fails with C4005 against the "inline" macro
WT-2377 WTPERF doesn't compile in Windows under MSVC
WT-2378 Tasks time out on LSM builder
WT-2397 Cursor traversal from end of the tree skips records.
WT-60 Big endian port
Diffstat (limited to 'src/third_party/wiredtiger/src/btree/bt_walk.c')
-rw-r--r-- | src/third_party/wiredtiger/src/btree/bt_walk.c | 157 |
1 files changed, 117 insertions, 40 deletions
diff --git a/src/third_party/wiredtiger/src/btree/bt_walk.c b/src/third_party/wiredtiger/src/btree/bt_walk.c index 49a59b89552..55b11d7b2d1 100644 --- a/src/third_party/wiredtiger/src/btree/bt_walk.c +++ b/src/third_party/wiredtiger/src/btree/bt_walk.c @@ -89,11 +89,11 @@ __ref_is_leaf(WT_REF *ref) } /* - * __page_ascend -- + * __ref_ascend -- * Ascend the tree one level. */ -static void -__page_ascend(WT_SESSION_IMPL *session, +static inline void +__ref_ascend(WT_SESSION_IMPL *session, WT_REF **refp, WT_PAGE_INDEX **pindexp, uint32_t *slotp) { WT_REF *parent_ref, *ref; @@ -163,23 +163,20 @@ __page_ascend(WT_SESSION_IMPL *session, } /* - * __page_descend -- - * Descend the tree one level. + * __ref_descend_prev -- + * Descend the tree one level, during a previous-cursor walk. */ -static void -__page_descend(WT_SESSION_IMPL *session, - WT_PAGE *page, WT_PAGE_INDEX **pindexp, uint32_t *slotp, bool prev) +static inline void +__ref_descend_prev( + WT_SESSION_IMPL *session, WT_REF *ref, WT_PAGE_INDEX **pindexp) { WT_PAGE_INDEX *pindex; /* - * Ref is a child page into which we're descending, and on which we - * have a hazard pointer. + * We're passed a child page into which we're descending, and on which + * we have a hazard pointer. */ for (;; __wt_yield()) { - WT_INTL_INDEX_GET(session, page, pindex); - *slotp = prev ? pindex->entries - 1 : 0; - /* * There's a split race when a cursor moving backwards through * the tree descends the tree. If we're splitting an internal @@ -233,21 +230,41 @@ __page_descend(WT_SESSION_IMPL *session, * being split and part of its namespace moved. We have the * correct page and we don't have to move, all we have to do is * wait until the split page's page index is updated. - * - * No test is necessary for a next-cursor movement because we - * do right-hand splits on internal pages and the initial part - * of the page's namespace won't change as part of a split. - * Instead of testing the direction boolean, do the test the - * previous cursor movement requires in all cases, even though - * it will always succeed for a next-cursor movement. */ - if (pindex->index[*slotp]->home == page) + WT_INTL_INDEX_GET(session, ref->page, pindex); + if (pindex->index[pindex->entries - 1]->home == ref->page) break; } *pindexp = pindex; } /* + * __ref_initial_descent_prev -- + * Descend the tree one level, when setting up the initial cursor position + * for a previous-cursor walk. + */ +static inline bool +__ref_initial_descent_prev( + WT_SESSION_IMPL *session, WT_REF *ref, WT_PAGE_INDEX **pindexp) +{ + WT_PAGE_INDEX *pindex; + + /* + * We're passed a child page into which we're descending, and on which + * we have a hazard pointer. + * + * Acquire a page index for the child page and then confirm we haven't + * raced with a parent split. + */ + WT_INTL_INDEX_GET(session, ref->page, pindex); + if (__wt_split_descent_race(session, ref, *pindexp)) + return (false); + + *pindexp = pindex; + return (true); +} + +/* * __tree_walk_internal -- * Move to the next/previous page in the tree. */ @@ -259,11 +276,12 @@ __tree_walk_internal(WT_SESSION_IMPL *session, WT_DECL_RET; WT_PAGE_INDEX *pindex; WT_REF *couple, *couple_orig, *ref; - bool empty_internal, prev, skip; + bool empty_internal, initial_descent, prev, skip; uint32_t slot; btree = S2BT(session); - empty_internal = false; + pindex = NULL; + empty_internal = initial_descent = false; /* * Tree walks are special: they look inside page structures that splits @@ -323,22 +341,30 @@ __tree_walk_internal(WT_SESSION_IMPL *session, couple = couple_orig = ref = *refp; *refp = NULL; - /* If no page is active, begin a walk from the start of the tree. */ + /* If no page is active, begin a walk from the start/end of the tree. */ if (ref == NULL) { - ref = &btree->root; +restart: /* + * We can reach here with a NULL or root reference; the release + * function handles them internally, don't complicate this code + * by calling them out. + */ + WT_ERR(__wt_page_release(session, couple, flags)); + + couple = couple_orig = ref = &btree->root; if (ref->page == NULL) goto done; + + initial_descent = true; goto descend; } /* - * If the active page was the root, we've reached the walk's end. - * Release any hazard-pointer we're holding. + * If the active page was the root, we've reached the walk's end; we + * only get here if we've returned the root to our caller, so we're + * holding no hazard pointers. */ - if (__wt_ref_is_root(ref)) { - WT_ERR(__wt_page_release(session, couple, flags)); + if (__wt_ref_is_root(ref)) goto done; - } /* Figure out the current slot in the WT_REF array. */ __ref_index_slot(session, ref, &pindex, &slot); @@ -352,7 +378,7 @@ __tree_walk_internal(WT_SESSION_IMPL *session, while ((prev && slot == 0) || (!prev && slot == pindex->entries - 1)) { /* Ascend to the parent. */ - __page_ascend(session, &ref, &pindex, &slot); + __ref_ascend(session, &ref, &pindex, &slot); /* * If we got all the way through an internal page and @@ -521,16 +547,21 @@ __tree_walk_internal(WT_SESSION_IMPL *session, ret = 0; /* + * If a cursor is setting up at the end of the + * tree, we can't use our parent page's index, + * because it may have already split; restart + * the walk. + */ + if (prev && initial_descent) + goto restart; + + /* * If a new walk that never coupled from the * root to a new saved position in the tree, * restart the walk. */ - if (couple == &btree->root) { - ref = &btree->root; - if (ref->page == NULL) - goto done; - goto descend; - } + if (couple == &btree->root) + goto restart; /* * If restarting from some original position, @@ -561,10 +592,56 @@ __tree_walk_internal(WT_SESSION_IMPL *session, descend: couple = ref; empty_internal = true; - __page_descend( - session, ref->page, &pindex, &slot, prev); + /* + * There's a split race when a cursor is setting + * up at the end of the tree or moving backwards + * through the tree and descending a level. When + * splitting an internal page into its parent, + * we move the WT_REF structures and update the + * parent's page index before updating the split + * page's page index, and it's not an atomic + * update. A thread can read the parent page's + * replacement page index, then read the split + * page's original index, or the parent page's + * original and the split page's replacement. + * + * This isn't a problem for a cursor setting up + * at the start of the tree or moving forwards + * through the tree because we do right-hand + * splits on internal pages and the initial part + * of the split page's namespace won't change as + * part of a split. A thread reading the parent + * page's and split page's indexes will move to + * the same slot no matter what order of indexes + * are read. + * + * Handle a cursor setting up at the end of the + * tree or moving backwards through the tree. + */ + if (!prev) { + WT_INTL_INDEX_GET( + session, ref->page, pindex); + slot = 0; + } else if (initial_descent) { + if (!__ref_initial_descent_prev( + session, ref, &pindex)) + goto restart; + slot = pindex->entries - 1; + } else { + __ref_descend_prev( + session, ref, &pindex); + slot = pindex->entries - 1; + } } else { /* + * At the lowest tree level (considering a leaf + * page), turn off the initial-descent state. + * Descent race tests are different when moving + * through the tree vs. the initial descent. + */ + initial_descent = false; + + /* * Optionally skip leaf pages, the second half. * We didn't have an on-page cell to figure out * if it was a leaf page, we had to acquire the @@ -605,7 +682,7 @@ __wt_tree_walk(WT_SESSION_IMPL *session, WT_REF **refp, uint32_t flags) /* * __wt_tree_walk_count -- * Move to the next/previous page in the tree, tracking how many - * references were visited to get there. + * references were visited to get there. */ int __wt_tree_walk_count(WT_SESSION_IMPL *session, |