summaryrefslogtreecommitdiff
path: root/src/third_party/wiredtiger/src/btree/bt_walk.c
diff options
context:
space:
mode:
authorRamon Fernandez <ramon@mongodb.com>2016-02-24 11:31:55 -0500
committerRamon Fernandez <ramon@mongodb.com>2016-02-24 11:32:02 -0500
commitf77630a9e971cae1f921292ea31d9d40a4b096b8 (patch)
tree5c14f42716ba59ff352413f6888ad099e7568294 /src/third_party/wiredtiger/src/btree/bt_walk.c
parent68f07ff3cc8b7bcbd37bf298d28156ad77adaa32 (diff)
downloadmongo-f77630a9e971cae1f921292ea31d9d40a4b096b8.tar.gz
Import wiredtiger-wiredtiger-2.7.0-650-g5cdd3e3.tar.gz from wiredtiger branch mongodb-3.2
ref: 07966a4..5cdd3e3 SERVER-22437 Coverity analysis defect 77704: Redundant test SERVER-22438 Coverity analysis defect 77705: Dereference before null check SERVER-22676 WiredTiger fails to open databases created by 3.0.0 or 3.0.1 WT-2130 Improve on-disk page utlilization with random workloads WT-2215 WT_LSN needs to support atomic reads and updates WT-2295 WT_SESSION.create does a full-scan of the main table WT-2352 Allow build and test without requiring lz4 WT-2356 log scan advances to next log file on partially written record WT-2363 Remove built in support for bzip2 WT-2368 row-store can pass garbage keys to collator functions WT-2369 Use C compiler to detect headers instead of C++ compiler WT-2371 parent split cannot access the page after page-index swap WT-2372 WiredTiger windows builder fails with C4005 against the "inline" macro WT-2377 WTPERF doesn't compile in Windows under MSVC WT-2378 Tasks time out on LSM builder WT-2397 Cursor traversal from end of the tree skips records. WT-60 Big endian port
Diffstat (limited to 'src/third_party/wiredtiger/src/btree/bt_walk.c')
-rw-r--r--src/third_party/wiredtiger/src/btree/bt_walk.c157
1 files changed, 117 insertions, 40 deletions
diff --git a/src/third_party/wiredtiger/src/btree/bt_walk.c b/src/third_party/wiredtiger/src/btree/bt_walk.c
index 49a59b89552..55b11d7b2d1 100644
--- a/src/third_party/wiredtiger/src/btree/bt_walk.c
+++ b/src/third_party/wiredtiger/src/btree/bt_walk.c
@@ -89,11 +89,11 @@ __ref_is_leaf(WT_REF *ref)
}
/*
- * __page_ascend --
+ * __ref_ascend --
* Ascend the tree one level.
*/
-static void
-__page_ascend(WT_SESSION_IMPL *session,
+static inline void
+__ref_ascend(WT_SESSION_IMPL *session,
WT_REF **refp, WT_PAGE_INDEX **pindexp, uint32_t *slotp)
{
WT_REF *parent_ref, *ref;
@@ -163,23 +163,20 @@ __page_ascend(WT_SESSION_IMPL *session,
}
/*
- * __page_descend --
- * Descend the tree one level.
+ * __ref_descend_prev --
+ * Descend the tree one level, during a previous-cursor walk.
*/
-static void
-__page_descend(WT_SESSION_IMPL *session,
- WT_PAGE *page, WT_PAGE_INDEX **pindexp, uint32_t *slotp, bool prev)
+static inline void
+__ref_descend_prev(
+ WT_SESSION_IMPL *session, WT_REF *ref, WT_PAGE_INDEX **pindexp)
{
WT_PAGE_INDEX *pindex;
/*
- * Ref is a child page into which we're descending, and on which we
- * have a hazard pointer.
+ * We're passed a child page into which we're descending, and on which
+ * we have a hazard pointer.
*/
for (;; __wt_yield()) {
- WT_INTL_INDEX_GET(session, page, pindex);
- *slotp = prev ? pindex->entries - 1 : 0;
-
/*
* There's a split race when a cursor moving backwards through
* the tree descends the tree. If we're splitting an internal
@@ -233,21 +230,41 @@ __page_descend(WT_SESSION_IMPL *session,
* being split and part of its namespace moved. We have the
* correct page and we don't have to move, all we have to do is
* wait until the split page's page index is updated.
- *
- * No test is necessary for a next-cursor movement because we
- * do right-hand splits on internal pages and the initial part
- * of the page's namespace won't change as part of a split.
- * Instead of testing the direction boolean, do the test the
- * previous cursor movement requires in all cases, even though
- * it will always succeed for a next-cursor movement.
*/
- if (pindex->index[*slotp]->home == page)
+ WT_INTL_INDEX_GET(session, ref->page, pindex);
+ if (pindex->index[pindex->entries - 1]->home == ref->page)
break;
}
*pindexp = pindex;
}
/*
+ * __ref_initial_descent_prev --
+ * Descend the tree one level, when setting up the initial cursor position
+ * for a previous-cursor walk.
+ */
+static inline bool
+__ref_initial_descent_prev(
+ WT_SESSION_IMPL *session, WT_REF *ref, WT_PAGE_INDEX **pindexp)
+{
+ WT_PAGE_INDEX *pindex;
+
+ /*
+ * We're passed a child page into which we're descending, and on which
+ * we have a hazard pointer.
+ *
+ * Acquire a page index for the child page and then confirm we haven't
+ * raced with a parent split.
+ */
+ WT_INTL_INDEX_GET(session, ref->page, pindex);
+ if (__wt_split_descent_race(session, ref, *pindexp))
+ return (false);
+
+ *pindexp = pindex;
+ return (true);
+}
+
+/*
* __tree_walk_internal --
* Move to the next/previous page in the tree.
*/
@@ -259,11 +276,12 @@ __tree_walk_internal(WT_SESSION_IMPL *session,
WT_DECL_RET;
WT_PAGE_INDEX *pindex;
WT_REF *couple, *couple_orig, *ref;
- bool empty_internal, prev, skip;
+ bool empty_internal, initial_descent, prev, skip;
uint32_t slot;
btree = S2BT(session);
- empty_internal = false;
+ pindex = NULL;
+ empty_internal = initial_descent = false;
/*
* Tree walks are special: they look inside page structures that splits
@@ -323,22 +341,30 @@ __tree_walk_internal(WT_SESSION_IMPL *session,
couple = couple_orig = ref = *refp;
*refp = NULL;
- /* If no page is active, begin a walk from the start of the tree. */
+ /* If no page is active, begin a walk from the start/end of the tree. */
if (ref == NULL) {
- ref = &btree->root;
+restart: /*
+ * We can reach here with a NULL or root reference; the release
+ * function handles them internally, don't complicate this code
+ * by calling them out.
+ */
+ WT_ERR(__wt_page_release(session, couple, flags));
+
+ couple = couple_orig = ref = &btree->root;
if (ref->page == NULL)
goto done;
+
+ initial_descent = true;
goto descend;
}
/*
- * If the active page was the root, we've reached the walk's end.
- * Release any hazard-pointer we're holding.
+ * If the active page was the root, we've reached the walk's end; we
+ * only get here if we've returned the root to our caller, so we're
+ * holding no hazard pointers.
*/
- if (__wt_ref_is_root(ref)) {
- WT_ERR(__wt_page_release(session, couple, flags));
+ if (__wt_ref_is_root(ref))
goto done;
- }
/* Figure out the current slot in the WT_REF array. */
__ref_index_slot(session, ref, &pindex, &slot);
@@ -352,7 +378,7 @@ __tree_walk_internal(WT_SESSION_IMPL *session,
while ((prev && slot == 0) ||
(!prev && slot == pindex->entries - 1)) {
/* Ascend to the parent. */
- __page_ascend(session, &ref, &pindex, &slot);
+ __ref_ascend(session, &ref, &pindex, &slot);
/*
* If we got all the way through an internal page and
@@ -521,16 +547,21 @@ __tree_walk_internal(WT_SESSION_IMPL *session,
ret = 0;
/*
+ * If a cursor is setting up at the end of the
+ * tree, we can't use our parent page's index,
+ * because it may have already split; restart
+ * the walk.
+ */
+ if (prev && initial_descent)
+ goto restart;
+
+ /*
* If a new walk that never coupled from the
* root to a new saved position in the tree,
* restart the walk.
*/
- if (couple == &btree->root) {
- ref = &btree->root;
- if (ref->page == NULL)
- goto done;
- goto descend;
- }
+ if (couple == &btree->root)
+ goto restart;
/*
* If restarting from some original position,
@@ -561,10 +592,56 @@ __tree_walk_internal(WT_SESSION_IMPL *session,
descend: couple = ref;
empty_internal = true;
- __page_descend(
- session, ref->page, &pindex, &slot, prev);
+ /*
+ * There's a split race when a cursor is setting
+ * up at the end of the tree or moving backwards
+ * through the tree and descending a level. When
+ * splitting an internal page into its parent,
+ * we move the WT_REF structures and update the
+ * parent's page index before updating the split
+ * page's page index, and it's not an atomic
+ * update. A thread can read the parent page's
+ * replacement page index, then read the split
+ * page's original index, or the parent page's
+ * original and the split page's replacement.
+ *
+ * This isn't a problem for a cursor setting up
+ * at the start of the tree or moving forwards
+ * through the tree because we do right-hand
+ * splits on internal pages and the initial part
+ * of the split page's namespace won't change as
+ * part of a split. A thread reading the parent
+ * page's and split page's indexes will move to
+ * the same slot no matter what order of indexes
+ * are read.
+ *
+ * Handle a cursor setting up at the end of the
+ * tree or moving backwards through the tree.
+ */
+ if (!prev) {
+ WT_INTL_INDEX_GET(
+ session, ref->page, pindex);
+ slot = 0;
+ } else if (initial_descent) {
+ if (!__ref_initial_descent_prev(
+ session, ref, &pindex))
+ goto restart;
+ slot = pindex->entries - 1;
+ } else {
+ __ref_descend_prev(
+ session, ref, &pindex);
+ slot = pindex->entries - 1;
+ }
} else {
/*
+ * At the lowest tree level (considering a leaf
+ * page), turn off the initial-descent state.
+ * Descent race tests are different when moving
+ * through the tree vs. the initial descent.
+ */
+ initial_descent = false;
+
+ /*
* Optionally skip leaf pages, the second half.
* We didn't have an on-page cell to figure out
* if it was a leaf page, we had to acquire the
@@ -605,7 +682,7 @@ __wt_tree_walk(WT_SESSION_IMPL *session, WT_REF **refp, uint32_t flags)
/*
* __wt_tree_walk_count --
* Move to the next/previous page in the tree, tracking how many
- * references were visited to get there.
+ * references were visited to get there.
*/
int
__wt_tree_walk_count(WT_SESSION_IMPL *session,