summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorKeith Bostic <keith@wiredtiger.com>2016-02-15 12:21:50 -0500
committerKeith Bostic <keith@wiredtiger.com>2016-02-15 12:21:50 -0500
commitf70cd24413cf311305c58a3156849ebf5ecbdf39 (patch)
tree42577ab490d34200576c64f724546ae6e1064d21
parent275da68d2223042f3adadd8ac716a57ef33031f5 (diff)
downloadmongo-f70cd24413cf311305c58a3156849ebf5ecbdf39.tar.gz
WT-2397: Cursor traversal from end of the tree skips records.
Positioning a cursor at the end of the tree can race with page splits. During the initial descent of the tree in the cursor.prev case, do the same tests for races we do for a search past the end of the tree.
-rw-r--r--src/btree/bt_walk.c116
-rw-r--r--src/btree/col_srch.c5
-rw-r--r--src/btree/row_srch.c5
-rw-r--r--src/include/btree.i12
4 files changed, 101 insertions, 37 deletions
diff --git a/src/btree/bt_walk.c b/src/btree/bt_walk.c
index d7785c689d9..90c806c6ab8 100644
--- a/src/btree/bt_walk.c
+++ b/src/btree/bt_walk.c
@@ -163,12 +163,12 @@ __page_ascend(WT_SESSION_IMPL *session,
}
/*
- * __page_descend --
- * Descend the tree one level.
+ * __page_descend_prev --
+ * Descend the tree one level, during a previous-cursor walk.
*/
static void
-__page_descend(WT_SESSION_IMPL *session,
- WT_PAGE *page, WT_PAGE_INDEX **pindexp, uint32_t *slotp, bool prev)
+__page_descend_prev(WT_SESSION_IMPL *session,
+ WT_REF *ref, WT_PAGE_INDEX **pindexp, uint32_t *slotp)
{
WT_PAGE_INDEX *pindex;
@@ -177,9 +177,6 @@ __page_descend(WT_SESSION_IMPL *session,
* we have a hazard pointer.
*/
for (;; __wt_yield()) {
- WT_INTL_INDEX_GET(session, page, pindex);
- *slotp = prev ? pindex->entries - 1 : 0;
-
/*
* There's a split race when a cursor moving backwards through
* the tree descends the tree. If we're splitting an internal
@@ -233,21 +230,41 @@ __page_descend(WT_SESSION_IMPL *session,
* being split and part of its namespace moved. We have the
* correct page and we don't have to move, all we have to do is
* wait until the split page's page index is updated.
- *
- * No test is necessary for a next-cursor movement because we
- * do right-hand splits on internal pages and the initial part
- * of the page's namespace won't change as part of a split.
- * Instead of testing the direction boolean, do the test the
- * previous cursor movement requires in all cases, even though
- * it will always succeed for a next-cursor movement.
*/
- if (pindex->index[*slotp]->home == page)
+ WT_INTL_INDEX_GET(session, ref->page, pindex);
+ *slotp = pindex->entries - 1;
+ if (pindex->index[*slotp]->home == ref->page)
break;
}
*pindexp = pindex;
}
/*
+ * __page_initial_descent_prev --
+ * Descend the tree one level, when setting up the initial cursor position
+ * for a previous-cursor walk.
+ */
+static bool
+__page_initial_descent_prev(WT_SESSION_IMPL *session,
+ WT_REF *ref, WT_PAGE_INDEX **pindexp, uint32_t *slotp)
+{
+ WT_PAGE_INDEX *parent_pindex, *pindex;
+
+ /*
+ * We're passed a child page into which we're descending, and on which
+ * we have a hazard pointer.
+ */
+ parent_pindex = *pindexp;
+ WT_INTL_INDEX_GET(session, ref->page, pindex);
+ *slotp = pindex->entries - 1;
+ if (__wt_split_descent_race(session, ref, parent_pindex))
+ return (false);
+
+ *pindexp = pindex;
+ return (true);
+}
+
+/*
* __tree_walk_internal --
* Move to the next/previous page in the tree.
*/
@@ -259,11 +276,12 @@ __tree_walk_internal(WT_SESSION_IMPL *session,
WT_DECL_RET;
WT_PAGE_INDEX *pindex;
WT_REF *couple, *couple_orig, *ref;
- bool empty_internal, prev, skip;
+ bool empty_internal, initial_descent, prev, skip;
uint32_t slot;
btree = S2BT(session);
- empty_internal = false;
+ pindex = NULL;
+ empty_internal = initial_descent = false;
/*
* Tree walks are special: they look inside page structures that splits
@@ -325,9 +343,14 @@ __tree_walk_internal(WT_SESSION_IMPL *session,
/* If no page is active, begin a walk from the start of the tree. */
if (ref == NULL) {
- ref = &btree->root;
+restart: if (couple != &btree->root)
+ WT_ERR(__wt_page_release(session, couple, flags));
+
+ couple = couple_orig = ref = &btree->root;
if (ref->page == NULL)
goto done;
+
+ initial_descent = true;
goto descend;
}
@@ -336,7 +359,7 @@ __tree_walk_internal(WT_SESSION_IMPL *session,
* Release any hazard-pointer we're holding.
*/
if (__wt_ref_is_root(ref)) {
- WT_ERR(__wt_page_release(session, couple, flags));
+ WT_ERR(__wt_page_release(session, ref, flags));
goto done;
}
@@ -525,12 +548,8 @@ __tree_walk_internal(WT_SESSION_IMPL *session,
* root to a new saved position in the tree,
* restart the walk.
*/
- if (couple == &btree->root) {
- ref = &btree->root;
- if (ref->page == NULL)
- goto done;
- goto descend;
- }
+ if (couple == &btree->root)
+ goto restart;
/*
* If restarting from some original position,
@@ -561,10 +580,53 @@ __tree_walk_internal(WT_SESSION_IMPL *session,
descend: couple = ref;
empty_internal = true;
- __page_descend(
- session, ref->page, &pindex, &slot, prev);
+ /*
+ * There's a split race when a cursor is setting
+ * up at the end of the tree or moving backwards
+ * through the tree and descending a level. When
+ * splitting an internal page into its parent,
+ * we move the WT_REF structures and update the
+ * parent's page index before updating the split
+ * page's page index, and it's not an atomic
+ * update. A thread can read the parent page's
+ * replacement page index, then read the split
+ * page's original index.
+ *
+ * This isn't a problem for a cursor setting up
+ * at the start of the tree or moving forwards
+ * through the tree because we do right-hand
+ * splits on internal pages and the initial part
+ * of the split page's namespace won't change as
+ * part of a split. A thread reading the parent
+ * page's and split page's indexes will move to
+ * the same slot no matter what combination of
+ * indexes are read.
+ *
+ * Handle a cursor setting up at the end of the
+ * tree or moving backwards through the tree.
+ */
+ if (!prev) {
+ WT_INTL_INDEX_GET(
+ session, ref->page, pindex);
+ slot = 0;
+ } else if (initial_descent) {
+ if (!__page_initial_descent_prev(
+ session, ref, &pindex, &slot))
+ goto restart;
+ } else
+ __page_descend_prev(
+ session, ref, &pindex, &slot);
} else {
/*
+ * At the lowest tree level (considering a leaf
+ * page), turn off the initial-descent test.
+ * The descent race tests are different when
+ * moving through the tree vs. doing the initial
+ * descent.
+ */
+ initial_descent = false;
+
+ /*
* Optionally skip leaf pages, the second half.
* We didn't have an on-page cell to figure out
* if it was a leaf page, we had to acquire the
diff --git a/src/btree/col_srch.c b/src/btree/col_srch.c
index 862327699c0..23eae75ec2b 100644
--- a/src/btree/col_srch.c
+++ b/src/btree/col_srch.c
@@ -145,9 +145,8 @@ restart: /*
* If on the last slot (the key is larger than any key
* on the page), check for an internal page split race.
*/
- if (parent_pindex != NULL &&
- __wt_split_intl_race(
- session, current->home, parent_pindex))
+ if (__wt_split_descent_race(
+ session, current, parent_pindex))
goto restart;
goto descend;
diff --git a/src/btree/row_srch.c b/src/btree/row_srch.c
index 955568bcd50..9d68c8e0ce7 100644
--- a/src/btree/row_srch.c
+++ b/src/btree/row_srch.c
@@ -425,9 +425,8 @@ restart: /*
* page), check for an internal page split race.
*/
if (pindex->entries == base) {
-append: if (parent_pindex != NULL &&
- __wt_split_intl_race(
- session, current->home, parent_pindex))
+append: if (__wt_split_descent_race(
+ session, current, parent_pindex))
goto restart;
}
diff --git a/src/include/btree.i b/src/include/btree.i
index 16906aa7cfa..422e67a1ebe 100644
--- a/src/include/btree.i
+++ b/src/include/btree.i
@@ -1446,15 +1446,19 @@ __wt_btree_lsm_over_size(WT_SESSION_IMPL *session, uint64_t maxsize)
}
/*
- * __wt_split_intl_race --
+ * __wt_split_descent_race --
* Return if we raced with an internal page split when descending the tree.
*/
static inline bool
-__wt_split_intl_race(
- WT_SESSION_IMPL *session, WT_PAGE *parent, WT_PAGE_INDEX *saved_pindex)
+__wt_split_descent_race(
+ WT_SESSION_IMPL *session, WT_REF *ref, WT_PAGE_INDEX *saved_pindex)
{
WT_PAGE_INDEX *pindex;
+ /* No test when starting the descent (there's no home to check). */
+ if (__wt_ref_is_root(ref))
+ return (false);
+
/*
* A place to hang this comment...
*
@@ -1509,6 +1513,6 @@ __wt_split_intl_race(
* content the split page retains after the split, and we ignore this
* race.
*/
- WT_INTL_INDEX_GET(session, parent, pindex);
+ WT_INTL_INDEX_GET(session, ref->home, pindex);
return (pindex != saved_pindex);
}