summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorKeith Bostic <keith.bostic@mongodb.com>2017-04-18 02:40:08 -0400
committerAlex Gorrod <alexander.gorrod@mongodb.com>2017-10-03 11:17:54 +1100
commit0831e04799544167ee88c5b4dbad07d922a47d4d (patch)
tree75ded26eec39b246a019ea4cf88f5d1b91f6d0c3
parent15aacb8d1301acb926d964c214caf4ab989107a5 (diff)
downloadmongo-0831e04799544167ee88c5b4dbad07d922a47d4d.tar.gz
WT-3284 tree-walk restart bug (#3389)
The tree-walk restart code (that is, dealing with WT_RESTART being returned from a page-coupling attempt after a page split), depends on the tree-walk function never hazard coupling from a leaf page, that is, any leaf page it acquires must be returned to the caller. The "skip leaf page during a tree walk" functionality violated that requirement. Rather than rework the tree walk code's restart handling, move half of the functionality to skip a count of leaf pages into the tree-walk function's caller, essentially, it's an easy way to jump back into the tree-walk code at the right place. (cherry picked from commit 095efd2952d6c957bf468b8180ed4f495e4d7a08)
-rw-r--r--dist/flags.py1
-rw-r--r--src/btree/bt_random.c3
-rw-r--r--src/btree/bt_walk.c99
-rw-r--r--src/include/extern.h2
-rw-r--r--src/include/flags.h5
5 files changed, 52 insertions, 58 deletions
diff --git a/dist/flags.py b/dist/flags.py
index 64b5d789e72..d80c80a37ce 100644
--- a/dist/flags.py
+++ b/dist/flags.py
@@ -32,7 +32,6 @@ flags = {
'READ_PREV',
'READ_RESTART_OK',
'READ_SKIP_INTL',
- 'READ_SKIP_LEAF',
'READ_TRUNCATE',
'READ_WONT_NEED',
],
diff --git a/src/btree/bt_random.c b/src/btree/bt_random.c
index c5948ec4ab5..b4f05c440ba 100644
--- a/src/btree/bt_random.c
+++ b/src/btree/bt_random.c
@@ -395,8 +395,7 @@ __wt_btcur_next_random(WT_CURSOR_BTREE *cbt)
*/
for (skip = cbt->next_random_leaf_skip; cbt->ref == NULL || skip > 0;) {
n = skip;
- WT_ERR(__wt_tree_walk_skip(session, &cbt->ref, &skip,
- WT_READ_NO_GEN | WT_READ_SKIP_INTL | WT_READ_WONT_NEED));
+ WT_ERR(__wt_tree_walk_skip(session, &cbt->ref, &skip));
if (n == skip) {
if (skip == 0)
break;
diff --git a/src/btree/bt_walk.c b/src/btree/bt_walk.c
index 5d674c4e46b..c22b99c55d0 100644
--- a/src/btree/bt_walk.c
+++ b/src/btree/bt_walk.c
@@ -501,29 +501,21 @@ restart: /*
}
/*
- * Optionally skip leaf pages: skip all leaf pages if
- * WT_READ_SKIP_LEAF is set, when the skip-leaf-count
- * variable is non-zero, skip some count of leaf pages.
- * If this page is disk-based, crack the cell to figure
- * out it's a leaf page without reading it.
+ * Optionally skip leaf pages: when the skip-leaf-count
+ * variable is non-zero, skip some count of leaf pages,
+ * then take the next leaf page we can.
*
- * If skipping some number of leaf pages, decrement the
- * count of pages to zero, and then take the next leaf
- * page we can. Be cautious around the page decrement,
- * if for some reason don't take this particular page,
- * we can take the next one, and, there are additional
- * tests/decrements when we're about to return a leaf
- * page.
+ * The reason to do some of this work here (rather than
+ * in our caller), is because we can look at the cell
+ * and know it's a leaf page without reading it into
+ * memory. If this page is disk-based, crack the cell
+ * to figure out it's a leaf page without reading it.
*/
- if (skipleafcntp != NULL || LF_ISSET(WT_READ_SKIP_LEAF))
- if (__ref_is_leaf(ref)) {
- if (LF_ISSET(WT_READ_SKIP_LEAF))
- break;
- if (*skipleafcntp > 0) {
- --*skipleafcntp;
- break;
- }
- }
+ if (skipleafcntp != NULL &&
+ *skipleafcntp > 0 && __ref_is_leaf(ref)) {
+ --*skipleafcntp;
+ break;
+ }
ret = __wt_page_swap(session, couple, ref,
WT_READ_NOTFOUND_OK | WT_READ_RESTART_OK | flags);
@@ -630,34 +622,18 @@ descend: empty_internal = true;
session, ref, &pindex);
slot = pindex->entries - 1;
}
- } else {
- /*
- * At the lowest tree level (considering a leaf
- * page), turn off the initial-descent state.
- * Descent race tests are different when moving
- * through the tree vs. the initial descent.
- */
- initial_descent = false;
-
- /*
- * Optionally skip leaf pages, the second half.
- * We didn't have an on-page cell to figure out
- * if it was a leaf page, we had to acquire the
- * hazard pointer and look at the page.
- */
- if (skipleafcntp != NULL ||
- LF_ISSET(WT_READ_SKIP_LEAF)) {
- if (LF_ISSET(WT_READ_SKIP_LEAF))
- break;
- if (*skipleafcntp > 0) {
- --*skipleafcntp;
- break;
- }
- }
-
- *refp = ref;
- goto done;
+ continue;
}
+
+ /*
+ * The tree-walk restart code knows we return any leaf
+ * page we acquire (never hazard-pointer coupling on
+ * after acquiring a leaf page), and asserts no restart
+ * happens while holding a leaf page. This page must be
+ * returned to our caller.
+ */
+ *refp = ref;
+ goto done;
}
}
@@ -694,8 +670,29 @@ __wt_tree_walk_count(WT_SESSION_IMPL *session,
* of leaf pages before returning.
*/
int
-__wt_tree_walk_skip(WT_SESSION_IMPL *session,
- WT_REF **refp, uint64_t *skipleafcntp, uint32_t flags)
+__wt_tree_walk_skip(
+ WT_SESSION_IMPL *session, WT_REF **refp, uint64_t *skipleafcntp)
{
- return (__tree_walk_internal(session, refp, NULL, skipleafcntp, flags));
+ /*
+ * Optionally skip leaf pages, the second half. The tree-walk function
+ * didn't have an on-page cell it could use to figure out if the page
+ * was a leaf page or not, it had to acquire the hazard pointer and look
+ * at the page. The tree-walk code never acquires a hazard pointer on a
+ * leaf page without returning it, and it's not trivial to change that.
+ * So, the tree-walk code returns all leaf pages here and we deal with
+ * decrementing the count.
+ */
+ do {
+ WT_RET(__tree_walk_internal(session, refp, NULL, skipleafcntp,
+ WT_READ_NO_GEN | WT_READ_SKIP_INTL | WT_READ_WONT_NEED));
+
+ /*
+ * The walk skipped internal pages, any page returned must be a
+ * leaf page.
+ */
+ if (*skipleafcntp > 0)
+ --*skipleafcntp;
+ } while (*skipleafcntp > 0);
+
+ return (0);
}
diff --git a/src/include/extern.h b/src/include/extern.h
index 431a83cef9d..e77de41344c 100644
--- a/src/include/extern.h
+++ b/src/include/extern.h
@@ -181,7 +181,7 @@ extern int __wt_verify_dsk_image(WT_SESSION_IMPL *session, const char *tag, cons
extern int __wt_verify_dsk(WT_SESSION_IMPL *session, const char *tag, WT_ITEM *buf) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
extern int __wt_tree_walk(WT_SESSION_IMPL *session, WT_REF **refp, uint32_t flags) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
extern int __wt_tree_walk_count(WT_SESSION_IMPL *session, WT_REF **refp, uint64_t *walkcntp, uint32_t flags) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
-extern int __wt_tree_walk_skip(WT_SESSION_IMPL *session, WT_REF **refp, uint64_t *skipleafcntp, uint32_t flags) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
+extern int __wt_tree_walk_skip( WT_SESSION_IMPL *session, WT_REF **refp, uint64_t *skipleafcntp) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
extern int __wt_col_modify(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, uint64_t recno, WT_ITEM *value, WT_UPDATE *upd_arg, bool is_remove, bool exclusive) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
extern int __wt_col_search(WT_SESSION_IMPL *session, uint64_t search_recno, WT_REF *leaf, WT_CURSOR_BTREE *cbt) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
extern int __wt_row_leaf_keys(WT_SESSION_IMPL *session, WT_PAGE *page) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
diff --git a/src/include/flags.h b/src/include/flags.h
index f26a45c68f5..d7c0e0f9472 100644
--- a/src/include/flags.h
+++ b/src/include/flags.h
@@ -47,9 +47,8 @@
#define WT_READ_PREV 0x00000080
#define WT_READ_RESTART_OK 0x00000100
#define WT_READ_SKIP_INTL 0x00000200
-#define WT_READ_SKIP_LEAF 0x00000400
-#define WT_READ_TRUNCATE 0x00000800
-#define WT_READ_WONT_NEED 0x00001000
+#define WT_READ_TRUNCATE 0x00000400
+#define WT_READ_WONT_NEED 0x00000800
#define WT_SESSION_CAN_WAIT 0x00000001
#define WT_SESSION_INTERNAL 0x00000002
#define WT_SESSION_LOCKED_CHECKPOINT 0x00000004