diff options
author | Keith Bostic <keith.bostic@mongodb.com> | 2017-04-18 02:40:08 -0400 |
---|---|---|
committer | Alex Gorrod <alexander.gorrod@mongodb.com> | 2017-10-03 11:17:54 +1100 |
commit | 0831e04799544167ee88c5b4dbad07d922a47d4d (patch) | |
tree | 75ded26eec39b246a019ea4cf88f5d1b91f6d0c3 | |
parent | 15aacb8d1301acb926d964c214caf4ab989107a5 (diff) | |
download | mongo-0831e04799544167ee88c5b4dbad07d922a47d4d.tar.gz |
WT-3284 tree-walk restart bug (#3389)
The tree-walk restart code (that is, dealing with WT_RESTART being returned from a page-coupling attempt after a page split), depends on the tree-walk function never hazard coupling from a leaf page, that is, any leaf page it acquires must be returned to the caller. The "skip leaf page during a tree walk" functionality violated that requirement.
Rather than rework the tree walk code's restart handling, move half of the functionality to skip a count of leaf pages into the tree-walk function's caller, essentially, it's an easy way to jump back into the tree-walk code at the right place.
(cherry picked from commit 095efd2952d6c957bf468b8180ed4f495e4d7a08)
-rw-r--r-- | dist/flags.py | 1 | ||||
-rw-r--r-- | src/btree/bt_random.c | 3 | ||||
-rw-r--r-- | src/btree/bt_walk.c | 99 | ||||
-rw-r--r-- | src/include/extern.h | 2 | ||||
-rw-r--r-- | src/include/flags.h | 5 |
5 files changed, 52 insertions, 58 deletions
diff --git a/dist/flags.py b/dist/flags.py index 64b5d789e72..d80c80a37ce 100644 --- a/dist/flags.py +++ b/dist/flags.py @@ -32,7 +32,6 @@ flags = { 'READ_PREV', 'READ_RESTART_OK', 'READ_SKIP_INTL', - 'READ_SKIP_LEAF', 'READ_TRUNCATE', 'READ_WONT_NEED', ], diff --git a/src/btree/bt_random.c b/src/btree/bt_random.c index c5948ec4ab5..b4f05c440ba 100644 --- a/src/btree/bt_random.c +++ b/src/btree/bt_random.c @@ -395,8 +395,7 @@ __wt_btcur_next_random(WT_CURSOR_BTREE *cbt) */ for (skip = cbt->next_random_leaf_skip; cbt->ref == NULL || skip > 0;) { n = skip; - WT_ERR(__wt_tree_walk_skip(session, &cbt->ref, &skip, - WT_READ_NO_GEN | WT_READ_SKIP_INTL | WT_READ_WONT_NEED)); + WT_ERR(__wt_tree_walk_skip(session, &cbt->ref, &skip)); if (n == skip) { if (skip == 0) break; diff --git a/src/btree/bt_walk.c b/src/btree/bt_walk.c index 5d674c4e46b..c22b99c55d0 100644 --- a/src/btree/bt_walk.c +++ b/src/btree/bt_walk.c @@ -501,29 +501,21 @@ restart: /* } /* - * Optionally skip leaf pages: skip all leaf pages if - * WT_READ_SKIP_LEAF is set, when the skip-leaf-count - * variable is non-zero, skip some count of leaf pages. - * If this page is disk-based, crack the cell to figure - * out it's a leaf page without reading it. + * Optionally skip leaf pages: when the skip-leaf-count + * variable is non-zero, skip some count of leaf pages, + * then take the next leaf page we can. * - * If skipping some number of leaf pages, decrement the - * count of pages to zero, and then take the next leaf - * page we can. Be cautious around the page decrement, - * if for some reason don't take this particular page, - * we can take the next one, and, there are additional - * tests/decrements when we're about to return a leaf - * page. + * The reason to do some of this work here (rather than + * in our caller), is because we can look at the cell + * and know it's a leaf page without reading it into + * memory. If this page is disk-based, crack the cell + * to figure out it's a leaf page without reading it. */ - if (skipleafcntp != NULL || LF_ISSET(WT_READ_SKIP_LEAF)) - if (__ref_is_leaf(ref)) { - if (LF_ISSET(WT_READ_SKIP_LEAF)) - break; - if (*skipleafcntp > 0) { - --*skipleafcntp; - break; - } - } + if (skipleafcntp != NULL && + *skipleafcntp > 0 && __ref_is_leaf(ref)) { + --*skipleafcntp; + break; + } ret = __wt_page_swap(session, couple, ref, WT_READ_NOTFOUND_OK | WT_READ_RESTART_OK | flags); @@ -630,34 +622,18 @@ descend: empty_internal = true; session, ref, &pindex); slot = pindex->entries - 1; } - } else { - /* - * At the lowest tree level (considering a leaf - * page), turn off the initial-descent state. - * Descent race tests are different when moving - * through the tree vs. the initial descent. - */ - initial_descent = false; - - /* - * Optionally skip leaf pages, the second half. - * We didn't have an on-page cell to figure out - * if it was a leaf page, we had to acquire the - * hazard pointer and look at the page. - */ - if (skipleafcntp != NULL || - LF_ISSET(WT_READ_SKIP_LEAF)) { - if (LF_ISSET(WT_READ_SKIP_LEAF)) - break; - if (*skipleafcntp > 0) { - --*skipleafcntp; - break; - } - } - - *refp = ref; - goto done; + continue; } + + /* + * The tree-walk restart code knows we return any leaf + * page we acquire (never hazard-pointer coupling on + * after acquiring a leaf page), and asserts no restart + * happens while holding a leaf page. This page must be + * returned to our caller. + */ + *refp = ref; + goto done; } } @@ -694,8 +670,29 @@ __wt_tree_walk_count(WT_SESSION_IMPL *session, * of leaf pages before returning. */ int -__wt_tree_walk_skip(WT_SESSION_IMPL *session, - WT_REF **refp, uint64_t *skipleafcntp, uint32_t flags) +__wt_tree_walk_skip( + WT_SESSION_IMPL *session, WT_REF **refp, uint64_t *skipleafcntp) { - return (__tree_walk_internal(session, refp, NULL, skipleafcntp, flags)); + /* + * Optionally skip leaf pages, the second half. The tree-walk function + * didn't have an on-page cell it could use to figure out if the page + * was a leaf page or not, it had to acquire the hazard pointer and look + * at the page. The tree-walk code never acquires a hazard pointer on a + * leaf page without returning it, and it's not trivial to change that. + * So, the tree-walk code returns all leaf pages here and we deal with + * decrementing the count. + */ + do { + WT_RET(__tree_walk_internal(session, refp, NULL, skipleafcntp, + WT_READ_NO_GEN | WT_READ_SKIP_INTL | WT_READ_WONT_NEED)); + + /* + * The walk skipped internal pages, any page returned must be a + * leaf page. + */ + if (*skipleafcntp > 0) + --*skipleafcntp; + } while (*skipleafcntp > 0); + + return (0); } diff --git a/src/include/extern.h b/src/include/extern.h index 431a83cef9d..e77de41344c 100644 --- a/src/include/extern.h +++ b/src/include/extern.h @@ -181,7 +181,7 @@ extern int __wt_verify_dsk_image(WT_SESSION_IMPL *session, const char *tag, cons extern int __wt_verify_dsk(WT_SESSION_IMPL *session, const char *tag, WT_ITEM *buf) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_tree_walk(WT_SESSION_IMPL *session, WT_REF **refp, uint32_t flags) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_tree_walk_count(WT_SESSION_IMPL *session, WT_REF **refp, uint64_t *walkcntp, uint32_t flags) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); -extern int __wt_tree_walk_skip(WT_SESSION_IMPL *session, WT_REF **refp, uint64_t *skipleafcntp, uint32_t flags) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); +extern int __wt_tree_walk_skip( WT_SESSION_IMPL *session, WT_REF **refp, uint64_t *skipleafcntp) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_col_modify(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, uint64_t recno, WT_ITEM *value, WT_UPDATE *upd_arg, bool is_remove, bool exclusive) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_col_search(WT_SESSION_IMPL *session, uint64_t search_recno, WT_REF *leaf, WT_CURSOR_BTREE *cbt) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_row_leaf_keys(WT_SESSION_IMPL *session, WT_PAGE *page) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); diff --git a/src/include/flags.h b/src/include/flags.h index f26a45c68f5..d7c0e0f9472 100644 --- a/src/include/flags.h +++ b/src/include/flags.h @@ -47,9 +47,8 @@ #define WT_READ_PREV 0x00000080 #define WT_READ_RESTART_OK 0x00000100 #define WT_READ_SKIP_INTL 0x00000200 -#define WT_READ_SKIP_LEAF 0x00000400 -#define WT_READ_TRUNCATE 0x00000800 -#define WT_READ_WONT_NEED 0x00001000 +#define WT_READ_TRUNCATE 0x00000400 +#define WT_READ_WONT_NEED 0x00000800 #define WT_SESSION_CAN_WAIT 0x00000001 #define WT_SESSION_INTERNAL 0x00000002 #define WT_SESSION_LOCKED_CHECKPOINT 0x00000004 |