summaryrefslogtreecommitdiff
path: root/src/third_party/wiredtiger/src/btree
diff options
context:
space:
mode:
Diffstat (limited to 'src/third_party/wiredtiger/src/btree')
-rw-r--r--src/third_party/wiredtiger/src/btree/bt_handle.c4
-rw-r--r--src/third_party/wiredtiger/src/btree/bt_split.c34
-rw-r--r--src/third_party/wiredtiger/src/btree/bt_walk.c157
-rw-r--r--src/third_party/wiredtiger/src/btree/col_srch.c6
-rw-r--r--src/third_party/wiredtiger/src/btree/row_srch.c36
5 files changed, 166 insertions, 71 deletions
diff --git a/src/third_party/wiredtiger/src/btree/bt_handle.c b/src/third_party/wiredtiger/src/btree/bt_handle.c
index 7f0f37d95d6..2db3ca7d984 100644
--- a/src/third_party/wiredtiger/src/btree/bt_handle.c
+++ b/src/third_party/wiredtiger/src/btree/bt_handle.c
@@ -329,7 +329,7 @@ __btree_conf(WT_SESSION_IMPL *session, WT_CKPT *ckpt)
* always inherit from the connection.
*/
WT_RET(__wt_config_gets(session, cfg, "encryption.name", &cval));
- if (WT_IS_METADATA(btree->dhandle) || cval.len == 0)
+ if (WT_IS_METADATA(session, btree->dhandle) || cval.len == 0)
btree->kencryptor = conn->kencryptor;
else if (WT_STRING_MATCH("none", cval.str, cval.len))
btree->kencryptor = NULL;
@@ -420,7 +420,7 @@ __wt_btree_tree_open(
* Failure to open metadata means that the database is unavailable.
* Try to provide a helpful failure message.
*/
- if (ret != 0 && WT_IS_METADATA(session->dhandle)) {
+ if (ret != 0 && WT_IS_METADATA(session, session->dhandle)) {
__wt_errx(session,
"WiredTiger has failed to open its metadata");
__wt_errx(session, "This may be due to the database"
diff --git a/src/third_party/wiredtiger/src/btree/bt_split.c b/src/third_party/wiredtiger/src/btree/bt_split.c
index ac83a21ac6f..bd38451d5d1 100644
--- a/src/third_party/wiredtiger/src/btree/bt_split.c
+++ b/src/third_party/wiredtiger/src/btree/bt_split.c
@@ -875,16 +875,24 @@ __split_parent(WT_SESSION_IMPL *session, WT_REF *ref, WT_REF **ref_new,
/* The split is complete and correct, ignore benign errors. */
complete = WT_ERR_IGNORE;
- WT_ERR(__wt_verbose(session, WT_VERB_SPLIT,
- "%p: %s %s" "split into parent %p, %" PRIu32 " -> %" PRIu32
- " (%s%" PRIu32 ")",
- ref->page, ref->page == NULL ?
- "unknown page type" : __wt_page_type_string(ref->page->type),
- ref->page == NULL ? "reverse " : "", parent,
- parent_entries, result_entries,
- ref->page == NULL ? "-" : "+",
- ref->page == NULL ?
- parent_entries - result_entries : result_entries - parent_entries));
+ /*
+ * !!!
+ * Swapping in the new page index released the page for eviction, we can
+ * no longer look inside the page.
+ */
+
+ if (ref->page == NULL)
+ WT_ERR(__wt_verbose(session, WT_VERB_SPLIT,
+ "%p: reverse split into parent %p, %" PRIu32 " -> %" PRIu32
+ " (-%" PRIu32 ")",
+ ref->page, parent, parent_entries, result_entries,
+ parent_entries - result_entries));
+ else
+ WT_ERR(__wt_verbose(session, WT_VERB_SPLIT,
+ "%p: split into parent %p, %" PRIu32 " -> %" PRIu32
+ " (+%" PRIu32 ")",
+ ref->page, parent, parent_entries, result_entries,
+ result_entries - parent_entries));
/*
* The new page index is in place, free the WT_REF we were splitting and
@@ -935,8 +943,10 @@ __split_parent(WT_SESSION_IMPL *session, WT_REF *ref, WT_REF **ref_new,
parent_decr += sizeof(WT_REF);
}
- /* We freed the reference that was split in the loop above. */
- ref = NULL;
+ /*
+ * !!!
+ * The original WT_REF has now been freed, we can no longer look at it.
+ */
/*
* We can't free the previous page index, there may be threads using it.
diff --git a/src/third_party/wiredtiger/src/btree/bt_walk.c b/src/third_party/wiredtiger/src/btree/bt_walk.c
index 49a59b89552..55b11d7b2d1 100644
--- a/src/third_party/wiredtiger/src/btree/bt_walk.c
+++ b/src/third_party/wiredtiger/src/btree/bt_walk.c
@@ -89,11 +89,11 @@ __ref_is_leaf(WT_REF *ref)
}
/*
- * __page_ascend --
+ * __ref_ascend --
* Ascend the tree one level.
*/
-static void
-__page_ascend(WT_SESSION_IMPL *session,
+static inline void
+__ref_ascend(WT_SESSION_IMPL *session,
WT_REF **refp, WT_PAGE_INDEX **pindexp, uint32_t *slotp)
{
WT_REF *parent_ref, *ref;
@@ -163,23 +163,20 @@ __page_ascend(WT_SESSION_IMPL *session,
}
/*
- * __page_descend --
- * Descend the tree one level.
+ * __ref_descend_prev --
+ * Descend the tree one level, during a previous-cursor walk.
*/
-static void
-__page_descend(WT_SESSION_IMPL *session,
- WT_PAGE *page, WT_PAGE_INDEX **pindexp, uint32_t *slotp, bool prev)
+static inline void
+__ref_descend_prev(
+ WT_SESSION_IMPL *session, WT_REF *ref, WT_PAGE_INDEX **pindexp)
{
WT_PAGE_INDEX *pindex;
/*
- * Ref is a child page into which we're descending, and on which we
- * have a hazard pointer.
+ * We're passed a child page into which we're descending, and on which
+ * we have a hazard pointer.
*/
for (;; __wt_yield()) {
- WT_INTL_INDEX_GET(session, page, pindex);
- *slotp = prev ? pindex->entries - 1 : 0;
-
/*
* There's a split race when a cursor moving backwards through
* the tree descends the tree. If we're splitting an internal
@@ -233,21 +230,41 @@ __page_descend(WT_SESSION_IMPL *session,
* being split and part of its namespace moved. We have the
* correct page and we don't have to move, all we have to do is
* wait until the split page's page index is updated.
- *
- * No test is necessary for a next-cursor movement because we
- * do right-hand splits on internal pages and the initial part
- * of the page's namespace won't change as part of a split.
- * Instead of testing the direction boolean, do the test the
- * previous cursor movement requires in all cases, even though
- * it will always succeed for a next-cursor movement.
*/
- if (pindex->index[*slotp]->home == page)
+ WT_INTL_INDEX_GET(session, ref->page, pindex);
+ if (pindex->index[pindex->entries - 1]->home == ref->page)
break;
}
*pindexp = pindex;
}
/*
+ * __ref_initial_descent_prev --
+ * Descend the tree one level, when setting up the initial cursor position
+ * for a previous-cursor walk.
+ */
+static inline bool
+__ref_initial_descent_prev(
+ WT_SESSION_IMPL *session, WT_REF *ref, WT_PAGE_INDEX **pindexp)
+{
+ WT_PAGE_INDEX *pindex;
+
+ /*
+ * We're passed a child page into which we're descending, and on which
+ * we have a hazard pointer.
+ *
+ * Acquire a page index for the child page and then confirm we haven't
+ * raced with a parent split.
+ */
+ WT_INTL_INDEX_GET(session, ref->page, pindex);
+ if (__wt_split_descent_race(session, ref, *pindexp))
+ return (false);
+
+ *pindexp = pindex;
+ return (true);
+}
+
+/*
* __tree_walk_internal --
* Move to the next/previous page in the tree.
*/
@@ -259,11 +276,12 @@ __tree_walk_internal(WT_SESSION_IMPL *session,
WT_DECL_RET;
WT_PAGE_INDEX *pindex;
WT_REF *couple, *couple_orig, *ref;
- bool empty_internal, prev, skip;
+ bool empty_internal, initial_descent, prev, skip;
uint32_t slot;
btree = S2BT(session);
- empty_internal = false;
+ pindex = NULL;
+ empty_internal = initial_descent = false;
/*
* Tree walks are special: they look inside page structures that splits
@@ -323,22 +341,30 @@ __tree_walk_internal(WT_SESSION_IMPL *session,
couple = couple_orig = ref = *refp;
*refp = NULL;
- /* If no page is active, begin a walk from the start of the tree. */
+ /* If no page is active, begin a walk from the start/end of the tree. */
if (ref == NULL) {
- ref = &btree->root;
+restart: /*
+ * We can reach here with a NULL or root reference; the release
+ * function handles them internally, don't complicate this code
+ * by calling them out.
+ */
+ WT_ERR(__wt_page_release(session, couple, flags));
+
+ couple = couple_orig = ref = &btree->root;
if (ref->page == NULL)
goto done;
+
+ initial_descent = true;
goto descend;
}
/*
- * If the active page was the root, we've reached the walk's end.
- * Release any hazard-pointer we're holding.
+ * If the active page was the root, we've reached the walk's end; we
+ * only get here if we've returned the root to our caller, so we're
+ * holding no hazard pointers.
*/
- if (__wt_ref_is_root(ref)) {
- WT_ERR(__wt_page_release(session, couple, flags));
+ if (__wt_ref_is_root(ref))
goto done;
- }
/* Figure out the current slot in the WT_REF array. */
__ref_index_slot(session, ref, &pindex, &slot);
@@ -352,7 +378,7 @@ __tree_walk_internal(WT_SESSION_IMPL *session,
while ((prev && slot == 0) ||
(!prev && slot == pindex->entries - 1)) {
/* Ascend to the parent. */
- __page_ascend(session, &ref, &pindex, &slot);
+ __ref_ascend(session, &ref, &pindex, &slot);
/*
* If we got all the way through an internal page and
@@ -521,16 +547,21 @@ __tree_walk_internal(WT_SESSION_IMPL *session,
ret = 0;
/*
+ * If a cursor is setting up at the end of the
+ * tree, we can't use our parent page's index,
+ * because it may have already split; restart
+ * the walk.
+ */
+ if (prev && initial_descent)
+ goto restart;
+
+ /*
* If a new walk that never coupled from the
* root to a new saved position in the tree,
* restart the walk.
*/
- if (couple == &btree->root) {
- ref = &btree->root;
- if (ref->page == NULL)
- goto done;
- goto descend;
- }
+ if (couple == &btree->root)
+ goto restart;
/*
* If restarting from some original position,
@@ -561,10 +592,56 @@ __tree_walk_internal(WT_SESSION_IMPL *session,
descend: couple = ref;
empty_internal = true;
- __page_descend(
- session, ref->page, &pindex, &slot, prev);
+ /*
+ * There's a split race when a cursor is setting
+ * up at the end of the tree or moving backwards
+ * through the tree and descending a level. When
+ * splitting an internal page into its parent,
+ * we move the WT_REF structures and update the
+ * parent's page index before updating the split
+ * page's page index, and it's not an atomic
+ * update. A thread can read the parent page's
+ * replacement page index, then read the split
+ * page's original index, or the parent page's
+ * original and the split page's replacement.
+ *
+ * This isn't a problem for a cursor setting up
+ * at the start of the tree or moving forwards
+ * through the tree because we do right-hand
+ * splits on internal pages and the initial part
+ * of the split page's namespace won't change as
+ * part of a split. A thread reading the parent
+ * page's and split page's indexes will move to
+ * the same slot no matter what order of indexes
+ * are read.
+ *
+ * Handle a cursor setting up at the end of the
+ * tree or moving backwards through the tree.
+ */
+ if (!prev) {
+ WT_INTL_INDEX_GET(
+ session, ref->page, pindex);
+ slot = 0;
+ } else if (initial_descent) {
+ if (!__ref_initial_descent_prev(
+ session, ref, &pindex))
+ goto restart;
+ slot = pindex->entries - 1;
+ } else {
+ __ref_descend_prev(
+ session, ref, &pindex);
+ slot = pindex->entries - 1;
+ }
} else {
/*
+ * At the lowest tree level (considering a leaf
+ * page), turn off the initial-descent state.
+ * Descent race tests are different when moving
+ * through the tree vs. the initial descent.
+ */
+ initial_descent = false;
+
+ /*
* Optionally skip leaf pages, the second half.
* We didn't have an on-page cell to figure out
* if it was a leaf page, we had to acquire the
@@ -605,7 +682,7 @@ __wt_tree_walk(WT_SESSION_IMPL *session, WT_REF **refp, uint32_t flags)
/*
* __wt_tree_walk_count --
* Move to the next/previous page in the tree, tracking how many
- * references were visited to get there.
+ * references were visited to get there.
*/
int
__wt_tree_walk_count(WT_SESSION_IMPL *session,
diff --git a/src/third_party/wiredtiger/src/btree/col_srch.c b/src/third_party/wiredtiger/src/btree/col_srch.c
index cb5a227495f..3aa31044b82 100644
--- a/src/third_party/wiredtiger/src/btree/col_srch.c
+++ b/src/third_party/wiredtiger/src/btree/col_srch.c
@@ -137,12 +137,12 @@ restart_page: page = current->page;
* If on the last slot (the key is larger than any key
* on the page), check for an internal page split race.
*/
- if (parent_pindex != NULL &&
- __wt_split_intl_race(
- session, current->home, parent_pindex)) {
+ if (__wt_split_descent_race(
+ session, current, parent_pindex)) {
WT_RET(__wt_page_release(session, current, 0));
goto restart_root;
}
+
goto descend;
}
diff --git a/src/third_party/wiredtiger/src/btree/row_srch.c b/src/third_party/wiredtiger/src/btree/row_srch.c
index c06274cdb17..28c55a4ccd0 100644
--- a/src/third_party/wiredtiger/src/btree/row_srch.c
+++ b/src/third_party/wiredtiger/src/btree/row_srch.c
@@ -287,9 +287,26 @@ restart_page: page = current->page;
WT_INTL_INDEX_GET(session, page, pindex);
- /* Fast-path appends. */
+ /*
+ * Fast-path appends.
+ *
+ * The 0th key on an internal page is a problem for a couple of
+ * reasons. First, we have to force the 0th key to sort less
+ * than any application key, so internal pages don't have to be
+ * updated if the application stores a new, "smallest" key in
+ * the tree. Second, reconciliation is aware of this and will
+ * store a byte of garbage in the 0th key, so the comparison of
+ * an application key and a 0th key is meaningless (but doing
+ * the comparison could still incorrectly modify our tracking
+ * of the leading bytes in each key that we can skip during the
+ * comparison). For these reasons, special-case the 0th key, and
+ * never pass it to a collator.
+ */
if (append_check) {
descent = pindex->index[pindex->entries - 1];
+
+ if (pindex->entries == 1)
+ goto append;
__wt_ref_key(page, descent, &item->data, &item->size);
WT_ERR(__wt_compare(
session, collator, srch_key, item, &cmp));
@@ -307,16 +324,8 @@ restart_page: page = current->page;
* collation order), because doing the tests and error handling
* inside the loop costs about 5%.
*
- * The 0th key on an internal page is a problem for a couple of
- * reasons. First, we have to force the 0th key to sort less
- * than any application key, so internal pages don't have to be
- * updated if the application stores a new, "smallest" key in
- * the tree. Second, reconciliation is aware of this and will
- * store a byte of garbage in the 0th key, so the comparison of
- * an application key and a 0th key is meaningless (but doing
- * the comparison could still incorrectly modify our tracking
- * of the leading bytes in each key that we can skip during the
- * comparison). For these reasons, skip the 0th key.
+ * Reference the comment above about the 0th key: we continue to
+ * special-case it.
*/
base = 1;
limit = pindex->entries - 1;
@@ -409,9 +418,8 @@ restart_page: page = current->page;
* page), check for an internal page split race.
*/
if (pindex->entries == base) {
-append: if (parent_pindex != NULL &&
- __wt_split_intl_race(
- session, current->home, parent_pindex)) {
+append: if (__wt_split_descent_race(
+ session, current, parent_pindex)) {
if ((ret = __wt_page_release(
session, current, 0)) != 0)
return (ret);