summaryrefslogtreecommitdiff
path: root/src/btree/row_srch.c
diff options
context:
space:
mode:
Diffstat (limited to 'src/btree/row_srch.c')
-rw-r--r--src/btree/row_srch.c147
1 files changed, 74 insertions, 73 deletions
diff --git a/src/btree/row_srch.c b/src/btree/row_srch.c
index 28c55a4ccd0..6169a0a810a 100644
--- a/src/btree/row_srch.c
+++ b/src/btree/row_srch.c
@@ -9,18 +9,17 @@
#include "wt_internal.h"
/*
- * __wt_search_insert_append --
+ * __search_insert_append --
* Fast append search of a row-store insert list, creating a skiplist stack
* as we go.
*/
static inline int
-__wt_search_insert_append(WT_SESSION_IMPL *session,
- WT_CURSOR_BTREE *cbt, WT_ITEM *srch_key, bool *donep)
+__search_insert_append(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt,
+ WT_INSERT_HEAD *ins_head, WT_ITEM *srch_key, bool *donep)
{
WT_BTREE *btree;
WT_COLLATOR *collator;
WT_INSERT *ins;
- WT_INSERT_HEAD *inshead;
WT_ITEM key;
int cmp, i;
@@ -28,8 +27,7 @@ __wt_search_insert_append(WT_SESSION_IMPL *session,
collator = btree->collator;
*donep = 0;
- inshead = cbt->ins_head;
- if ((ins = WT_SKIP_LAST(inshead)) == NULL)
+ if ((ins = WT_SKIP_LAST(ins_head)) == NULL)
return (0);
key.data = WT_INSERT_KEY(ins);
key.size = WT_INSERT_KEY_SIZE(ins);
@@ -48,12 +46,13 @@ __wt_search_insert_append(WT_SESSION_IMPL *session,
*/
for (i = WT_SKIP_MAXDEPTH - 1; i >= 0; i--) {
cbt->ins_stack[i] = (i == 0) ? &ins->next[0] :
- (inshead->tail[i] != NULL) ?
- &inshead->tail[i]->next[i] : &inshead->head[i];
+ (ins_head->tail[i] != NULL) ?
+ &ins_head->tail[i]->next[i] : &ins_head->head[i];
cbt->next_stack[i] = NULL;
}
cbt->compare = -cmp;
cbt->ins = ins;
+ cbt->ins_head = ins_head;
*donep = 1;
}
return (0);
@@ -64,20 +63,18 @@ __wt_search_insert_append(WT_SESSION_IMPL *session,
* Search a row-store insert list, creating a skiplist stack as we go.
*/
int
-__wt_search_insert(
- WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, WT_ITEM *srch_key)
+__wt_search_insert(WT_SESSION_IMPL *session,
+ WT_CURSOR_BTREE *cbt, WT_INSERT_HEAD *ins_head, WT_ITEM *srch_key)
{
WT_BTREE *btree;
WT_COLLATOR *collator;
WT_INSERT *ins, **insp, *last_ins;
- WT_INSERT_HEAD *inshead;
WT_ITEM key;
size_t match, skiphigh, skiplow;
int cmp, i;
btree = S2BT(session);
collator = btree->collator;
- inshead = cbt->ins_head;
cmp = 0; /* -Wuninitialized */
/*
@@ -86,7 +83,7 @@ __wt_search_insert(
*/
match = skiphigh = skiplow = 0;
ins = last_ins = NULL;
- for (i = WT_SKIP_MAXDEPTH - 1, insp = &inshead->head[i]; i >= 0;) {
+ for (i = WT_SKIP_MAXDEPTH - 1, insp = &ins_head->head[i]; i >= 0;) {
if ((ins = *insp) == NULL) {
cbt->next_stack[i] = NULL;
cbt->ins_stack[i--] = insp--;
@@ -128,6 +125,7 @@ __wt_search_insert(
*/
cbt->compare = -cmp;
cbt->ins = (ins != NULL) ? ins : last_ins;
+ cbt->ins_head = ins_head;
return (0);
}
@@ -212,6 +210,7 @@ __wt_row_search(WT_SESSION_IMPL *session,
WT_BTREE *btree;
WT_COLLATOR *collator;
WT_DECL_RET;
+ WT_INSERT_HEAD *ins_head;
WT_ITEM *item;
WT_PAGE *page;
WT_PAGE_INDEX *pindex, *parent_pindex;
@@ -276,12 +275,20 @@ __wt_row_search(WT_SESSION_IMPL *session,
goto leaf_only;
}
+ if (0) {
+restart: /*
+ * Discard the currently held page and restart the search from
+ * the root.
+ */
+ WT_RET(__wt_page_release(session, current, 0));
+ skiphigh = skiplow = 0;
+ }
+
/* Search the internal pages of the tree. */
-restart_root:
current = &btree->root;
for (depth = 2, pindex = NULL;; ++depth) {
parent_pindex = pindex;
-restart_page: page = current->page;
+ page = current->page;
if (page->type != WT_PAGE_ROW_INT)
break;
@@ -419,20 +426,20 @@ restart_page: page = current->page;
*/
if (pindex->entries == base) {
append: if (__wt_split_descent_race(
- session, current, parent_pindex)) {
- if ((ret = __wt_page_release(
- session, current, 0)) != 0)
- return (ret);
-
- skiplow = skiphigh = 0;
- goto restart_root;
- }
+ session, current, parent_pindex))
+ goto restart;
}
descend: /*
* Swap the current page for the child page. If the page splits
- * while we're retrieving it, restart the search in the current
- * page; otherwise return on error, the swap call ensures we're
+ * while we're retrieving it, restart the search at the root.
+ * We cannot restart in the "current" page; for example, if a
+ * thread is appending to the tree, the page it's waiting for
+ * did an insert-split into the parent, then the parent split
+ * into its parent, the name space we are searching for may have
+ * moved above the current page in the tree.
+ *
+ * On other error, simply return, the swap call ensures we're
* holding nothing on failure.
*/
if ((ret = __wt_page_swap(
@@ -440,10 +447,8 @@ descend: /*
current = descent;
continue;
}
- if (ret == WT_RESTART) {
- skiphigh = skiplow = 0;
- goto restart_page;
- }
+ if (ret == WT_RESTART)
+ goto restart;
return (ret);
}
@@ -456,6 +461,12 @@ leaf_only:
cbt->ref = current;
/*
+ * Clear current now that we have moved the reference into the btree
+ * cursor, so that cleanup never releases twice.
+ */
+ current = NULL;
+
+ /*
* In the case of a right-side tree descent during an insert, do a fast
* check for an append to the page, try to catch cursors appending data
* into the tree.
@@ -479,24 +490,18 @@ leaf_only:
cbt->slot = WT_ROW_SLOT(page, page->pg_row_d);
F_SET(cbt, WT_CBT_SEARCH_SMALLEST);
- cbt->ins_head = WT_ROW_INSERT_SMALLEST(page);
+ ins_head = WT_ROW_INSERT_SMALLEST(page);
} else {
cbt->slot = WT_ROW_SLOT(page,
page->pg_row_d + (page->pg_row_entries - 1));
- cbt->ins_head = WT_ROW_INSERT_SLOT(page, cbt->slot);
+ ins_head = WT_ROW_INSERT_SLOT(page, cbt->slot);
}
- WT_ERR(
- __wt_search_insert_append(session, cbt, srch_key, &done));
+ WT_ERR(__search_insert_append(
+ session, cbt, ins_head, srch_key, &done));
if (done)
return (0);
-
- /*
- * Don't leave the insert list head set, code external to the
- * search uses it.
- */
- cbt->ins_head = NULL;
}
/*
@@ -589,16 +594,16 @@ leaf_match: cbt->compare = 0;
cbt->slot = WT_ROW_SLOT(page, page->pg_row_d);
F_SET(cbt, WT_CBT_SEARCH_SMALLEST);
- cbt->ins_head = WT_ROW_INSERT_SMALLEST(page);
+ ins_head = WT_ROW_INSERT_SMALLEST(page);
} else {
cbt->compare = -1;
cbt->slot = WT_ROW_SLOT(page, page->pg_row_d + (base - 1));
- cbt->ins_head = WT_ROW_INSERT_SLOT(page, cbt->slot);
+ ins_head = WT_ROW_INSERT_SLOT(page, cbt->slot);
}
/* If there's no insert list, we're done. */
- if (WT_SKIP_FIRST(cbt->ins_head) == NULL)
+ if (WT_SKIP_FIRST(ins_head) == NULL)
return (0);
/*
@@ -606,23 +611,16 @@ leaf_match: cbt->compare = 0;
* catch cursors repeatedly inserting at a single point.
*/
if (insert) {
- WT_ERR(
- __wt_search_insert_append(session, cbt, srch_key, &done));
+ WT_ERR(__search_insert_append(
+ session, cbt, ins_head, srch_key, &done));
if (done)
return (0);
}
- WT_ERR(__wt_search_insert(session, cbt, srch_key));
+ WT_ERR(__wt_search_insert(session, cbt, ins_head, srch_key));
return (0);
-err: /*
- * Release the current page if the search started at the root. If the
- * search didn't start at the root we should never have gone looking
- * beyond the start page.
- */
- WT_ASSERT(session, leaf == NULL || leaf == current);
- if (leaf == NULL)
- WT_TRET(__wt_page_release(session, current, 0));
+err: WT_TRET(__wt_page_release(session, current, 0));
return (ret);
}
@@ -660,19 +658,16 @@ __wt_row_random_leaf(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt)
/*
* If the tree is new (and not empty), it might have a large insert
* list.
- */
- F_SET(cbt, WT_CBT_SEARCH_SMALLEST);
- if ((cbt->ins_head = WT_ROW_INSERT_SMALLEST(page)) == NULL)
- return (WT_NOTFOUND);
-
- /*
+ *
* Walk down the list until we find a level with at least 50 entries,
* that's where we'll start rolling random numbers. The value 50 is
* used to ignore levels with only a few entries, that is, levels which
* are potentially badly skewed.
*/
- for (ins_head = cbt->ins_head,
- level = WT_SKIP_MAXDEPTH - 1; level >= 0; --level) {
+ F_SET(cbt, WT_CBT_SEARCH_SMALLEST);
+ if ((ins_head = WT_ROW_INSERT_SMALLEST(page)) == NULL)
+ return (WT_NOTFOUND);
+ for (level = WT_SKIP_MAXDEPTH - 1; level >= 0; --level) {
start = &ins_head->head[level];
for (entries = 0, stop = start;
*stop != NULL; stop = &(*stop)->next[level])
@@ -767,6 +762,7 @@ __wt_row_random_leaf(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt)
ins = ins->next[0];
cbt->ins = ins;
+ cbt->ins_head = ins_head;
cbt->compare = 0;
return (0);
@@ -786,11 +782,19 @@ __wt_row_random_descent(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt)
WT_REF *current, *descent;
btree = S2BT(session);
+ current = NULL;
__cursor_pos_clear(cbt);
-restart_root:
- /* Walk the internal pages of the tree. */
+ if (0) {
+restart: /*
+ * Discard the currently held page and restart the search from
+ * the root.
+ */
+ WT_RET(__wt_page_release(session, current, 0));
+ }
+
+ /* Search the internal pages of the tree. */
current = &btree->root;
for (;;) {
page = current->page;
@@ -802,22 +806,19 @@ restart_root:
__wt_random(&session->rnd) % pindex->entries];
/*
- * Swap the parent page for the child page; return on error,
- * the swap function ensures we're holding nothing on failure.
+ * Swap the current page for the child page. If the page splits
+ * while we're retrieving it, restart the search at the root.
+ *
+ * On other error, simply return, the swap call ensures we're
+ * holding nothing on failure.
*/
if ((ret = __wt_page_swap(
session, current, descent, WT_READ_RESTART_OK)) == 0) {
current = descent;
continue;
}
- /*
- * Restart is returned if we find a page that's been split; the
- * held page isn't discarded when restart is returned, discard
- * it and restart the search from the top of the tree.
- */
- if (ret == WT_RESTART &&
- (ret = __wt_page_release(session, current, 0)) == 0)
- goto restart_root;
+ if (ret == WT_RESTART)
+ goto restart;
return (ret);
}