summaryrefslogtreecommitdiff
path: root/src/third_party/wiredtiger/src/btree
diff options
context:
space:
mode:
authorMichael Cahill <michael.cahill@mongodb.com>2016-02-19 15:36:42 +1100
committerMichael Cahill <michael.cahill@mongodb.com>2016-02-19 15:36:47 +1100
commit70db6ed51f90f627570de9bf32ab8c5cd23886ca (patch)
tree1f5ae19d38fad8488657ae80ae9014b832ec858e /src/third_party/wiredtiger/src/btree
parentda2441b59b742c077306be6515c999c33cd955a6 (diff)
downloadmongo-70db6ed51f90f627570de9bf32ab8c5cd23886ca.tar.gz
Import wiredtiger-wiredtiger-2.7.0-675-g4f38287.tar.gz from wiredtiger branch mongodb-3.4
ref: cc96d99..4f38287 SERVER-22676 WiredTiger fails to open databases created by 3.0.0 or 3.0.1 WT-2280 Add CRC32 Optimized code for PPC64LE WT-2295 WT_SESSION.create does a full-scan of the main table WT-2346 Don't hold schema lock during checkpoint I/O WT-2361 Column-store starting record number error WT-2367 WT_CURSOR.next out-of-order returns failure WT-2374 Read error on index file WT-2375 Need tests for collators WT-2382 Problem with custom collator for 'u' format with join cursor WT-2387 Fix cursor random unit test on Windows WT-2390 OS X build is broken WT-2393 Unnecessary error handling labels. WT-2396 Jenkins Spinlock GCC task Hung WT-2397 Cursor traversal from end of the tree skips records. WT-2399 Add test case that verifies cursor traversal WT-2411 LSM drop hang
Diffstat (limited to 'src/third_party/wiredtiger/src/btree')
-rw-r--r--src/third_party/wiredtiger/src/btree/bt_curprev.c3
-rw-r--r--src/third_party/wiredtiger/src/btree/bt_split.c60
-rw-r--r--src/third_party/wiredtiger/src/btree/bt_sync.c14
-rw-r--r--src/third_party/wiredtiger/src/btree/bt_walk.c153
-rw-r--r--src/third_party/wiredtiger/src/btree/col_modify.c22
-rw-r--r--src/third_party/wiredtiger/src/btree/col_srch.c34
-rw-r--r--src/third_party/wiredtiger/src/btree/row_srch.c135
7 files changed, 276 insertions, 145 deletions
diff --git a/src/third_party/wiredtiger/src/btree/bt_curprev.c b/src/third_party/wiredtiger/src/btree/bt_curprev.c
index a083ec4016e..7475c0f1312 100644
--- a/src/third_party/wiredtiger/src/btree/bt_curprev.c
+++ b/src/third_party/wiredtiger/src/btree/bt_curprev.c
@@ -51,7 +51,8 @@ restart:
if (cbt->btree->type == BTREE_ROW) {
key.data = WT_INSERT_KEY(current);
key.size = WT_INSERT_KEY_SIZE(current);
- WT_RET(__wt_search_insert(session, cbt, &key));
+ WT_RET(__wt_search_insert(
+ session, cbt, cbt->ins_head, &key));
} else
cbt->ins = __col_insert_search(cbt->ins_head,
cbt->ins_stack, cbt->next_stack,
diff --git a/src/third_party/wiredtiger/src/btree/bt_split.c b/src/third_party/wiredtiger/src/btree/bt_split.c
index bd38451d5d1..3dea03316ce 100644
--- a/src/third_party/wiredtiger/src/btree/bt_split.c
+++ b/src/third_party/wiredtiger/src/btree/bt_split.c
@@ -1383,11 +1383,27 @@ __split_internal_should_split(WT_SESSION_IMPL *session, WT_REF *ref)
static int
__split_parent_climb(WT_SESSION_IMPL *session, WT_PAGE *page, bool page_hazard)
{
+ WT_BTREE *btree;
WT_DECL_RET;
WT_PAGE *parent;
WT_REF *ref;
bool parent_hazard;
+ btree = S2BT(session);
+
+ /*
+ * Disallow internal splits during the final pass of a checkpoint. Most
+ * splits are already disallowed during checkpoints, but an important
+ * exception is insert splits. The danger is an insert split creates a
+ * new chunk of the namespace, and then the internal split will move it
+ * to a different part of the tree where it will be written; in other
+ * words, in one part of the tree we'll skip the newly created insert
+ * split chunk, but we'll write it upon finding it in a different part
+ * of the tree.
+ */
+ if (btree->checkpointing != WT_CKPT_OFF)
+ return (__split_internal_unlock(session, page, page_hazard));
+
/*
* Page splits trickle up the tree, that is, as leaf pages grow large
* enough and are evicted, they'll split into their parent. And, as
@@ -1771,8 +1787,8 @@ __split_insert(WT_SESSION_IMPL *session, WT_REF *ref)
type, WT_INSERT_RECNO(moved_ins), 0, false, &right));
/*
- * The new page is dirty by definition, column-store splits update the
- * page-modify structure, so create it now.
+ * The new page is dirty by definition, plus column-store splits update
+ * the page-modify structure, so create it now.
*/
WT_ERR(__wt_page_modify_init(session, right));
__wt_page_modify_set(session, right);
@@ -1813,15 +1829,6 @@ __split_insert(WT_SESSION_IMPL *session, WT_REF *ref)
}
/*
- * We modified the page above, which will have set the first dirty
- * transaction to the last transaction current running. However, the
- * updates we installed may be older than that. Set the first dirty
- * transaction to an impossibly old value so this page is never skipped
- * in a checkpoint.
- */
- right->modify->first_dirty_txn = WT_TXN_FIRST;
-
- /*
* Calculate how much memory we're moving: figure out how deep the skip
* list stack is for the element we are moving, and the memory used by
* the item's list of updates.
@@ -1919,6 +1926,24 @@ __split_insert(WT_SESSION_IMPL *session, WT_REF *ref)
#endif
/*
+ * We perform insert splits concurrently with checkpoints, where the
+ * requirement is a checkpoint must include either the original page
+ * or both new pages. The page we're splitting is dirty, but that's
+ * insufficient: set the first dirty transaction to an impossibly old
+ * value so this page is not skipped by a checkpoint.
+ */
+ page->modify->first_dirty_txn = WT_TXN_FIRST;
+
+ /*
+ * We modified the page above, which will have set the first dirty
+ * transaction to the last transaction current running. However, the
+ * updates we installed may be older than that. Set the first dirty
+ * transaction to an impossibly old value so this page is never skipped
+ * in a checkpoint.
+ */
+ right->modify->first_dirty_txn = WT_TXN_FIRST;
+
+ /*
* Update the page accounting.
*
* XXX
@@ -1928,10 +1953,14 @@ __split_insert(WT_SESSION_IMPL *session, WT_REF *ref)
__wt_cache_page_inmem_incr(session, right, right_incr);
/*
- * Split into the parent. On successful return, the original page is no
- * longer locked, so we cannot safely look at it.
+ * The act of splitting into the parent releases the pages for eviction;
+ * ensure the page contents are consistent.
+ */
+ WT_WRITE_BARRIER();
+
+ /*
+ * Split into the parent.
*/
- page = NULL;
if ((ret = __split_parent(
session, ref, split_ref, 2, parent_incr, false, true)) == 0)
return (0);
@@ -1941,7 +1970,8 @@ __split_insert(WT_SESSION_IMPL *session, WT_REF *ref)
*
* Reset the split column-store page record.
*/
- page->modify->mod_split_recno = WT_RECNO_OOB;
+ if (type != WT_PAGE_ROW_LEAF)
+ page->modify->mod_split_recno = WT_RECNO_OOB;
/*
* Clear the allocated page's reference to the moved insert list element
diff --git a/src/third_party/wiredtiger/src/btree/bt_sync.c b/src/third_party/wiredtiger/src/btree/bt_sync.c
index 5cbd8d1e996..bbfb06c636f 100644
--- a/src/third_party/wiredtiger/src/btree/bt_sync.c
+++ b/src/third_party/wiredtiger/src/btree/bt_sync.c
@@ -105,13 +105,13 @@ __sync_file(WT_SESSION_IMPL *session, WT_CACHE_OP syncop)
__wt_spin_lock(session, &btree->flush_lock);
/*
- * When internal pages are being reconciled by checkpoint their
- * child pages cannot disappear from underneath them or be split
- * into them, nor can underlying blocks be freed until the block
- * lists for the checkpoint are stable. Set the checkpointing
- * flag to block eviction of dirty pages until the checkpoint's
- * internal page pass is complete, then wait for any existing
- * eviction to complete.
+ * In the final checkpoint pass, child pages cannot be evicted
+ * from underneath internal pages nor can underlying blocks be
+ * freed until the checkpoint's block lists are stable. Also,
+ * we cannot split child pages into parents unless we know the
+ * final pass will write a consistent view of that namespace.
+ * Set the checkpointing flag to block such actions and wait for
+ * any problematic eviction or page splits to complete.
*/
WT_PUBLISH(btree->checkpointing, WT_CKPT_PREPARE);
diff --git a/src/third_party/wiredtiger/src/btree/bt_walk.c b/src/third_party/wiredtiger/src/btree/bt_walk.c
index d7785c689d9..55b11d7b2d1 100644
--- a/src/third_party/wiredtiger/src/btree/bt_walk.c
+++ b/src/third_party/wiredtiger/src/btree/bt_walk.c
@@ -89,11 +89,11 @@ __ref_is_leaf(WT_REF *ref)
}
/*
- * __page_ascend --
+ * __ref_ascend --
* Ascend the tree one level.
*/
-static void
-__page_ascend(WT_SESSION_IMPL *session,
+static inline void
+__ref_ascend(WT_SESSION_IMPL *session,
WT_REF **refp, WT_PAGE_INDEX **pindexp, uint32_t *slotp)
{
WT_REF *parent_ref, *ref;
@@ -163,12 +163,12 @@ __page_ascend(WT_SESSION_IMPL *session,
}
/*
- * __page_descend --
- * Descend the tree one level.
+ * __ref_descend_prev --
+ * Descend the tree one level, during a previous-cursor walk.
*/
-static void
-__page_descend(WT_SESSION_IMPL *session,
- WT_PAGE *page, WT_PAGE_INDEX **pindexp, uint32_t *slotp, bool prev)
+static inline void
+__ref_descend_prev(
+ WT_SESSION_IMPL *session, WT_REF *ref, WT_PAGE_INDEX **pindexp)
{
WT_PAGE_INDEX *pindex;
@@ -177,9 +177,6 @@ __page_descend(WT_SESSION_IMPL *session,
* we have a hazard pointer.
*/
for (;; __wt_yield()) {
- WT_INTL_INDEX_GET(session, page, pindex);
- *slotp = prev ? pindex->entries - 1 : 0;
-
/*
* There's a split race when a cursor moving backwards through
* the tree descends the tree. If we're splitting an internal
@@ -233,21 +230,41 @@ __page_descend(WT_SESSION_IMPL *session,
* being split and part of its namespace moved. We have the
* correct page and we don't have to move, all we have to do is
* wait until the split page's page index is updated.
- *
- * No test is necessary for a next-cursor movement because we
- * do right-hand splits on internal pages and the initial part
- * of the page's namespace won't change as part of a split.
- * Instead of testing the direction boolean, do the test the
- * previous cursor movement requires in all cases, even though
- * it will always succeed for a next-cursor movement.
*/
- if (pindex->index[*slotp]->home == page)
+ WT_INTL_INDEX_GET(session, ref->page, pindex);
+ if (pindex->index[pindex->entries - 1]->home == ref->page)
break;
}
*pindexp = pindex;
}
/*
+ * __ref_initial_descent_prev --
+ * Descend the tree one level, when setting up the initial cursor position
+ * for a previous-cursor walk.
+ */
+static inline bool
+__ref_initial_descent_prev(
+ WT_SESSION_IMPL *session, WT_REF *ref, WT_PAGE_INDEX **pindexp)
+{
+ WT_PAGE_INDEX *pindex;
+
+ /*
+ * We're passed a child page into which we're descending, and on which
+ * we have a hazard pointer.
+ *
+ * Acquire a page index for the child page and then confirm we haven't
+ * raced with a parent split.
+ */
+ WT_INTL_INDEX_GET(session, ref->page, pindex);
+ if (__wt_split_descent_race(session, ref, *pindexp))
+ return (false);
+
+ *pindexp = pindex;
+ return (true);
+}
+
+/*
* __tree_walk_internal --
* Move to the next/previous page in the tree.
*/
@@ -259,11 +276,12 @@ __tree_walk_internal(WT_SESSION_IMPL *session,
WT_DECL_RET;
WT_PAGE_INDEX *pindex;
WT_REF *couple, *couple_orig, *ref;
- bool empty_internal, prev, skip;
+ bool empty_internal, initial_descent, prev, skip;
uint32_t slot;
btree = S2BT(session);
- empty_internal = false;
+ pindex = NULL;
+ empty_internal = initial_descent = false;
/*
* Tree walks are special: they look inside page structures that splits
@@ -323,22 +341,30 @@ __tree_walk_internal(WT_SESSION_IMPL *session,
couple = couple_orig = ref = *refp;
*refp = NULL;
- /* If no page is active, begin a walk from the start of the tree. */
+ /* If no page is active, begin a walk from the start/end of the tree. */
if (ref == NULL) {
- ref = &btree->root;
+restart: /*
+ * We can reach here with a NULL or root reference; the release
+ * function handles them internally, don't complicate this code
+ * by calling them out.
+ */
+ WT_ERR(__wt_page_release(session, couple, flags));
+
+ couple = couple_orig = ref = &btree->root;
if (ref->page == NULL)
goto done;
+
+ initial_descent = true;
goto descend;
}
/*
- * If the active page was the root, we've reached the walk's end.
- * Release any hazard-pointer we're holding.
+ * If the active page was the root, we've reached the walk's end; we
+ * only get here if we've returned the root to our caller, so we're
+ * holding no hazard pointers.
*/
- if (__wt_ref_is_root(ref)) {
- WT_ERR(__wt_page_release(session, couple, flags));
+ if (__wt_ref_is_root(ref))
goto done;
- }
/* Figure out the current slot in the WT_REF array. */
__ref_index_slot(session, ref, &pindex, &slot);
@@ -352,7 +378,7 @@ __tree_walk_internal(WT_SESSION_IMPL *session,
while ((prev && slot == 0) ||
(!prev && slot == pindex->entries - 1)) {
/* Ascend to the parent. */
- __page_ascend(session, &ref, &pindex, &slot);
+ __ref_ascend(session, &ref, &pindex, &slot);
/*
* If we got all the way through an internal page and
@@ -521,16 +547,21 @@ __tree_walk_internal(WT_SESSION_IMPL *session,
ret = 0;
/*
+ * If a cursor is setting up at the end of the
+ * tree, we can't use our parent page's index,
+ * because it may have already split; restart
+ * the walk.
+ */
+ if (prev && initial_descent)
+ goto restart;
+
+ /*
* If a new walk that never coupled from the
* root to a new saved position in the tree,
* restart the walk.
*/
- if (couple == &btree->root) {
- ref = &btree->root;
- if (ref->page == NULL)
- goto done;
- goto descend;
- }
+ if (couple == &btree->root)
+ goto restart;
/*
* If restarting from some original position,
@@ -561,10 +592,56 @@ __tree_walk_internal(WT_SESSION_IMPL *session,
descend: couple = ref;
empty_internal = true;
- __page_descend(
- session, ref->page, &pindex, &slot, prev);
+ /*
+ * There's a split race when a cursor is setting
+ * up at the end of the tree or moving backwards
+ * through the tree and descending a level. When
+ * splitting an internal page into its parent,
+ * we move the WT_REF structures and update the
+ * parent's page index before updating the split
+ * page's page index, and it's not an atomic
+ * update. A thread can read the parent page's
+ * replacement page index, then read the split
+ * page's original index, or the parent page's
+ * original and the split page's replacement.
+ *
+ * This isn't a problem for a cursor setting up
+ * at the start of the tree or moving forwards
+ * through the tree because we do right-hand
+ * splits on internal pages and the initial part
+ * of the split page's namespace won't change as
+ * part of a split. A thread reading the parent
+ * page's and split page's indexes will move to
+ * the same slot no matter what order of indexes
+ * are read.
+ *
+ * Handle a cursor setting up at the end of the
+ * tree or moving backwards through the tree.
+ */
+ if (!prev) {
+ WT_INTL_INDEX_GET(
+ session, ref->page, pindex);
+ slot = 0;
+ } else if (initial_descent) {
+ if (!__ref_initial_descent_prev(
+ session, ref, &pindex))
+ goto restart;
+ slot = pindex->entries - 1;
+ } else {
+ __ref_descend_prev(
+ session, ref, &pindex);
+ slot = pindex->entries - 1;
+ }
} else {
/*
+ * At the lowest tree level (considering a leaf
+ * page), turn off the initial-descent state.
+ * Descent race tests are different when moving
+ * through the tree vs. the initial descent.
+ */
+ initial_descent = false;
+
+ /*
* Optionally skip leaf pages, the second half.
* We didn't have an on-page cell to figure out
* if it was a leaf page, we had to acquire the
@@ -605,7 +682,7 @@ __wt_tree_walk(WT_SESSION_IMPL *session, WT_REF **refp, uint32_t flags)
/*
* __wt_tree_walk_count --
* Move to the next/previous page in the tree, tracking how many
- * references were visited to get there.
+ * references were visited to get there.
*/
int
__wt_tree_walk_count(WT_SESSION_IMPL *session,
diff --git a/src/third_party/wiredtiger/src/btree/col_modify.c b/src/third_party/wiredtiger/src/btree/col_modify.c
index 645d98d9c9b..fd60b12538a 100644
--- a/src/third_party/wiredtiger/src/btree/col_modify.c
+++ b/src/third_party/wiredtiger/src/btree/col_modify.c
@@ -25,6 +25,7 @@ __wt_col_modify(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt,
WT_INSERT_HEAD *ins_head, **ins_headp;
WT_ITEM _value;
WT_PAGE *page;
+ WT_PAGE_MODIFY *mod;
WT_UPDATE *old_upd, *upd;
size_t ins_size, upd_size;
u_int i, skipdepth;
@@ -60,6 +61,7 @@ __wt_col_modify(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt,
/* If we don't yet have a modify structure, we'll need one. */
WT_RET(__wt_page_modify_init(session, page));
+ mod = page->modify;
/*
* Delete, insert or update a column-store entry.
@@ -105,17 +107,17 @@ __wt_col_modify(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt,
/* Allocate the append/update list reference as necessary. */
if (append) {
WT_PAGE_ALLOC_AND_SWAP(session,
- page, page->modify->mod_append, ins_headp, 1);
- ins_headp = &page->modify->mod_append[0];
+ page, mod->mod_append, ins_headp, 1);
+ ins_headp = &mod->mod_append[0];
} else if (page->type == WT_PAGE_COL_FIX) {
WT_PAGE_ALLOC_AND_SWAP(session,
- page, page->modify->mod_update, ins_headp, 1);
- ins_headp = &page->modify->mod_update[0];
+ page, mod->mod_update, ins_headp, 1);
+ ins_headp = &mod->mod_update[0];
} else {
WT_PAGE_ALLOC_AND_SWAP(session,
- page, page->modify->mod_update, ins_headp,
+ page, mod->mod_update, ins_headp,
page->pg_var_entries);
- ins_headp = &page->modify->mod_update[cbt->slot];
+ ins_headp = &mod->mod_update[cbt->slot];
}
/* Allocate the WT_INSERT_HEAD structure as necessary. */
@@ -135,6 +137,14 @@ __wt_col_modify(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt,
cbt->ins_head = ins_head;
cbt->ins = ins;
+ /*
+ * Check for insert split and checkpoint races in column-store:
+ * it's easy (as opposed to in row-store) and a difficult bug to
+ * otherwise diagnose.
+ */
+ WT_ASSERT(session, mod->mod_split_recno == WT_RECNO_OOB ||
+ (recno != WT_RECNO_OOB && mod->mod_split_recno > recno));
+
if (upd_arg == NULL) {
WT_ERR(
__wt_update_alloc(session, value, &upd, &upd_size));
diff --git a/src/third_party/wiredtiger/src/btree/col_srch.c b/src/third_party/wiredtiger/src/btree/col_srch.c
index cb5a227495f..23eae75ec2b 100644
--- a/src/third_party/wiredtiger/src/btree/col_srch.c
+++ b/src/third_party/wiredtiger/src/btree/col_srch.c
@@ -77,6 +77,7 @@ __wt_col_search(WT_SESSION_IMPL *session,
int depth;
btree = S2BT(session);
+ current = NULL;
__cursor_pos_clear(cbt);
@@ -116,12 +117,19 @@ __wt_col_search(WT_SESSION_IMPL *session,
goto leaf_only;
}
-restart_root:
+ if (0) {
+restart: /*
+ * Discard the currently held page and restart the search from
+ * the root.
+ */
+ WT_RET(__wt_page_release(session, current, 0));
+ }
+
/* Search the internal pages of the tree. */
current = &btree->root;
for (depth = 2, pindex = NULL;; ++depth) {
parent_pindex = pindex;
-restart_page: page = current->page;
+ page = current->page;
if (page->type != WT_PAGE_COL_INT)
break;
@@ -137,12 +145,10 @@ restart_page: page = current->page;
* If on the last slot (the key is larger than any key
* on the page), check for an internal page split race.
*/
- if (parent_pindex != NULL &&
- __wt_split_intl_race(
- session, current->home, parent_pindex)) {
- WT_RET(__wt_page_release(session, current, 0));
- goto restart_root;
- }
+ if (__wt_split_descent_race(
+ session, current, parent_pindex))
+ goto restart;
+
goto descend;
}
@@ -178,8 +184,14 @@ descend: /*
/*
* Swap the current page for the child page. If the page splits
- * while we're retrieving it, restart the search in the current
- * page; otherwise return on error, the swap call ensures we're
+ * while we're retrieving it, restart the search at the root.
+ * We cannot restart in the "current" page; for example, if a
+ * thread is appending to the tree, the page it's waiting for
+ * did an insert-split into the parent, then the parent split
+ * into its parent, the name space we are searching for may have
+ * moved above the current page in the tree.
+ *
+ * On other error, simply return, the swap call ensures we're
* holding nothing on failure.
*/
if ((ret = __wt_page_swap(
@@ -188,7 +200,7 @@ descend: /*
continue;
}
if (ret == WT_RESTART)
- goto restart_page;
+ goto restart;
return (ret);
}
diff --git a/src/third_party/wiredtiger/src/btree/row_srch.c b/src/third_party/wiredtiger/src/btree/row_srch.c
index 71564a7b3c5..9d68c8e0ce7 100644
--- a/src/third_party/wiredtiger/src/btree/row_srch.c
+++ b/src/third_party/wiredtiger/src/btree/row_srch.c
@@ -9,18 +9,17 @@
#include "wt_internal.h"
/*
- * __wt_search_insert_append --
+ * __search_insert_append --
* Fast append search of a row-store insert list, creating a skiplist stack
* as we go.
*/
static inline int
-__wt_search_insert_append(WT_SESSION_IMPL *session,
- WT_CURSOR_BTREE *cbt, WT_ITEM *srch_key, bool *donep)
+__search_insert_append(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt,
+ WT_INSERT_HEAD *ins_head, WT_ITEM *srch_key, bool *donep)
{
WT_BTREE *btree;
WT_COLLATOR *collator;
WT_INSERT *ins;
- WT_INSERT_HEAD *inshead;
WT_ITEM key;
int cmp, i;
@@ -28,8 +27,7 @@ __wt_search_insert_append(WT_SESSION_IMPL *session,
collator = btree->collator;
*donep = 0;
- inshead = cbt->ins_head;
- if ((ins = WT_SKIP_LAST(inshead)) == NULL)
+ if ((ins = WT_SKIP_LAST(ins_head)) == NULL)
return (0);
key.data = WT_INSERT_KEY(ins);
key.size = WT_INSERT_KEY_SIZE(ins);
@@ -48,12 +46,13 @@ __wt_search_insert_append(WT_SESSION_IMPL *session,
*/
for (i = WT_SKIP_MAXDEPTH - 1; i >= 0; i--) {
cbt->ins_stack[i] = (i == 0) ? &ins->next[0] :
- (inshead->tail[i] != NULL) ?
- &inshead->tail[i]->next[i] : &inshead->head[i];
+ (ins_head->tail[i] != NULL) ?
+ &ins_head->tail[i]->next[i] : &ins_head->head[i];
cbt->next_stack[i] = NULL;
}
cbt->compare = -cmp;
cbt->ins = ins;
+ cbt->ins_head = ins_head;
*donep = 1;
}
return (0);
@@ -64,20 +63,18 @@ __wt_search_insert_append(WT_SESSION_IMPL *session,
* Search a row-store insert list, creating a skiplist stack as we go.
*/
int
-__wt_search_insert(
- WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, WT_ITEM *srch_key)
+__wt_search_insert(WT_SESSION_IMPL *session,
+ WT_CURSOR_BTREE *cbt, WT_INSERT_HEAD *ins_head, WT_ITEM *srch_key)
{
WT_BTREE *btree;
WT_COLLATOR *collator;
WT_INSERT *ins, **insp, *last_ins;
- WT_INSERT_HEAD *inshead;
WT_ITEM key;
size_t match, skiphigh, skiplow;
int cmp, i;
btree = S2BT(session);
collator = btree->collator;
- inshead = cbt->ins_head;
cmp = 0; /* -Wuninitialized */
/*
@@ -86,7 +83,7 @@ __wt_search_insert(
*/
match = skiphigh = skiplow = 0;
ins = last_ins = NULL;
- for (i = WT_SKIP_MAXDEPTH - 1, insp = &inshead->head[i]; i >= 0;) {
+ for (i = WT_SKIP_MAXDEPTH - 1, insp = &ins_head->head[i]; i >= 0;) {
if ((ins = *insp) == NULL) {
cbt->next_stack[i] = NULL;
cbt->ins_stack[i--] = insp--;
@@ -128,6 +125,7 @@ __wt_search_insert(
*/
cbt->compare = -cmp;
cbt->ins = (ins != NULL) ? ins : last_ins;
+ cbt->ins_head = ins_head;
return (0);
}
@@ -212,6 +210,7 @@ __wt_row_search(WT_SESSION_IMPL *session,
WT_BTREE *btree;
WT_COLLATOR *collator;
WT_DECL_RET;
+ WT_INSERT_HEAD *ins_head;
WT_ITEM *item;
WT_PAGE *page;
WT_PAGE_INDEX *pindex, *parent_pindex;
@@ -276,12 +275,20 @@ __wt_row_search(WT_SESSION_IMPL *session,
goto leaf_only;
}
+ if (0) {
+restart: /*
+ * Discard the currently held page and restart the search from
+ * the root.
+ */
+ WT_RET(__wt_page_release(session, current, 0));
+ skiphigh = skiplow = 0;
+ }
+
/* Search the internal pages of the tree. */
-restart_root:
current = &btree->root;
for (depth = 2, pindex = NULL;; ++depth) {
parent_pindex = pindex;
-restart_page: page = current->page;
+ page = current->page;
if (page->type != WT_PAGE_ROW_INT)
break;
@@ -418,22 +425,21 @@ restart_page: page = current->page;
* page), check for an internal page split race.
*/
if (pindex->entries == base) {
-append: if (parent_pindex != NULL &&
- __wt_split_intl_race(
- session, current->home, parent_pindex)) {
- if ((ret = __wt_page_release(
- session, current, 0)) != 0)
- return (ret);
-
- skiplow = skiphigh = 0;
- goto restart_root;
- }
+append: if (__wt_split_descent_race(
+ session, current, parent_pindex))
+ goto restart;
}
descend: /*
* Swap the current page for the child page. If the page splits
- * while we're retrieving it, restart the search in the current
- * page; otherwise return on error, the swap call ensures we're
+ * while we're retrieving it, restart the search at the root.
+ * We cannot restart in the "current" page; for example, if a
+ * thread is appending to the tree, the page it's waiting for
+ * did an insert-split into the parent, then the parent split
+ * into its parent, the name space we are searching for may have
+ * moved above the current page in the tree.
+ *
+ * On other error, simply return, the swap call ensures we're
* holding nothing on failure.
*/
if ((ret = __wt_page_swap(
@@ -441,10 +447,8 @@ descend: /*
current = descent;
continue;
}
- if (ret == WT_RESTART) {
- skiphigh = skiplow = 0;
- goto restart_page;
- }
+ if (ret == WT_RESTART)
+ goto restart;
return (ret);
}
@@ -480,24 +484,18 @@ leaf_only:
cbt->slot = WT_ROW_SLOT(page, page->pg_row_d);
F_SET(cbt, WT_CBT_SEARCH_SMALLEST);
- cbt->ins_head = WT_ROW_INSERT_SMALLEST(page);
+ ins_head = WT_ROW_INSERT_SMALLEST(page);
} else {
cbt->slot = WT_ROW_SLOT(page,
page->pg_row_d + (page->pg_row_entries - 1));
- cbt->ins_head = WT_ROW_INSERT_SLOT(page, cbt->slot);
+ ins_head = WT_ROW_INSERT_SLOT(page, cbt->slot);
}
- WT_ERR(
- __wt_search_insert_append(session, cbt, srch_key, &done));
+ WT_ERR(__search_insert_append(
+ session, cbt, ins_head, srch_key, &done));
if (done)
return (0);
-
- /*
- * Don't leave the insert list head set, code external to the
- * search uses it.
- */
- cbt->ins_head = NULL;
}
/*
@@ -590,16 +588,16 @@ leaf_match: cbt->compare = 0;
cbt->slot = WT_ROW_SLOT(page, page->pg_row_d);
F_SET(cbt, WT_CBT_SEARCH_SMALLEST);
- cbt->ins_head = WT_ROW_INSERT_SMALLEST(page);
+ ins_head = WT_ROW_INSERT_SMALLEST(page);
} else {
cbt->compare = -1;
cbt->slot = WT_ROW_SLOT(page, page->pg_row_d + (base - 1));
- cbt->ins_head = WT_ROW_INSERT_SLOT(page, cbt->slot);
+ ins_head = WT_ROW_INSERT_SLOT(page, cbt->slot);
}
/* If there's no insert list, we're done. */
- if (WT_SKIP_FIRST(cbt->ins_head) == NULL)
+ if (WT_SKIP_FIRST(ins_head) == NULL)
return (0);
/*
@@ -607,12 +605,12 @@ leaf_match: cbt->compare = 0;
* catch cursors repeatedly inserting at a single point.
*/
if (insert) {
- WT_ERR(
- __wt_search_insert_append(session, cbt, srch_key, &done));
+ WT_ERR(__search_insert_append(
+ session, cbt, ins_head, srch_key, &done));
if (done)
return (0);
}
- WT_ERR(__wt_search_insert(session, cbt, srch_key));
+ WT_ERR(__wt_search_insert(session, cbt, ins_head, srch_key));
return (0);
@@ -661,19 +659,16 @@ __wt_row_random_leaf(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt)
/*
* If the tree is new (and not empty), it might have a large insert
* list.
- */
- F_SET(cbt, WT_CBT_SEARCH_SMALLEST);
- if ((cbt->ins_head = WT_ROW_INSERT_SMALLEST(page)) == NULL)
- return (WT_NOTFOUND);
-
- /*
+ *
* Walk down the list until we find a level with at least 50 entries,
* that's where we'll start rolling random numbers. The value 50 is
* used to ignore levels with only a few entries, that is, levels which
* are potentially badly skewed.
*/
- for (ins_head = cbt->ins_head,
- level = WT_SKIP_MAXDEPTH - 1; level >= 0; --level) {
+ F_SET(cbt, WT_CBT_SEARCH_SMALLEST);
+ if ((ins_head = WT_ROW_INSERT_SMALLEST(page)) == NULL)
+ return (WT_NOTFOUND);
+ for (level = WT_SKIP_MAXDEPTH - 1; level >= 0; --level) {
start = &ins_head->head[level];
for (entries = 0, stop = start;
*stop != NULL; stop = &(*stop)->next[level])
@@ -768,6 +763,7 @@ __wt_row_random_leaf(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt)
ins = ins->next[0];
cbt->ins = ins;
+ cbt->ins_head = ins_head;
cbt->compare = 0;
return (0);
@@ -787,11 +783,19 @@ __wt_row_random_descent(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt)
WT_REF *current, *descent;
btree = S2BT(session);
+ current = NULL;
__cursor_pos_clear(cbt);
-restart_root:
- /* Walk the internal pages of the tree. */
+ if (0) {
+restart: /*
+ * Discard the currently held page and restart the search from
+ * the root.
+ */
+ WT_RET(__wt_page_release(session, current, 0));
+ }
+
+ /* Search the internal pages of the tree. */
current = &btree->root;
for (;;) {
page = current->page;
@@ -803,22 +807,19 @@ restart_root:
__wt_random(&session->rnd) % pindex->entries];
/*
- * Swap the parent page for the child page; return on error,
- * the swap function ensures we're holding nothing on failure.
+ * Swap the current page for the child page. If the page splits
+ * while we're retrieving it, restart the search at the root.
+ *
+ * On other error, simply return, the swap call ensures we're
+ * holding nothing on failure.
*/
if ((ret = __wt_page_swap(
session, current, descent, WT_READ_RESTART_OK)) == 0) {
current = descent;
continue;
}
- /*
- * Restart is returned if we find a page that's been split; the
- * held page isn't discarded when restart is returned, discard
- * it and restart the search from the top of the tree.
- */
- if (ret == WT_RESTART &&
- (ret = __wt_page_release(session, current, 0)) == 0)
- goto restart_root;
+ if (ret == WT_RESTART)
+ goto restart;
return (ret);
}