diff options
author | Michael Cahill <michael.cahill@mongodb.com> | 2016-02-19 15:36:42 +1100 |
---|---|---|
committer | Michael Cahill <michael.cahill@mongodb.com> | 2016-02-19 15:36:47 +1100 |
commit | 70db6ed51f90f627570de9bf32ab8c5cd23886ca (patch) | |
tree | 1f5ae19d38fad8488657ae80ae9014b832ec858e /src/third_party/wiredtiger/src/btree | |
parent | da2441b59b742c077306be6515c999c33cd955a6 (diff) | |
download | mongo-70db6ed51f90f627570de9bf32ab8c5cd23886ca.tar.gz |
Import wiredtiger-wiredtiger-2.7.0-675-g4f38287.tar.gz from wiredtiger branch mongodb-3.4
ref: cc96d99..4f38287
SERVER-22676 WiredTiger fails to open databases created by 3.0.0 or 3.0.1
WT-2280 Add CRC32 Optimized code for PPC64LE
WT-2295 WT_SESSION.create does a full-scan of the main table
WT-2346 Don't hold schema lock during checkpoint I/O
WT-2361 Column-store starting record number error
WT-2367 WT_CURSOR.next out-of-order returns failure
WT-2374 Read error on index file
WT-2375 Need tests for collators
WT-2382 Problem with custom collator for 'u' format with join cursor
WT-2387 Fix cursor random unit test on Windows
WT-2390 OS X build is broken
WT-2393 Unnecessary error handling labels.
WT-2396 Jenkins Spinlock GCC task Hung
WT-2397 Cursor traversal from end of the tree skips records.
WT-2399 Add test case that verifies cursor traversal
WT-2411 LSM drop hang
Diffstat (limited to 'src/third_party/wiredtiger/src/btree')
-rw-r--r-- | src/third_party/wiredtiger/src/btree/bt_curprev.c | 3 | ||||
-rw-r--r-- | src/third_party/wiredtiger/src/btree/bt_split.c | 60 | ||||
-rw-r--r-- | src/third_party/wiredtiger/src/btree/bt_sync.c | 14 | ||||
-rw-r--r-- | src/third_party/wiredtiger/src/btree/bt_walk.c | 153 | ||||
-rw-r--r-- | src/third_party/wiredtiger/src/btree/col_modify.c | 22 | ||||
-rw-r--r-- | src/third_party/wiredtiger/src/btree/col_srch.c | 34 | ||||
-rw-r--r-- | src/third_party/wiredtiger/src/btree/row_srch.c | 135 |
7 files changed, 276 insertions, 145 deletions
diff --git a/src/third_party/wiredtiger/src/btree/bt_curprev.c b/src/third_party/wiredtiger/src/btree/bt_curprev.c index a083ec4016e..7475c0f1312 100644 --- a/src/third_party/wiredtiger/src/btree/bt_curprev.c +++ b/src/third_party/wiredtiger/src/btree/bt_curprev.c @@ -51,7 +51,8 @@ restart: if (cbt->btree->type == BTREE_ROW) { key.data = WT_INSERT_KEY(current); key.size = WT_INSERT_KEY_SIZE(current); - WT_RET(__wt_search_insert(session, cbt, &key)); + WT_RET(__wt_search_insert( + session, cbt, cbt->ins_head, &key)); } else cbt->ins = __col_insert_search(cbt->ins_head, cbt->ins_stack, cbt->next_stack, diff --git a/src/third_party/wiredtiger/src/btree/bt_split.c b/src/third_party/wiredtiger/src/btree/bt_split.c index bd38451d5d1..3dea03316ce 100644 --- a/src/third_party/wiredtiger/src/btree/bt_split.c +++ b/src/third_party/wiredtiger/src/btree/bt_split.c @@ -1383,11 +1383,27 @@ __split_internal_should_split(WT_SESSION_IMPL *session, WT_REF *ref) static int __split_parent_climb(WT_SESSION_IMPL *session, WT_PAGE *page, bool page_hazard) { + WT_BTREE *btree; WT_DECL_RET; WT_PAGE *parent; WT_REF *ref; bool parent_hazard; + btree = S2BT(session); + + /* + * Disallow internal splits during the final pass of a checkpoint. Most + * splits are already disallowed during checkpoints, but an important + * exception is insert splits. The danger is an insert split creates a + * new chunk of the namespace, and then the internal split will move it + * to a different part of the tree where it will be written; in other + * words, in one part of the tree we'll skip the newly created insert + * split chunk, but we'll write it upon finding it in a different part + * of the tree. + */ + if (btree->checkpointing != WT_CKPT_OFF) + return (__split_internal_unlock(session, page, page_hazard)); + /* * Page splits trickle up the tree, that is, as leaf pages grow large * enough and are evicted, they'll split into their parent. And, as @@ -1771,8 +1787,8 @@ __split_insert(WT_SESSION_IMPL *session, WT_REF *ref) type, WT_INSERT_RECNO(moved_ins), 0, false, &right)); /* - * The new page is dirty by definition, column-store splits update the - * page-modify structure, so create it now. + * The new page is dirty by definition, plus column-store splits update + * the page-modify structure, so create it now. */ WT_ERR(__wt_page_modify_init(session, right)); __wt_page_modify_set(session, right); @@ -1813,15 +1829,6 @@ __split_insert(WT_SESSION_IMPL *session, WT_REF *ref) } /* - * We modified the page above, which will have set the first dirty - * transaction to the last transaction current running. However, the - * updates we installed may be older than that. Set the first dirty - * transaction to an impossibly old value so this page is never skipped - * in a checkpoint. - */ - right->modify->first_dirty_txn = WT_TXN_FIRST; - - /* * Calculate how much memory we're moving: figure out how deep the skip * list stack is for the element we are moving, and the memory used by * the item's list of updates. @@ -1919,6 +1926,24 @@ __split_insert(WT_SESSION_IMPL *session, WT_REF *ref) #endif /* + * We perform insert splits concurrently with checkpoints, where the + * requirement is a checkpoint must include either the original page + * or both new pages. The page we're splitting is dirty, but that's + * insufficient: set the first dirty transaction to an impossibly old + * value so this page is not skipped by a checkpoint. + */ + page->modify->first_dirty_txn = WT_TXN_FIRST; + + /* + * We modified the page above, which will have set the first dirty + * transaction to the last transaction current running. However, the + * updates we installed may be older than that. Set the first dirty + * transaction to an impossibly old value so this page is never skipped + * in a checkpoint. + */ + right->modify->first_dirty_txn = WT_TXN_FIRST; + + /* * Update the page accounting. * * XXX @@ -1928,10 +1953,14 @@ __split_insert(WT_SESSION_IMPL *session, WT_REF *ref) __wt_cache_page_inmem_incr(session, right, right_incr); /* - * Split into the parent. On successful return, the original page is no - * longer locked, so we cannot safely look at it. + * The act of splitting into the parent releases the pages for eviction; + * ensure the page contents are consistent. + */ + WT_WRITE_BARRIER(); + + /* + * Split into the parent. */ - page = NULL; if ((ret = __split_parent( session, ref, split_ref, 2, parent_incr, false, true)) == 0) return (0); @@ -1941,7 +1970,8 @@ __split_insert(WT_SESSION_IMPL *session, WT_REF *ref) * * Reset the split column-store page record. */ - page->modify->mod_split_recno = WT_RECNO_OOB; + if (type != WT_PAGE_ROW_LEAF) + page->modify->mod_split_recno = WT_RECNO_OOB; /* * Clear the allocated page's reference to the moved insert list element diff --git a/src/third_party/wiredtiger/src/btree/bt_sync.c b/src/third_party/wiredtiger/src/btree/bt_sync.c index 5cbd8d1e996..bbfb06c636f 100644 --- a/src/third_party/wiredtiger/src/btree/bt_sync.c +++ b/src/third_party/wiredtiger/src/btree/bt_sync.c @@ -105,13 +105,13 @@ __sync_file(WT_SESSION_IMPL *session, WT_CACHE_OP syncop) __wt_spin_lock(session, &btree->flush_lock); /* - * When internal pages are being reconciled by checkpoint their - * child pages cannot disappear from underneath them or be split - * into them, nor can underlying blocks be freed until the block - * lists for the checkpoint are stable. Set the checkpointing - * flag to block eviction of dirty pages until the checkpoint's - * internal page pass is complete, then wait for any existing - * eviction to complete. + * In the final checkpoint pass, child pages cannot be evicted + * from underneath internal pages nor can underlying blocks be + * freed until the checkpoint's block lists are stable. Also, + * we cannot split child pages into parents unless we know the + * final pass will write a consistent view of that namespace. + * Set the checkpointing flag to block such actions and wait for + * any problematic eviction or page splits to complete. */ WT_PUBLISH(btree->checkpointing, WT_CKPT_PREPARE); diff --git a/src/third_party/wiredtiger/src/btree/bt_walk.c b/src/third_party/wiredtiger/src/btree/bt_walk.c index d7785c689d9..55b11d7b2d1 100644 --- a/src/third_party/wiredtiger/src/btree/bt_walk.c +++ b/src/third_party/wiredtiger/src/btree/bt_walk.c @@ -89,11 +89,11 @@ __ref_is_leaf(WT_REF *ref) } /* - * __page_ascend -- + * __ref_ascend -- * Ascend the tree one level. */ -static void -__page_ascend(WT_SESSION_IMPL *session, +static inline void +__ref_ascend(WT_SESSION_IMPL *session, WT_REF **refp, WT_PAGE_INDEX **pindexp, uint32_t *slotp) { WT_REF *parent_ref, *ref; @@ -163,12 +163,12 @@ __page_ascend(WT_SESSION_IMPL *session, } /* - * __page_descend -- - * Descend the tree one level. + * __ref_descend_prev -- + * Descend the tree one level, during a previous-cursor walk. */ -static void -__page_descend(WT_SESSION_IMPL *session, - WT_PAGE *page, WT_PAGE_INDEX **pindexp, uint32_t *slotp, bool prev) +static inline void +__ref_descend_prev( + WT_SESSION_IMPL *session, WT_REF *ref, WT_PAGE_INDEX **pindexp) { WT_PAGE_INDEX *pindex; @@ -177,9 +177,6 @@ __page_descend(WT_SESSION_IMPL *session, * we have a hazard pointer. */ for (;; __wt_yield()) { - WT_INTL_INDEX_GET(session, page, pindex); - *slotp = prev ? pindex->entries - 1 : 0; - /* * There's a split race when a cursor moving backwards through * the tree descends the tree. If we're splitting an internal @@ -233,21 +230,41 @@ __page_descend(WT_SESSION_IMPL *session, * being split and part of its namespace moved. We have the * correct page and we don't have to move, all we have to do is * wait until the split page's page index is updated. - * - * No test is necessary for a next-cursor movement because we - * do right-hand splits on internal pages and the initial part - * of the page's namespace won't change as part of a split. - * Instead of testing the direction boolean, do the test the - * previous cursor movement requires in all cases, even though - * it will always succeed for a next-cursor movement. */ - if (pindex->index[*slotp]->home == page) + WT_INTL_INDEX_GET(session, ref->page, pindex); + if (pindex->index[pindex->entries - 1]->home == ref->page) break; } *pindexp = pindex; } /* + * __ref_initial_descent_prev -- + * Descend the tree one level, when setting up the initial cursor position + * for a previous-cursor walk. + */ +static inline bool +__ref_initial_descent_prev( + WT_SESSION_IMPL *session, WT_REF *ref, WT_PAGE_INDEX **pindexp) +{ + WT_PAGE_INDEX *pindex; + + /* + * We're passed a child page into which we're descending, and on which + * we have a hazard pointer. + * + * Acquire a page index for the child page and then confirm we haven't + * raced with a parent split. + */ + WT_INTL_INDEX_GET(session, ref->page, pindex); + if (__wt_split_descent_race(session, ref, *pindexp)) + return (false); + + *pindexp = pindex; + return (true); +} + +/* * __tree_walk_internal -- * Move to the next/previous page in the tree. */ @@ -259,11 +276,12 @@ __tree_walk_internal(WT_SESSION_IMPL *session, WT_DECL_RET; WT_PAGE_INDEX *pindex; WT_REF *couple, *couple_orig, *ref; - bool empty_internal, prev, skip; + bool empty_internal, initial_descent, prev, skip; uint32_t slot; btree = S2BT(session); - empty_internal = false; + pindex = NULL; + empty_internal = initial_descent = false; /* * Tree walks are special: they look inside page structures that splits @@ -323,22 +341,30 @@ __tree_walk_internal(WT_SESSION_IMPL *session, couple = couple_orig = ref = *refp; *refp = NULL; - /* If no page is active, begin a walk from the start of the tree. */ + /* If no page is active, begin a walk from the start/end of the tree. */ if (ref == NULL) { - ref = &btree->root; +restart: /* + * We can reach here with a NULL or root reference; the release + * function handles them internally, don't complicate this code + * by calling them out. + */ + WT_ERR(__wt_page_release(session, couple, flags)); + + couple = couple_orig = ref = &btree->root; if (ref->page == NULL) goto done; + + initial_descent = true; goto descend; } /* - * If the active page was the root, we've reached the walk's end. - * Release any hazard-pointer we're holding. + * If the active page was the root, we've reached the walk's end; we + * only get here if we've returned the root to our caller, so we're + * holding no hazard pointers. */ - if (__wt_ref_is_root(ref)) { - WT_ERR(__wt_page_release(session, couple, flags)); + if (__wt_ref_is_root(ref)) goto done; - } /* Figure out the current slot in the WT_REF array. */ __ref_index_slot(session, ref, &pindex, &slot); @@ -352,7 +378,7 @@ __tree_walk_internal(WT_SESSION_IMPL *session, while ((prev && slot == 0) || (!prev && slot == pindex->entries - 1)) { /* Ascend to the parent. */ - __page_ascend(session, &ref, &pindex, &slot); + __ref_ascend(session, &ref, &pindex, &slot); /* * If we got all the way through an internal page and @@ -521,16 +547,21 @@ __tree_walk_internal(WT_SESSION_IMPL *session, ret = 0; /* + * If a cursor is setting up at the end of the + * tree, we can't use our parent page's index, + * because it may have already split; restart + * the walk. + */ + if (prev && initial_descent) + goto restart; + + /* * If a new walk that never coupled from the * root to a new saved position in the tree, * restart the walk. */ - if (couple == &btree->root) { - ref = &btree->root; - if (ref->page == NULL) - goto done; - goto descend; - } + if (couple == &btree->root) + goto restart; /* * If restarting from some original position, @@ -561,10 +592,56 @@ __tree_walk_internal(WT_SESSION_IMPL *session, descend: couple = ref; empty_internal = true; - __page_descend( - session, ref->page, &pindex, &slot, prev); + /* + * There's a split race when a cursor is setting + * up at the end of the tree or moving backwards + * through the tree and descending a level. When + * splitting an internal page into its parent, + * we move the WT_REF structures and update the + * parent's page index before updating the split + * page's page index, and it's not an atomic + * update. A thread can read the parent page's + * replacement page index, then read the split + * page's original index, or the parent page's + * original and the split page's replacement. + * + * This isn't a problem for a cursor setting up + * at the start of the tree or moving forwards + * through the tree because we do right-hand + * splits on internal pages and the initial part + * of the split page's namespace won't change as + * part of a split. A thread reading the parent + * page's and split page's indexes will move to + * the same slot no matter what order of indexes + * are read. + * + * Handle a cursor setting up at the end of the + * tree or moving backwards through the tree. + */ + if (!prev) { + WT_INTL_INDEX_GET( + session, ref->page, pindex); + slot = 0; + } else if (initial_descent) { + if (!__ref_initial_descent_prev( + session, ref, &pindex)) + goto restart; + slot = pindex->entries - 1; + } else { + __ref_descend_prev( + session, ref, &pindex); + slot = pindex->entries - 1; + } } else { /* + * At the lowest tree level (considering a leaf + * page), turn off the initial-descent state. + * Descent race tests are different when moving + * through the tree vs. the initial descent. + */ + initial_descent = false; + + /* * Optionally skip leaf pages, the second half. * We didn't have an on-page cell to figure out * if it was a leaf page, we had to acquire the @@ -605,7 +682,7 @@ __wt_tree_walk(WT_SESSION_IMPL *session, WT_REF **refp, uint32_t flags) /* * __wt_tree_walk_count -- * Move to the next/previous page in the tree, tracking how many - * references were visited to get there. + * references were visited to get there. */ int __wt_tree_walk_count(WT_SESSION_IMPL *session, diff --git a/src/third_party/wiredtiger/src/btree/col_modify.c b/src/third_party/wiredtiger/src/btree/col_modify.c index 645d98d9c9b..fd60b12538a 100644 --- a/src/third_party/wiredtiger/src/btree/col_modify.c +++ b/src/third_party/wiredtiger/src/btree/col_modify.c @@ -25,6 +25,7 @@ __wt_col_modify(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, WT_INSERT_HEAD *ins_head, **ins_headp; WT_ITEM _value; WT_PAGE *page; + WT_PAGE_MODIFY *mod; WT_UPDATE *old_upd, *upd; size_t ins_size, upd_size; u_int i, skipdepth; @@ -60,6 +61,7 @@ __wt_col_modify(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, /* If we don't yet have a modify structure, we'll need one. */ WT_RET(__wt_page_modify_init(session, page)); + mod = page->modify; /* * Delete, insert or update a column-store entry. @@ -105,17 +107,17 @@ __wt_col_modify(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, /* Allocate the append/update list reference as necessary. */ if (append) { WT_PAGE_ALLOC_AND_SWAP(session, - page, page->modify->mod_append, ins_headp, 1); - ins_headp = &page->modify->mod_append[0]; + page, mod->mod_append, ins_headp, 1); + ins_headp = &mod->mod_append[0]; } else if (page->type == WT_PAGE_COL_FIX) { WT_PAGE_ALLOC_AND_SWAP(session, - page, page->modify->mod_update, ins_headp, 1); - ins_headp = &page->modify->mod_update[0]; + page, mod->mod_update, ins_headp, 1); + ins_headp = &mod->mod_update[0]; } else { WT_PAGE_ALLOC_AND_SWAP(session, - page, page->modify->mod_update, ins_headp, + page, mod->mod_update, ins_headp, page->pg_var_entries); - ins_headp = &page->modify->mod_update[cbt->slot]; + ins_headp = &mod->mod_update[cbt->slot]; } /* Allocate the WT_INSERT_HEAD structure as necessary. */ @@ -135,6 +137,14 @@ __wt_col_modify(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, cbt->ins_head = ins_head; cbt->ins = ins; + /* + * Check for insert split and checkpoint races in column-store: + * it's easy (as opposed to in row-store) and a difficult bug to + * otherwise diagnose. + */ + WT_ASSERT(session, mod->mod_split_recno == WT_RECNO_OOB || + (recno != WT_RECNO_OOB && mod->mod_split_recno > recno)); + if (upd_arg == NULL) { WT_ERR( __wt_update_alloc(session, value, &upd, &upd_size)); diff --git a/src/third_party/wiredtiger/src/btree/col_srch.c b/src/third_party/wiredtiger/src/btree/col_srch.c index cb5a227495f..23eae75ec2b 100644 --- a/src/third_party/wiredtiger/src/btree/col_srch.c +++ b/src/third_party/wiredtiger/src/btree/col_srch.c @@ -77,6 +77,7 @@ __wt_col_search(WT_SESSION_IMPL *session, int depth; btree = S2BT(session); + current = NULL; __cursor_pos_clear(cbt); @@ -116,12 +117,19 @@ __wt_col_search(WT_SESSION_IMPL *session, goto leaf_only; } -restart_root: + if (0) { +restart: /* + * Discard the currently held page and restart the search from + * the root. + */ + WT_RET(__wt_page_release(session, current, 0)); + } + /* Search the internal pages of the tree. */ current = &btree->root; for (depth = 2, pindex = NULL;; ++depth) { parent_pindex = pindex; -restart_page: page = current->page; + page = current->page; if (page->type != WT_PAGE_COL_INT) break; @@ -137,12 +145,10 @@ restart_page: page = current->page; * If on the last slot (the key is larger than any key * on the page), check for an internal page split race. */ - if (parent_pindex != NULL && - __wt_split_intl_race( - session, current->home, parent_pindex)) { - WT_RET(__wt_page_release(session, current, 0)); - goto restart_root; - } + if (__wt_split_descent_race( + session, current, parent_pindex)) + goto restart; + goto descend; } @@ -178,8 +184,14 @@ descend: /* /* * Swap the current page for the child page. If the page splits - * while we're retrieving it, restart the search in the current - * page; otherwise return on error, the swap call ensures we're + * while we're retrieving it, restart the search at the root. + * We cannot restart in the "current" page; for example, if a + * thread is appending to the tree, the page it's waiting for + * did an insert-split into the parent, then the parent split + * into its parent, the name space we are searching for may have + * moved above the current page in the tree. + * + * On other error, simply return, the swap call ensures we're * holding nothing on failure. */ if ((ret = __wt_page_swap( @@ -188,7 +200,7 @@ descend: /* continue; } if (ret == WT_RESTART) - goto restart_page; + goto restart; return (ret); } diff --git a/src/third_party/wiredtiger/src/btree/row_srch.c b/src/third_party/wiredtiger/src/btree/row_srch.c index 71564a7b3c5..9d68c8e0ce7 100644 --- a/src/third_party/wiredtiger/src/btree/row_srch.c +++ b/src/third_party/wiredtiger/src/btree/row_srch.c @@ -9,18 +9,17 @@ #include "wt_internal.h" /* - * __wt_search_insert_append -- + * __search_insert_append -- * Fast append search of a row-store insert list, creating a skiplist stack * as we go. */ static inline int -__wt_search_insert_append(WT_SESSION_IMPL *session, - WT_CURSOR_BTREE *cbt, WT_ITEM *srch_key, bool *donep) +__search_insert_append(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, + WT_INSERT_HEAD *ins_head, WT_ITEM *srch_key, bool *donep) { WT_BTREE *btree; WT_COLLATOR *collator; WT_INSERT *ins; - WT_INSERT_HEAD *inshead; WT_ITEM key; int cmp, i; @@ -28,8 +27,7 @@ __wt_search_insert_append(WT_SESSION_IMPL *session, collator = btree->collator; *donep = 0; - inshead = cbt->ins_head; - if ((ins = WT_SKIP_LAST(inshead)) == NULL) + if ((ins = WT_SKIP_LAST(ins_head)) == NULL) return (0); key.data = WT_INSERT_KEY(ins); key.size = WT_INSERT_KEY_SIZE(ins); @@ -48,12 +46,13 @@ __wt_search_insert_append(WT_SESSION_IMPL *session, */ for (i = WT_SKIP_MAXDEPTH - 1; i >= 0; i--) { cbt->ins_stack[i] = (i == 0) ? &ins->next[0] : - (inshead->tail[i] != NULL) ? - &inshead->tail[i]->next[i] : &inshead->head[i]; + (ins_head->tail[i] != NULL) ? + &ins_head->tail[i]->next[i] : &ins_head->head[i]; cbt->next_stack[i] = NULL; } cbt->compare = -cmp; cbt->ins = ins; + cbt->ins_head = ins_head; *donep = 1; } return (0); @@ -64,20 +63,18 @@ __wt_search_insert_append(WT_SESSION_IMPL *session, * Search a row-store insert list, creating a skiplist stack as we go. */ int -__wt_search_insert( - WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, WT_ITEM *srch_key) +__wt_search_insert(WT_SESSION_IMPL *session, + WT_CURSOR_BTREE *cbt, WT_INSERT_HEAD *ins_head, WT_ITEM *srch_key) { WT_BTREE *btree; WT_COLLATOR *collator; WT_INSERT *ins, **insp, *last_ins; - WT_INSERT_HEAD *inshead; WT_ITEM key; size_t match, skiphigh, skiplow; int cmp, i; btree = S2BT(session); collator = btree->collator; - inshead = cbt->ins_head; cmp = 0; /* -Wuninitialized */ /* @@ -86,7 +83,7 @@ __wt_search_insert( */ match = skiphigh = skiplow = 0; ins = last_ins = NULL; - for (i = WT_SKIP_MAXDEPTH - 1, insp = &inshead->head[i]; i >= 0;) { + for (i = WT_SKIP_MAXDEPTH - 1, insp = &ins_head->head[i]; i >= 0;) { if ((ins = *insp) == NULL) { cbt->next_stack[i] = NULL; cbt->ins_stack[i--] = insp--; @@ -128,6 +125,7 @@ __wt_search_insert( */ cbt->compare = -cmp; cbt->ins = (ins != NULL) ? ins : last_ins; + cbt->ins_head = ins_head; return (0); } @@ -212,6 +210,7 @@ __wt_row_search(WT_SESSION_IMPL *session, WT_BTREE *btree; WT_COLLATOR *collator; WT_DECL_RET; + WT_INSERT_HEAD *ins_head; WT_ITEM *item; WT_PAGE *page; WT_PAGE_INDEX *pindex, *parent_pindex; @@ -276,12 +275,20 @@ __wt_row_search(WT_SESSION_IMPL *session, goto leaf_only; } + if (0) { +restart: /* + * Discard the currently held page and restart the search from + * the root. + */ + WT_RET(__wt_page_release(session, current, 0)); + skiphigh = skiplow = 0; + } + /* Search the internal pages of the tree. */ -restart_root: current = &btree->root; for (depth = 2, pindex = NULL;; ++depth) { parent_pindex = pindex; -restart_page: page = current->page; + page = current->page; if (page->type != WT_PAGE_ROW_INT) break; @@ -418,22 +425,21 @@ restart_page: page = current->page; * page), check for an internal page split race. */ if (pindex->entries == base) { -append: if (parent_pindex != NULL && - __wt_split_intl_race( - session, current->home, parent_pindex)) { - if ((ret = __wt_page_release( - session, current, 0)) != 0) - return (ret); - - skiplow = skiphigh = 0; - goto restart_root; - } +append: if (__wt_split_descent_race( + session, current, parent_pindex)) + goto restart; } descend: /* * Swap the current page for the child page. If the page splits - * while we're retrieving it, restart the search in the current - * page; otherwise return on error, the swap call ensures we're + * while we're retrieving it, restart the search at the root. + * We cannot restart in the "current" page; for example, if a + * thread is appending to the tree, the page it's waiting for + * did an insert-split into the parent, then the parent split + * into its parent, the name space we are searching for may have + * moved above the current page in the tree. + * + * On other error, simply return, the swap call ensures we're * holding nothing on failure. */ if ((ret = __wt_page_swap( @@ -441,10 +447,8 @@ descend: /* current = descent; continue; } - if (ret == WT_RESTART) { - skiphigh = skiplow = 0; - goto restart_page; - } + if (ret == WT_RESTART) + goto restart; return (ret); } @@ -480,24 +484,18 @@ leaf_only: cbt->slot = WT_ROW_SLOT(page, page->pg_row_d); F_SET(cbt, WT_CBT_SEARCH_SMALLEST); - cbt->ins_head = WT_ROW_INSERT_SMALLEST(page); + ins_head = WT_ROW_INSERT_SMALLEST(page); } else { cbt->slot = WT_ROW_SLOT(page, page->pg_row_d + (page->pg_row_entries - 1)); - cbt->ins_head = WT_ROW_INSERT_SLOT(page, cbt->slot); + ins_head = WT_ROW_INSERT_SLOT(page, cbt->slot); } - WT_ERR( - __wt_search_insert_append(session, cbt, srch_key, &done)); + WT_ERR(__search_insert_append( + session, cbt, ins_head, srch_key, &done)); if (done) return (0); - - /* - * Don't leave the insert list head set, code external to the - * search uses it. - */ - cbt->ins_head = NULL; } /* @@ -590,16 +588,16 @@ leaf_match: cbt->compare = 0; cbt->slot = WT_ROW_SLOT(page, page->pg_row_d); F_SET(cbt, WT_CBT_SEARCH_SMALLEST); - cbt->ins_head = WT_ROW_INSERT_SMALLEST(page); + ins_head = WT_ROW_INSERT_SMALLEST(page); } else { cbt->compare = -1; cbt->slot = WT_ROW_SLOT(page, page->pg_row_d + (base - 1)); - cbt->ins_head = WT_ROW_INSERT_SLOT(page, cbt->slot); + ins_head = WT_ROW_INSERT_SLOT(page, cbt->slot); } /* If there's no insert list, we're done. */ - if (WT_SKIP_FIRST(cbt->ins_head) == NULL) + if (WT_SKIP_FIRST(ins_head) == NULL) return (0); /* @@ -607,12 +605,12 @@ leaf_match: cbt->compare = 0; * catch cursors repeatedly inserting at a single point. */ if (insert) { - WT_ERR( - __wt_search_insert_append(session, cbt, srch_key, &done)); + WT_ERR(__search_insert_append( + session, cbt, ins_head, srch_key, &done)); if (done) return (0); } - WT_ERR(__wt_search_insert(session, cbt, srch_key)); + WT_ERR(__wt_search_insert(session, cbt, ins_head, srch_key)); return (0); @@ -661,19 +659,16 @@ __wt_row_random_leaf(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt) /* * If the tree is new (and not empty), it might have a large insert * list. - */ - F_SET(cbt, WT_CBT_SEARCH_SMALLEST); - if ((cbt->ins_head = WT_ROW_INSERT_SMALLEST(page)) == NULL) - return (WT_NOTFOUND); - - /* + * * Walk down the list until we find a level with at least 50 entries, * that's where we'll start rolling random numbers. The value 50 is * used to ignore levels with only a few entries, that is, levels which * are potentially badly skewed. */ - for (ins_head = cbt->ins_head, - level = WT_SKIP_MAXDEPTH - 1; level >= 0; --level) { + F_SET(cbt, WT_CBT_SEARCH_SMALLEST); + if ((ins_head = WT_ROW_INSERT_SMALLEST(page)) == NULL) + return (WT_NOTFOUND); + for (level = WT_SKIP_MAXDEPTH - 1; level >= 0; --level) { start = &ins_head->head[level]; for (entries = 0, stop = start; *stop != NULL; stop = &(*stop)->next[level]) @@ -768,6 +763,7 @@ __wt_row_random_leaf(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt) ins = ins->next[0]; cbt->ins = ins; + cbt->ins_head = ins_head; cbt->compare = 0; return (0); @@ -787,11 +783,19 @@ __wt_row_random_descent(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt) WT_REF *current, *descent; btree = S2BT(session); + current = NULL; __cursor_pos_clear(cbt); -restart_root: - /* Walk the internal pages of the tree. */ + if (0) { +restart: /* + * Discard the currently held page and restart the search from + * the root. + */ + WT_RET(__wt_page_release(session, current, 0)); + } + + /* Search the internal pages of the tree. */ current = &btree->root; for (;;) { page = current->page; @@ -803,22 +807,19 @@ restart_root: __wt_random(&session->rnd) % pindex->entries]; /* - * Swap the parent page for the child page; return on error, - * the swap function ensures we're holding nothing on failure. + * Swap the current page for the child page. If the page splits + * while we're retrieving it, restart the search at the root. + * + * On other error, simply return, the swap call ensures we're + * holding nothing on failure. */ if ((ret = __wt_page_swap( session, current, descent, WT_READ_RESTART_OK)) == 0) { current = descent; continue; } - /* - * Restart is returned if we find a page that's been split; the - * held page isn't discarded when restart is returned, discard - * it and restart the search from the top of the tree. - */ - if (ret == WT_RESTART && - (ret = __wt_page_release(session, current, 0)) == 0) - goto restart_root; + if (ret == WT_RESTART) + goto restart; return (ret); } |