diff options
author | Keith Bostic <keith.bostic@mongodb.com> | 2015-03-26 20:13:18 -0400 |
---|---|---|
committer | Keith Bostic <keith.bostic@mongodb.com> | 2015-03-26 20:13:18 -0400 |
commit | b6d2f5360519f1356d7184363236e7dee188469c (patch) | |
tree | 4eca77a4683451252ee1ae6ca6e63a671a4ddeed | |
parent | 3e37e1fca16f135a56068996bd37e28165cef0dc (diff) | |
parent | 8d8253a335eded69190a394a0aaea2d942168097 (diff) | |
download | mongo-b6d2f5360519f1356d7184363236e7dee188469c.tar.gz |
Merge pull request #1828 from wiredtiger/split-generation-with-safe
Split generation with safe
-rw-r--r-- | src/btree/bt_debug.c | 7 | ||||
-rw-r--r-- | src/btree/bt_discard.c | 2 | ||||
-rw-r--r-- | src/btree/bt_handle.c | 4 | ||||
-rw-r--r-- | src/btree/bt_page.c | 6 | ||||
-rw-r--r-- | src/btree/bt_slvg.c | 4 | ||||
-rw-r--r-- | src/btree/bt_split.c | 63 | ||||
-rw-r--r-- | src/btree/bt_walk.c | 2 | ||||
-rw-r--r-- | src/btree/col_srch.c | 3 | ||||
-rw-r--r-- | src/btree/row_srch.c | 9 | ||||
-rw-r--r-- | src/include/btmem.h | 23 | ||||
-rw-r--r-- | src/include/btree.i | 14 | ||||
-rw-r--r-- | src/lsm/lsm_cursor.c | 5 | ||||
-rw-r--r-- | src/reconcile/rec_write.c | 10 |
13 files changed, 88 insertions, 64 deletions
diff --git a/src/btree/bt_debug.c b/src/btree/bt_debug.c index fa7cff35e5f..6da5d9ecd16 100644 --- a/src/btree/bt_debug.c +++ b/src/btree/bt_debug.c @@ -546,8 +546,7 @@ __debug_page(WT_DBG *ds, WT_PAGE *page, uint32_t flags) session = ds->session; /* Dump the page metadata. */ - WT_WITH_PAGE_INDEX(session, - ret = __debug_page_metadata(ds, page)); + WT_WITH_PAGE_INDEX(session, ret = __debug_page_metadata(ds, page)); WT_RET(ret); /* Dump the page. */ @@ -600,7 +599,7 @@ __debug_page_metadata(WT_DBG *ds, WT_PAGE *page) switch (page->type) { case WT_PAGE_COL_INT: __dmsg(ds, " recno %" PRIu64, page->pg_intl_recno); - pindex = WT_INTL_INDEX_COPY(page); + WT_INTL_INDEX_GET(session, page, pindex); entries = pindex->entries; break; case WT_PAGE_COL_FIX: @@ -612,7 +611,7 @@ __debug_page_metadata(WT_DBG *ds, WT_PAGE *page) entries = page->pg_var_entries; break; case WT_PAGE_ROW_INT: - pindex = WT_INTL_INDEX_COPY(page); + WT_INTL_INDEX_GET(session, page, pindex); entries = pindex->entries; break; case WT_PAGE_ROW_LEAF: diff --git a/src/btree/bt_discard.c b/src/btree/bt_discard.c index 05a54ad643e..f43e936eeda 100644 --- a/src/btree/bt_discard.c +++ b/src/btree/bt_discard.c @@ -209,7 +209,7 @@ __free_page_modify(WT_SESSION_IMPL *session, WT_PAGE *page) static void __free_page_int(WT_SESSION_IMPL *session, WT_PAGE *page) { - __wt_free_ref_index(session, page, WT_INTL_INDEX_COPY(page), 0); + __wt_free_ref_index(session, page, WT_INTL_INDEX_GET_SAFE(page), 0); } /* diff --git a/src/btree/bt_handle.c b/src/btree/bt_handle.c index 1e0f95d3131..1930d6650a8 100644 --- a/src/btree/bt_handle.c +++ b/src/btree/bt_handle.c @@ -422,7 +422,7 @@ __btree_tree_open_empty(WT_SESSION_IMPL *session, int creation) __wt_page_alloc(session, WT_PAGE_COL_INT, 1, 1, 1, &root)); root->pg_intl_parent_ref = &btree->root; - pindex = WT_INTL_INDEX_COPY(root); + pindex = WT_INTL_INDEX_GET_SAFE(root); ref = pindex->index[0]; ref->home = root; ref->page = NULL; @@ -435,7 +435,7 @@ __btree_tree_open_empty(WT_SESSION_IMPL *session, int creation) __wt_page_alloc(session, WT_PAGE_ROW_INT, 0, 1, 1, &root)); root->pg_intl_parent_ref = &btree->root; - pindex = WT_INTL_INDEX_COPY(root); + pindex = WT_INTL_INDEX_GET_SAFE(root); ref = pindex->index[0]; ref->home = root; ref->page = NULL; diff --git a/src/btree/bt_page.c b/src/btree/bt_page.c index 0b93cc981d7..dd7a29347df 100644 --- a/src/btree/bt_page.c +++ b/src/btree/bt_page.c @@ -269,7 +269,7 @@ __wt_page_alloc(WT_SESSION_IMPL *session, uint8_t type, size += sizeof(WT_REF); } if (0) { -err: if ((pindex = WT_INTL_INDEX_COPY(page)) != NULL) { +err: if ((pindex = WT_INTL_INDEX_GET_SAFE(page)) != NULL) { for (i = 0; i < pindex->entries; ++i) __wt_free(session, pindex->index[i]); __wt_free(session, pindex); @@ -456,7 +456,7 @@ __inmem_col_int(WT_SESSION_IMPL *session, WT_PAGE *page) * Walk the page, building references: the page contains value items. * The value items are on-page items (WT_CELL_VALUE). */ - pindex = WT_INTL_INDEX_COPY(page); + pindex = WT_INTL_INDEX_GET_SAFE(page); refp = pindex->index; WT_CELL_FOREACH(btree, dsk, cell, unpack, i) { ref = *refp++; @@ -591,7 +591,7 @@ __inmem_row_int(WT_SESSION_IMPL *session, WT_PAGE *page, size_t *sizep) * location cookie pairs. Keys are on-page/overflow items and location * cookies are WT_CELL_ADDR_XXX items. */ - pindex = WT_INTL_INDEX_COPY(page); + pindex = WT_INTL_INDEX_GET_SAFE(page); refp = pindex->index; WT_CELL_FOREACH(btree, dsk, cell, unpack, i) { ref = *refp; diff --git a/src/btree/bt_slvg.c b/src/btree/bt_slvg.c index ba1802116d0..896ab23f1c2 100644 --- a/src/btree/bt_slvg.c +++ b/src/btree/bt_slvg.c @@ -1175,7 +1175,7 @@ __slvg_col_build_internal( __wt_page_alloc(session, WT_PAGE_COL_INT, 1, leaf_cnt, 1, &page)); WT_ERR(__slvg_modify_init(session, page)); - pindex = WT_INTL_INDEX_COPY(page); + pindex = WT_INTL_INDEX_GET_SAFE(page); for (refp = pindex->index, i = 0; i < ss->pages_next; ++i) { if ((trk = ss->pages[i]) == NULL) continue; @@ -1820,7 +1820,7 @@ __slvg_row_build_internal( __wt_page_alloc(session, WT_PAGE_ROW_INT, 0, leaf_cnt, 1, &page)); WT_ERR(__slvg_modify_init(session, page)); - pindex = WT_INTL_INDEX_COPY(page); + pindex = WT_INTL_INDEX_GET_SAFE(page); for (refp = pindex->index, i = 0; i < ss->pages_next; ++i) { if ((trk = ss->pages[i]) == NULL) continue; diff --git a/src/btree/bt_split.c b/src/btree/bt_split.c index 9fc567f02c1..99817eb4d04 100644 --- a/src/btree/bt_split.c +++ b/src/btree/bt_split.c @@ -178,7 +178,13 @@ __split_should_deepen( btree = S2BT(session); page = ref->page; - pindex = WT_INTL_INDEX_COPY(page); + + /* + * Our caller is holding the parent page locked to single-thread splits, + * which means we can safely look at the page's index without setting a + * split generation. + */ + pindex = WT_INTL_INDEX_GET_SAFE(page); /* * Deepen the tree if the page's memory footprint is larger than the @@ -338,7 +344,7 @@ __split_verify_intl_key_order(WT_SESSION_IMPL *session, WT_PAGE *page) switch (page->type) { case WT_PAGE_COL_INT: recno = 0; - WT_INTL_FOREACH_BEGIN_SAFE(session, page, ref) { + WT_INTL_FOREACH_BEGIN(session, page, ref) { WT_ASSERT(session, ref->key.recno > recno); recno = ref->key.recno; } WT_INTL_FOREACH_END; @@ -350,7 +356,7 @@ __split_verify_intl_key_order(WT_SESSION_IMPL *session, WT_PAGE *page) WT_CLEAR(_last); first = 1; - WT_INTL_FOREACH_BEGIN_SAFE(session, page, ref) { + WT_INTL_FOREACH_BEGIN(session, page, ref) { __wt_ref_key(page, ref, &next->data, &next->size); if (last->size == 0) { if (first) @@ -393,7 +399,12 @@ __split_deepen(WT_SESSION_IMPL *session, WT_PAGE *parent, uint32_t children) parent_incr = parent_decr = 0; panic = 0; - pindex = WT_INTL_INDEX_COPY(parent); + /* + * Our caller is holding the parent page locked to single-thread splits, + * which means we can safely look at the page's index without setting a + * split generation. + */ + pindex = WT_INTL_INDEX_GET_SAFE(parent); WT_STAT_FAST_CONN_INCR(session, cache_eviction_deepen); WT_STAT_FAST_DATA_INCR(session, cache_eviction_deepen); @@ -491,7 +502,7 @@ __split_deepen(WT_SESSION_IMPL *session, WT_PAGE *parent, uint32_t children) * to change. */ child_incr = 0; - child_pindex = WT_INTL_INDEX_COPY(child); + child_pindex = WT_INTL_INDEX_GET_SAFE(child); for (child_refp = child_pindex->index, j = 0; j < slots; ++j) { WT_ERR(__split_ref_deepen_move(session, parent, *parent_refp, &parent_decr, &child_incr)); @@ -505,11 +516,11 @@ __split_deepen(WT_SESSION_IMPL *session, WT_PAGE *parent, uint32_t children) parent_refp - pindex->index == pindex->entries - SPLIT_CORRECT_1); /* - * Update the parent's index; this is the update which splits the page, - * making the change visible to threads descending the tree. From now - * on, we're committed to the split. If any subsequent work fails, we - * have to panic because we potentially have threads of control using - * the new page index we just swapped in. + * Confirm the parent page's index hasn't moved, then update it, which + * makes the split visible to threads descending the tree. From this + * point on, we're committed to the split. If subsequent work fails, + * we have to panic because we may have threads of control using the + * new page index we swap in. * * A note on error handling: until this point, there's no problem with * unwinding on error. We allocated a new page index, a new set of @@ -518,13 +529,14 @@ __split_deepen(WT_SESSION_IMPL *session, WT_PAGE *parent, uint32_t children) * footprint. From now on we've modified the parent page, attention * needs to be paid. */ - WT_ASSERT(session, WT_INTL_INDEX_COPY(parent) == pindex); + WT_ASSERT(session, WT_INTL_INDEX_GET_SAFE(parent) == pindex); WT_INTL_INDEX_SET(parent, alloc_index); split_gen = WT_ATOMIC_ADD8(S2C(session)->split_gen, 1); panic = 1; #ifdef HAVE_DIAGNOSTIC - __split_verify_intl_key_order(session, parent); + WT_WITH_PAGE_INDEX(session, + __split_verify_intl_key_order(session, parent)); #endif /* @@ -555,9 +567,16 @@ __split_deepen(WT_SESSION_IMPL *session, WT_PAGE *parent, uint32_t children) if (!WT_PAGE_IS_INTERNAL(child)) continue; #ifdef HAVE_DIAGNOSTIC - __split_verify_intl_key_order(session, child); + WT_WITH_PAGE_INDEX(session, + __split_verify_intl_key_order(session, child)); #endif - WT_INTL_FOREACH_BEGIN_SAFE(session, child, child_ref) { + /* + * We have the parent locked, but there's nothing to prevent + * this child from splitting beneath us; ensure that reading + * the child's page index structure is safe. + */ + WT_ENTER_PAGE_INDEX(session); + WT_INTL_FOREACH_BEGIN(session, child, child_ref) { /* * The page's parent reference may not be wrong, as we * opened up access from the top of the tree already, @@ -570,6 +589,7 @@ __split_deepen(WT_SESSION_IMPL *session, WT_PAGE *parent, uint32_t children) child_ref->ref_hint = 0; } } WT_INTL_FOREACH_END; + WT_LEAVE_PAGE_INDEX(session); } /* @@ -848,7 +868,11 @@ __split_parent(WT_SESSION_IMPL *session, WT_REF *ref, hazard = 1; } - pindex = WT_INTL_INDEX_COPY(parent); + /* + * We've locked the parent above, which means it cannot split (which is + * the only reason to worry about split generation values). + */ + pindex = WT_INTL_INDEX_GET_SAFE(parent); parent_entries = pindex->entries; /* @@ -904,16 +928,17 @@ __split_parent(WT_SESSION_IMPL *session, WT_REF *ref, } /* - * Update the parent page's index: this update makes the split visible - * to threads descending the tree. + * Confirm the parent page's index hasn't moved then update it, which + * makes the split visible to threads descending the tree. */ - WT_ASSERT(session, WT_INTL_INDEX_COPY(parent) == pindex); + WT_ASSERT(session, WT_INTL_INDEX_GET_SAFE(parent) == pindex); WT_INTL_INDEX_SET(parent, alloc_index); split_gen = WT_ATOMIC_ADD8(S2C(session)->split_gen, 1); alloc_index = NULL; #ifdef HAVE_DIAGNOSTIC - __split_verify_intl_key_order(session, parent); + WT_WITH_PAGE_INDEX(session, + __split_verify_intl_key_order(session, parent)); #endif /* diff --git a/src/btree/bt_walk.c b/src/btree/bt_walk.c index 98411ce548d..fd25efec97e 100644 --- a/src/btree/bt_walk.c +++ b/src/btree/bt_walk.c @@ -272,7 +272,7 @@ descend: couple = ref; page = ref->page; if (page->type == WT_PAGE_ROW_INT || page->type == WT_PAGE_COL_INT) { - pindex = WT_INTL_INDEX_COPY(page); + WT_INTL_INDEX_GET(session, page, pindex); slot = prev ? pindex->entries - 1 : 0; } else { *refp = ref; diff --git a/src/btree/col_srch.c b/src/btree/col_srch.c index db1b565b439..4ebdde8674c 100644 --- a/src/btree/col_srch.c +++ b/src/btree/col_srch.c @@ -49,8 +49,7 @@ restart: page = current->page; WT_ASSERT(session, current->key.recno == page->pg_intl_recno); - WT_ASSERT(session, session->split_gen != 0); - pindex = WT_INTL_INDEX_COPY(page); + WT_INTL_INDEX_GET(session, page, pindex); base = pindex->entries; descent = pindex->index[base - 1]; diff --git a/src/btree/row_srch.c b/src/btree/row_srch.c index 9967c5ecb0c..d6179d08b55 100644 --- a/src/btree/row_srch.c +++ b/src/btree/row_srch.c @@ -195,8 +195,7 @@ restart: page = current->page; if (page->type != WT_PAGE_ROW_INT) break; - WT_ASSERT(session, session->split_gen != 0); - pindex = WT_INTL_INDEX_COPY(page); + WT_INTL_INDEX_GET(session, page, pindex); /* * Fast-path internal pages with one child, a common case for @@ -488,8 +487,7 @@ restart: if (page->type != WT_PAGE_ROW_INT) break; - WT_ASSERT(session, session->split_gen != 0); - pindex = WT_INTL_INDEX_COPY(page); + WT_INTL_INDEX_GET(session, page, pindex); descent = pindex->index[ __wt_random(session->rnd) % pindex->entries]; @@ -523,8 +521,7 @@ restart: */ cbt->ref = current; cbt->compare = 0; - WT_ASSERT(session, session->split_gen != 0); - pindex = WT_INTL_INDEX_COPY(btree->root.page); + WT_INTL_INDEX_GET(session, btree->root.page, pindex); cbt->slot = pindex->entries < 2 ? __wt_random(session->rnd) % page->pg_row_entries : 0; diff --git a/src/include/btmem.h b/src/include/btmem.h index cda672bc7b4..e9b6b5a1d6e 100644 --- a/src/include/btmem.h +++ b/src/include/btmem.h @@ -412,8 +412,17 @@ struct __wt_page { /* * Macros to copy/set the index because the name is obscured to ensure * the field isn't read multiple times. + * + * There are two versions of WT_INTL_INDEX_GET because the session split + * generation is usually set, but it's not always required: for example, + * if a page is locked for splitting, or being created or destroyed. */ -#define WT_INTL_INDEX_COPY(page) ((page)->u.intl.__index) +#define WT_INTL_INDEX_GET_SAFE(page) \ + ((page)->u.intl.__index) +#define WT_INTL_INDEX_GET(session, page, pindex) do { \ + WT_ASSERT(session, session->split_gen != 0); \ + (pindex) = WT_INTL_INDEX_GET_SAFE(page); \ +} while (0) #define WT_INTL_INDEX_SET(page, v) do { \ WT_WRITE_BARRIER(); \ ((page)->u.intl.__index) = (v); \ @@ -421,21 +430,15 @@ struct __wt_page { /* * Macro to walk the list of references in an internal page. - * Two flavors: by default, check that we have a split_gen, but - * provide a "SAFE" version for code that can safely read the - * page index without a split_gen. */ -#define WT_INTL_FOREACH_BEGIN_SAFE(session, page, ref) do { \ +#define WT_INTL_FOREACH_BEGIN(session, page, ref) do { \ WT_PAGE_INDEX *__pindex; \ WT_REF **__refp; \ uint32_t __entries; \ - for (__pindex = WT_INTL_INDEX_COPY(page), \ - __refp = __pindex->index, \ + WT_INTL_INDEX_GET(session, page, __pindex); \ + for (__refp = __pindex->index, \ __entries = __pindex->entries; __entries > 0; --__entries) {\ (ref) = *__refp++; -#define WT_INTL_FOREACH_BEGIN(session, page, ref) \ - WT_ASSERT(session, session->split_gen != 0); \ - WT_INTL_FOREACH_BEGIN_SAFE(session, page, ref) #define WT_INTL_FOREACH_END \ } \ } while (0) diff --git a/src/include/btree.i b/src/include/btree.i index 7d9a3095a0c..dfb9cbfe37d 100644 --- a/src/include/btree.i +++ b/src/include/btree.i @@ -279,13 +279,11 @@ __wt_page_refp(WT_SESSION_IMPL *session, WT_PAGE_INDEX *pindex; uint32_t i; - WT_ASSERT(session, session->split_gen != 0); - /* * Copy the parent page's index value: the page can split at any time, * but the index's value is always valid, even if it's not up-to-date. */ -retry: pindex = WT_INTL_INDEX_COPY(ref->home); +retry: WT_INTL_INDEX_GET(session, ref->home, pindex); /* * Use the page's reference hint: it should be correct unless the page @@ -1229,13 +1227,13 @@ __wt_skip_choose_depth(WT_SESSION_IMPL *session) } /* - * __wt_btree_size_overflow -- - * Check if the size of an in-memory tree with a single leaf page is over + * __wt_btree_lsm_size -- + * Return if the size of an in-memory tree with a single leaf page is over * a specified maximum. If called on anything other than a simple tree with a - * single leaf page, returns true so the calling code will switch to a new tree. + * single leaf page, returns true so our LSM caller will switch to a new tree. */ static inline int -__wt_btree_size_overflow(WT_SESSION_IMPL *session, uint64_t maxsize) +__wt_btree_lsm_size(WT_SESSION_IMPL *session, uint64_t maxsize) { WT_BTREE *btree; WT_PAGE *child, *root; @@ -1254,7 +1252,7 @@ __wt_btree_size_overflow(WT_SESSION_IMPL *session, uint64_t maxsize) return (1); /* Check for a tree with a single leaf page. */ - pindex = WT_INTL_INDEX_COPY(root); + WT_INTL_INDEX_GET(session, root, pindex); if (pindex->entries != 1) /* > 1 child page, switch */ return (1); diff --git a/src/lsm/lsm_cursor.c b/src/lsm/lsm_cursor.c index a18269baa28..a9825b45d7a 100644 --- a/src/lsm/lsm_cursor.c +++ b/src/lsm/lsm_cursor.c @@ -97,10 +97,11 @@ __clsm_enter_update(WT_CURSOR_LSM *clsm) hard_limit = F_ISSET(lsm_tree, WT_LSM_TREE_NEED_SWITCH) ? 1 : 0; if (have_primary) { + WT_ENTER_PAGE_INDEX(session); WT_WITH_BTREE(session, ((WT_CURSOR_BTREE *)primary)->btree, - ovfl = __wt_btree_size_overflow( - session, hard_limit ? + ovfl = __wt_btree_lsm_size(session, hard_limit ? 2 * lsm_tree->chunk_size : lsm_tree->chunk_size)); + WT_LEAVE_PAGE_INDEX(session); /* If there was no overflow, we're done. */ if (!ovfl) diff --git a/src/reconcile/rec_write.c b/src/reconcile/rec_write.c index 33d79e6d4ce..5782c701c26 100644 --- a/src/reconcile/rec_write.c +++ b/src/reconcile/rec_write.c @@ -508,8 +508,7 @@ __rec_root_write(WT_SESSION_IMPL *session, WT_PAGE *page, uint32_t flags) WT_ILLEGAL_VALUE(session); } - WT_ASSERT(session, session->split_gen != 0); - pindex = WT_INTL_INDEX_COPY(next); + WT_INTL_INDEX_GET(session, next, pindex); for (i = 0; i < mod->mod_multi_entries; ++i) { WT_ERR(__wt_multi_to_ref(session, next, &mod->mod_multi[i], &pindex->index[i], NULL)); @@ -2931,8 +2930,11 @@ __wt_bulk_init(WT_SESSION_IMPL *session, WT_CURSOR_BULK *cbulk) WT_RET_MSG(session, EINVAL, "bulk-load is only possible for newly created trees"); - /* Get a reference to the empty leaf page. */ - pindex = WT_INTL_INDEX_COPY(btree->root.page); + /* + * Get a reference to the empty leaf page; we have exclusive access so + * we can take a copy of the page, confident the parent won't split. + */ + pindex = WT_INTL_INDEX_GET_SAFE(btree->root.page); cbulk->ref = pindex->index[0]; cbulk->leaf = cbulk->ref->page; |