summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorKeith Bostic <keith.bostic@mongodb.com>2015-03-26 20:13:18 -0400
committerKeith Bostic <keith.bostic@mongodb.com>2015-03-26 20:13:18 -0400
commitb6d2f5360519f1356d7184363236e7dee188469c (patch)
tree4eca77a4683451252ee1ae6ca6e63a671a4ddeed
parent3e37e1fca16f135a56068996bd37e28165cef0dc (diff)
parent8d8253a335eded69190a394a0aaea2d942168097 (diff)
downloadmongo-b6d2f5360519f1356d7184363236e7dee188469c.tar.gz
Merge pull request #1828 from wiredtiger/split-generation-with-safe
Split generation with safe
-rw-r--r--src/btree/bt_debug.c7
-rw-r--r--src/btree/bt_discard.c2
-rw-r--r--src/btree/bt_handle.c4
-rw-r--r--src/btree/bt_page.c6
-rw-r--r--src/btree/bt_slvg.c4
-rw-r--r--src/btree/bt_split.c63
-rw-r--r--src/btree/bt_walk.c2
-rw-r--r--src/btree/col_srch.c3
-rw-r--r--src/btree/row_srch.c9
-rw-r--r--src/include/btmem.h23
-rw-r--r--src/include/btree.i14
-rw-r--r--src/lsm/lsm_cursor.c5
-rw-r--r--src/reconcile/rec_write.c10
13 files changed, 88 insertions, 64 deletions
diff --git a/src/btree/bt_debug.c b/src/btree/bt_debug.c
index fa7cff35e5f..6da5d9ecd16 100644
--- a/src/btree/bt_debug.c
+++ b/src/btree/bt_debug.c
@@ -546,8 +546,7 @@ __debug_page(WT_DBG *ds, WT_PAGE *page, uint32_t flags)
session = ds->session;
/* Dump the page metadata. */
- WT_WITH_PAGE_INDEX(session,
- ret = __debug_page_metadata(ds, page));
+ WT_WITH_PAGE_INDEX(session, ret = __debug_page_metadata(ds, page));
WT_RET(ret);
/* Dump the page. */
@@ -600,7 +599,7 @@ __debug_page_metadata(WT_DBG *ds, WT_PAGE *page)
switch (page->type) {
case WT_PAGE_COL_INT:
__dmsg(ds, " recno %" PRIu64, page->pg_intl_recno);
- pindex = WT_INTL_INDEX_COPY(page);
+ WT_INTL_INDEX_GET(session, page, pindex);
entries = pindex->entries;
break;
case WT_PAGE_COL_FIX:
@@ -612,7 +611,7 @@ __debug_page_metadata(WT_DBG *ds, WT_PAGE *page)
entries = page->pg_var_entries;
break;
case WT_PAGE_ROW_INT:
- pindex = WT_INTL_INDEX_COPY(page);
+ WT_INTL_INDEX_GET(session, page, pindex);
entries = pindex->entries;
break;
case WT_PAGE_ROW_LEAF:
diff --git a/src/btree/bt_discard.c b/src/btree/bt_discard.c
index 05a54ad643e..f43e936eeda 100644
--- a/src/btree/bt_discard.c
+++ b/src/btree/bt_discard.c
@@ -209,7 +209,7 @@ __free_page_modify(WT_SESSION_IMPL *session, WT_PAGE *page)
static void
__free_page_int(WT_SESSION_IMPL *session, WT_PAGE *page)
{
- __wt_free_ref_index(session, page, WT_INTL_INDEX_COPY(page), 0);
+ __wt_free_ref_index(session, page, WT_INTL_INDEX_GET_SAFE(page), 0);
}
/*
diff --git a/src/btree/bt_handle.c b/src/btree/bt_handle.c
index 1e0f95d3131..1930d6650a8 100644
--- a/src/btree/bt_handle.c
+++ b/src/btree/bt_handle.c
@@ -422,7 +422,7 @@ __btree_tree_open_empty(WT_SESSION_IMPL *session, int creation)
__wt_page_alloc(session, WT_PAGE_COL_INT, 1, 1, 1, &root));
root->pg_intl_parent_ref = &btree->root;
- pindex = WT_INTL_INDEX_COPY(root);
+ pindex = WT_INTL_INDEX_GET_SAFE(root);
ref = pindex->index[0];
ref->home = root;
ref->page = NULL;
@@ -435,7 +435,7 @@ __btree_tree_open_empty(WT_SESSION_IMPL *session, int creation)
__wt_page_alloc(session, WT_PAGE_ROW_INT, 0, 1, 1, &root));
root->pg_intl_parent_ref = &btree->root;
- pindex = WT_INTL_INDEX_COPY(root);
+ pindex = WT_INTL_INDEX_GET_SAFE(root);
ref = pindex->index[0];
ref->home = root;
ref->page = NULL;
diff --git a/src/btree/bt_page.c b/src/btree/bt_page.c
index 0b93cc981d7..dd7a29347df 100644
--- a/src/btree/bt_page.c
+++ b/src/btree/bt_page.c
@@ -269,7 +269,7 @@ __wt_page_alloc(WT_SESSION_IMPL *session, uint8_t type,
size += sizeof(WT_REF);
}
if (0) {
-err: if ((pindex = WT_INTL_INDEX_COPY(page)) != NULL) {
+err: if ((pindex = WT_INTL_INDEX_GET_SAFE(page)) != NULL) {
for (i = 0; i < pindex->entries; ++i)
__wt_free(session, pindex->index[i]);
__wt_free(session, pindex);
@@ -456,7 +456,7 @@ __inmem_col_int(WT_SESSION_IMPL *session, WT_PAGE *page)
* Walk the page, building references: the page contains value items.
* The value items are on-page items (WT_CELL_VALUE).
*/
- pindex = WT_INTL_INDEX_COPY(page);
+ pindex = WT_INTL_INDEX_GET_SAFE(page);
refp = pindex->index;
WT_CELL_FOREACH(btree, dsk, cell, unpack, i) {
ref = *refp++;
@@ -591,7 +591,7 @@ __inmem_row_int(WT_SESSION_IMPL *session, WT_PAGE *page, size_t *sizep)
* location cookie pairs. Keys are on-page/overflow items and location
* cookies are WT_CELL_ADDR_XXX items.
*/
- pindex = WT_INTL_INDEX_COPY(page);
+ pindex = WT_INTL_INDEX_GET_SAFE(page);
refp = pindex->index;
WT_CELL_FOREACH(btree, dsk, cell, unpack, i) {
ref = *refp;
diff --git a/src/btree/bt_slvg.c b/src/btree/bt_slvg.c
index ba1802116d0..896ab23f1c2 100644
--- a/src/btree/bt_slvg.c
+++ b/src/btree/bt_slvg.c
@@ -1175,7 +1175,7 @@ __slvg_col_build_internal(
__wt_page_alloc(session, WT_PAGE_COL_INT, 1, leaf_cnt, 1, &page));
WT_ERR(__slvg_modify_init(session, page));
- pindex = WT_INTL_INDEX_COPY(page);
+ pindex = WT_INTL_INDEX_GET_SAFE(page);
for (refp = pindex->index, i = 0; i < ss->pages_next; ++i) {
if ((trk = ss->pages[i]) == NULL)
continue;
@@ -1820,7 +1820,7 @@ __slvg_row_build_internal(
__wt_page_alloc(session, WT_PAGE_ROW_INT, 0, leaf_cnt, 1, &page));
WT_ERR(__slvg_modify_init(session, page));
- pindex = WT_INTL_INDEX_COPY(page);
+ pindex = WT_INTL_INDEX_GET_SAFE(page);
for (refp = pindex->index, i = 0; i < ss->pages_next; ++i) {
if ((trk = ss->pages[i]) == NULL)
continue;
diff --git a/src/btree/bt_split.c b/src/btree/bt_split.c
index 9fc567f02c1..99817eb4d04 100644
--- a/src/btree/bt_split.c
+++ b/src/btree/bt_split.c
@@ -178,7 +178,13 @@ __split_should_deepen(
btree = S2BT(session);
page = ref->page;
- pindex = WT_INTL_INDEX_COPY(page);
+
+ /*
+ * Our caller is holding the parent page locked to single-thread splits,
+ * which means we can safely look at the page's index without setting a
+ * split generation.
+ */
+ pindex = WT_INTL_INDEX_GET_SAFE(page);
/*
* Deepen the tree if the page's memory footprint is larger than the
@@ -338,7 +344,7 @@ __split_verify_intl_key_order(WT_SESSION_IMPL *session, WT_PAGE *page)
switch (page->type) {
case WT_PAGE_COL_INT:
recno = 0;
- WT_INTL_FOREACH_BEGIN_SAFE(session, page, ref) {
+ WT_INTL_FOREACH_BEGIN(session, page, ref) {
WT_ASSERT(session, ref->key.recno > recno);
recno = ref->key.recno;
} WT_INTL_FOREACH_END;
@@ -350,7 +356,7 @@ __split_verify_intl_key_order(WT_SESSION_IMPL *session, WT_PAGE *page)
WT_CLEAR(_last);
first = 1;
- WT_INTL_FOREACH_BEGIN_SAFE(session, page, ref) {
+ WT_INTL_FOREACH_BEGIN(session, page, ref) {
__wt_ref_key(page, ref, &next->data, &next->size);
if (last->size == 0) {
if (first)
@@ -393,7 +399,12 @@ __split_deepen(WT_SESSION_IMPL *session, WT_PAGE *parent, uint32_t children)
parent_incr = parent_decr = 0;
panic = 0;
- pindex = WT_INTL_INDEX_COPY(parent);
+ /*
+ * Our caller is holding the parent page locked to single-thread splits,
+ * which means we can safely look at the page's index without setting a
+ * split generation.
+ */
+ pindex = WT_INTL_INDEX_GET_SAFE(parent);
WT_STAT_FAST_CONN_INCR(session, cache_eviction_deepen);
WT_STAT_FAST_DATA_INCR(session, cache_eviction_deepen);
@@ -491,7 +502,7 @@ __split_deepen(WT_SESSION_IMPL *session, WT_PAGE *parent, uint32_t children)
* to change.
*/
child_incr = 0;
- child_pindex = WT_INTL_INDEX_COPY(child);
+ child_pindex = WT_INTL_INDEX_GET_SAFE(child);
for (child_refp = child_pindex->index, j = 0; j < slots; ++j) {
WT_ERR(__split_ref_deepen_move(session,
parent, *parent_refp, &parent_decr, &child_incr));
@@ -505,11 +516,11 @@ __split_deepen(WT_SESSION_IMPL *session, WT_PAGE *parent, uint32_t children)
parent_refp - pindex->index == pindex->entries - SPLIT_CORRECT_1);
/*
- * Update the parent's index; this is the update which splits the page,
- * making the change visible to threads descending the tree. From now
- * on, we're committed to the split. If any subsequent work fails, we
- * have to panic because we potentially have threads of control using
- * the new page index we just swapped in.
+ * Confirm the parent page's index hasn't moved, then update it, which
+ * makes the split visible to threads descending the tree. From this
+ * point on, we're committed to the split. If subsequent work fails,
+ * we have to panic because we may have threads of control using the
+ * new page index we swap in.
*
* A note on error handling: until this point, there's no problem with
* unwinding on error. We allocated a new page index, a new set of
@@ -518,13 +529,14 @@ __split_deepen(WT_SESSION_IMPL *session, WT_PAGE *parent, uint32_t children)
* footprint. From now on we've modified the parent page, attention
* needs to be paid.
*/
- WT_ASSERT(session, WT_INTL_INDEX_COPY(parent) == pindex);
+ WT_ASSERT(session, WT_INTL_INDEX_GET_SAFE(parent) == pindex);
WT_INTL_INDEX_SET(parent, alloc_index);
split_gen = WT_ATOMIC_ADD8(S2C(session)->split_gen, 1);
panic = 1;
#ifdef HAVE_DIAGNOSTIC
- __split_verify_intl_key_order(session, parent);
+ WT_WITH_PAGE_INDEX(session,
+ __split_verify_intl_key_order(session, parent));
#endif
/*
@@ -555,9 +567,16 @@ __split_deepen(WT_SESSION_IMPL *session, WT_PAGE *parent, uint32_t children)
if (!WT_PAGE_IS_INTERNAL(child))
continue;
#ifdef HAVE_DIAGNOSTIC
- __split_verify_intl_key_order(session, child);
+ WT_WITH_PAGE_INDEX(session,
+ __split_verify_intl_key_order(session, child));
#endif
- WT_INTL_FOREACH_BEGIN_SAFE(session, child, child_ref) {
+ /*
+ * We have the parent locked, but there's nothing to prevent
+ * this child from splitting beneath us; ensure that reading
+ * the child's page index structure is safe.
+ */
+ WT_ENTER_PAGE_INDEX(session);
+ WT_INTL_FOREACH_BEGIN(session, child, child_ref) {
/*
* The page's parent reference may not be wrong, as we
* opened up access from the top of the tree already,
@@ -570,6 +589,7 @@ __split_deepen(WT_SESSION_IMPL *session, WT_PAGE *parent, uint32_t children)
child_ref->ref_hint = 0;
}
} WT_INTL_FOREACH_END;
+ WT_LEAVE_PAGE_INDEX(session);
}
/*
@@ -848,7 +868,11 @@ __split_parent(WT_SESSION_IMPL *session, WT_REF *ref,
hazard = 1;
}
- pindex = WT_INTL_INDEX_COPY(parent);
+ /*
+ * We've locked the parent above, which means it cannot split (which is
+ * the only reason to worry about split generation values).
+ */
+ pindex = WT_INTL_INDEX_GET_SAFE(parent);
parent_entries = pindex->entries;
/*
@@ -904,16 +928,17 @@ __split_parent(WT_SESSION_IMPL *session, WT_REF *ref,
}
/*
- * Update the parent page's index: this update makes the split visible
- * to threads descending the tree.
+ * Confirm the parent page's index hasn't moved then update it, which
+ * makes the split visible to threads descending the tree.
*/
- WT_ASSERT(session, WT_INTL_INDEX_COPY(parent) == pindex);
+ WT_ASSERT(session, WT_INTL_INDEX_GET_SAFE(parent) == pindex);
WT_INTL_INDEX_SET(parent, alloc_index);
split_gen = WT_ATOMIC_ADD8(S2C(session)->split_gen, 1);
alloc_index = NULL;
#ifdef HAVE_DIAGNOSTIC
- __split_verify_intl_key_order(session, parent);
+ WT_WITH_PAGE_INDEX(session,
+ __split_verify_intl_key_order(session, parent));
#endif
/*
diff --git a/src/btree/bt_walk.c b/src/btree/bt_walk.c
index 98411ce548d..fd25efec97e 100644
--- a/src/btree/bt_walk.c
+++ b/src/btree/bt_walk.c
@@ -272,7 +272,7 @@ descend: couple = ref;
page = ref->page;
if (page->type == WT_PAGE_ROW_INT ||
page->type == WT_PAGE_COL_INT) {
- pindex = WT_INTL_INDEX_COPY(page);
+ WT_INTL_INDEX_GET(session, page, pindex);
slot = prev ? pindex->entries - 1 : 0;
} else {
*refp = ref;
diff --git a/src/btree/col_srch.c b/src/btree/col_srch.c
index db1b565b439..4ebdde8674c 100644
--- a/src/btree/col_srch.c
+++ b/src/btree/col_srch.c
@@ -49,8 +49,7 @@ restart: page = current->page;
WT_ASSERT(session, current->key.recno == page->pg_intl_recno);
- WT_ASSERT(session, session->split_gen != 0);
- pindex = WT_INTL_INDEX_COPY(page);
+ WT_INTL_INDEX_GET(session, page, pindex);
base = pindex->entries;
descent = pindex->index[base - 1];
diff --git a/src/btree/row_srch.c b/src/btree/row_srch.c
index 9967c5ecb0c..d6179d08b55 100644
--- a/src/btree/row_srch.c
+++ b/src/btree/row_srch.c
@@ -195,8 +195,7 @@ restart: page = current->page;
if (page->type != WT_PAGE_ROW_INT)
break;
- WT_ASSERT(session, session->split_gen != 0);
- pindex = WT_INTL_INDEX_COPY(page);
+ WT_INTL_INDEX_GET(session, page, pindex);
/*
* Fast-path internal pages with one child, a common case for
@@ -488,8 +487,7 @@ restart:
if (page->type != WT_PAGE_ROW_INT)
break;
- WT_ASSERT(session, session->split_gen != 0);
- pindex = WT_INTL_INDEX_COPY(page);
+ WT_INTL_INDEX_GET(session, page, pindex);
descent = pindex->index[
__wt_random(session->rnd) % pindex->entries];
@@ -523,8 +521,7 @@ restart:
*/
cbt->ref = current;
cbt->compare = 0;
- WT_ASSERT(session, session->split_gen != 0);
- pindex = WT_INTL_INDEX_COPY(btree->root.page);
+ WT_INTL_INDEX_GET(session, btree->root.page, pindex);
cbt->slot = pindex->entries < 2 ?
__wt_random(session->rnd) % page->pg_row_entries : 0;
diff --git a/src/include/btmem.h b/src/include/btmem.h
index cda672bc7b4..e9b6b5a1d6e 100644
--- a/src/include/btmem.h
+++ b/src/include/btmem.h
@@ -412,8 +412,17 @@ struct __wt_page {
/*
* Macros to copy/set the index because the name is obscured to ensure
* the field isn't read multiple times.
+ *
+ * There are two versions of WT_INTL_INDEX_GET because the session split
+ * generation is usually set, but it's not always required: for example,
+ * if a page is locked for splitting, or being created or destroyed.
*/
-#define WT_INTL_INDEX_COPY(page) ((page)->u.intl.__index)
+#define WT_INTL_INDEX_GET_SAFE(page) \
+ ((page)->u.intl.__index)
+#define WT_INTL_INDEX_GET(session, page, pindex) do { \
+ WT_ASSERT(session, session->split_gen != 0); \
+ (pindex) = WT_INTL_INDEX_GET_SAFE(page); \
+} while (0)
#define WT_INTL_INDEX_SET(page, v) do { \
WT_WRITE_BARRIER(); \
((page)->u.intl.__index) = (v); \
@@ -421,21 +430,15 @@ struct __wt_page {
/*
* Macro to walk the list of references in an internal page.
- * Two flavors: by default, check that we have a split_gen, but
- * provide a "SAFE" version for code that can safely read the
- * page index without a split_gen.
*/
-#define WT_INTL_FOREACH_BEGIN_SAFE(session, page, ref) do { \
+#define WT_INTL_FOREACH_BEGIN(session, page, ref) do { \
WT_PAGE_INDEX *__pindex; \
WT_REF **__refp; \
uint32_t __entries; \
- for (__pindex = WT_INTL_INDEX_COPY(page), \
- __refp = __pindex->index, \
+ WT_INTL_INDEX_GET(session, page, __pindex); \
+ for (__refp = __pindex->index, \
__entries = __pindex->entries; __entries > 0; --__entries) {\
(ref) = *__refp++;
-#define WT_INTL_FOREACH_BEGIN(session, page, ref) \
- WT_ASSERT(session, session->split_gen != 0); \
- WT_INTL_FOREACH_BEGIN_SAFE(session, page, ref)
#define WT_INTL_FOREACH_END \
} \
} while (0)
diff --git a/src/include/btree.i b/src/include/btree.i
index 7d9a3095a0c..dfb9cbfe37d 100644
--- a/src/include/btree.i
+++ b/src/include/btree.i
@@ -279,13 +279,11 @@ __wt_page_refp(WT_SESSION_IMPL *session,
WT_PAGE_INDEX *pindex;
uint32_t i;
- WT_ASSERT(session, session->split_gen != 0);
-
/*
* Copy the parent page's index value: the page can split at any time,
* but the index's value is always valid, even if it's not up-to-date.
*/
-retry: pindex = WT_INTL_INDEX_COPY(ref->home);
+retry: WT_INTL_INDEX_GET(session, ref->home, pindex);
/*
* Use the page's reference hint: it should be correct unless the page
@@ -1229,13 +1227,13 @@ __wt_skip_choose_depth(WT_SESSION_IMPL *session)
}
/*
- * __wt_btree_size_overflow --
- * Check if the size of an in-memory tree with a single leaf page is over
+ * __wt_btree_lsm_size --
+ * Return if the size of an in-memory tree with a single leaf page is over
* a specified maximum. If called on anything other than a simple tree with a
- * single leaf page, returns true so the calling code will switch to a new tree.
+ * single leaf page, returns true so our LSM caller will switch to a new tree.
*/
static inline int
-__wt_btree_size_overflow(WT_SESSION_IMPL *session, uint64_t maxsize)
+__wt_btree_lsm_size(WT_SESSION_IMPL *session, uint64_t maxsize)
{
WT_BTREE *btree;
WT_PAGE *child, *root;
@@ -1254,7 +1252,7 @@ __wt_btree_size_overflow(WT_SESSION_IMPL *session, uint64_t maxsize)
return (1);
/* Check for a tree with a single leaf page. */
- pindex = WT_INTL_INDEX_COPY(root);
+ WT_INTL_INDEX_GET(session, root, pindex);
if (pindex->entries != 1) /* > 1 child page, switch */
return (1);
diff --git a/src/lsm/lsm_cursor.c b/src/lsm/lsm_cursor.c
index a18269baa28..a9825b45d7a 100644
--- a/src/lsm/lsm_cursor.c
+++ b/src/lsm/lsm_cursor.c
@@ -97,10 +97,11 @@ __clsm_enter_update(WT_CURSOR_LSM *clsm)
hard_limit = F_ISSET(lsm_tree, WT_LSM_TREE_NEED_SWITCH) ? 1 : 0;
if (have_primary) {
+ WT_ENTER_PAGE_INDEX(session);
WT_WITH_BTREE(session, ((WT_CURSOR_BTREE *)primary)->btree,
- ovfl = __wt_btree_size_overflow(
- session, hard_limit ?
+ ovfl = __wt_btree_lsm_size(session, hard_limit ?
2 * lsm_tree->chunk_size : lsm_tree->chunk_size));
+ WT_LEAVE_PAGE_INDEX(session);
/* If there was no overflow, we're done. */
if (!ovfl)
diff --git a/src/reconcile/rec_write.c b/src/reconcile/rec_write.c
index 33d79e6d4ce..5782c701c26 100644
--- a/src/reconcile/rec_write.c
+++ b/src/reconcile/rec_write.c
@@ -508,8 +508,7 @@ __rec_root_write(WT_SESSION_IMPL *session, WT_PAGE *page, uint32_t flags)
WT_ILLEGAL_VALUE(session);
}
- WT_ASSERT(session, session->split_gen != 0);
- pindex = WT_INTL_INDEX_COPY(next);
+ WT_INTL_INDEX_GET(session, next, pindex);
for (i = 0; i < mod->mod_multi_entries; ++i) {
WT_ERR(__wt_multi_to_ref(session,
next, &mod->mod_multi[i], &pindex->index[i], NULL));
@@ -2931,8 +2930,11 @@ __wt_bulk_init(WT_SESSION_IMPL *session, WT_CURSOR_BULK *cbulk)
WT_RET_MSG(session, EINVAL,
"bulk-load is only possible for newly created trees");
- /* Get a reference to the empty leaf page. */
- pindex = WT_INTL_INDEX_COPY(btree->root.page);
+ /*
+ * Get a reference to the empty leaf page; we have exclusive access so
+ * we can take a copy of the page, confident the parent won't split.
+ */
+ pindex = WT_INTL_INDEX_GET_SAFE(btree->root.page);
cbulk->ref = pindex->index[0];
cbulk->leaf = cbulk->ref->page;