summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorKeith Bostic <keith.bostic@mongodb.com>2017-02-09 09:15:15 -0500
committerGitHub <noreply@github.com>2017-02-09 09:15:15 -0500
commit0b9e4534b2e01a7bf3dec00c91d6f38dfbcc0dd0 (patch)
treef7c0bd56aca2b0facdedad02eb6b945c022ef70d
parent15b7658a380e374e627b86e7629c8fad3ef349dc (diff)
downloadmongo-0b9e4534b2e01a7bf3dec00c91d6f38dfbcc0dd0.tar.gz
WT-3088 bug: WiredTiger can evict the tree's current eviction walk point (#3280)
WT-3088 bug: WiredTiger can evict the tree's current eviction walk point
-rw-r--r--src/btree/bt_debug.c2
-rw-r--r--src/btree/bt_split.c74
-rw-r--r--src/include/btmem.h8
-rw-r--r--src/include/btree.i4
-rw-r--r--src/include/extern.h1
5 files changed, 47 insertions, 42 deletions
diff --git a/src/btree/bt_debug.c b/src/btree/bt_debug.c
index a89eca230fd..d664da2ebd3 100644
--- a/src/btree/bt_debug.c
+++ b/src/btree/bt_debug.c
@@ -699,8 +699,6 @@ __debug_page_metadata(WT_DBG *ds, WT_REF *ref)
WT_RET(ds->f(ds, ", evict-lru"));
if (F_ISSET_ATOMIC(page, WT_PAGE_OVERFLOW_KEYS))
WT_RET(ds->f(ds, ", overflow-keys"));
- if (F_ISSET_ATOMIC(page, WT_PAGE_SPLIT_BLOCK))
- WT_RET(ds->f(ds, ", split-block"));
if (F_ISSET_ATOMIC(page, WT_PAGE_SPLIT_INSERT))
WT_RET(ds->f(ds, ", split-insert"));
if (F_ISSET_ATOMIC(page, WT_PAGE_UPDATE_IGNORE))
diff --git a/src/btree/bt_split.c b/src/btree/bt_split.c
index 7cfcd08f931..8122d242666 100644
--- a/src/btree/bt_split.c
+++ b/src/btree/bt_split.c
@@ -54,6 +54,16 @@ __split_oldest_gen(WT_SESSION_IMPL *session)
}
/*
+ * __wt_split_obsolete --
+ * Check if it is safe to free / evict based on split generation.
+ */
+bool
+__wt_split_obsolete(WT_SESSION_IMPL *session, uint64_t split_gen)
+{
+ return (split_gen < __split_oldest_gen(session));
+}
+
+/*
* __split_stash_add --
* Add a new entry into the session's split stash list.
*/
@@ -394,8 +404,8 @@ __split_ref_move(WT_SESSION_IMPL *session, WT_PAGE *from_home,
* Prepare a set of WT_REFs for a move.
*/
static void
-__split_ref_step1(
- WT_SESSION_IMPL *session, WT_PAGE_INDEX *pindex, bool skip_first)
+__split_ref_step1(WT_SESSION_IMPL *session,
+ WT_PAGE_INDEX *pindex, uint64_t split_gen, bool skip_first)
{
WT_PAGE *child;
WT_REF *child_ref, *ref;
@@ -418,30 +428,25 @@ __split_ref_step1(
child = ref->page;
/*
- * Block eviction and splits in newly created pages.
+ * Block eviction in newly created pages.
*
* Once the split is live, newly created internal pages might be
* evicted and their WT_REF structures freed. If that happened
* before all threads exit the index of the page that previously
* "owned" the WT_REF, a thread might see a freed WT_REF. To
- * ensure that doesn't happen, the newly created page's modify
- * structure has a field with a transaction ID that's checked
- * before any internal page is evicted. Unfortunately, we don't
- * know the correct value until we update the original page's
- * index (we need a transaction ID from after that update), but
- * the act of updating the original page's index is what allows
- * the eviction to happen.
+ * ensure that doesn't happen, the newly created page contains
+ * the current split generation and can't be evicted until
+ * all readers have left the old generation.
*
- * Split blocking was because historic versions of the split
- * code didn't update the WT_REF.home field until after the
- * split was live, so the WT_REF.home fields being updated could
- * split again before the update, there's a race between splits
- * as to which would update them first. The current code updates
- * the WT_REF.home fields before going live (in this function),
- * this shouldn't be an issue, but for now splits remain turned
- * off.
+ * Historic, we also blocked splits in newly created pages
+ * because we didn't update the WT_REF.home field until after
+ * the split was live, so the WT_REF.home fields being updated
+ * could split again before the update, there's a race between
+ * splits as to which would update them first. The current code
+ * updates the WT_REF.home fields before going live (in this
+ * function), this isn't an issue.
*/
- F_SET_ATOMIC(child, WT_PAGE_SPLIT_BLOCK);
+ child->pg_intl_split_gen = split_gen;
/*
* We use a page flag to prevent the child from splitting from
@@ -473,7 +478,6 @@ __split_ref_step2(
WT_SESSION_IMPL *session, WT_PAGE_INDEX *pindex, bool skip_first)
{
WT_DECL_RET;
- WT_PAGE *child;
WT_REF *ref;
uint32_t i;
@@ -503,14 +507,9 @@ __split_ref_step2(
continue;
WT_ERR(ret);
- child = ref->page;
-
- /* The child can now be evicted or split. */
- F_CLR_ATOMIC(child, WT_PAGE_SPLIT_BLOCK);
-
#ifdef HAVE_DIAGNOSTIC
WT_WITH_PAGE_INDEX(session,
- __split_verify_intl_key_order(session, child));
+ __split_verify_intl_key_order(session, ref->page));
#endif
WT_ERR(__wt_hazard_clear(session, ref));
@@ -653,8 +652,12 @@ __split_root(WT_SESSION_IMPL *session, WT_PAGE *root)
/* Start making real changes to the tree, errors are fatal. */
complete = WT_ERR_PANIC;
+ /* Get a generation for this split, mark the root page. */
+ split_gen = __wt_atomic_addv64(&S2C(session)->split_gen, 1);
+ root->pg_intl_split_gen = split_gen;
+
/* Prepare the WT_REFs for the move. */
- __split_ref_step1(session, alloc_index, false);
+ __split_ref_step1(session, alloc_index, split_gen, false);
/*
* Confirm the root page's index hasn't moved, then update it, which
@@ -686,7 +689,6 @@ __split_root(WT_SESSION_IMPL *session, WT_PAGE *root)
* fails, we don't roll back that change, because threads may already
* be using the new index.
*/
- split_gen = __wt_atomic_addv64(&S2C(session)->split_gen, 1);
size = sizeof(WT_PAGE_INDEX) + pindex->entries * sizeof(WT_REF *);
WT_TRET(__split_safe_free(session, split_gen, false, pindex, size));
root_decr += size;
@@ -838,6 +840,10 @@ __split_parent(WT_SESSION_IMPL *session, WT_REF *ref, WT_REF **ref_new,
/* Start making real changes to the tree, errors are fatal. */
complete = WT_ERR_PANIC;
+ /* Get a generation for this split, mark the parent page. */
+ split_gen = __wt_atomic_addv64(&S2C(session)->split_gen, 1);
+ parent->pg_intl_split_gen = split_gen;
+
/*
* Confirm the parent page's index hasn't moved then update it, which
* makes the split visible to threads descending the tree.
@@ -908,7 +914,6 @@ __split_parent(WT_SESSION_IMPL *session, WT_REF *ref, WT_REF **ref_new,
*
* Acquire a new split generation.
*/
- split_gen = __wt_atomic_addv64(&S2C(session)->split_gen, 1);
for (i = 0, deleted_refs = scr->mem; i < deleted_entries; ++i) {
next_ref = pindex->index[deleted_refs[i]];
WT_ASSERT(session, next_ref->state == WT_REF_SPLIT);
@@ -1160,8 +1165,12 @@ __split_internal(WT_SESSION_IMPL *session, WT_PAGE *parent, WT_PAGE *page)
/* Start making real changes to the tree, errors are fatal. */
complete = WT_ERR_PANIC;
+ /* Get a generation for this split, mark the page. */
+ split_gen = __wt_atomic_addv64(&S2C(session)->split_gen, 1);
+ page->pg_intl_split_gen = split_gen;
+
/* Prepare the WT_REFs for the move. */
- __split_ref_step1(session, alloc_index, true);
+ __split_ref_step1(session, alloc_index, split_gen, true);
/* Split into the parent. */
WT_ERR(__split_parent(session, page_ref, alloc_index->index,
@@ -1207,7 +1216,6 @@ __split_internal(WT_SESSION_IMPL *session, WT_PAGE *parent, WT_PAGE *page)
* back that change, because threads may already be using the new parent
* page.
*/
- split_gen = __wt_atomic_addv64(&S2C(session)->split_gen, 1);
size = sizeof(WT_PAGE_INDEX) + pindex->entries * sizeof(WT_REF *);
WT_TRET(__split_safe_free(session, split_gen, false, pindex, size));
page_decr += size;
@@ -1284,10 +1292,6 @@ __split_internal_lock(WT_SESSION_IMPL *session, WT_REF *ref, bool trylock,
for (;;) {
parent = ref->home;
- /* Skip pages that aren't ready to split. */
- if (F_ISSET_ATOMIC(parent, WT_PAGE_SPLIT_BLOCK))
- return (EBUSY);
-
if (trylock)
WT_RET(__wt_try_writelock(session, &parent->page_lock));
else
diff --git a/src/include/btmem.h b/src/include/btmem.h
index 43c1a309d52..39ca223aebf 100644
--- a/src/include/btmem.h
+++ b/src/include/btmem.h
@@ -483,6 +483,7 @@ struct __wt_page {
*/
struct {
WT_REF *parent_ref; /* Parent reference */
+ uint64_t split_gen; /* Generation of last split */
struct __wt_page_index {
uint32_t entries;
@@ -492,6 +493,8 @@ struct __wt_page {
} intl;
#undef pg_intl_parent_ref
#define pg_intl_parent_ref u.intl.parent_ref
+#undef pg_intl_split_gen
+#define pg_intl_split_gen u.intl.split_gen
/*
* Macros to copy/set the index because the name is obscured to ensure
@@ -593,9 +596,8 @@ struct __wt_page {
#define WT_PAGE_DISK_MAPPED 0x04 /* Disk image in mapped memory */
#define WT_PAGE_EVICT_LRU 0x08 /* Page is on the LRU queue */
#define WT_PAGE_OVERFLOW_KEYS 0x10 /* Page has overflow keys */
-#define WT_PAGE_SPLIT_BLOCK 0x20 /* Split blocking eviction and splits */
-#define WT_PAGE_SPLIT_INSERT 0x40 /* A leaf page was split for append */
-#define WT_PAGE_UPDATE_IGNORE 0x80 /* Ignore updates on page discard */
+#define WT_PAGE_SPLIT_INSERT 0x20 /* A leaf page was split for append */
+#define WT_PAGE_UPDATE_IGNORE 0x40 /* Ignore updates on page discard */
uint8_t flags_atomic; /* Atomic flags, use F_*_ATOMIC */
uint8_t unused[2]; /* Unused padding */
diff --git a/src/include/btree.i b/src/include/btree.i
index 378d93dd2ee..315efa86fa6 100644
--- a/src/include/btree.i
+++ b/src/include/btree.i
@@ -1348,8 +1348,8 @@ __wt_page_can_evict(
* discards its WT_REF array, and a thread traversing the original
* parent page index might see a freed WT_REF.
*/
- if (WT_PAGE_IS_INTERNAL(page) &&
- F_ISSET_ATOMIC(page, WT_PAGE_SPLIT_BLOCK))
+ if (WT_PAGE_IS_INTERNAL(page) && !__wt_split_obsolete(
+ session, page->pg_intl_split_gen))
return (false);
/*
diff --git a/src/include/extern.h b/src/include/extern.h
index 863d2a02861..836a7cb1ae6 100644
--- a/src/include/extern.h
+++ b/src/include/extern.h
@@ -160,6 +160,7 @@ __wt_page_in_func(WT_SESSION_IMPL *session, WT_REF *ref, uint32_t flags
extern int __wt_bt_rebalance(WT_SESSION_IMPL *session, const char *cfg[]) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden")));
extern int __wt_kv_return(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, WT_UPDATE *upd) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden")));
extern int __wt_bt_salvage(WT_SESSION_IMPL *session, WT_CKPT *ckptbase, const char *cfg[]) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden")));
+extern bool __wt_split_obsolete(WT_SESSION_IMPL *session, uint64_t split_gen) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden")));
extern void __wt_split_stash_discard(WT_SESSION_IMPL *session) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden")));
extern void __wt_split_stash_discard_all( WT_SESSION_IMPL *session_safe, WT_SESSION_IMPL *session) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden")));
extern int __wt_multi_to_ref(WT_SESSION_IMPL *session, WT_PAGE *page, WT_MULTI *multi, WT_REF **refp, size_t *incrp, bool closing) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden")));