summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorAlex Gorrod <alexander.gorrod@mongodb.com>2017-02-13 10:49:24 +1100
committerMichael Cahill <michael.cahill@mongodb.com>2017-02-13 10:49:24 +1100
commit7f5d0f9981214c723f2ed90cf4533887ed406176 (patch)
tree80f5c03f83eab92180c5a6464394e1172dea49fe
parent722b9d1b3da5dbfc4703f41855ae219df3fc6f57 (diff)
downloadmongo-7f5d0f9981214c723f2ed90cf4533887ed406176.tar.gz
WT-3170 Change when eviction walk point is saved, cleanup splits. (#3284)
* Change how eviction walk point is saved during walk. * After 0b9e453, we no longer need to do any non-DIAGNOSTIC work after completing the split (previously, we had changes to make the newly created split pages evictable, but now they are initially given a generation number which will prevent their eviction until it's OK). Rename __split_ref_step2() to be __split_verify_intl(), and change it to verify all of the internal pages involved in the split. Previously, we only verified the pages we had to read and update anyway. Now we don't have to update any pages and we're only reading pages in DIAGNOSTIC mode, verify all of them. Don't release the hazard pointer explicitly, use the more standard __wt_page_release() call (it should make no difference, it's just a bit more consistent). Rename __split_ref_step1() to be __split_ref_prepare(), there's no longer a step #2. * We don't need to publish WT_BTREE.evict_ref, or use a barrier: in one we're guaranteed that only the writing thread will check the assertion in the discard code (that we're not discarding the eviction's reference), and in the other case we're doing hazard-pointer coupling, which implies there is a barrier in the code path before the page can possibly be discarded by any thread. * Review barriers use in splits. (#3288). In all cases, use the pattern "Update the page index, which includes a barrier to make the split live, switch to benign error mode, then verify the pages involved in the split are correct."
-rw-r--r--src/btree/bt_split.c164
-rw-r--r--src/btree/bt_walk.c4
-rw-r--r--src/evict/evict_lru.c49
-rw-r--r--src/include/session.h2
-rw-r--r--src/session/session_api.c5
5 files changed, 103 insertions, 121 deletions
diff --git a/src/btree/bt_split.c b/src/btree/bt_split.c
index 8122d242666..fcb14be7c76 100644
--- a/src/btree/bt_split.c
+++ b/src/btree/bt_split.c
@@ -197,7 +197,7 @@ __split_safe_free(WT_SESSION_IMPL *session,
#ifdef HAVE_DIAGNOSTIC
/*
* __split_verify_intl_key_order --
- * Verify the key order on an internal page after a split, diagnostic only.
+ * Verify the key order on an internal page after a split.
*/
static void
__split_verify_intl_key_order(WT_SESSION_IMPL *session, WT_PAGE *page)
@@ -249,6 +249,42 @@ __split_verify_intl_key_order(WT_SESSION_IMPL *session, WT_PAGE *page)
break;
}
}
+
+/*
+ * __split_verify_intl --
+ * Verify a set of internal pages involved in a split.
+ */
+static int
+__split_verify_intl(WT_SESSION_IMPL *session,
+ WT_PAGE *page1, WT_PAGE *page2, WT_PAGE *pindex_page, bool skip_first)
+{
+ WT_DECL_RET;
+ WT_REF *ref;
+
+ /* The split is complete and live, verify all of the pages involved. */
+ if (page1 != NULL)
+ __split_verify_intl_key_order(session, page1);
+ if (page2 != NULL)
+ __split_verify_intl_key_order(session, page2);
+
+ /* Skip the first slot on non-root internal pages, it's not set. */
+ WT_INTL_FOREACH_BEGIN(session, pindex_page, ref) {
+ if (skip_first) {
+ skip_first = false;
+ continue;
+ }
+ WT_ERR(__wt_page_in(session, ref, WT_READ_NO_EVICT));
+
+ __split_verify_intl_key_order(session, ref->page);
+
+ WT_ERR(__wt_page_release(session, ref, WT_READ_NO_EVICT));
+ } WT_INTL_FOREACH_END;
+
+ return (0);
+
+err: /* Something really bad just happened. */
+ WT_PANIC_RET(session, ret, "fatal error during page split");
+}
#endif
/*
@@ -400,11 +436,11 @@ __split_ref_move(WT_SESSION_IMPL *session, WT_PAGE *from_home,
}
/*
- * __split_ref_step1 --
+ * __split_ref_prepare --
* Prepare a set of WT_REFs for a move.
*/
static void
-__split_ref_step1(WT_SESSION_IMPL *session,
+__split_ref_prepare(WT_SESSION_IMPL *session,
WT_PAGE_INDEX *pindex, uint64_t split_gen, bool skip_first)
{
WT_PAGE *child;
@@ -470,58 +506,6 @@ __split_ref_step1(WT_SESSION_IMPL *session,
}
/*
- * __split_ref_step2 --
- * Allow the newly created children to be evicted or split.
- */
-static int
-__split_ref_step2(
- WT_SESSION_IMPL *session, WT_PAGE_INDEX *pindex, bool skip_first)
-{
- WT_DECL_RET;
- WT_REF *ref;
- uint32_t i;
-
- /*
- * The split has gone live, enable eviction and splits on the newly
- * created internal pages.
- */
- WT_WRITE_BARRIER();
-
- for (i = skip_first ? 1 : 0; i < pindex->entries; ++i) {
- ref = pindex->index[i];
-
- /*
- * We don't hold hazard pointers on created pages, they cannot
- * be evicted because the page-modify transaction value set as
- * they were created prevents eviction. (See above, we reset
- * that value as part of fixing up the page.) But, an eviction
- * thread might be attempting to evict the page (the WT_REF may
- * be WT_REF_LOCKED), or it may be a disk based page (the WT_REF
- * may be WT_REF_READING), or it may be in some other state.
- * Acquire a hazard pointer for any in-memory pages so we know
- * the state of the page. Ignore pages not in-memory (deleted,
- * on-disk, being read), there's no in-memory structure to fix.
- */
- if ((ret = __wt_page_in(session,
- ref, WT_READ_CACHE | WT_READ_NO_EVICT)) == WT_NOTFOUND)
- continue;
- WT_ERR(ret);
-
-#ifdef HAVE_DIAGNOSTIC
- WT_WITH_PAGE_INDEX(session,
- __split_verify_intl_key_order(session, ref->page));
-#endif
-
- WT_ERR(__wt_hazard_clear(session, ref));
- }
-
- return (0);
-
-err: /* Something really bad just happened. */
- WT_PANIC_RET(session, ret, "fatal error resolving a split");
-}
-
-/*
* __split_root --
* Split the root page in-memory, deepening the tree.
*/
@@ -657,7 +641,7 @@ __split_root(WT_SESSION_IMPL *session, WT_PAGE *root)
root->pg_intl_split_gen = split_gen;
/* Prepare the WT_REFs for the move. */
- __split_ref_step1(session, alloc_index, split_gen, false);
+ __split_ref_prepare(session, alloc_index, split_gen, false);
/*
* Confirm the root page's index hasn't moved, then update it, which
@@ -665,19 +649,16 @@ __split_root(WT_SESSION_IMPL *session, WT_PAGE *root)
*/
WT_ASSERT(session, WT_INTL_INDEX_GET_SAFE(root) == pindex);
WT_INTL_INDEX_SET(root, alloc_index);
-
-#ifdef HAVE_DIAGNOSTIC
- WT_WITH_PAGE_INDEX(session,
- __split_verify_intl_key_order(session, root));
-#endif
- /* Finalize the WT_REFs we moved. */
- WT_ERR(__split_ref_step2(session, alloc_index, false));
+ alloc_index = NULL;
/* The split is complete and correct, ignore benign errors. */
complete = WT_ERR_IGNORE;
- /* We've installed the allocated page-index, ensure error handling. */
- alloc_index = NULL;
+#ifdef HAVE_DIAGNOSTIC
+ WT_WITH_PAGE_INDEX(session,
+ ret = __split_verify_intl(session, root, NULL, root, false));
+ WT_ERR(ret);
+#endif
/*
* We can't free the previous root's index, there may be threads using
@@ -852,11 +833,6 @@ __split_parent(WT_SESSION_IMPL *session, WT_REF *ref, WT_REF **ref_new,
WT_INTL_INDEX_SET(parent, alloc_index);
alloc_index = NULL;
-#ifdef HAVE_DIAGNOSTIC
- WT_WITH_PAGE_INDEX(session,
- __split_verify_intl_key_order(session, parent));
-#endif
-
/*
* If discarding the page's original WT_REF field, reset it to split.
* Threads cursoring through the tree were blocked because that WT_REF
@@ -875,18 +851,27 @@ __split_parent(WT_SESSION_IMPL *session, WT_REF *ref, WT_REF **ref_new,
__wt_free(session, ref->page_del);
}
+ /*
+ * Set the discarded WT_REF state to split, ensuring we don't
+ * race with any discard of the WT_REF deleted fields.
+ */
WT_PUBLISH(ref->state, WT_REF_SPLIT);
- }
- /*
- * Push out the changes: not required for correctness, but don't let
- * threads spin on incorrect page references longer than necessary.
- */
- WT_FULL_BARRIER();
+ /*
+ * Push out the change: not required for correctness, but stops
+ * threads spinning on incorrect page references.
+ */
+ WT_FULL_BARRIER();
+ }
/* The split is complete and correct, ignore benign errors. */
complete = WT_ERR_IGNORE;
+#ifdef HAVE_DIAGNOSTIC
+ WT_WITH_PAGE_INDEX(session,
+ __split_verify_intl_key_order(session, parent));
+#endif
+
/*
* !!!
* Swapping in the new page index released the page for eviction, we can
@@ -1170,34 +1155,27 @@ __split_internal(WT_SESSION_IMPL *session, WT_PAGE *parent, WT_PAGE *page)
page->pg_intl_split_gen = split_gen;
/* Prepare the WT_REFs for the move. */
- __split_ref_step1(session, alloc_index, split_gen, true);
+ __split_ref_prepare(session, alloc_index, split_gen, true);
/* Split into the parent. */
WT_ERR(__split_parent(session, page_ref, alloc_index->index,
alloc_index->entries, parent_incr, false, false));
- /* Confirm the page's index hasn't moved, then update it. */
+ /*
+ * Confirm the page's index hasn't moved, then update it, which makes
+ * the split visible to threads descending the tree.
+ */
WT_ASSERT(session, WT_INTL_INDEX_GET_SAFE(page) == pindex);
WT_INTL_INDEX_SET(page, replace_index);
-#ifdef HAVE_DIAGNOSTIC
- WT_WITH_PAGE_INDEX(session,
- __split_verify_intl_key_order(session, parent));
- WT_WITH_PAGE_INDEX(session,
- __split_verify_intl_key_order(session, page));
-#endif
-
- /* Finalize the WT_REFs we moved. */
- WT_ERR(__split_ref_step2(session, alloc_index, true));
-
/* The split is complete and correct, ignore benign errors. */
complete = WT_ERR_IGNORE;
- /*
- * Push out the changes: not required for correctness, but no reason
- * to wait.
- */
- WT_FULL_BARRIER();
+#ifdef HAVE_DIAGNOSTIC
+ WT_WITH_PAGE_INDEX(session,
+ ret = __split_verify_intl(session, parent, page, page, true));
+ WT_ERR(ret);
+#endif
/*
* We don't care about the page-index we allocated, all we needed was
diff --git a/src/btree/bt_walk.c b/src/btree/bt_walk.c
index 049700952ee..ddaa2e5f70b 100644
--- a/src/btree/bt_walk.c
+++ b/src/btree/bt_walk.c
@@ -340,9 +340,7 @@ __tree_walk_internal(WT_SESSION_IMPL *session,
* Take a copy of any held page and clear the return value. Remember
* the hazard pointer we're currently holding.
*
- * We may be passed a pointer to btree->evict_page that we are clearing
- * here. We check when discarding pages that we're not discarding that
- * page, so this clear must be done before the page is released.
+ * Clear the returned value, it makes future error handling easier.
*/
couple = couple_orig = ref = *refp;
*refp = NULL;
diff --git a/src/evict/evict_lru.c b/src/evict/evict_lru.c
index db39a5acdee..efe056aee02 100644
--- a/src/evict/evict_lru.c
+++ b/src/evict/evict_lru.c
@@ -756,7 +756,7 @@ __evict_pass(WT_SESSION_IMPL *session)
* Clear a single walk point.
*/
static int
-__evict_clear_walk(WT_SESSION_IMPL *session, bool count_stat)
+__evict_clear_walk(WT_SESSION_IMPL *session)
{
WT_BTREE *btree;
WT_CACHE *cache;
@@ -773,14 +773,14 @@ __evict_clear_walk(WT_SESSION_IMPL *session, bool count_stat)
if ((ref = btree->evict_ref) == NULL)
return (0);
- if (count_stat)
- WT_STAT_CONN_INCR(session, cache_eviction_walks_abandoned);
+ WT_STAT_CONN_INCR(session, cache_eviction_walks_abandoned);
/*
- * Clear evict_ref first, in case releasing it forces eviction (we
- * assert we never try to evict the current eviction walk point).
+ * Clear evict_ref before releasing it in case that forces eviction (we
+ * assert that we never try to evict the current eviction walk point).
*/
btree->evict_ref = NULL;
+
WT_WITH_DHANDLE(cache->walk_session, session->dhandle,
(ret = __wt_page_release(cache->walk_session,
ref, WT_READ_NO_EVICT)));
@@ -803,7 +803,7 @@ __evict_clear_all_walks(WT_SESSION_IMPL *session)
TAILQ_FOREACH(dhandle, &conn->dhqh, q)
if (WT_PREFIX_MATCH(dhandle->name, "file:"))
WT_WITH_DHANDLE(session, dhandle,
- WT_TRET(__evict_clear_walk(session, true)));
+ WT_TRET(__evict_clear_walk(session)));
return (ret);
}
@@ -848,7 +848,7 @@ __wt_evict_file_exclusive_on(WT_SESSION_IMPL *session)
/* Clear any existing LRU eviction walk for the file. */
WT_WITH_PASS_LOCK(session,
- ret = __evict_clear_walk(session, true));
+ ret = __evict_clear_walk(session));
(void)__wt_atomic_subv32(&cache->pass_intr, 1);
WT_ERR(ret);
@@ -1662,8 +1662,15 @@ __evict_walk_file(WT_SESSION_IMPL *session,
FLD_SET(walk_flags, WT_READ_PREV);
/*
- * Get some more eviction candidate pages.
- *
+ * Get some more eviction candidate pages, starting at the last saved
+ * point. Clear the saved point immediately, we assert when discarding
+ * pages we're not discarding an eviction point, so this clear must be
+ * complete before the page is released.
+ */
+ ref = btree->evict_ref;
+ btree->evict_ref = NULL;
+
+ /*
* !!! Take care terminating this loop.
*
* Don't make an extra call to __wt_tree_walk after we hit the end of a
@@ -1676,7 +1683,7 @@ __evict_walk_file(WT_SESSION_IMPL *session,
for (evict = start, pages_queued = pages_seen = refs_walked = 0;
evict < end && (ret == 0 || ret == WT_NOTFOUND);
ret = __wt_tree_walk_count(
- session, &btree->evict_ref, &refs_walked, walk_flags)) {
+ session, &ref, &refs_walked, walk_flags)) {
/*
* Check whether we're finding a good ratio of candidates vs
* pages seen. Some workloads create "deserts" in trees where
@@ -1690,7 +1697,7 @@ __evict_walk_file(WT_SESSION_IMPL *session,
if (give_up)
break;
- if ((ref = btree->evict_ref) == NULL) {
+ if (ref == NULL) {
if (++restarts == 2)
break;
WT_STAT_CONN_INCR(
@@ -1812,6 +1819,8 @@ fast: /* If the page can't be evicted, give up. */
btree->evict_walk_period /= 2;
/*
+ * Give up the walk occasionally.
+ *
* If we happen to end up on the root page or a page requiring urgent
* eviction, clear it. We have to track hazard pointers, and the root
* page complicates that calculation.
@@ -1823,16 +1832,20 @@ fast: /* If the page can't be evicted, give up. */
* If we land on a page requiring forced eviction, move on to the next
* page: we want this page evicted as quickly as possible.
*/
- if ((ref = btree->evict_ref) != NULL) {
- /* Give up the walk occasionally. */
+ if (ref != NULL) {
if (__wt_ref_is_root(ref) || evict == start || give_up ||
ref->page->read_gen == WT_READGEN_OLDEST ||
- ref->page->memory_footprint >= btree->splitmempage)
- WT_RET(__evict_clear_walk(session, restarts == 0));
- else if (ref->page->read_gen == WT_READGEN_OLDEST)
+ ref->page->memory_footprint >= btree->splitmempage) {
+ if (restarts == 0)
+ WT_STAT_CONN_INCR(
+ session, cache_eviction_walks_abandoned);
+ WT_RET(__wt_page_release(cache->walk_session,
+ ref, WT_READ_NO_EVICT));
+ ref = NULL;
+ } else if (ref->page->read_gen == WT_READGEN_OLDEST)
WT_RET_NOTFOUND_OK(__wt_tree_walk_count(
- session, &btree->evict_ref,
- &refs_walked, walk_flags));
+ session, &ref, &refs_walked, walk_flags));
+ btree->evict_ref = ref;
}
WT_STAT_CONN_INCRV(session, cache_eviction_walk, refs_walked);
diff --git a/src/include/session.h b/src/include/session.h
index 7dd523aea26..085f871a34f 100644
--- a/src/include/session.h
+++ b/src/include/session.h
@@ -52,8 +52,6 @@ struct __wt_session_impl {
const char *lastop; /* Last operation */
uint32_t id; /* UID, offset in session array */
- WT_CONDVAR *cond; /* Condition variable */
-
WT_EVENT_HANDLER *event_handler;/* Application's event handlers */
WT_DATA_HANDLE *dhandle; /* Current data handle */
diff --git a/src/session/session_api.c b/src/session/session_api.c
index 71626e098cb..3a5d06f1b61 100644
--- a/src/session/session_api.c
+++ b/src/session/session_api.c
@@ -234,9 +234,6 @@ __session_close(WT_SESSION *wt_session, const char *config)
/* Release common session resources. */
WT_TRET(__wt_session_release_resources(session));
- /* Destroy the thread's mutex. */
- WT_TRET(__wt_cond_destroy(session, &session->cond));
-
/* The API lock protects opening and closing of sessions. */
__wt_spin_lock(session, &conn->api_lock);
@@ -1837,8 +1834,6 @@ __open_session(WT_CONNECTION_IMPL *conn,
session_ret->name = NULL;
session_ret->id = i;
- WT_ERR(__wt_cond_alloc(session, "session", &session_ret->cond));
-
if (WT_SESSION_FIRST_USE(session_ret))
__wt_random_init(&session_ret->rnd);