summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--src/btree/bt_split.c164
-rw-r--r--src/btree/bt_walk.c4
-rw-r--r--src/evict/evict_lru.c49
-rw-r--r--src/include/session.h2
-rw-r--r--src/session/session_api.c5
5 files changed, 103 insertions, 121 deletions
diff --git a/src/btree/bt_split.c b/src/btree/bt_split.c
index 8122d242666..fcb14be7c76 100644
--- a/src/btree/bt_split.c
+++ b/src/btree/bt_split.c
@@ -197,7 +197,7 @@ __split_safe_free(WT_SESSION_IMPL *session,
#ifdef HAVE_DIAGNOSTIC
/*
* __split_verify_intl_key_order --
- * Verify the key order on an internal page after a split, diagnostic only.
+ * Verify the key order on an internal page after a split.
*/
static void
__split_verify_intl_key_order(WT_SESSION_IMPL *session, WT_PAGE *page)
@@ -249,6 +249,42 @@ __split_verify_intl_key_order(WT_SESSION_IMPL *session, WT_PAGE *page)
break;
}
}
+
+/*
+ * __split_verify_intl --
+ * Verify a set of internal pages involved in a split.
+ */
+static int
+__split_verify_intl(WT_SESSION_IMPL *session,
+ WT_PAGE *page1, WT_PAGE *page2, WT_PAGE *pindex_page, bool skip_first)
+{
+ WT_DECL_RET;
+ WT_REF *ref;
+
+ /* The split is complete and live, verify all of the pages involved. */
+ if (page1 != NULL)
+ __split_verify_intl_key_order(session, page1);
+ if (page2 != NULL)
+ __split_verify_intl_key_order(session, page2);
+
+ /* Skip the first slot on non-root internal pages, it's not set. */
+ WT_INTL_FOREACH_BEGIN(session, pindex_page, ref) {
+ if (skip_first) {
+ skip_first = false;
+ continue;
+ }
+ WT_ERR(__wt_page_in(session, ref, WT_READ_NO_EVICT));
+
+ __split_verify_intl_key_order(session, ref->page);
+
+ WT_ERR(__wt_page_release(session, ref, WT_READ_NO_EVICT));
+ } WT_INTL_FOREACH_END;
+
+ return (0);
+
+err: /* Something really bad just happened. */
+ WT_PANIC_RET(session, ret, "fatal error during page split");
+}
#endif
/*
@@ -400,11 +436,11 @@ __split_ref_move(WT_SESSION_IMPL *session, WT_PAGE *from_home,
}
/*
- * __split_ref_step1 --
+ * __split_ref_prepare --
* Prepare a set of WT_REFs for a move.
*/
static void
-__split_ref_step1(WT_SESSION_IMPL *session,
+__split_ref_prepare(WT_SESSION_IMPL *session,
WT_PAGE_INDEX *pindex, uint64_t split_gen, bool skip_first)
{
WT_PAGE *child;
@@ -470,58 +506,6 @@ __split_ref_step1(WT_SESSION_IMPL *session,
}
/*
- * __split_ref_step2 --
- * Allow the newly created children to be evicted or split.
- */
-static int
-__split_ref_step2(
- WT_SESSION_IMPL *session, WT_PAGE_INDEX *pindex, bool skip_first)
-{
- WT_DECL_RET;
- WT_REF *ref;
- uint32_t i;
-
- /*
- * The split has gone live, enable eviction and splits on the newly
- * created internal pages.
- */
- WT_WRITE_BARRIER();
-
- for (i = skip_first ? 1 : 0; i < pindex->entries; ++i) {
- ref = pindex->index[i];
-
- /*
- * We don't hold hazard pointers on created pages, they cannot
- * be evicted because the page-modify transaction value set as
- * they were created prevents eviction. (See above, we reset
- * that value as part of fixing up the page.) But, an eviction
- * thread might be attempting to evict the page (the WT_REF may
- * be WT_REF_LOCKED), or it may be a disk based page (the WT_REF
- * may be WT_REF_READING), or it may be in some other state.
- * Acquire a hazard pointer for any in-memory pages so we know
- * the state of the page. Ignore pages not in-memory (deleted,
- * on-disk, being read), there's no in-memory structure to fix.
- */
- if ((ret = __wt_page_in(session,
- ref, WT_READ_CACHE | WT_READ_NO_EVICT)) == WT_NOTFOUND)
- continue;
- WT_ERR(ret);
-
-#ifdef HAVE_DIAGNOSTIC
- WT_WITH_PAGE_INDEX(session,
- __split_verify_intl_key_order(session, ref->page));
-#endif
-
- WT_ERR(__wt_hazard_clear(session, ref));
- }
-
- return (0);
-
-err: /* Something really bad just happened. */
- WT_PANIC_RET(session, ret, "fatal error resolving a split");
-}
-
-/*
* __split_root --
* Split the root page in-memory, deepening the tree.
*/
@@ -657,7 +641,7 @@ __split_root(WT_SESSION_IMPL *session, WT_PAGE *root)
root->pg_intl_split_gen = split_gen;
/* Prepare the WT_REFs for the move. */
- __split_ref_step1(session, alloc_index, split_gen, false);
+ __split_ref_prepare(session, alloc_index, split_gen, false);
/*
* Confirm the root page's index hasn't moved, then update it, which
@@ -665,19 +649,16 @@ __split_root(WT_SESSION_IMPL *session, WT_PAGE *root)
*/
WT_ASSERT(session, WT_INTL_INDEX_GET_SAFE(root) == pindex);
WT_INTL_INDEX_SET(root, alloc_index);
-
-#ifdef HAVE_DIAGNOSTIC
- WT_WITH_PAGE_INDEX(session,
- __split_verify_intl_key_order(session, root));
-#endif
- /* Finalize the WT_REFs we moved. */
- WT_ERR(__split_ref_step2(session, alloc_index, false));
+ alloc_index = NULL;
/* The split is complete and correct, ignore benign errors. */
complete = WT_ERR_IGNORE;
- /* We've installed the allocated page-index, ensure error handling. */
- alloc_index = NULL;
+#ifdef HAVE_DIAGNOSTIC
+ WT_WITH_PAGE_INDEX(session,
+ ret = __split_verify_intl(session, root, NULL, root, false));
+ WT_ERR(ret);
+#endif
/*
* We can't free the previous root's index, there may be threads using
@@ -852,11 +833,6 @@ __split_parent(WT_SESSION_IMPL *session, WT_REF *ref, WT_REF **ref_new,
WT_INTL_INDEX_SET(parent, alloc_index);
alloc_index = NULL;
-#ifdef HAVE_DIAGNOSTIC
- WT_WITH_PAGE_INDEX(session,
- __split_verify_intl_key_order(session, parent));
-#endif
-
/*
* If discarding the page's original WT_REF field, reset it to split.
* Threads cursoring through the tree were blocked because that WT_REF
@@ -875,18 +851,27 @@ __split_parent(WT_SESSION_IMPL *session, WT_REF *ref, WT_REF **ref_new,
__wt_free(session, ref->page_del);
}
+ /*
+ * Set the discarded WT_REF state to split, ensuring we don't
+ * race with any discard of the WT_REF deleted fields.
+ */
WT_PUBLISH(ref->state, WT_REF_SPLIT);
- }
- /*
- * Push out the changes: not required for correctness, but don't let
- * threads spin on incorrect page references longer than necessary.
- */
- WT_FULL_BARRIER();
+ /*
+ * Push out the change: not required for correctness, but stops
+ * threads spinning on incorrect page references.
+ */
+ WT_FULL_BARRIER();
+ }
/* The split is complete and correct, ignore benign errors. */
complete = WT_ERR_IGNORE;
+#ifdef HAVE_DIAGNOSTIC
+ WT_WITH_PAGE_INDEX(session,
+ __split_verify_intl_key_order(session, parent));
+#endif
+
/*
* !!!
* Swapping in the new page index released the page for eviction, we can
@@ -1170,34 +1155,27 @@ __split_internal(WT_SESSION_IMPL *session, WT_PAGE *parent, WT_PAGE *page)
page->pg_intl_split_gen = split_gen;
/* Prepare the WT_REFs for the move. */
- __split_ref_step1(session, alloc_index, split_gen, true);
+ __split_ref_prepare(session, alloc_index, split_gen, true);
/* Split into the parent. */
WT_ERR(__split_parent(session, page_ref, alloc_index->index,
alloc_index->entries, parent_incr, false, false));
- /* Confirm the page's index hasn't moved, then update it. */
+ /*
+ * Confirm the page's index hasn't moved, then update it, which makes
+ * the split visible to threads descending the tree.
+ */
WT_ASSERT(session, WT_INTL_INDEX_GET_SAFE(page) == pindex);
WT_INTL_INDEX_SET(page, replace_index);
-#ifdef HAVE_DIAGNOSTIC
- WT_WITH_PAGE_INDEX(session,
- __split_verify_intl_key_order(session, parent));
- WT_WITH_PAGE_INDEX(session,
- __split_verify_intl_key_order(session, page));
-#endif
-
- /* Finalize the WT_REFs we moved. */
- WT_ERR(__split_ref_step2(session, alloc_index, true));
-
/* The split is complete and correct, ignore benign errors. */
complete = WT_ERR_IGNORE;
- /*
- * Push out the changes: not required for correctness, but no reason
- * to wait.
- */
- WT_FULL_BARRIER();
+#ifdef HAVE_DIAGNOSTIC
+ WT_WITH_PAGE_INDEX(session,
+ ret = __split_verify_intl(session, parent, page, page, true));
+ WT_ERR(ret);
+#endif
/*
* We don't care about the page-index we allocated, all we needed was
diff --git a/src/btree/bt_walk.c b/src/btree/bt_walk.c
index 049700952ee..ddaa2e5f70b 100644
--- a/src/btree/bt_walk.c
+++ b/src/btree/bt_walk.c
@@ -340,9 +340,7 @@ __tree_walk_internal(WT_SESSION_IMPL *session,
* Take a copy of any held page and clear the return value. Remember
* the hazard pointer we're currently holding.
*
- * We may be passed a pointer to btree->evict_page that we are clearing
- * here. We check when discarding pages that we're not discarding that
- * page, so this clear must be done before the page is released.
+ * Clear the returned value, it makes future error handling easier.
*/
couple = couple_orig = ref = *refp;
*refp = NULL;
diff --git a/src/evict/evict_lru.c b/src/evict/evict_lru.c
index db39a5acdee..efe056aee02 100644
--- a/src/evict/evict_lru.c
+++ b/src/evict/evict_lru.c
@@ -756,7 +756,7 @@ __evict_pass(WT_SESSION_IMPL *session)
* Clear a single walk point.
*/
static int
-__evict_clear_walk(WT_SESSION_IMPL *session, bool count_stat)
+__evict_clear_walk(WT_SESSION_IMPL *session)
{
WT_BTREE *btree;
WT_CACHE *cache;
@@ -773,14 +773,14 @@ __evict_clear_walk(WT_SESSION_IMPL *session, bool count_stat)
if ((ref = btree->evict_ref) == NULL)
return (0);
- if (count_stat)
- WT_STAT_CONN_INCR(session, cache_eviction_walks_abandoned);
+ WT_STAT_CONN_INCR(session, cache_eviction_walks_abandoned);
/*
- * Clear evict_ref first, in case releasing it forces eviction (we
- * assert we never try to evict the current eviction walk point).
+ * Clear evict_ref before releasing it in case that forces eviction (we
+ * assert that we never try to evict the current eviction walk point).
*/
btree->evict_ref = NULL;
+
WT_WITH_DHANDLE(cache->walk_session, session->dhandle,
(ret = __wt_page_release(cache->walk_session,
ref, WT_READ_NO_EVICT)));
@@ -803,7 +803,7 @@ __evict_clear_all_walks(WT_SESSION_IMPL *session)
TAILQ_FOREACH(dhandle, &conn->dhqh, q)
if (WT_PREFIX_MATCH(dhandle->name, "file:"))
WT_WITH_DHANDLE(session, dhandle,
- WT_TRET(__evict_clear_walk(session, true)));
+ WT_TRET(__evict_clear_walk(session)));
return (ret);
}
@@ -848,7 +848,7 @@ __wt_evict_file_exclusive_on(WT_SESSION_IMPL *session)
/* Clear any existing LRU eviction walk for the file. */
WT_WITH_PASS_LOCK(session,
- ret = __evict_clear_walk(session, true));
+ ret = __evict_clear_walk(session));
(void)__wt_atomic_subv32(&cache->pass_intr, 1);
WT_ERR(ret);
@@ -1662,8 +1662,15 @@ __evict_walk_file(WT_SESSION_IMPL *session,
FLD_SET(walk_flags, WT_READ_PREV);
/*
- * Get some more eviction candidate pages.
- *
+ * Get some more eviction candidate pages, starting at the last saved
+ * point. Clear the saved point immediately, we assert when discarding
+ * pages we're not discarding an eviction point, so this clear must be
+ * complete before the page is released.
+ */
+ ref = btree->evict_ref;
+ btree->evict_ref = NULL;
+
+ /*
* !!! Take care terminating this loop.
*
* Don't make an extra call to __wt_tree_walk after we hit the end of a
@@ -1676,7 +1683,7 @@ __evict_walk_file(WT_SESSION_IMPL *session,
for (evict = start, pages_queued = pages_seen = refs_walked = 0;
evict < end && (ret == 0 || ret == WT_NOTFOUND);
ret = __wt_tree_walk_count(
- session, &btree->evict_ref, &refs_walked, walk_flags)) {
+ session, &ref, &refs_walked, walk_flags)) {
/*
* Check whether we're finding a good ratio of candidates vs
* pages seen. Some workloads create "deserts" in trees where
@@ -1690,7 +1697,7 @@ __evict_walk_file(WT_SESSION_IMPL *session,
if (give_up)
break;
- if ((ref = btree->evict_ref) == NULL) {
+ if (ref == NULL) {
if (++restarts == 2)
break;
WT_STAT_CONN_INCR(
@@ -1812,6 +1819,8 @@ fast: /* If the page can't be evicted, give up. */
btree->evict_walk_period /= 2;
/*
+ * Give up the walk occasionally.
+ *
* If we happen to end up on the root page or a page requiring urgent
* eviction, clear it. We have to track hazard pointers, and the root
* page complicates that calculation.
@@ -1823,16 +1832,20 @@ fast: /* If the page can't be evicted, give up. */
* If we land on a page requiring forced eviction, move on to the next
* page: we want this page evicted as quickly as possible.
*/
- if ((ref = btree->evict_ref) != NULL) {
- /* Give up the walk occasionally. */
+ if (ref != NULL) {
if (__wt_ref_is_root(ref) || evict == start || give_up ||
ref->page->read_gen == WT_READGEN_OLDEST ||
- ref->page->memory_footprint >= btree->splitmempage)
- WT_RET(__evict_clear_walk(session, restarts == 0));
- else if (ref->page->read_gen == WT_READGEN_OLDEST)
+ ref->page->memory_footprint >= btree->splitmempage) {
+ if (restarts == 0)
+ WT_STAT_CONN_INCR(
+ session, cache_eviction_walks_abandoned);
+ WT_RET(__wt_page_release(cache->walk_session,
+ ref, WT_READ_NO_EVICT));
+ ref = NULL;
+ } else if (ref->page->read_gen == WT_READGEN_OLDEST)
WT_RET_NOTFOUND_OK(__wt_tree_walk_count(
- session, &btree->evict_ref,
- &refs_walked, walk_flags));
+ session, &ref, &refs_walked, walk_flags));
+ btree->evict_ref = ref;
}
WT_STAT_CONN_INCRV(session, cache_eviction_walk, refs_walked);
diff --git a/src/include/session.h b/src/include/session.h
index 7dd523aea26..085f871a34f 100644
--- a/src/include/session.h
+++ b/src/include/session.h
@@ -52,8 +52,6 @@ struct __wt_session_impl {
const char *lastop; /* Last operation */
uint32_t id; /* UID, offset in session array */
- WT_CONDVAR *cond; /* Condition variable */
-
WT_EVENT_HANDLER *event_handler;/* Application's event handlers */
WT_DATA_HANDLE *dhandle; /* Current data handle */
diff --git a/src/session/session_api.c b/src/session/session_api.c
index 71626e098cb..3a5d06f1b61 100644
--- a/src/session/session_api.c
+++ b/src/session/session_api.c
@@ -234,9 +234,6 @@ __session_close(WT_SESSION *wt_session, const char *config)
/* Release common session resources. */
WT_TRET(__wt_session_release_resources(session));
- /* Destroy the thread's mutex. */
- WT_TRET(__wt_cond_destroy(session, &session->cond));
-
/* The API lock protects opening and closing of sessions. */
__wt_spin_lock(session, &conn->api_lock);
@@ -1837,8 +1834,6 @@ __open_session(WT_CONNECTION_IMPL *conn,
session_ret->name = NULL;
session_ret->id = i;
- WT_ERR(__wt_cond_alloc(session, "session", &session_ret->cond));
-
if (WT_SESSION_FIRST_USE(session_ret))
__wt_random_init(&session_ret->rnd);