summaryrefslogtreecommitdiff
path: root/src/third_party/wiredtiger/src/evict
diff options
context:
space:
mode:
authorAlex Gorrod <alexander.gorrod@mongodb.com>2017-02-17 11:22:16 +1100
committerAlex Gorrod <alexander.gorrod@mongodb.com>2017-02-17 11:22:16 +1100
commit9c8c662a9213b16ae206f495c875594f5f0454f0 (patch)
tree7c9e527eec0fc5d99119ed5bf080660e96d20405 /src/third_party/wiredtiger/src/evict
parent22ec4be075233d425f21349854b5ceac6baa5289 (diff)
downloadmongo-9c8c662a9213b16ae206f495c875594f5f0454f0.tar.gz
Import wiredtiger: e1bcc30da91eedd0b17cebb725cc7e607ffa2340 from branch mongodb-3.6
ref: 48a3cbc17f..e1bcc30da9 for: 3.5.4 WT-2790 Fix a text case false positive in test_sweep01 WT-2909 Create automatable test verifying checkpoint integrity after errors WT-3088 bug: Don't evict a page with refs visible to readers after a split WT-3097 Race on reconfigure or shutdown can lead to waiting for statistics log server WT-3111 util_create() doesnt free memory assigned to "uri" WT-3113 Add a verbose mode to dump the cache when eviction is stuck WT-3115 Change the dhandle lock to a read/write lock WT-3120 Fix ordering problem in connection_close for filesystem loaded in an extension WT-3135 search_near() for index with custom collator WT-3137 Hang in __log_slot_join/__log_slot_switch_internal WT-3139 Enhance wtperf to support periodic table scans WT-3144 bug fix: random cursor returns not-found when descending to an empty page WT-3148 Improve eviction efficiency with many small trees WT-3149 Change eviction to start new walks from a random place in the tree WT-3150 Reduce impact of checkpoints on eviction server WT-3152 Convert table lock from a spinlock to a read write lock WT-3156 Assertion in log_write fires after write failure WT-3157 checkpoint/transaction integrity issue when writes fail. WT-3159 Incorrect key for index containing multiple variable sized entries WT-3161 checkpoint hang after write failure injection. WT-3164 Ensure all relevant btree fields are reset on checkpoint error WT-3170 Clear the eviction walk point while populating from a tree WT-3173 Add runtime detection for s390x CRC32 hardware support WT-3174 Coverity/lint cleanup WT-3175 New hang in internal page split WT-3179 Test bug: clang sanitizer failure in fail_fs WT-3180 Fault injection tests should only run as "long" tests and should not create core files WT-3184 Problem duplicating index cursor with custom collator WT-3186 Fix error path and panic detection in logging loops
Diffstat (limited to 'src/third_party/wiredtiger/src/evict')
-rw-r--r--src/third_party/wiredtiger/src/evict/evict_lru.c617
-rw-r--r--src/third_party/wiredtiger/src/evict/evict_stat.c2
2 files changed, 307 insertions, 312 deletions
diff --git a/src/third_party/wiredtiger/src/evict/evict_lru.c b/src/third_party/wiredtiger/src/evict/evict_lru.c
index 9b969de9a9e..42fe4d4608e 100644
--- a/src/third_party/wiredtiger/src/evict/evict_lru.c
+++ b/src/third_party/wiredtiger/src/evict/evict_lru.c
@@ -24,40 +24,40 @@ static int __evict_walk_file(
(S2C(s)->evict_threads.current_threads > 1)
/*
- * __evict_lock_dhandle --
- * Try to get the dhandle lock, with yield and sleep back off.
+ * __evict_lock_handle_list --
+ * Try to get the handle list lock, with yield and sleep back off.
* Keep timing statistics overall.
*/
static int
-__evict_lock_dhandle(WT_SESSION_IMPL *session)
+__evict_lock_handle_list(WT_SESSION_IMPL *session)
{
struct timespec enter, leave;
WT_CACHE *cache;
WT_CONNECTION_IMPL *conn;
WT_DECL_RET;
- WT_SPINLOCK *dh_lock;
- int64_t **stats;
+ WT_RWLOCK *dh_lock;
u_int spins;
bool dh_stats;
conn = S2C(session);
cache = conn->cache;
dh_lock = &conn->dhandle_lock;
- stats = (int64_t **)conn->stats;
- dh_stats = WT_STAT_ENABLED(session) && dh_lock->stat_count_off != -1;
/*
- * Maintain lock acquisition timing statistics as if this were a
- * regular lock acquisition.
+ * Setup tracking of handle lock acquisition wait time if statistics
+ * are enabled.
*/
+ dh_stats = WT_STAT_ENABLED(session);
+
if (dh_stats)
__wt_epoch(session, &enter);
+
/*
* Use a custom lock acquisition back off loop so the eviction server
* notices any interrupt quickly.
*/
for (spins = 0;
- (ret = __wt_spin_trylock_track(session, dh_lock)) == EBUSY &&
+ (ret = __wt_try_readlock(session, dh_lock)) == EBUSY &&
cache->pass_intr == 0; spins++) {
if (spins < WT_THOUSAND)
__wt_yield();
@@ -70,8 +70,9 @@ __evict_lock_dhandle(WT_SESSION_IMPL *session)
WT_RET(ret);
if (dh_stats) {
__wt_epoch(session, &leave);
- stats[session->stat_bucket][dh_lock->stat_int_usecs_off] +=
- (int64_t)WT_TIMEDIFF_US(leave, enter);
+ WT_STAT_CONN_INCRV(
+ session, lock_handle_list_wait_eviction,
+ (int64_t)WT_TIMEDIFF_US(leave, enter));
}
return (0);
}
@@ -197,8 +198,7 @@ __wt_evict_list_clear_page(WT_SESSION_IMPL *session, WT_REF *ref)
}
__wt_spin_unlock(session, &cache->evict_queues[q].evict_lock);
}
- WT_ASSERT(session,
- !F_ISSET_ATOMIC(ref->page, WT_PAGE_EVICT_LRU));
+ WT_ASSERT(session, !F_ISSET_ATOMIC(ref->page, WT_PAGE_EVICT_LRU));
__wt_spin_unlock(session, &cache->evict_queue_lock);
}
@@ -267,7 +267,7 @@ __wt_evict_server_wake(WT_SESSION_IMPL *session)
}
#endif
- __wt_cond_auto_signal(session, cache->evict_cond);
+ __wt_cond_signal(session, cache->evict_cond);
}
/*
@@ -280,12 +280,12 @@ __wt_evict_thread_run(WT_SESSION_IMPL *session, WT_THREAD *thread)
WT_CACHE *cache;
WT_CONNECTION_IMPL *conn;
WT_DECL_RET;
- bool did_work;
+ bool did_work, was_intr;
conn = S2C(session);
cache = conn->cache;
-#ifdef HAVE_DIAGNOSTIC
+#if defined(HAVE_DIAGNOSTIC) || defined(HAVE_VERBOSE)
/*
* Ensure the cache stuck timer is initialized when starting eviction.
*/
@@ -308,12 +308,28 @@ __wt_evict_thread_run(WT_SESSION_IMPL *session, WT_THREAD *thread)
ret = __evict_server(session, &did_work);
F_CLR(cache->walk_session, WT_SESSION_LOCKED_PASS);
F_CLR(session, WT_SESSION_LOCKED_PASS);
+ was_intr = cache->pass_intr != 0;
__wt_spin_unlock(session, &cache->evict_pass_lock);
WT_ERR(ret);
+
+ /*
+ * If the eviction server was interrupted, wait until
+ * requests have been processed: the system may
+ * otherwise be busy so don't go to sleep.
+ */
+ if (was_intr) {
+ while (cache->pass_intr != 0 &&
+ F_ISSET(conn, WT_CONN_EVICTION_RUN) &&
+ F_ISSET(thread, WT_THREAD_RUN))
+ __wt_yield();
+ continue;
+ }
+
__wt_verbose(session, WT_VERB_EVICTSERVER, "sleeping");
+
/* Don't rely on signals: check periodically. */
__wt_cond_auto_wait(
- session, cache->evict_cond, did_work);
+ session, cache->evict_cond, did_work, NULL);
__wt_verbose(session, WT_VERB_EVICTSERVER, "waking");
} else
WT_ERR(__evict_lru_pages(session, false));
@@ -353,12 +369,12 @@ err: WT_PANIC_MSG(session, ret, "cache eviction thread error");
static int
__evict_server(WT_SESSION_IMPL *session, bool *did_work)
{
+#if defined(HAVE_DIAGNOSTIC) || defined(HAVE_VERBOSE)
+ struct timespec now;
+#endif
WT_CACHE *cache;
WT_CONNECTION_IMPL *conn;
WT_DECL_RET;
-#ifdef HAVE_DIAGNOSTIC
- struct timespec now;
-#endif
uint64_t orig_pages_evicted;
conn = S2C(session);
@@ -370,7 +386,8 @@ __evict_server(WT_SESSION_IMPL *session, bool *did_work)
/* Evict pages from the cache as needed. */
WT_RET(__evict_pass(session));
- if (!F_ISSET(conn, WT_CONN_EVICTION_RUN))
+ if (!F_ISSET(conn, WT_CONN_EVICTION_RUN) ||
+ cache->pass_intr != 0)
return (0);
/*
@@ -378,28 +395,31 @@ __evict_server(WT_SESSION_IMPL *session, bool *did_work)
* otherwise we can block applications evicting large pages.
*/
if (!__wt_cache_stuck(session)) {
-
/*
- * If we gave up acquiring the lock, that indicates a
- * session is waiting for us to clear walks. Do that
- * as part of a normal pass (without the handle list
+ * Try to get the handle list lock: if we give up, that
+ * indicates a session is waiting for us to clear walks. Do
+ * that as part of a normal pass (without the handle list
* lock) to avoid deadlock.
*/
- if ((ret = __evict_lock_dhandle(session)) == EBUSY)
+ if ((ret = __evict_lock_handle_list(session)) == EBUSY)
return (0);
WT_RET(ret);
ret = __evict_clear_all_walks(session);
- __wt_spin_unlock(session, &conn->dhandle_lock);
+ __wt_readunlock(session, &conn->dhandle_lock);
WT_RET(ret);
cache->pages_evicted = 0;
} else if (cache->pages_evicted != cache->pages_evict) {
cache->pages_evicted = cache->pages_evict;
-#ifdef HAVE_DIAGNOSTIC
+#if defined(HAVE_DIAGNOSTIC) || defined(HAVE_VERBOSE)
__wt_epoch(session, &cache->stuck_ts);
} else if (!F_ISSET(conn, WT_CONN_IN_MEMORY)) {
/*
- * After being stuck for 5 minutes, give up.
+ * If we're stuck for 5 minutes in diagnostic mode, or the
+ * verbose evict_stuck flag is configured, log the cache
+ * and transaction state.
+ *
+ * If we're stuck for 5 minutes in diagnostic mode, give up.
*
* We don't do this check for in-memory workloads because
* application threads are not blocked by the cache being full.
@@ -408,11 +428,22 @@ __evict_server(WT_SESSION_IMPL *session, bool *did_work)
*/
__wt_epoch(session, &now);
if (WT_TIMEDIFF_SEC(now, cache->stuck_ts) > 300) {
- ret = ETIMEDOUT;
- __wt_err(session, ret,
+#if defined(HAVE_DIAGNOSTIC)
+ __wt_err(session, ETIMEDOUT,
"Cache stuck for too long, giving up");
- WT_TRET(__wt_dump_stuck_info(session, NULL));
+ ret = ETIMEDOUT;
+ WT_TRET(__wt_verbose_dump_txn(session));
+ WT_TRET(__wt_verbose_dump_cache(session));
return (ret);
+#elif defined(HAVE_VERBOSE)
+ if (WT_VERBOSE_ISSET(session, WT_VERB_EVICT_STUCK)) {
+ WT_RET(__wt_verbose_dump_txn(session));
+ WT_RET(__wt_verbose_dump_cache(session));
+
+ /* Reset the timer. */
+ __wt_epoch(session, &cache->stuck_ts);
+ }
+#endif
}
#endif
}
@@ -697,8 +728,8 @@ __evict_pass(WT_SESSION_IMPL *session)
*/
WT_STAT_CONN_INCR(session,
cache_eviction_server_slept);
- __wt_cond_wait(
- session, cache->evict_cond, WT_THOUSAND);
+ __wt_cond_wait(session,
+ cache->evict_cond, WT_THOUSAND, NULL);
continue;
}
@@ -725,7 +756,7 @@ __evict_pass(WT_SESSION_IMPL *session)
* Clear a single walk point.
*/
static int
-__evict_clear_walk(WT_SESSION_IMPL *session, bool count_stat)
+__evict_clear_walk(WT_SESSION_IMPL *session)
{
WT_BTREE *btree;
WT_CACHE *cache;
@@ -742,14 +773,14 @@ __evict_clear_walk(WT_SESSION_IMPL *session, bool count_stat)
if ((ref = btree->evict_ref) == NULL)
return (0);
- if (count_stat)
- WT_STAT_CONN_INCR(session, cache_eviction_walks_abandoned);
+ WT_STAT_CONN_INCR(session, cache_eviction_walks_abandoned);
/*
- * Clear evict_ref first, in case releasing it forces eviction (we
- * assert we never try to evict the current eviction walk point).
+ * Clear evict_ref before releasing it in case that forces eviction (we
+ * assert that we never try to evict the current eviction walk point).
*/
btree->evict_ref = NULL;
+
WT_WITH_DHANDLE(cache->walk_session, session->dhandle,
(ret = __wt_page_release(cache->walk_session,
ref, WT_READ_NO_EVICT)));
@@ -772,7 +803,7 @@ __evict_clear_all_walks(WT_SESSION_IMPL *session)
TAILQ_FOREACH(dhandle, &conn->dhqh, q)
if (WT_PREFIX_MATCH(dhandle->name, "file:"))
WT_WITH_DHANDLE(session, dhandle,
- WT_TRET(__evict_clear_walk(session, true)));
+ WT_TRET(__evict_clear_walk(session)));
return (ret);
}
@@ -817,7 +848,7 @@ __wt_evict_file_exclusive_on(WT_SESSION_IMPL *session)
/* Clear any existing LRU eviction walk for the file. */
WT_WITH_PASS_LOCK(session,
- ret = __evict_clear_walk(session, true));
+ ret = __evict_clear_walk(session));
(void)__wt_atomic_subv32(&cache->pass_intr, 1);
WT_ERR(ret);
@@ -1087,7 +1118,8 @@ __evict_lru_pages(WT_SESSION_IMPL *session, bool is_server)
/* If a worker thread found the queue empty, pause. */
if (ret == WT_NOTFOUND && !is_server &&
F_ISSET(S2C(session), WT_CONN_EVICTION_RUN))
- __wt_cond_wait(session, conn->evict_threads.wait_cond, 10000);
+ __wt_cond_wait(
+ session, conn->evict_threads.wait_cond, 10000, NULL);
return (ret == WT_NOTFOUND ? 0 : ret);
}
@@ -1304,7 +1336,7 @@ retry: while (slot < max_entries) {
* reference count to keep it alive while we sweep.
*/
if (!dhandle_locked) {
- WT_ERR(__evict_lock_dhandle(session));
+ WT_ERR(__evict_lock_handle_list(session));
dhandle_locked = true;
}
@@ -1383,7 +1415,7 @@ retry: while (slot < max_entries) {
(void)__wt_atomic_addi32(&dhandle->session_inuse, 1);
incr = true;
- __wt_spin_unlock(session, &conn->dhandle_lock);
+ __wt_readunlock(session, &conn->dhandle_lock);
dhandle_locked = false;
/*
@@ -1430,7 +1462,7 @@ retry: while (slot < max_entries) {
}
err: if (dhandle_locked) {
- __wt_spin_unlock(session, &conn->dhandle_lock);
+ __wt_readunlock(session, &conn->dhandle_lock);
dhandle_locked = false;
}
@@ -1526,6 +1558,19 @@ __evict_walk_file(WT_SESSION_IMPL *session,
start = queue->evict_queue + *slotp;
remaining_slots = max_entries - *slotp;
total_slots = max_entries - queue->evict_entries;
+ btree_inuse = cache_inuse = 0;
+ target_pages_clean = target_pages_dirty = 0;
+
+ /*
+ * The number of times we should fill the queue by the end of
+ * considering all trees.
+ */
+#define QUEUE_FILLS_PER_PASS 10
+
+ /*
+ * The minimum number of pages we should consider per tree.
+ */
+#define MIN_PAGES_PER_TREE 10
/*
* The target number of pages for this tree is proportional to the
@@ -1534,13 +1579,12 @@ __evict_walk_file(WT_SESSION_IMPL *session,
* cache (and only have to walk it once).
*/
if (F_ISSET(cache, WT_CACHE_EVICT_CLEAN)) {
- btree_inuse = __wt_btree_bytes_inuse(session);
+ btree_inuse = __wt_btree_bytes_evictable(session);
cache_inuse = __wt_cache_bytes_inuse(cache);
bytes_per_slot = 1 + cache_inuse / total_slots;
target_pages_clean = (uint32_t)(
(btree_inuse + bytes_per_slot / 2) / bytes_per_slot);
- } else
- target_pages_clean = 0;
+ }
if (F_ISSET(cache, WT_CACHE_EVICT_DIRTY)) {
btree_inuse = __wt_btree_dirty_leaf_inuse(session);
@@ -1548,35 +1592,58 @@ __evict_walk_file(WT_SESSION_IMPL *session,
bytes_per_slot = 1 + cache_inuse / total_slots;
target_pages_dirty = (uint32_t)(
(btree_inuse + bytes_per_slot / 2) / bytes_per_slot);
- } else
- target_pages_dirty = 0;
+ }
- target_pages = WT_MAX(target_pages_clean, target_pages_dirty);
+ /*
+ * Weight the number of target pages by the number of times we want to
+ * fill the cache per pass through all the trees. Note that we don't
+ * build this into the calculation above because we don't want to favor
+ * small trees, so round to a whole number of slots (zero for small
+ * trees) before multiplying.
+ */
+ target_pages = WT_MAX(target_pages_clean, target_pages_dirty) *
+ QUEUE_FILLS_PER_PASS;
+ /*
+ * Randomly walk trees with a small fraction of the cache in case there
+ * are so many trees that none of them use enough of the cache to be
+ * allocated slots.
+ *
+ * The chance of walking a tree is equal to the chance that a random
+ * byte in cache belongs to the tree, weighted by how many times we
+ * want to fill queues during a pass through all the trees in cache.
+ */
if (target_pages == 0) {
- /*
- * Randomly walk trees with a tiny fraction of the cache in
- * case there are so many trees that none of them use enough of
- * the cache to be allocated slots. Walk small trees 1% of the
- * time.
- */
- if (__wt_random(&session->rnd) > UINT32_MAX / 100)
+ if (F_ISSET(cache, WT_CACHE_EVICT_CLEAN)) {
+ btree_inuse = __wt_btree_bytes_evictable(session);
+ cache_inuse = __wt_cache_bytes_inuse(cache);
+ } else {
+ btree_inuse = __wt_btree_dirty_leaf_inuse(session);
+ cache_inuse = __wt_cache_dirty_leaf_inuse(cache);
+ }
+ if (btree_inuse == 0 || cache_inuse == 0)
+ return (0);
+ if (__wt_random64(&session->rnd) % cache_inuse >
+ btree_inuse * QUEUE_FILLS_PER_PASS)
return (0);
- target_pages = 10;
}
+ /*
+ * There is some cost associated with walking a tree. If we're going
+ * to visit this tree, always look for a minimum number of pages.
+ */
+ if (target_pages < MIN_PAGES_PER_TREE)
+ target_pages = MIN_PAGES_PER_TREE;
+
+ /*
+ * If the tree is dead or we're near the end of the queue, fill the
+ * remaining slots.
+ */
if (F_ISSET(session->dhandle, WT_DHANDLE_DEAD) ||
target_pages > remaining_slots)
target_pages = remaining_slots;
end = start + target_pages;
- walk_flags =
- WT_READ_CACHE | WT_READ_NO_EVICT | WT_READ_NO_GEN | WT_READ_NO_WAIT;
-
- /* Randomize the walk direction. */
- if (btree->evict_walk_reverse)
- FLD_SET(walk_flags, WT_READ_PREV);
-
/*
* Examine at least a reasonable number of pages before deciding
* whether to give up. When we are only looking for dirty pages,
@@ -1588,8 +1655,41 @@ __evict_walk_file(WT_SESSION_IMPL *session,
min_pages *= 10;
/*
- * Get some more eviction candidate pages.
- *
+ * Choose a random point in the tree if looking for candidates in a
+ * tree with no starting point set. This is mostly aimed at ensuring
+ * eviction fairly visits all pages in trees with a lot of in-cache
+ * content.
+ */
+ if (btree->evict_ref == NULL) {
+ /* Ensure internal pages indexes remain valid for our walk */
+ WT_WITH_PAGE_INDEX(session, ret =
+ __wt_random_descent(session, &btree->evict_ref, true));
+ WT_RET_NOTFOUND_OK(ret);
+
+ /*
+ * Reverse the direction of the walk each time we start at a
+ * random point so both ends of the tree are equally likely to
+ * be visited.
+ */
+ btree->evict_walk_reverse = !btree->evict_walk_reverse;
+ }
+
+ walk_flags =
+ WT_READ_CACHE | WT_READ_NO_EVICT | WT_READ_NO_GEN | WT_READ_NO_WAIT;
+
+ if (btree->evict_walk_reverse)
+ FLD_SET(walk_flags, WT_READ_PREV);
+
+ /*
+ * Get some more eviction candidate pages, starting at the last saved
+ * point. Clear the saved point immediately, we assert when discarding
+ * pages we're not discarding an eviction point, so this clear must be
+ * complete before the page is released.
+ */
+ ref = btree->evict_ref;
+ btree->evict_ref = NULL;
+
+ /*
* !!! Take care terminating this loop.
*
* Don't make an extra call to __wt_tree_walk after we hit the end of a
@@ -1602,7 +1702,7 @@ __evict_walk_file(WT_SESSION_IMPL *session,
for (evict = start, pages_queued = pages_seen = refs_walked = 0;
evict < end && (ret == 0 || ret == WT_NOTFOUND);
ret = __wt_tree_walk_count(
- session, &btree->evict_ref, &refs_walked, walk_flags)) {
+ session, &ref, &refs_walked, walk_flags)) {
/*
* Check whether we're finding a good ratio of candidates vs
* pages seen. Some workloads create "deserts" in trees where
@@ -1616,7 +1716,7 @@ __evict_walk_file(WT_SESSION_IMPL *session,
if (give_up)
break;
- if ((ref = btree->evict_ref) == NULL) {
+ if (ref == NULL) {
if (++restarts == 2)
break;
WT_STAT_CONN_INCR(
@@ -1706,7 +1806,7 @@ fast: /* If the page can't be evicted, give up. */
++pages_queued;
if (WT_PAGE_IS_INTERNAL(page))
- ++internal_pages;
+ ++internal_pages;
__wt_verbose(session, WT_VERB_EVICTSERVER,
"select: %p, size %" WT_SIZET_FMT,
@@ -1719,12 +1819,10 @@ fast: /* If the page can't be evicted, give up. */
session, cache_eviction_pages_queued, (u_int)(evict - start));
/*
- * If we didn't find any candidates in the file, reverse the direction
- * of the walk and skip it next time.
+ * If we couldn't find the number of pages we were looking for, skip
+ * the tree next time.
*/
- if (give_up)
- btree->evict_walk_reverse = !btree->evict_walk_reverse;
- if (pages_queued == 0 && !urgent_queued)
+ if (pages_queued < target_pages / 2 && !urgent_queued)
btree->evict_walk_period = WT_MIN(
WT_MAX(1, 2 * btree->evict_walk_period), 100);
else if (pages_queued == target_pages)
@@ -1733,6 +1831,8 @@ fast: /* If the page can't be evicted, give up. */
btree->evict_walk_period /= 2;
/*
+ * Give up the walk occasionally.
+ *
* If we happen to end up on the root page or a page requiring urgent
* eviction, clear it. We have to track hazard pointers, and the root
* page complicates that calculation.
@@ -1744,16 +1844,20 @@ fast: /* If the page can't be evicted, give up. */
* If we land on a page requiring forced eviction, move on to the next
* page: we want this page evicted as quickly as possible.
*/
- if ((ref = btree->evict_ref) != NULL) {
- /* Give up the walk occasionally. */
+ if (ref != NULL) {
if (__wt_ref_is_root(ref) || evict == start || give_up ||
ref->page->read_gen == WT_READGEN_OLDEST ||
- ref->page->memory_footprint >= btree->splitmempage)
- WT_RET(__evict_clear_walk(session, restarts == 0));
- else if (ref->page->read_gen == WT_READGEN_OLDEST)
+ ref->page->memory_footprint >= btree->splitmempage) {
+ if (restarts == 0)
+ WT_STAT_CONN_INCR(
+ session, cache_eviction_walks_abandoned);
+ WT_RET(__wt_page_release(cache->walk_session,
+ ref, WT_READ_NO_EVICT));
+ ref = NULL;
+ } else if (ref->page->read_gen == WT_READGEN_OLDEST)
WT_RET_NOTFOUND_OK(__wt_tree_walk_count(
- session, &btree->evict_ref,
- &refs_walked, walk_flags));
+ session, &ref, &refs_walked, walk_flags));
+ btree->evict_ref = ref;
}
WT_STAT_CONN_INCRV(session, cache_eviction_walk, refs_walked);
@@ -2087,8 +2191,8 @@ __wt_cache_eviction_worker(WT_SESSION_IMPL *session, bool busy, u_int pct_full)
break;
case WT_NOTFOUND:
/* Allow the queue to re-populate before retrying. */
- __wt_cond_wait(
- session, conn->evict_threads.wait_cond, 10000);
+ __wt_cond_wait(session,
+ conn->evict_threads.wait_cond, 10000, NULL);
cache->app_waits++;
break;
default:
@@ -2184,226 +2288,140 @@ __wt_evict_priority_clear(WT_SESSION_IMPL *session)
S2BT(session)->evict_priority = 0;
}
-#ifdef HAVE_DIAGNOSTIC
+#if defined(HAVE_DIAGNOSTIC) || defined(HAVE_VERBOSE)
/*
- * __dump_txn_state --
- * Output debugging information about the global transaction state.
+ * __verbose_dump_cache_single --
+ * Output diagnostic information about a single file in the cache.
*/
static int
-__dump_txn_state(WT_SESSION_IMPL *session, FILE *fp)
+__verbose_dump_cache_single(WT_SESSION_IMPL *session,
+ uint64_t *total_bytesp, uint64_t *total_dirty_bytesp)
{
- WT_CONNECTION_IMPL *conn;
- WT_TXN_GLOBAL *txn_global;
- WT_TXN *txn;
- WT_TXN_STATE *s;
- const char *iso_tag;
- uint64_t id;
- uint32_t i, session_cnt;
-
- conn = S2C(session);
- txn_global = &conn->txn_global;
- WT_ORDERED_READ(session_cnt, conn->session_cnt);
-
- /* Note: odd string concatenation avoids spelling errors. */
- if (fprintf(fp, "==========\n" "transaction state dump\n") < 0)
- return (EIO);
-
- if (fprintf(fp,
- "current ID: %" PRIu64 "\n"
- "last running ID: %" PRIu64 "\n"
- "oldest ID: %" PRIu64 "\n"
- "oldest named snapshot ID: %" PRIu64 "\n",
- txn_global->current, txn_global->last_running,
- txn_global->oldest_id, txn_global->nsnap_oldest_id) < 0)
- return (EIO);
-
- if (fprintf(fp,
- "checkpoint running? %s\n"
- "checkpoint generation: %" PRIu64 "\n"
- "checkpoint pinned ID: %" PRIu64 "\n"
- "checkpoint txn ID: %" PRIu64 "\n"
- "session count: %" PRIu32 "\n",
- txn_global->checkpoint_running ? "yes" : "no",
- txn_global->checkpoint_gen,
- txn_global->checkpoint_pinned,
- txn_global->checkpoint_txnid,
- session_cnt) < 0)
- return (EIO);
-
- if (fprintf(fp, "Dumping transaction state of active sessions\n") < 0)
- return (EIO);
-
- /*
- * Walk each session transaction state and dump information. Accessing
- * the content of session handles is not thread safe, so some
- * information may change while traversing if other threads are active
- * at the same time, which is OK since this is diagnostic code.
- */
- for (i = 0, s = txn_global->states; i < session_cnt; i++, s++) {
- /* Skip sessions with no active transaction */
- if ((id = s->id) == WT_TXN_NONE && s->pinned_id == WT_TXN_NONE)
- continue;
+ WT_DATA_HANDLE *dhandle;
+ WT_PAGE *page;
+ WT_REF *next_walk;
+ size_t size;
+ uint64_t intl_bytes, intl_bytes_max, intl_dirty_bytes;
+ uint64_t intl_dirty_bytes_max, intl_dirty_pages, intl_pages;
+ uint64_t leaf_bytes, leaf_bytes_max, leaf_dirty_bytes;
+ uint64_t leaf_dirty_bytes_max, leaf_dirty_pages, leaf_pages;
- txn = &conn->sessions[i].txn;
- iso_tag = "INVALID";
- switch (txn->isolation) {
- case WT_ISO_READ_COMMITTED:
- iso_tag = "WT_ISO_READ_COMMITTED";
- break;
- case WT_ISO_READ_UNCOMMITTED:
- iso_tag = "WT_ISO_READ_UNCOMMITTED";
- break;
- case WT_ISO_SNAPSHOT:
- iso_tag = "WT_ISO_SNAPSHOT";
- break;
+ intl_bytes = intl_bytes_max = intl_dirty_bytes = 0;
+ intl_dirty_bytes_max = intl_dirty_pages = intl_pages = 0;
+ leaf_bytes = leaf_bytes_max = leaf_dirty_bytes = 0;
+ leaf_dirty_bytes_max = leaf_dirty_pages = leaf_pages = 0;
+
+ next_walk = NULL;
+ while (__wt_tree_walk(session, &next_walk,
+ WT_READ_CACHE | WT_READ_NO_EVICT | WT_READ_NO_WAIT) == 0 &&
+ next_walk != NULL) {
+ page = next_walk->page;
+ size = page->memory_footprint;
+
+ if (WT_PAGE_IS_INTERNAL(page)) {
+ ++intl_pages;
+ intl_bytes += size;
+ intl_bytes_max = WT_MAX(intl_bytes_max, size);
+ if (__wt_page_is_modified(page)) {
+ ++intl_dirty_pages;
+ intl_dirty_bytes += size;
+ intl_dirty_bytes_max =
+ WT_MAX(intl_dirty_bytes_max, size);
+ }
+ } else {
+ ++leaf_pages;
+ leaf_bytes += size;
+ leaf_bytes_max = WT_MAX(leaf_bytes_max, size);
+ if (__wt_page_is_modified(page)) {
+ ++leaf_dirty_pages;
+ leaf_dirty_bytes += size;
+ leaf_dirty_bytes_max =
+ WT_MAX(leaf_dirty_bytes_max, size);
+ }
}
-
- if (fprintf(fp,
- "ID: %6" PRIu64
- ", mod count: %u"
- ", pinned ID: %" PRIu64
- ", snap min: %" PRIu64
- ", snap max: %" PRIu64
- ", metadata pinned ID: %" PRIu64
- ", flags: 0x%08" PRIx32
- ", name: %s"
- ", isolation: %s" "\n",
- id,
- txn->mod_count,
- s->pinned_id,
- txn->snap_min,
- txn->snap_max,
- s->metadata_pinned,
- txn->flags,
- conn->sessions[i].name == NULL ?
- "EMPTY" : conn->sessions[i].name,
- iso_tag) < 0)
- return (EIO);
}
+ dhandle = session->dhandle;
+ if (dhandle->checkpoint == NULL)
+ WT_RET(__wt_msg(session, "%s(<live>):", dhandle->name));
+ else
+ WT_RET(__wt_msg(session, "%s(checkpoint=%s):",
+ dhandle->name, dhandle->checkpoint));
+ if (intl_pages != 0)
+ WT_RET(__wt_msg(session,
+ "internal: "
+ "%" PRIu64 " pages, "
+ "%" PRIu64 "MB, "
+ "%" PRIu64 "/%" PRIu64 " clean/dirty pages, "
+ "%" PRIu64 "/%" PRIu64 " clean/dirty MB, "
+ "%" PRIu64 "MB max page, "
+ "%" PRIu64 "MB max dirty page",
+ intl_pages,
+ intl_bytes / WT_MEGABYTE,
+ intl_pages - intl_dirty_pages,
+ intl_dirty_pages,
+ (intl_bytes - intl_dirty_bytes) / WT_MEGABYTE,
+ intl_dirty_bytes / WT_MEGABYTE,
+ intl_bytes_max / WT_MEGABYTE,
+ intl_dirty_bytes_max / WT_MEGABYTE));
+ if (leaf_pages != 0)
+ WT_RET(__wt_msg(session,
+ "leaf: "
+ "%" PRIu64 " pages, "
+ "%" PRIu64 "MB, "
+ "%" PRIu64 "/%" PRIu64 " clean/dirty pages, "
+ "%" PRIu64 "/%" PRIu64 " clean/dirty MB, "
+ "%" PRIu64 "MB max page, "
+ "%" PRIu64 "MB max dirty page",
+ leaf_pages,
+ leaf_bytes / WT_MEGABYTE,
+ leaf_pages - leaf_dirty_pages,
+ leaf_dirty_pages,
+ (leaf_bytes - leaf_dirty_bytes) / WT_MEGABYTE,
+ leaf_dirty_bytes / WT_MEGABYTE,
+ leaf_bytes_max / WT_MEGABYTE,
+ leaf_dirty_bytes_max / WT_MEGABYTE));
+
+ *total_bytesp += intl_bytes + leaf_bytes;
+ *total_dirty_bytesp += intl_dirty_bytes + leaf_dirty_bytes;
+
return (0);
}
/*
- * __dump_cache --
- * Output debugging information about the size of the files in cache.
+ * __wt_verbose_dump_cache --
+ * Output diagnostic information about the cache.
*/
-static int
-__dump_cache(WT_SESSION_IMPL *session, FILE *fp)
+int
+__wt_verbose_dump_cache(WT_SESSION_IMPL *session)
{
WT_CONNECTION_IMPL *conn;
- WT_DATA_HANDLE *dhandle, *saved_dhandle;
- WT_PAGE *page;
- WT_REF *next_walk;
- uint64_t intl_bytes, intl_bytes_max, intl_dirty_bytes;
- uint64_t intl_dirty_bytes_max, intl_dirty_pages, intl_pages;
- uint64_t leaf_bytes, leaf_bytes_max, leaf_dirty_bytes;
- uint64_t leaf_dirty_bytes_max, leaf_dirty_pages, leaf_pages;
+ WT_DATA_HANDLE *dhandle;
+ WT_DECL_RET;
uint64_t total_bytes, total_dirty_bytes;
- size_t size;
conn = S2C(session);
total_bytes = total_dirty_bytes = 0;
- /* Note: odd string concatenation avoids spelling errors. */
- if (fprintf(fp, "==========\n" "cache dump\n") < 0)
- return (EIO);
+ WT_RET(__wt_msg(session, "%s", WT_DIVIDER));
+ WT_RET(__wt_msg(session, "cache dump"));
- saved_dhandle = session->dhandle;
- TAILQ_FOREACH(dhandle, &conn->dhqh, q) {
+ for (dhandle = NULL;;) {
+ WT_WITH_HANDLE_LIST_READ_LOCK(session,
+ WT_DHANDLE_NEXT(session, dhandle, &conn->dhqh, q));
+ if (dhandle == NULL)
+ break;
if (!WT_PREFIX_MATCH(dhandle->name, "file:") ||
!F_ISSET(dhandle, WT_DHANDLE_OPEN))
continue;
- intl_bytes = intl_bytes_max = intl_dirty_bytes = 0;
- intl_dirty_bytes_max = intl_dirty_pages = intl_pages = 0;
- leaf_bytes = leaf_bytes_max = leaf_dirty_bytes = 0;
- leaf_dirty_bytes_max = leaf_dirty_pages = leaf_pages = 0;
-
- next_walk = NULL;
- session->dhandle = dhandle;
- while (__wt_tree_walk(session, &next_walk,
- WT_READ_CACHE | WT_READ_NO_EVICT | WT_READ_NO_WAIT) == 0 &&
- next_walk != NULL) {
- page = next_walk->page;
- size = page->memory_footprint;
-
- if (WT_PAGE_IS_INTERNAL(page)) {
- ++intl_pages;
- intl_bytes += size;
- intl_bytes_max = WT_MAX(intl_bytes_max, size);
- if (__wt_page_is_modified(page)) {
- ++intl_dirty_pages;
- intl_dirty_bytes += size;
- intl_dirty_bytes_max =
- WT_MAX(intl_dirty_bytes_max, size);
- }
- } else {
- ++leaf_pages;
- leaf_bytes += size;
- leaf_bytes_max = WT_MAX(leaf_bytes_max, size);
- if (__wt_page_is_modified(page)) {
- ++leaf_dirty_pages;
- leaf_dirty_bytes += size;
- leaf_dirty_bytes_max =
- WT_MAX(leaf_dirty_bytes_max, size);
- }
- }
- }
- session->dhandle = NULL;
-
- if (dhandle->checkpoint == NULL) {
- if (fprintf(fp,
- "%s(<live>): \n", dhandle->name) < 0)
- return (EIO);
- } else {
- if (fprintf(fp, "%s(checkpoint=%s): \n",
- dhandle->name, dhandle->checkpoint) < 0)
- return (EIO);
- }
- if (intl_pages != 0) {
- if (fprintf(fp,
- "\t" "internal: "
- "%" PRIu64 " pages, "
- "%" PRIu64 "MB, "
- "%" PRIu64 "/%" PRIu64 " clean/dirty pages, "
- "%" PRIu64 "/%" PRIu64 " clean/dirty MB, "
- "%" PRIu64 "MB max page, "
- "%" PRIu64 "MB max dirty page\n",
- intl_pages,
- intl_bytes >> 20,
- intl_pages - intl_dirty_pages,
- intl_dirty_pages,
- (intl_bytes - intl_dirty_bytes) >> 20,
- intl_dirty_bytes >> 20,
- intl_bytes_max >> 20,
- intl_dirty_bytes_max >> 20) < 0)
- return (EIO);
- }
- if (leaf_pages != 0) {
- if (fprintf(fp,
- "\t" "leaf: "
- "%" PRIu64 " pages, "
- "%" PRIu64 "MB, "
- "%" PRIu64 "/%" PRIu64 " clean/dirty pages, "
- "%" PRIu64 "/%" PRIu64 " clean/dirty MB, "
- "%" PRIu64 "MB max page, "
- "%" PRIu64 "MB max dirty page\n",
- leaf_pages,
- leaf_bytes >> 20,
- leaf_pages - leaf_dirty_pages,
- leaf_dirty_pages,
- (leaf_bytes - leaf_dirty_bytes) >> 20,
- leaf_dirty_bytes >> 20,
- leaf_bytes_max >> 20,
- leaf_dirty_bytes_max >> 20) < 0)
- return (EIO);
- }
-
- total_bytes += intl_bytes + leaf_bytes;
- total_dirty_bytes += intl_dirty_bytes + leaf_dirty_bytes;
+ WT_WITH_DHANDLE(session, dhandle,
+ ret = __verbose_dump_cache_single(
+ session, &total_bytes, &total_dirty_bytes));
+ if (ret != 0)
+ break;
}
- session->dhandle = saved_dhandle;
+ WT_RET(ret);
/*
* Apply the overhead percentage so our total bytes are comparable with
@@ -2411,39 +2429,16 @@ __dump_cache(WT_SESSION_IMPL *session, FILE *fp)
*/
total_bytes = __wt_cache_bytes_plus_overhead(conn->cache, total_bytes);
- if (fprintf(fp,
+ WT_RET(__wt_msg(session,
"cache dump: "
- "total found: %" PRIu64 "MB vs tracked inuse %" PRIu64 "MB\n"
- "total dirty bytes: %" PRIu64 "MB\n",
- total_bytes >> 20, __wt_cache_bytes_inuse(conn->cache) >> 20,
- total_dirty_bytes >> 20) < 0)
- return (EIO);
- if (fprintf(fp, "==========\n") < 0)
- return (EIO);
+ "total found: %" PRIu64 "MB vs tracked inuse %" PRIu64 "MB",
+ total_bytes / WT_MEGABYTE,
+ __wt_cache_bytes_inuse(conn->cache) / WT_MEGABYTE));
+ WT_RET(__wt_msg(session,
+ "total dirty bytes: %" PRIu64 "MB",
+ total_dirty_bytes / WT_MEGABYTE));
+ WT_RET(__wt_msg(session, "%s", WT_DIVIDER));
return (0);
}
-
-/*
- * __wt_dump_stuck_info --
- * Dump debugging information to a file (default stderr) about the state
- * of WiredTiger when we have determined that the cache is stuck full.
- */
-int
-__wt_dump_stuck_info(WT_SESSION_IMPL *session, const char *ofile)
-{
- FILE *fp;
- WT_DECL_RET;
-
- if (ofile == NULL)
- fp = stderr;
- else if ((fp = fopen(ofile, "w")) == NULL)
- return (EIO);
-
- WT_ERR(__dump_txn_state(session, fp));
- WT_ERR(__dump_cache(session, fp));
-err: if (ofile != NULL && fclose(fp) != 0)
- return (EIO);
- return (ret);
-}
#endif
diff --git a/src/third_party/wiredtiger/src/evict/evict_stat.c b/src/third_party/wiredtiger/src/evict/evict_stat.c
index 2dd3b1e83a0..7c2d5722a63 100644
--- a/src/third_party/wiredtiger/src/evict/evict_stat.c
+++ b/src/third_party/wiredtiger/src/evict/evict_stat.c
@@ -134,5 +134,5 @@ __wt_curstat_cache_walk(WT_SESSION_IMPL *session)
WT_STAT_DATA_SET(session,
cache_state_root_size, btree->root.page->memory_footprint);
- WT_WITH_HANDLE_LIST_LOCK(session, __evict_stat_walk(session));
+ __evict_stat_walk(session);
}