WT-3616 format failed to report a stuck cache (#3745)

Add a separate counter of eviction progress: when we do eviction but don't count it as progress, the page doesn't stay in memory. Update various places that track whether eviction is making progress to use the new counter. In particular, cleanup / rename the eviction thread tuning code and move its state from WT_CONNECTION_IMPL to WT_CACHE.
author: Keith Bostic <keith.bostic@mongodb.com> 2017-10-20 01:29:02 -0400
committer: Michael Cahill <michael.cahill@mongodb.com> 2017-10-20 16:29:02 +1100
commit: 76b3b945e51d501a3346a37fd48d0561a81e07d8 (patch)
tree: 569fc49f58b9dcaa39a8fd52768cae494992b53c
parent: fa84ecc8bfdb487c8795e4ea2c0382ce671de519 (diff)
download: mongo-76b3b945e51d501a3346a37fd48d0561a81e07d8.tar.gz
29 files changed, 189 insertions, 147 deletions
diff --git a/src/async/async_api.c b/src/async/async_api.c
index 7f81ad0a8af..0152238456d 100644
--- a/src/async/async_api.c
+++ b/src/async/async_api.c
@@ -111,10 +111,11 @@ __async_new_op_alloc(WT_SESSION_IMPL *session, const char *uri,
 	WT_CONNECTION_IMPL *conn;
 	uint32_t i, save_i, view;
 
+	*opp = NULL;
+
 	conn = S2C(session);
 	async = conn->async;
 	WT_STAT_CONN_INCR(session, async_op_alloc);
-	*opp = NULL;
 
 retry:
 	op = NULL;
diff --git a/src/async/async_worker.c b/src/async/async_worker.c
index 2626a21435f..48961da9e42 100644
--- a/src/async/async_worker.c
+++ b/src/async/async_worker.c
@@ -22,8 +22,9 @@ __async_op_dequeue(WT_CONNECTION_IMPL *conn, WT_SESSION_IMPL *session,
 	uint64_t sleep_usec;
 	uint32_t tries;
 
-	async = conn->async;
 	*op = NULL;
+
+	async = conn->async;
 	/*
 	 * Wait for work to do.  Work is available when async->head moves.
 	 * Then grab the slot containing the work.  If we lose, try again.
@@ -125,8 +126,9 @@ __async_worker_cursor(WT_SESSION_IMPL *session, WT_ASYNC_OP_IMPL *op,
 	WT_DECL_RET;
 	WT_SESSION *wt_session;
 
-	wt_session = (WT_SESSION *)session;
 	*cursorp = NULL;
+
+	wt_session = (WT_SESSION *)session;
 	/*
 	 * Compact doesn't need a cursor.
 	 */
diff --git a/src/block/block_ckpt.c b/src/block/block_ckpt.c
index d744e10539f..64d0283a8a3 100644
--- a/src/block/block_ckpt.c
+++ b/src/block/block_ckpt.c
@@ -53,8 +53,6 @@ __wt_block_checkpoint_load(WT_SESSION_IMPL *session, WT_BLOCK *block,
 	WT_DECL_RET;
 	uint8_t *endp;
 
-	ci = NULL;
-
 	/*
 	 * Sometimes we don't find a root page (we weren't given a checkpoint,
 	 * or the checkpoint was empty).  In that case we return an empty root
@@ -62,6 +60,8 @@ __wt_block_checkpoint_load(WT_SESSION_IMPL *session, WT_BLOCK *block,
 	 */
 	*root_addr_sizep = 0;
 
+	ci = NULL;
+
 #ifdef HAVE_VERBOSE
 	if (WT_VERBOSE_ISSET(session, WT_VERB_CHECKPOINT)) {
 		if (addr != NULL) {
diff --git a/src/block/block_open.c b/src/block/block_open.c
index 6a9b2e65ac5..5f2ab947acb 100644
--- a/src/block/block_open.c
+++ b/src/block/block_open.c
@@ -150,10 +150,11 @@ __wt_block_open(WT_SESSION_IMPL *session,
 	uint64_t bucket, hash;
 	uint32_t flags;
 
+	*blockp = block = NULL;
+
 	__wt_verbose(session, WT_VERB_BLOCK, "open: %s", filename);
 
 	conn = S2C(session);
-	*blockp = block = NULL;
 	hash = __wt_hash_city64(filename, strlen(filename));
 	bucket = hash % WT_HASH_ARRAY_SIZE;
 	__wt_spin_lock(session, &conn->block_lock);
diff --git a/src/btree/bt_compact.c b/src/btree/bt_compact.c
index 75bb46aaf89..1a39b479ae8 100644
--- a/src/btree/bt_compact.c
+++ b/src/btree/bt_compact.c
@@ -220,6 +220,8 @@ __wt_compact_page_skip(
 	u_int type;
 
 	WT_UNUSED(context);
+	*skipp = false;				/* Default to reading */
+
 	/*
 	 * Skip deleted pages, rewriting them doesn't seem useful; in a better
 	 * world we'd write the parent to delete the page.
@@ -229,8 +231,6 @@ __wt_compact_page_skip(
 		return (0);
 	}
 
-	*skipp = false;				/* Default to reading */
-
 	/*
 	 * If the page is in-memory, we want to look at it (it may have been
 	 * modified and written, and the current location is the interesting
diff --git a/src/btree/bt_page.c b/src/btree/bt_page.c
index 68b242bf91e..e2d4fa01fa7 100644
--- a/src/btree/bt_page.c
+++ b/src/btree/bt_page.c
@@ -311,12 +311,13 @@ __inmem_col_var_repeats(WT_SESSION_IMPL *session, WT_PAGE *page, uint32_t *np)
 	const WT_PAGE_HEADER *dsk;
 	uint32_t i;
 
+	*np = 0;
+
 	btree = S2BT(session);
 	dsk = page->dsk;
 	unpack = &_unpack;
 
 	/* Walk the page, counting entries for the repeats array. */
-	*np = 0;
 	WT_CELL_FOREACH(btree, dsk, cell, unpack, i) {
 		__wt_cell_unpack(cell, unpack);
 		if (__wt_cell_rle(unpack) > 1)
diff --git a/src/btree/bt_read.c b/src/btree/bt_read.c
index 0246c1eca66..838c6845b08 100644
--- a/src/btree/bt_read.c
+++ b/src/btree/bt_read.c
@@ -296,7 +296,7 @@ __evict_force_check(WT_SESSION_IMPL *session, WT_REF *ref)
  *	Read a page from the file.
  */
 static int
-__page_read(WT_SESSION_IMPL *session, WT_REF *ref)
+__page_read(WT_SESSION_IMPL *session, WT_REF *ref, uint32_t flags)
 {
 	struct timespec start, stop;
 	WT_BTREE *btree;
@@ -304,7 +304,7 @@ __page_read(WT_SESSION_IMPL *session, WT_REF *ref)
 	WT_ITEM tmp;
 	WT_PAGE *page;
 	size_t addr_size;
-	uint32_t new_state, previous_state;
+	uint32_t page_flags, new_state, previous_state;
 	const uint8_t *addr;
 	bool timer;
 
@@ -372,9 +372,12 @@ __page_read(WT_SESSION_IMPL *session, WT_REF *ref)
 	 * the allocated copy of the disk image on return, the in-memory object
 	 * steals it.
 	 */
-	WT_ERR(__wt_page_inmem(session, ref, tmp.data,
-	    WT_DATA_IN_ITEM(&tmp) ?
-	    WT_PAGE_DISK_ALLOC : WT_PAGE_DISK_MAPPED, &page));
+	page_flags =
+	    WT_DATA_IN_ITEM(&tmp) ? WT_PAGE_DISK_ALLOC : WT_PAGE_DISK_MAPPED;
+	if (LF_ISSET(WT_READ_NO_EVICT) ||
+	    F_ISSET(session, WT_SESSION_NO_EVICTION))
+		FLD_SET(page_flags, WT_PAGE_READ_NO_EVICT);
+	WT_ERR(__wt_page_inmem(session, ref, tmp.data, page_flags, &page));
 	tmp.mem = NULL;
 
 skip_read:
@@ -499,7 +502,7 @@ __wt_page_in_func(WT_SESSION_IMPL *session, WT_REF *ref, uint32_t flags
 			if (!LF_ISSET(WT_READ_NO_EVICT))
 				WT_RET(__wt_cache_eviction_check(
 				    session, 1, NULL));
-			WT_RET(__page_read(session, ref));
+			WT_RET(__page_read(session, ref, flags));
 
 			/*
 			 * We just read a page, don't evict it before we have a
diff --git a/src/btree/bt_vrfy.c b/src/btree/bt_vrfy.c
index 952b90e3444..c3b5d926a8f 100644
--- a/src/btree/bt_vrfy.c
+++ b/src/btree/bt_vrfy.c
@@ -696,9 +696,10 @@ __verify_overflow_cell(
 	const WT_PAGE_HEADER *dsk;
 	uint32_t cell_num, i;
 
+	*found = false;
+
 	btree = S2BT(session);
 	unpack = &_unpack;
-	*found = false;
 
 	/*
 	 * If a tree is empty (just created), it won't have a disk image;
diff --git a/src/btree/row_srch.c b/src/btree/row_srch.c
index 16081e841dc..68299a396ba 100644
--- a/src/btree/row_srch.c
+++ b/src/btree/row_srch.c
@@ -23,9 +23,10 @@ __search_insert_append(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt,
 	WT_ITEM key;
 	int cmp, i;
 
+	*donep = 0;
+
 	btree = S2BT(session);
 	collator = btree->collator;
-	*donep = 0;
 
 	if ((ins = WT_SKIP_LAST(ins_head)) == NULL)
 		return (0);
diff --git a/src/config/config.c b/src/config/config.c
index dd46aa55ad1..d48c39de6b5 100644
--- a/src/config/config.c
+++ b/src/config/config.c
@@ -346,12 +346,12 @@ __config_next(WT_CONFIG *conf, WT_CONFIG_ITEM *key, WT_CONFIG_ITEM *value)
 		"", 0, 1, WT_CONFIG_ITEM_BOOL
 	};
 
+	/* Keys with no value default to true. */
+	*value = true_value;
+
 	out = key;
 	utf8_remain = 0;
-
 	key->len = 0;
-	/* Keys with no value default to true. */
-	*value = true_value;
 
 	if (conf->go == NULL)
 		conf->go = gostruct;
diff --git a/src/config/config_api.c b/src/config/config_api.c
index 74da016afbc..3b37732f49b 100644
--- a/src/config/config_api.c
+++ b/src/config/config_api.c
@@ -73,6 +73,7 @@ wiredtiger_config_parser_open(WT_SESSION *wt_session,
 	WT_SESSION_IMPL *session;
 
 	*config_parserp = NULL;
+
 	session = (WT_SESSION_IMPL *)wt_session;
 
 	WT_RET(__wt_calloc_one(session, &config_parser));
diff --git a/src/conn/conn_cache.c b/src/conn/conn_cache.c
index da5b6bfd55f..c83fb544982 100644
--- a/src/conn/conn_cache.c
+++ b/src/conn/conn_cache.c
@@ -295,11 +295,11 @@ __wt_cache_destroy(WT_SESSION_IMPL *session)
 		return (0);
 
 	/* The cache should be empty at this point.  Complain if not. */
-	if (cache->pages_inmem != cache->pages_evict)
+	if (cache->pages_inmem != cache->pages_evicted)
 		__wt_errx(session,
 		    "cache server: exiting with %" PRIu64 " pages in "
 		    "memory and %" PRIu64 " pages evicted",
-		    cache->pages_inmem, cache->pages_evict);
+		    cache->pages_inmem, cache->pages_evicted);
 	if (cache->bytes_image != 0)
 		__wt_errx(session,
 		    "cache server: exiting with %" PRIu64 " image bytes in "
diff --git a/src/conn/conn_cache_pool.c b/src/conn/conn_cache_pool.c
index 83e122aaabe..4475b27a7b8 100644
--- a/src/conn/conn_cache_pool.c
+++ b/src/conn/conn_cache_pool.c
@@ -575,6 +575,7 @@ __cache_pool_adjust(WT_SESSION_IMPL *session,
 	bool busy, decrease_ok, grow, pool_full;
 
 	*adjustedp = false;
+
 	cp = __wt_process.cache_pool;
 	grow = false;
 	pool_full = cp->currently_used >= cp->size;
diff --git a/src/conn/conn_sweep.c b/src/conn/conn_sweep.c
index a164e34fe33..9b64c7a0f77 100644
--- a/src/conn/conn_sweep.c
+++ b/src/conn/conn_sweep.c
@@ -159,10 +159,10 @@ __sweep_discard_trees(WT_SESSION_IMPL *session, u_int *dead_handlesp)
 	WT_DATA_HANDLE *dhandle;
 	WT_DECL_RET;
 
-	conn = S2C(session);
-
 	*dead_handlesp = 0;
 
+	conn = S2C(session);
+
 	TAILQ_FOREACH(dhandle, &conn->dhqh, q) {
 		if (WT_DHANDLE_CAN_DISCARD(dhandle))
 			++*dead_handlesp;
diff --git a/src/cursor/cur_join.c b/src/cursor/cur_join.c
index 24f68e3863a..e1fbb63178f 100644
--- a/src/cursor/cur_join.c
+++ b/src/cursor/cur_join.c
@@ -54,6 +54,7 @@ __curjoin_iter_init(WT_SESSION_IMPL *session, WT_CURSOR_JOIN *cjoin,
 	WT_CURSOR_JOIN_ITER *iter;
 
 	*iterp = NULL;
+
 	WT_RET(__wt_calloc_one(session, iterp));
 	iter = *iterp;
 	iter->cjoin = cjoin;
diff --git a/src/evict/evict_lru.c b/src/evict/evict_lru.c
index 02208e0f84a..0205dbb08e3 100644
--- a/src/evict/evict_lru.c
+++ b/src/evict/evict_lru.c
@@ -377,13 +377,12 @@ __evict_server(WT_SESSION_IMPL *session, bool *did_work)
 	WT_CACHE *cache;
 	WT_CONNECTION_IMPL *conn;
 	WT_DECL_RET;
-	uint64_t orig_pages_evicted;
+
+	/* Assume there has been no progress. */
+	*did_work = false;
 
 	conn = S2C(session);
 	cache = conn->cache;
-	WT_ASSERT(session, did_work != NULL);
-	*did_work = false;
-	orig_pages_evicted = cache->pages_evicted;
 
 	/* Evict pages from the cache as needed. */
 	WT_RET(__evict_pass(session));
@@ -411,46 +410,58 @@ __evict_server(WT_SESSION_IMPL *session, bool *did_work)
 		__wt_readunlock(session, &conn->dhandle_lock);
 		WT_RET(ret);
 
-		cache->pages_evicted = 0;
-	} else if (cache->pages_evicted != cache->pages_evict) {
-		cache->pages_evicted = cache->pages_evict;
+		/* Make sure we'll notice next time we're stuck. */
+		cache->last_eviction_progress = 0;
+		return (0);
+	}
+
+	/* Track if work was done. */
+	*did_work = cache->eviction_progress != cache->last_eviction_progress;
+	cache->last_eviction_progress = cache->eviction_progress;
+
+	/* Eviction is stuck, check if we have made progress. */
+	if (*did_work) {
 #if defined(HAVE_DIAGNOSTIC) || defined(HAVE_VERBOSE)
 		__wt_epoch(session, &cache->stuck_time);
-	} else if (!F_ISSET(conn, WT_CONN_IN_MEMORY)) {
-		/*
-		 * If we're stuck for 5 minutes in diagnostic mode, or the
-		 * verbose evict_stuck flag is configured, log the cache
-		 * and transaction state.
-		 *
-		 * If we're stuck for 5 minutes in diagnostic mode, give up.
-		 *
-		 * We don't do this check for in-memory workloads because
-		 * application threads are not blocked by the cache being full.
-		 * If the cache becomes full of clean pages, we can be
-		 * servicing reads while the cache appears stuck to eviction.
-		 */
-		__wt_epoch(session, &now);
-		if (WT_TIMEDIFF_SEC(now, cache->stuck_time) > 300) {
+#endif
+		return (0);
+	}
+
+#if defined(HAVE_DIAGNOSTIC) || defined(HAVE_VERBOSE)
+	/*
+	 * If we're stuck for 5 minutes in diagnostic mode, or the verbose
+	 * evict_stuck flag is configured, log the cache and transaction state.
+	 *
+	 * If we're stuck for 5 minutes in diagnostic mode, give up.
+	 *
+	 * We don't do this check for in-memory workloads because application
+	 * threads are not blocked by the cache being full. If the cache becomes
+	 * full of clean pages, we can be servicing reads while the cache
+	 * appears stuck to eviction.
+	 */
+	if (F_ISSET(conn, WT_CONN_IN_MEMORY))
+		return (0);
+
+	__wt_epoch(session, &now);
+	if (WT_TIMEDIFF_SEC(now, cache->stuck_time) > 300) {
 #if defined(HAVE_DIAGNOSTIC)
-			__wt_err(session, ETIMEDOUT,
-			    "Cache stuck for too long, giving up");
-			ret = ETIMEDOUT;
-			WT_TRET(__wt_verbose_dump_txn(session));
-			WT_TRET(__wt_verbose_dump_cache(session));
-			return (ret);
+		__wt_err(session, ETIMEDOUT,
+		    "Cache stuck for too long, giving up");
+		ret = ETIMEDOUT;
+		WT_TRET(__wt_verbose_dump_txn(session));
+		WT_TRET(__wt_verbose_dump_cache(session));
+		return (ret);
 #elif defined(HAVE_VERBOSE)
-			if (WT_VERBOSE_ISSET(session, WT_VERB_EVICT_STUCK)) {
-				WT_RET(__wt_verbose_dump_txn(session));
-				WT_RET(__wt_verbose_dump_cache(session));
+		if (WT_VERBOSE_ISSET(session, WT_VERB_EVICT_STUCK)) {
+			WT_RET(__wt_verbose_dump_txn(session));
+			WT_RET(__wt_verbose_dump_cache(session));
 
-				/* Reset the timer. */
-				__wt_epoch(session, &cache->stuck_time);
-			}
-#endif
+			/* Reset the timer. */
+			__wt_epoch(session, &cache->stuck_time);
 		}
 #endif
 	}
-	*did_work = cache->pages_evicted != orig_pages_evicted;
+#endif
 	return (0);
 }
 
@@ -622,7 +633,7 @@ __evict_pass(WT_SESSION_IMPL *session)
 	WT_CACHE *cache;
 	WT_CONNECTION_IMPL *conn;
 	WT_TXN_GLOBAL *txn_global;
-	uint64_t oldest_id, pages_evicted, prev_oldest_id;
+	uint64_t eviction_progress, oldest_id, prev_oldest_id;
 	u_int loop;
 
 	conn = S2C(session);
@@ -630,7 +641,7 @@ __evict_pass(WT_SESSION_IMPL *session)
 	txn_global = &conn->txn_global;
 
 	/* Track whether pages are being evicted and progress is made. */
-	pages_evicted = cache->pages_evict;
+	eviction_progress = cache->eviction_progress;
 	prev_oldest_id = txn_global->oldest_id;
 	WT_CLEAR(prev);
 
@@ -705,7 +716,7 @@ __evict_pass(WT_SESSION_IMPL *session)
 		 * treat the cache as stuck and start rolling back
 		 * transactions and writing updates to the lookaside table.
 		 */
-		if (pages_evicted == cache->pages_evict) {
+		if (eviction_progress == cache->eviction_progress) {
 			if (WT_TIMEDIFF_MS(now, prev) >= 20 &&
 			    F_ISSET(cache, WT_CACHE_EVICT_CLEAN_HARD |
 			    WT_CACHE_EVICT_DIRTY_HARD)) {
@@ -757,7 +768,7 @@ __evict_pass(WT_SESSION_IMPL *session)
 				    cache->evict_aggressive_score);
 			}
 			loop = 0;
-			pages_evicted = cache->pages_evict;
+			eviction_progress = cache->eviction_progress;
 		}
 	}
 	return (0);
@@ -959,7 +970,7 @@ __evict_tune_workers(WT_SESSION_IMPL *session)
 	WT_CACHE *cache;
 	WT_CONNECTION_IMPL *conn;
 	uint64_t delta_msec, delta_pages;
-	uint64_t pgs_evicted_cur, pgs_evicted_persec_cur, time_diff;
+	uint64_t eviction_progress, eviction_progress_rate, time_diff;
 	int32_t cur_threads, i, target_threads, thread_surplus;
 
 	conn = S2C(session);
@@ -972,16 +983,16 @@ __evict_tune_workers(WT_SESSION_IMPL *session)
 	if (conn->evict_threads_max == conn->evict_threads_min)
 		return;
 
-	pgs_evicted_cur = 0;
+	eviction_progress_rate = 0;
 
 	__wt_epoch(session, &current_time);
-	time_diff = WT_TIMEDIFF_MS(current_time, conn->evict_tune_last_time);
+	time_diff = WT_TIMEDIFF_MS(current_time, cache->evict_tune_last_time);
 
 	/*
 	 * If we have reached the stable state and have not run long enough to
 	 * surpass the forced re-tuning threshold, return.
 	 */
-	if (conn->evict_tune_stable) {
+	if (cache->evict_tune_stable) {
 		if (time_diff < EVICT_FORCE_RETUNE)
 			return;
 
@@ -989,11 +1000,11 @@ __evict_tune_workers(WT_SESSION_IMPL *session)
 		 * Stable state was reached a long time ago. Let's re-tune.
 		 * Reset all the state.
 		 */
-		conn->evict_tune_stable = false;
-		conn->evict_tune_last_action_time.tv_sec = 0;
-		conn->evict_tune_pgs_last = 0;
-		conn->evict_tune_num_points = 0;
-		conn->evict_tune_pg_sec_max = 0;
+		cache->evict_tune_stable = false;
+		cache->evict_tune_last_action_time.tv_sec = 0;
+		cache->evict_tune_progress_last = 0;
+		cache->evict_tune_num_points = 0;
+		cache->evict_tune_progress_rate_max = 0;
 
 		/* Reduce the number of eviction workers by one */
 		thread_surplus =
@@ -1017,10 +1028,10 @@ __evict_tune_workers(WT_SESSION_IMPL *session)
 			return;
 
 	/*
-	 * Measure the number of evicted pages so far. Eviction rate correlates
-	 * to performance, so this is our metric of success.
+	 * Measure the evicted progress so far. Eviction rate correlates to
+	 * performance, so this is our metric of success.
 	 */
-	pgs_evicted_cur = cache->pages_evict;
+	eviction_progress = cache->eviction_progress;
 
 	/*
 	 * If we have recorded the number of pages evicted at the end of
@@ -1029,21 +1040,21 @@ __evict_tune_workers(WT_SESSION_IMPL *session)
 	 * measurement interval.
 	 * Otherwise, we just record the number of evicted pages and return.
 	 */
-	if (conn->evict_tune_pgs_last == 0)
+	if (cache->evict_tune_progress_last == 0)
 		goto done;
 
-	delta_msec = WT_TIMEDIFF_MS(current_time, conn->evict_tune_last_time);
-	delta_pages = pgs_evicted_cur - conn->evict_tune_pgs_last;
-	pgs_evicted_persec_cur = (delta_pages * WT_THOUSAND) / delta_msec;
-	conn->evict_tune_num_points++;
+	delta_msec = WT_TIMEDIFF_MS(current_time, cache->evict_tune_last_time);
+	delta_pages = eviction_progress - cache->evict_tune_progress_last;
+	eviction_progress_rate = (delta_pages * WT_THOUSAND) / delta_msec;
+	cache->evict_tune_num_points++;
 
 	/*
 	 * Keep track of the maximum eviction throughput seen and the number
 	 * of workers corresponding to that throughput.
 	 */
-	if (pgs_evicted_persec_cur > conn->evict_tune_pg_sec_max) {
-		conn->evict_tune_pg_sec_max = pgs_evicted_persec_cur;
-		conn->evict_tune_workers_best =
+	if (eviction_progress_rate > cache->evict_tune_progress_rate_max) {
+		cache->evict_tune_progress_rate_max = eviction_progress_rate;
+		cache->evict_tune_workers_best =
 		    conn->evict_threads.current_threads;
 	}
 
@@ -1057,8 +1068,8 @@ __evict_tune_workers(WT_SESSION_IMPL *session)
 	 * we will go back to the best observed number of workers and
 	 * settle into a stable state.
 	 */
-	if (conn->evict_tune_num_points >= conn->evict_tune_datapts_needed) {
-		if (conn->evict_tune_workers_best ==
+	if (cache->evict_tune_num_points >= cache->evict_tune_datapts_needed) {
+		if (cache->evict_tune_workers_best ==
 		    conn->evict_threads.current_threads &&
 		    conn->evict_threads.current_threads <
 		    conn->evict_threads_max) {
@@ -1066,7 +1077,7 @@ __evict_tune_workers(WT_SESSION_IMPL *session)
 			 * Keep adding workers. We will check again
 			 * at the next check point.
 			 */
-			conn->evict_tune_datapts_needed += WT_MIN(
+			cache->evict_tune_datapts_needed += WT_MIN(
 			    EVICT_TUNE_DATAPT_MIN,
 			    (conn->evict_threads_max -
 			    conn->evict_threads.current_threads) /
@@ -1079,7 +1090,7 @@ __evict_tune_workers(WT_SESSION_IMPL *session)
 			 */
 			thread_surplus =
 			    (int32_t)conn->evict_threads.current_threads -
-			    (int32_t)conn->evict_tune_workers_best;
+			    (int32_t)cache->evict_tune_workers_best;
 
 			for (i = 0; i < thread_surplus; i++) {
 				__wt_thread_group_stop_one(
@@ -1089,8 +1100,8 @@ __evict_tune_workers(WT_SESSION_IMPL *session)
 			}
 			WT_STAT_CONN_SET(session,
 			    cache_eviction_stable_state_workers,
-			    conn->evict_tune_workers_best);
-			conn->evict_tune_stable = true;
+			    cache->evict_tune_workers_best);
+			cache->evict_tune_stable = true;
 			WT_STAT_CONN_SET(session, cache_eviction_active_workers,
 			    conn->evict_threads.current_threads);
 			goto done;
@@ -1103,8 +1114,8 @@ __evict_tune_workers(WT_SESSION_IMPL *session)
 	 * we must accumulate before deciding if we should keep adding workers
 	 * or settle on a previously tried stable number of workers.
 	 */
-	if (conn->evict_tune_last_action_time.tv_sec == 0)
-		conn->evict_tune_datapts_needed = EVICT_TUNE_DATAPT_MIN;
+	if (cache->evict_tune_last_action_time.tv_sec == 0)
+		cache->evict_tune_datapts_needed = EVICT_TUNE_DATAPT_MIN;
 
 	if (F_ISSET(cache, WT_CACHE_EVICT_ALL)) {
 		cur_threads = (int32_t)conn->evict_threads.current_threads;
@@ -1121,14 +1132,14 @@ __evict_tune_workers(WT_SESSION_IMPL *session)
 			__wt_verbose(session,
 			    WT_VERB_EVICTSERVER, "%s", "added worker thread");
 		}
-		conn->evict_tune_last_action_time = current_time;
+		cache->evict_tune_last_action_time = current_time;
 	}
 
 	WT_STAT_CONN_SET(session, cache_eviction_active_workers,
 	    conn->evict_threads.current_threads);
 
-done:	conn->evict_tune_last_time = current_time;
-	conn->evict_tune_pgs_last = pgs_evicted_cur;
+done:	cache->evict_tune_last_time = current_time;
+	cache->evict_tune_progress_last = eviction_progress;
 }
 
 /*
@@ -2022,6 +2033,9 @@ __evict_get_ref(
 	uint32_t candidates;
 	bool is_app, server_only, urgent_ok;
 
+	*btreep = NULL;
+	*refp = NULL;
+
 	cache = S2C(session)->cache;
 	is_app = !F_ISSET(session, WT_SESSION_INTERNAL);
 	server_only = is_server && !WT_EVICT_HAS_WORKERS(session);
@@ -2029,8 +2043,6 @@ __evict_get_ref(
 	    !WT_EVICT_HAS_WORKERS(session) ||
 	    (is_app && __wt_cache_aggressive(session));
 	urgent_queue = cache->evict_urgent_queue;
-	*btreep = NULL;
-	*refp = NULL;
 
 	WT_STAT_CONN_INCR(session, cache_eviction_get_ref);
 
@@ -2255,7 +2267,7 @@ __wt_cache_eviction_worker(WT_SESSION_IMPL *session, bool busy, u_int pct_full)
 	WT_DECL_RET;
 	WT_TXN_GLOBAL *txn_global;
 	WT_TXN_STATE *txn_state;
-	uint64_t init_evict_count, max_pages_evicted;
+	uint64_t initial_progress, max_progress;
 	bool timer;
 
 	conn = S2C(session);
@@ -2282,7 +2294,7 @@ __wt_cache_eviction_worker(WT_SESSION_IMPL *session, bool busy, u_int pct_full)
 	if (timer)
 		__wt_epoch(session, &enter);
 
-	for (init_evict_count = cache->pages_evict;; ret = 0) {
+	for (initial_progress = cache->eviction_progress;; ret = 0) {
 		/*
 		 * A pathological case: if we're the oldest transaction in the
 		 * system and the eviction server is stuck trying to find space,
@@ -2307,12 +2319,12 @@ __wt_cache_eviction_worker(WT_SESSION_IMPL *session, bool busy, u_int pct_full)
 		if (!busy && txn_state->pinned_id != WT_TXN_NONE &&
 		    txn_global->current != txn_global->oldest_id)
 			busy = true;
-		max_pages_evicted = busy ? 5 : 20;
+		max_progress = busy ? 5 : 20;
 
 		/* See if eviction is still needed. */
 		if (!__wt_eviction_needed(session, busy, &pct_full) ||
-		    (pct_full < 100 &&
-		    cache->pages_evict > init_evict_count + max_pages_evicted))
+		    (pct_full < 100 && cache->eviction_progress >
+		    initial_progress + max_progress))
 			break;
 
 		/*
diff --git a/src/include/btmem.h b/src/include/btmem.h
index 8ba6a240ace..158fcf87d29 100644
--- a/src/include/btmem.h
+++ b/src/include/btmem.h
@@ -589,8 +589,9 @@ struct __wt_page {
 #define	WT_PAGE_DISK_MAPPED	0x04	/* Disk image in mapped memory */
 #define	WT_PAGE_EVICT_LRU	0x08	/* Page is on the LRU queue */
 #define	WT_PAGE_OVERFLOW_KEYS	0x10	/* Page has overflow keys */
-#define	WT_PAGE_SPLIT_INSERT	0x20	/* A leaf page was split for append */
-#define	WT_PAGE_UPDATE_IGNORE	0x40	/* Ignore updates on page discard */
+#define	WT_PAGE_READ_NO_EVICT	0x20	/* Page read with eviction disabled */
+#define	WT_PAGE_SPLIT_INSERT	0x40	/* A leaf page was split for append */
+#define	WT_PAGE_UPDATE_IGNORE	0x80	/* Ignore updates on page discard */
 	uint8_t flags_atomic;		/* Atomic flags, use F_*_ATOMIC */
 
 	uint8_t unused[2];		/* Unused padding */
diff --git a/src/include/btree.i b/src/include/btree.i
index 8803f3b907d..c62431fe057 100644
--- a/src/include/btree.i
+++ b/src/include/btree.i
@@ -454,17 +454,25 @@ __wt_cache_page_evict(WT_SESSION_IMPL *session, WT_PAGE *page, bool rewrite)
 		}
 	}
 
-	/* Update pages and bytes evicted. */
+	/* Update bytes and pages evicted. */
 	(void)__wt_atomic_add64(&cache->bytes_evict, page->memory_footprint);
+	(void)__wt_atomic_addv64(&cache->pages_evicted, 1);
 
 	/*
-	 * Don't count rewrites as eviction: there's no guarantee we are making
-	 * real progress.
+	 * Track if eviction makes progress.  This is used in various places to
+	 * determine whether eviction is stuck.
+	 *
+	 * We don't count rewrites as progress.
+	 *
+	 * Further, if a page was read with eviction disabled, we don't count
+	 * evicting a it as progress.  Since disabling eviction allows pages to
+	 * be read even when the cache is full, we want to avoid workloads
+	 * repeatedly reading a page with eviction disabled (e.g., from the
+	 * metadata), then evicting that page and deciding that is a sign that
+	 * eviction is unstuck.
 	 */
-	if (rewrite)
-		(void)__wt_atomic_sub64(&cache->pages_inmem, 1);
-	else
-		(void)__wt_atomic_addv64(&cache->pages_evict, 1);
+	if (!rewrite && !F_ISSET_ATOMIC(page, WT_PAGE_READ_NO_EVICT))
+		(void)__wt_atomic_addv64(&cache->eviction_progress, 1);
 }
 
 /*
diff --git a/src/include/cache.h b/src/include/cache.h
index 9d7489732aa..456cb0382e4 100644
--- a/src/include/cache.h
+++ b/src/include/cache.h
@@ -64,8 +64,7 @@ struct __wt_cache {
 	uint64_t bytes_dirty_leaf;
 	uint64_t pages_dirty_leaf;
 	uint64_t bytes_evict;		/* Bytes/pages discarded by eviction */
-	volatile uint64_t pages_evict;
-	uint64_t pages_evicted;		/* Pages evicted during a pass */
+	uint64_t pages_evicted;
 	uint64_t bytes_image;		/* Bytes of disk images */
 	uint64_t bytes_inmem;		/* Bytes/pages in memory */
 	uint64_t pages_inmem;
@@ -73,6 +72,9 @@ struct __wt_cache {
 	uint64_t bytes_read;		/* Bytes read into memory */
 	uint64_t bytes_written;
 
+	volatile uint64_t eviction_progress;	/* Eviction progress count */
+	uint64_t last_eviction_progress;/* Tracked eviction progress */
+
 	uint64_t app_waits;		/* User threads waited for cache */
 	uint64_t app_evicts;		/* Pages evicted by user threads */
 	uint64_t server_evicts;		/* Pages evicted by server thread */
@@ -111,6 +113,18 @@ struct __wt_cache {
 	u_int overhead_pct;	        /* Cache percent adjustment */
 
 	/*
+	 * Eviction thread tuning information.
+	 */
+	uint32_t evict_tune_datapts_needed;         /* Data needed to tune */
+	struct timespec evict_tune_last_action_time;/* Time of last action */
+	struct timespec evict_tune_last_time;	    /* Time of last check */
+	uint32_t evict_tune_num_points;	            /* Number of values tried */
+	uint64_t evict_tune_progress_last;	    /* Progress counter */
+	uint64_t evict_tune_progress_rate_max;	    /* Max progress rate */
+	bool evict_tune_stable;	                    /* Are we stable? */
+	uint32_t evict_tune_workers_best;           /* Best performing value */
+
+	/*
 	 * Pass interrupt counter.
 	 */
 	volatile uint32_t pass_intr;	/* Interrupt eviction pass. */
diff --git a/src/include/cache.i b/src/include/cache.i
index d51e58e471b..33a9e867b63 100644
--- a/src/include/cache.i
+++ b/src/include/cache.i
@@ -128,7 +128,7 @@ __wt_page_evict_soon(WT_SESSION_IMPL *session, WT_REF *ref)
 static inline uint64_t
 __wt_cache_pages_inuse(WT_CACHE *cache)
 {
-	return (cache->pages_inmem - cache->pages_evict);
+	return (cache->pages_inmem - cache->pages_evicted);
 }
 
 /*
diff --git a/src/include/connection.h b/src/include/connection.h
index 3f890a50d2b..2fa440e4e08 100644
--- a/src/include/connection.h
+++ b/src/include/connection.h
@@ -289,15 +289,6 @@ struct __wt_connection_impl {
 	uint32_t	 evict_threads_max;/* Max eviction threads */
 	uint32_t	 evict_threads_min;/* Min eviction threads */
 
-	uint32_t         evict_tune_datapts_needed;/* Data needed to tune */
-	struct timespec  evict_tune_last_action_time;/* Time of last action */
-	struct timespec  evict_tune_last_time;	/* Time of last check */
-	uint32_t         evict_tune_num_points;	/* Number of values tried */
-	uint64_t	 evict_tune_pgs_last;	/* Number of pages evicted */
-	uint64_t	 evict_tune_pg_sec_max;	/* Max throughput encountered */
-	bool             evict_tune_stable;	/* Are we stable? */
-	uint32_t	 evict_tune_workers_best;/* Best performing value */
-
 #define	WT_STATLOG_FILENAME	"WiredTigerStat.%d.%H"
 	WT_SESSION_IMPL *stat_session;	/* Statistics log session */
 	wt_thread_t	 stat_tid;	/* Statistics log thread */
diff --git a/src/include/serial.i b/src/include/serial.i
index 5d2c9a22058..c5758ee605a 100644
--- a/src/include/serial.i
+++ b/src/include/serial.i
@@ -159,12 +159,12 @@ __wt_col_append_serial(WT_SESSION_IMPL *session, WT_PAGE *page,
 	WT_DECL_RET;
 	WT_INSERT *new_ins = *new_insp;
 
-	/* Check for page write generation wrap. */
-	WT_RET(__page_write_gen_wrapped_check(page));
-
 	/* Clear references to memory we now own and must free on error. */
 	*new_insp = NULL;
 
+	/* Check for page write generation wrap. */
+	WT_RET(__page_write_gen_wrapped_check(page));
+
 	/*
 	 * Acquire the page's spinlock unless we already have exclusive access.
 	 * Then call the worker function.
@@ -210,12 +210,12 @@ __wt_insert_serial(WT_SESSION_IMPL *session, WT_PAGE *page,
 	u_int i;
 	bool simple;
 
-	/* Check for page write generation wrap. */
-	WT_RET(__page_write_gen_wrapped_check(page));
-
 	/* Clear references to memory we now own and must free on error. */
 	*new_insp = NULL;
 
+	/* Check for page write generation wrap. */
+	WT_RET(__page_write_gen_wrapped_check(page));
+
 	simple = true;
 	for (i = 0; i < skipdepth; i++)
 		if (new_ins->next[i] == NULL)
@@ -265,12 +265,12 @@ __wt_update_serial(WT_SESSION_IMPL *session, WT_PAGE *page,
 	WT_UPDATE *obsolete, *upd = *updp;
 	uint64_t txn;
 
-	/* Check for page write generation wrap. */
-	WT_RET(__page_write_gen_wrapped_check(page));
-
 	/* Clear references to memory we now own and must free on error. */
 	*updp = NULL;
 
+	/* Check for page write generation wrap. */
+	WT_RET(__page_write_gen_wrapped_check(page));
+
 	/*
 	 * All structure setup must be flushed before the structure is entered
 	 * into the list. We need a write barrier here, our callers depend on
diff --git a/src/include/txn.i b/src/include/txn.i
index 26dcd01fe5e..b0b71dbb3d0 100644
--- a/src/include/txn.i
+++ b/src/include/txn.i
@@ -152,9 +152,10 @@ __txn_next_op(WT_SESSION_IMPL *session, WT_TXN_OP **opp)
 {
 	WT_TXN *txn;
 
-	txn = &session->txn;
 	*opp = NULL;
 
+	txn = &session->txn;
+
 	/*
 	 * We're about to perform an update.
 	 * Make sure we have allocated a transaction ID.
diff --git a/src/log/log.c b/src/log/log.c
index 89fe64c6f18..0b01b61ced3 100644
--- a/src/log/log.c
+++ b/src/log/log.c
@@ -345,14 +345,15 @@ __wt_log_needs_recovery(WT_SESSION_IMPL *session, WT_LSN *ckp_lsn, bool *recp)
 	uint64_t dummy_txnid;
 	uint32_t dummy_fileid, dummy_optype, rectype;
 
-	conn = S2C(session);
-	log = conn->log;
-
 	/*
 	 * Default is to run recovery always (regardless of whether this
 	 * connection has logging enabled).
 	 */
 	*recp = true;
+
+	conn = S2C(session);
+	log = conn->log;
+
 	if (log == NULL)
 		return (0);
 
@@ -430,11 +431,11 @@ __wt_log_get_all_files(WT_SESSION_IMPL *session,
 
 	*filesp = NULL;
 	*countp = 0;
+	*maxid = 0;
 
 	id = 0;
 	log = S2C(session)->log;
 
-	*maxid = 0;
 	/*
 	 * These may be files needed by backup.  Force the current slot
 	 * to get written to the file.
@@ -1659,10 +1660,11 @@ __log_has_hole(WT_SESSION_IMPL *session,
 	size_t bufsz, rdlen;
 	char *buf, *zerobuf;
 
+	*hole = false;
+
 	conn = S2C(session);
 	log = conn->log;
 	remainder = log_size - offset;
-	*hole = false;
 
 	/*
 	 * It can be very slow looking for the last real record in the log
diff --git a/src/log/log_slot.c b/src/log/log_slot.c
index f8b355dcde3..61dfb82083d 100644
--- a/src/log/log_slot.c
+++ b/src/log/log_slot.c
@@ -104,11 +104,11 @@ __log_slot_close(
 	int count;
 #endif
 
+	*releasep = false;
+
 	WT_ASSERT(session, F_ISSET(session, WT_SESSION_LOCKED_SLOT));
-	WT_ASSERT(session, releasep != NULL);
 	conn = S2C(session);
 	log = conn->log;
-	*releasep = 0;
 	if (slot == NULL)
 		return (WT_NOTFOUND);
 retry:
@@ -149,7 +149,7 @@ retry:
 	 */
 	WT_STAT_CONN_INCR(session, log_slot_closes);
 	if (WT_LOG_SLOT_DONE(new_state))
-		*releasep = 1;
+		*releasep = true;
 	slot->slot_end_lsn = slot->slot_start_lsn;
 	/*
 	 * A thread setting the unbuffered flag sets the unbuffered size after
diff --git a/src/lsm/lsm_manager.c b/src/lsm/lsm_manager.c
index 5fb397e3db9..62f2c7795be 100644
--- a/src/lsm/lsm_manager.c
+++ b/src/lsm/lsm_manager.c
@@ -593,9 +593,9 @@ __wt_lsm_manager_pop_entry(
 	WT_LSM_MANAGER *manager;
 	WT_LSM_WORK_UNIT *entry;
 
+	*entryp = entry = NULL;
+
 	manager = &S2C(session)->lsm_manager;
-	*entryp = NULL;
-	entry = NULL;
 
 	/*
 	 * Pop the entry off the correct queue based on our work type.
diff --git a/src/lsm/lsm_work_unit.c b/src/lsm/lsm_work_unit.c
index 05e5fe5b07e..f6aea02e20d 100644
--- a/src/lsm/lsm_work_unit.c
+++ b/src/lsm/lsm_work_unit.c
@@ -77,6 +77,7 @@ __wt_lsm_get_chunk_to_flush(WT_SESSION_IMPL *session,
 	uint32_t i;
 
 	*chunkp = NULL;
+
 	chunk = evict_chunk = flush_chunk = NULL;
 
 	WT_ASSERT(session, lsm_tree->queue_ref > 0);
@@ -130,7 +131,6 @@ __wt_lsm_get_chunk_to_flush(WT_SESSION_IMPL *session,
 	}
 
 err:	__wt_lsm_tree_readunlock(session, lsm_tree);
-
 	*chunkp = chunk;
 	return (ret);
 }
@@ -168,8 +168,8 @@ __wt_lsm_work_switch(
 
 	/* We've become responsible for freeing the work unit. */
 	entry = *entryp;
-	*ran = false;
 	*entryp = NULL;
+	*ran = false;
 
 	if (entry->lsm_tree->need_switch) {
 		WT_WITH_SCHEMA_LOCK(session,
diff --git a/src/lsm/lsm_worker.c b/src/lsm/lsm_worker.c
index 21838d13e79..8e89cf39099 100644
--- a/src/lsm/lsm_worker.c
+++ b/src/lsm/lsm_worker.c
@@ -54,6 +54,7 @@ __lsm_worker_general_op(
 	bool force;
 
 	*completed = false;
+
 	/*
 	 * Return if this thread cannot process a bloom, drop or flush.
 	 */
diff --git a/src/os_posix/os_dir.c b/src/os_posix/os_dir.c
index 8f77aba5f96..205ca389f43 100644
--- a/src/os_posix/os_dir.c
+++ b/src/os_posix/os_dir.c
@@ -28,11 +28,10 @@ __wt_posix_directory_list(WT_FILE_SYSTEM *file_system,
 	int tret;
 	char **entries;
 
-	session = (WT_SESSION_IMPL *)wt_session;
-
 	*dirlistp = NULL;
 	*countp = 0;
 
+	session = (WT_SESSION_IMPL *)wt_session;
 	dirp = NULL;
 	dirallocsz = 0;
 	entries = NULL;
author	Keith Bostic <keith.bostic@mongodb.com>	2017-10-20 01:29:02 -0400
committer	Michael Cahill <michael.cahill@mongodb.com>	2017-10-20 16:29:02 +1100
commit	76b3b945e51d501a3346a37fd48d0561a81e07d8 (patch)
tree	569fc49f58b9dcaa39a8fd52768cae494992b53c
parent	fa84ecc8bfdb487c8795e4ea2c0382ce671de519 (diff)
download	mongo-76b3b945e51d501a3346a37fd48d0561a81e07d8.tar.gz