/*-
 * Copyright (c) 2014-2017 MongoDB, Inc.
 * Copyright (c) 2008-2014 WiredTiger, Inc.
 *	All rights reserved.
 *
 * See the file LICENSE for redistribution information.
 */

#include "wt_internal.h"

static int  __evict_clear_all_walks(WT_SESSION_IMPL *);
static int  WT_CDECL __evict_lru_cmp(const void *, const void *);
static int  __evict_lru_pages(WT_SESSION_IMPL *, bool);
static int  __evict_lru_walk(WT_SESSION_IMPL *);
static int  __evict_page(WT_SESSION_IMPL *, bool);
static int  __evict_pass(WT_SESSION_IMPL *);
static int  __evict_server(WT_SESSION_IMPL *, bool *);
static void __evict_tune_workers(WT_SESSION_IMPL *session);
static int  __evict_walk(WT_SESSION_IMPL *, WT_EVICT_QUEUE *);
static int  __evict_walk_file(
    WT_SESSION_IMPL *, WT_EVICT_QUEUE *, u_int, u_int *);

#define	WT_EVICT_HAS_WORKERS(s)				\
	(S2C(s)->evict_threads.current_threads > 1)

/*
 * __evict_lock_handle_list --
 *	Try to get the handle list lock, with yield and sleep back off.
 *	Keep timing statistics overall.
 */
static int
__evict_lock_handle_list(WT_SESSION_IMPL *session)
{
	WT_CACHE *cache;
	WT_CONNECTION_IMPL *conn;
	WT_DECL_RET;
	WT_RWLOCK *dh_lock;
	u_int spins;

	conn = S2C(session);
	cache = conn->cache;
	dh_lock = &conn->dhandle_lock;

	/*
	 * Use a custom lock acquisition back off loop so the eviction server
	 * notices any interrupt quickly.
	 */
	for (spins = 0;
	    (ret = __wt_try_readlock(session, dh_lock)) == EBUSY &&
	    cache->pass_intr == 0; spins++) {
		if (spins < WT_THOUSAND)
			__wt_yield();
		else
			__wt_sleep(0, WT_THOUSAND);
	}
	return (ret);
}

/*
 * __evict_entry_priority --
 *	Get the adjusted read generation for an eviction entry.
 */
static inline uint64_t
__evict_entry_priority(WT_SESSION_IMPL *session, WT_REF *ref)
{
	WT_BTREE *btree;
	WT_PAGE *page;
	uint64_t read_gen;

	btree = S2BT(session);
	page = ref->page;

	/* Any page set to the oldest generation should be discarded. */
	if (WT_READGEN_EVICT_SOON(page->read_gen))
		return (WT_READGEN_OLDEST);

	/* Any page from a dead tree is a great choice. */
	if (F_ISSET(btree->dhandle, WT_DHANDLE_DEAD))
		return (WT_READGEN_OLDEST);

	/* Any empty page (leaf or internal), is a good choice. */
	if (__wt_page_is_empty(page))
		return (WT_READGEN_OLDEST);

	/* Any large page in memory is likewise a good choice. */
	if (page->memory_footprint > btree->splitmempage)
		return (WT_READGEN_OLDEST);

	/*
	 * The base read-generation is skewed by the eviction priority.
	 * Internal pages are also adjusted, we prefer to evict leaf pages.
	 */
	if (page->modify != NULL &&
	    F_ISSET(S2C(session)->cache, WT_CACHE_EVICT_DIRTY) &&
	    !F_ISSET(S2C(session)->cache, WT_CACHE_EVICT_CLEAN))
		read_gen = page->modify->update_txn;
	else
		read_gen = page->read_gen;

	read_gen += btree->evict_priority;

#define	WT_EVICT_INTL_SKEW 1000
	if (WT_PAGE_IS_INTERNAL(page))
		read_gen += WT_EVICT_INTL_SKEW;

	return (read_gen);
}

/*
 * __evict_lru_cmp --
 *	Qsort function: sort the eviction array.
 */
static int WT_CDECL
__evict_lru_cmp(const void *a_arg, const void *b_arg)
{
	const WT_EVICT_ENTRY *a, *b;
	uint64_t a_score, b_score;

	a = a_arg;
	b = b_arg;
	a_score = (a->ref == NULL ? UINT64_MAX : a->score);
	b_score = (b->ref == NULL ? UINT64_MAX : b->score);

	return ((a_score < b_score) ? -1 : (a_score == b_score) ? 0 : 1);
}

/*
 * __evict_list_clear --
 *	Clear an entry in the LRU eviction list.
 */
static inline void
__evict_list_clear(WT_SESSION_IMPL *session, WT_EVICT_ENTRY *e)
{
	if (e->ref != NULL) {
		WT_ASSERT(session,
		    F_ISSET_ATOMIC(e->ref->page, WT_PAGE_EVICT_LRU));
		F_CLR_ATOMIC(e->ref->page, WT_PAGE_EVICT_LRU);
	}
	e->ref = NULL;
	e->btree = WT_DEBUG_POINT;
}

/*
 * __wt_evict_list_clear_page --
 *	Make sure a page is not in the LRU eviction list.  This called from the
 *	page eviction code to make sure there is no attempt to evict a child
 *	page multiple times.
 */
void
__wt_evict_list_clear_page(WT_SESSION_IMPL *session, WT_REF *ref)
{
	WT_CACHE *cache;
	WT_EVICT_ENTRY *evict;
	uint32_t i, elem, q;
	bool found;

	WT_ASSERT(session,
	    __wt_ref_is_root(ref) || ref->state == WT_REF_LOCKED);

	/* Fast path: if the page isn't on the queue, don't bother searching. */
	if (!F_ISSET_ATOMIC(ref->page, WT_PAGE_EVICT_LRU))
		return;

	cache = S2C(session)->cache;
	__wt_spin_lock(session, &cache->evict_queue_lock);

	found = false;
	for (q = 0; q < WT_EVICT_QUEUE_MAX && !found; q++) {
		__wt_spin_lock(session, &cache->evict_queues[q].evict_lock);
		elem = cache->evict_queues[q].evict_max;
		for (i = 0, evict = cache->evict_queues[q].evict_queue;
		    i < elem; i++, evict++)
			if (evict->ref == ref) {
				found = true;
				__evict_list_clear(session, evict);
				break;
			}
		__wt_spin_unlock(session, &cache->evict_queues[q].evict_lock);
	}
	WT_ASSERT(session, !F_ISSET_ATOMIC(ref->page, WT_PAGE_EVICT_LRU));

	__wt_spin_unlock(session, &cache->evict_queue_lock);
}

/*
 * __evict_queue_empty --
 *	Is the queue empty?
 *
 *	Note that the eviction server is pessimistic and treats a half full
 *	queue as empty.
 */
static inline bool
__evict_queue_empty(WT_EVICT_QUEUE *queue, bool server_check)
{
	uint32_t candidates, used;

	if (queue->evict_current == NULL)
		return (true);

	/* The eviction server only considers half of the candidates. */
	candidates = queue->evict_candidates;
	if (server_check && candidates > 1)
		candidates /= 2;
	used = (uint32_t)(queue->evict_current - queue->evict_queue);
	return (used >= candidates);
}

/*
 * __evict_queue_full --
 *	Is the queue full (i.e., it has been populated with candidates and none
 *	of them have been evicted yet)?
 */
static inline bool
__evict_queue_full(WT_EVICT_QUEUE *queue)
{
	return (queue->evict_current == queue->evict_queue &&
	    queue->evict_candidates != 0);
}

/*
 * __wt_evict_server_wake --
 *	Wake the eviction server thread.
 */
void
__wt_evict_server_wake(WT_SESSION_IMPL *session)
{
	WT_CACHE *cache;
	WT_CONNECTION_IMPL *conn;

	conn = S2C(session);
	cache = conn->cache;

#ifdef HAVE_VERBOSE
	if (WT_VERBOSE_ISSET(session, WT_VERB_EVICTSERVER)) {
		uint64_t bytes_inuse, bytes_max;

		bytes_inuse = __wt_cache_bytes_inuse(cache);
		bytes_max = conn->cache_size;
		__wt_verbose(session, WT_VERB_EVICTSERVER,
		    "waking, bytes inuse %s max (%" PRIu64
		    "MB %s %" PRIu64 "MB)",
		    bytes_inuse <= bytes_max ? "<=" : ">",
		    bytes_inuse / WT_MEGABYTE,
		    bytes_inuse <= bytes_max ? "<=" : ">",
		    bytes_max / WT_MEGABYTE);
	}
#endif

	__wt_cond_signal(session, cache->evict_cond);
}

/*
 * __wt_evict_thread_chk --
 *	Check to decide if the eviction thread should continue running.
 */
bool
__wt_evict_thread_chk(WT_SESSION_IMPL *session)
{
	return (F_ISSET(S2C(session), WT_CONN_EVICTION_RUN));
}

/*
 * __wt_evict_thread_run --
 *	Entry function for an eviction thread.  This is called repeatedly
 *	from the thread group code so it does not need to loop itself.
 */
int
__wt_evict_thread_run(WT_SESSION_IMPL *session, WT_THREAD *thread)
{
	WT_CACHE *cache;
	WT_CONNECTION_IMPL *conn;
	WT_DECL_RET;
	bool did_work, was_intr;

	conn = S2C(session);
	cache = conn->cache;

	/*
	 * The thread group code calls us repeatedly.  So each call is one pass
	 * through eviction.
	 */
	if (conn->evict_server_running &&
	    __wt_spin_trylock(session, &cache->evict_pass_lock) == 0) {
		/*
		 * Cannot use WT_WITH_PASS_LOCK because this is a try lock.
		 * Fix when that is supported.  We set the flag on both sessions
		 * because we may call clear_walk when we are walking with
		 * the walk session, locked.
		 */
		F_SET(session, WT_SESSION_LOCKED_PASS);
		F_SET(cache->walk_session, WT_SESSION_LOCKED_PASS);
		ret = __evict_server(session, &did_work);
		F_CLR(cache->walk_session, WT_SESSION_LOCKED_PASS);
		F_CLR(session, WT_SESSION_LOCKED_PASS);
		was_intr = cache->pass_intr != 0;
		__wt_spin_unlock(session, &cache->evict_pass_lock);
		WT_ERR(ret);

		/*
		 * If the eviction server was interrupted, wait until requests
		 * have been processed: the system may otherwise be busy so
		 * don't go to sleep.
		 */
		if (was_intr)
			while (cache->pass_intr != 0 &&
			    F_ISSET(conn, WT_CONN_EVICTION_RUN) &&
			    F_ISSET(thread, WT_THREAD_RUN))
				__wt_yield();
		else {
			__wt_verbose(session,
			    WT_VERB_EVICTSERVER, "%s", "sleeping");

			/* Don't rely on signals: check periodically. */
			__wt_cond_auto_wait(session,
			    cache->evict_cond, did_work, NULL);
			__wt_verbose(session,
			    WT_VERB_EVICTSERVER, "%s", "waking");
		}
	} else
		WT_ERR(__evict_lru_pages(session, false));

	if (0) {
err:		WT_PANIC_MSG(session, ret, "cache eviction thread error");
	}
	return (ret);
}

/*
 * __wt_evict_thread_stop --
 *	Shutdown function for an eviction thread.
 */
int
__wt_evict_thread_stop(WT_SESSION_IMPL *session, WT_THREAD *thread)
{
	WT_CACHE *cache;
	WT_CONNECTION_IMPL *conn;
	WT_DECL_RET;

	if (thread->id != 0)
		return (0);

	conn = S2C(session);
	cache = conn->cache;
	/*
	 * The only time the first eviction thread is stopped is on shutdown:
	 * in case any trees are still open, clear all walks now so that they
	 * can be closed.
	 */
	WT_WITH_PASS_LOCK(session, ret = __evict_clear_all_walks(session));
	WT_ERR(ret);
	/*
	 * The only two cases when the eviction server is expected to
	 * stop are when recovery is finished or when the connection is
	 * closing.
	 */
	WT_ASSERT(session, F_ISSET(conn, WT_CONN_CLOSING | WT_CONN_RECOVERING));

	__wt_verbose(session,
	    WT_VERB_EVICTSERVER, "%s", "cache eviction thread exiting");

	if (0) {
err:		WT_PANIC_MSG(session, ret, "cache eviction thread error");
	}
	return (ret);
}

/*
 * __evict_server --
 *	Thread to evict pages from the cache.
 */
static int
__evict_server(WT_SESSION_IMPL *session, bool *did_work)
{
#if defined(HAVE_DIAGNOSTIC) || defined(HAVE_VERBOSE)
	struct timespec now;
#endif
	WT_CACHE *cache;
	WT_CONNECTION_IMPL *conn;
	WT_DECL_RET;

	/* Assume there has been no progress. */
	*did_work = false;

	conn = S2C(session);
	cache = conn->cache;

	/* Evict pages from the cache as needed. */
	WT_RET(__evict_pass(session));

	if (!F_ISSET(conn, WT_CONN_EVICTION_RUN) || cache->pass_intr != 0)
		return (0);

	if (!__wt_cache_stuck(session)) {
		/*
		 * Try to get the handle list lock: if we give up, that
		 * indicates a session is waiting for us to clear walks.  Do
		 * that as part of a normal pass (without the handle list
		 * lock) to avoid deadlock.
		 */
		if ((ret = __evict_lock_handle_list(session)) == EBUSY)
			return (0);
		WT_RET(ret);

		/*
		 * Clear the walks so we don't pin pages while asleep,
		 * otherwise we can block applications evicting large pages.
		 */
		ret = __evict_clear_all_walks(session);

		__wt_readunlock(session, &conn->dhandle_lock);
		WT_RET(ret);

		/* Make sure we'll notice next time we're stuck. */
		cache->last_eviction_progress = 0;
		return (0);
	}

	/* Track if work was done. */
	*did_work = cache->eviction_progress != cache->last_eviction_progress;
	cache->last_eviction_progress = cache->eviction_progress;

	/* Eviction is stuck, check if we have made progress. */
	if (*did_work) {
#if defined(HAVE_DIAGNOSTIC) || defined(HAVE_VERBOSE)
		__wt_epoch(session, &cache->stuck_time);
#endif
		return (0);
	}

#if defined(HAVE_DIAGNOSTIC) || defined(HAVE_VERBOSE)
	/*
	 * If we're stuck for 5 minutes in diagnostic mode, or the verbose
	 * evict_stuck flag is configured, log the cache and transaction state.
	 *
	 * If we're stuck for 5 minutes in diagnostic mode, give up.
	 *
	 * We don't do this check for in-memory workloads because application
	 * threads are not blocked by the cache being full. If the cache becomes
	 * full of clean pages, we can be servicing reads while the cache
	 * appears stuck to eviction.
	 */
	if (F_ISSET(conn, WT_CONN_IN_MEMORY))
		return (0);

	__wt_epoch(session, &now);
	if (WT_TIMEDIFF_SEC(now, cache->stuck_time) > 300) {
#if defined(HAVE_DIAGNOSTIC)
		__wt_err(session, ETIMEDOUT,
		    "Cache stuck for too long, giving up");
		ret = ETIMEDOUT;
		WT_TRET(__wt_verbose_dump_txn(session));
		WT_TRET(__wt_verbose_dump_cache(session));
		return (ret);
#elif defined(HAVE_VERBOSE)
		if (WT_VERBOSE_ISSET(session, WT_VERB_EVICT_STUCK)) {
			WT_RET(__wt_verbose_dump_txn(session));
			WT_RET(__wt_verbose_dump_cache(session));

			/* Reset the timer. */
			__wt_epoch(session, &cache->stuck_time);
		}
#endif
	}
#endif
	return (0);
}

/*
 * __wt_evict_create --
 *	Start the eviction server.
 */
int
__wt_evict_create(WT_SESSION_IMPL *session)
{
	WT_CONNECTION_IMPL *conn;
	uint32_t session_flags;

	conn = S2C(session);

	WT_ASSERT(session, conn->evict_threads_min > 0);
	/* Set first, the thread might run before we finish up. */
	F_SET(conn, WT_CONN_EVICTION_RUN);

	/*
	 * Create the eviction thread group.
	 * Set the group size to the maximum allowed sessions.
	 */
	session_flags = WT_THREAD_CAN_WAIT |
	    WT_THREAD_LOOKASIDE | WT_THREAD_PANIC_FAIL;
	WT_RET(__wt_thread_group_create(session, &conn->evict_threads,
	    "eviction-server", conn->evict_threads_min, conn->evict_threads_max,
	    session_flags, __wt_evict_thread_chk, __wt_evict_thread_run,
	    __wt_evict_thread_stop));

#if defined(HAVE_DIAGNOSTIC) || defined(HAVE_VERBOSE)
	/*
	 * Ensure the cache stuck timer is initialized when starting eviction.
	 */
	__wt_epoch(session, &conn->cache->stuck_time);
#endif

	/*
	 * Allow queues to be populated now that the eviction threads
	 * are running.
	 */
	conn->evict_server_running = true;

	return (0);
}

/*
 * __wt_evict_destroy --
 *	Destroy the eviction threads.
 */
int
__wt_evict_destroy(WT_SESSION_IMPL *session)
{
	WT_CONNECTION_IMPL *conn;

	conn = S2C(session);

	/* We are done if the eviction server didn't start successfully. */
	if (!conn->evict_server_running)
		return (0);

	/* Wait for any eviction thread group changes to stabilize. */
	__wt_writelock(session, &conn->evict_threads.lock);

	/*
	 * Signal the threads to finish and stop populating the queue.
	 */
	F_CLR(conn, WT_CONN_EVICTION_RUN);
	conn->evict_server_running = false;
	__wt_evict_server_wake(session);

	__wt_verbose(
	    session, WT_VERB_EVICTSERVER, "%s", "waiting for helper threads");

	/*
	 * We call the destroy function still holding the write lock.
	 * It assumes it is called locked.
	 */
	WT_RET(__wt_thread_group_destroy(session, &conn->evict_threads));

	return (0);
}

/*
 * __evict_update_work --
 *	Configure eviction work state.
 */
static bool
__evict_update_work(WT_SESSION_IMPL *session)
{
	WT_BTREE *las_tree;
	WT_CACHE *cache;
	WT_CONNECTION_IMPL *conn;
	uint64_t bytes_inuse, bytes_max, dirty_inuse;

	conn = S2C(session);
	cache = conn->cache;

	/* Clear previous state. */
	cache->flags = 0;

	if (!F_ISSET(conn, WT_CONN_EVICTION_RUN))
		return (false);

	if (!__evict_queue_empty(cache->evict_urgent_queue, false))
		F_SET(cache, WT_CACHE_EVICT_URGENT);

	if (F_ISSET(conn, WT_CONN_LOOKASIDE_OPEN)) {
		WT_ASSERT(session,
		    F_ISSET(session, WT_SESSION_LOOKASIDE_CURSOR));

		las_tree = ((WT_CURSOR_BTREE *)session->las_cursor)->btree;
		cache->bytes_lookaside = las_tree->bytes_inmem;
	}

	/*
	 * If we need space in the cache, try to find clean pages to evict.
	 *
	 * Avoid division by zero if the cache size has not yet been set in a
	 * shared cache.
	 */
	bytes_max = conn->cache_size + 1;
	bytes_inuse = __wt_cache_bytes_inuse(cache);
	if (__wt_eviction_clean_needed(session, NULL))
		F_SET(cache, WT_CACHE_EVICT_CLEAN | WT_CACHE_EVICT_CLEAN_HARD);
	else if (bytes_inuse > (cache->eviction_target * bytes_max) / 100)
		F_SET(cache, WT_CACHE_EVICT_CLEAN);

	dirty_inuse = __wt_cache_dirty_leaf_inuse(cache);
	if (__wt_eviction_dirty_needed(session, NULL))
		F_SET(cache, WT_CACHE_EVICT_DIRTY | WT_CACHE_EVICT_DIRTY_HARD);
	else if (dirty_inuse > (cache->eviction_dirty_target * bytes_max) / 100)
		F_SET(cache, WT_CACHE_EVICT_DIRTY);

	/*
	 * If application threads are blocked by the total volume of data in
	 * cache, try dirty pages as well.
	 */
	if (__wt_cache_aggressive(session) &&
	    F_ISSET(cache, WT_CACHE_EVICT_CLEAN_HARD))
		F_SET(cache, WT_CACHE_EVICT_DIRTY);

	/*
	 * Scrub dirty pages and keep them in cache if we are less than half
	 * way to the clean or dirty trigger.
	 */
	if (bytes_inuse < ((cache->eviction_target + cache->eviction_trigger) *
	    bytes_max) / 200 && dirty_inuse < (uint64_t)
	    ((cache->eviction_dirty_target + cache->eviction_dirty_trigger) *
	    bytes_max) / 200)
		F_SET(cache, WT_CACHE_EVICT_SCRUB);

	/*
	 * Try lookaside evict when:
	 * (1) the cache is stuck; OR
	 * (2) the lookaside score goes over 80; and
	 * (3) the cache is more than half way from the dirty target to the
	 *     dirty trigger.
	 */
	if (!F_ISSET(conn, WT_CONN_EVICTION_NO_LOOKASIDE) &&
	    (__wt_cache_stuck(session) ||
	    (__wt_cache_lookaside_score(cache) > 80 &&
	    dirty_inuse > (uint64_t)
	    ((cache->eviction_dirty_target + cache->eviction_dirty_trigger) *
	    bytes_max) / 200)))
		F_SET(cache, WT_CACHE_EVICT_LOOKASIDE);

	/*
	 * With an in-memory cache, we only do dirty eviction in order to scrub
	 * pages.
	 */
	if (F_ISSET(conn, WT_CONN_IN_MEMORY)) {
		if (F_ISSET(cache, WT_CACHE_EVICT_CLEAN))
			F_SET(cache, WT_CACHE_EVICT_DIRTY);
		if (F_ISSET(cache, WT_CACHE_EVICT_CLEAN_HARD))
			F_SET(cache, WT_CACHE_EVICT_DIRTY_HARD);
		F_CLR(cache, WT_CACHE_EVICT_CLEAN | WT_CACHE_EVICT_CLEAN_HARD);
	}

	return (F_ISSET(cache, WT_CACHE_EVICT_ALL | WT_CACHE_EVICT_URGENT));
}

/*
 * __evict_pass --
 *	Evict pages from memory.
 */
static int
__evict_pass(WT_SESSION_IMPL *session)
{
	struct timespec now, prev;
	WT_CACHE *cache;
	WT_CONNECTION_IMPL *conn;
	WT_TXN_GLOBAL *txn_global;
	uint64_t eviction_progress, oldest_id, prev_oldest_id;
	u_int loop;

	conn = S2C(session);
	cache = conn->cache;
	txn_global = &conn->txn_global;

	/* Track whether pages are being evicted and progress is made. */
	eviction_progress = cache->eviction_progress;
	prev_oldest_id = txn_global->oldest_id;
	WT_CLEAR(prev);

	/* Evict pages from the cache. */
	for (loop = 0; cache->pass_intr == 0; loop++) {
		__wt_epoch(session, &now);
		if (loop == 0)
			prev = now;

		__evict_tune_workers(session);
		/*
		 * Increment the shared read generation. Do this occasionally
		 * even if eviction is not currently required, so that pages
		 * have some relative read generation when the eviction server
		 * does need to do some work.
		 */
		__wt_cache_read_gen_incr(session);
		++cache->evict_pass_gen;

		/*
		 * Update the oldest ID: we use it to decide whether pages are
		 * candidates for eviction.  Without this, if all threads are
		 * blocked after a long-running transaction (such as a
		 * checkpoint) completes, we may never start evicting again.
		 *
		 * Do this every time the eviction server wakes up, regardless
		 * of whether the cache is full, to prevent the oldest ID
		 * falling too far behind.  Don't wait to lock the table: with
		 * highly threaded workloads, that creates a bottleneck.
		 */
		WT_RET(__wt_txn_update_oldest(session, WT_TXN_OLDEST_STRICT));

		if (!__evict_update_work(session))
			break;

		__wt_verbose(session, WT_VERB_EVICTSERVER,
		    "Eviction pass with: Max: %" PRIu64
		    " In use: %" PRIu64 " Dirty: %" PRIu64,
		    conn->cache_size, cache->bytes_inmem,
		    cache->bytes_dirty_intl + cache->bytes_dirty_leaf);

		if (F_ISSET(cache, WT_CACHE_EVICT_ALL))
			WT_RET(__evict_lru_walk(session));

		/*
		 * If the queue has been empty recently, keep queuing more
		 * pages to evict.  If the rate of queuing pages is high
		 * enough, this score will go to zero, in which case the
		 * eviction server might as well help out with eviction.
		 *
		 * Also, if there is a single eviction server thread with no
		 * workers, it must service the urgent queue in case all
		 * application threads are busy.
		 */
		if (!WT_EVICT_HAS_WORKERS(session) &&
		    (cache->evict_empty_score < WT_EVICT_SCORE_CUTOFF ||
		    !__evict_queue_empty(cache->evict_urgent_queue, false)))
			WT_RET(__evict_lru_pages(session, true));

		if (cache->pass_intr != 0)
			break;

		/*
		 * If we're making progress, keep going; if we're not making
		 * any progress at all, mark the cache "stuck" and go back to
		 * sleep, it's not something we can fix.
		 *
		 * We check for progress every 20ms, the idea being that the
		 * aggressive score will reach 10 after 200ms if we aren't
		 * making progress and eviction will start considering more
		 * pages.  If there is still no progress after 2s, we will
		 * treat the cache as stuck and start rolling back
		 * transactions and writing updates to the lookaside table.
		 */
		if (eviction_progress == cache->eviction_progress) {
			if (WT_TIMEDIFF_MS(now, prev) >= 20 &&
			    F_ISSET(cache, WT_CACHE_EVICT_CLEAN_HARD |
			    WT_CACHE_EVICT_DIRTY_HARD)) {
				if (cache->evict_aggressive_score < 100)
					++cache->evict_aggressive_score;
				oldest_id = txn_global->oldest_id;
				if (prev_oldest_id == oldest_id &&
				    txn_global->current != oldest_id &&
				    cache->evict_aggressive_score < 100)
					++cache->evict_aggressive_score;
				prev = now;
				prev_oldest_id = oldest_id;
			}

			/*
			 * Keep trying for long enough that we should be able
			 * to evict a page if the server isn't interfering.
			 */
			if (loop < 100 || cache->evict_aggressive_score < 100) {
				/*
				 * Back off if we aren't making progress: walks
				 * hold the handle list lock, blocking other
				 * operations that can free space in cache,
				 * such as LSM discarding handles.
				 *
				 * Allow this wait to be interrupted (e.g. if a
				 * checkpoint completes): make sure we wait for
				 * a non-zero number of microseconds).
				 */
				WT_STAT_CONN_INCR(session,
				    cache_eviction_server_slept);
				__wt_cond_wait(session,
				    cache->evict_cond, WT_THOUSAND, NULL);
				continue;
			}

			WT_STAT_CONN_INCR(session, cache_eviction_slow);
			__wt_verbose(session, WT_VERB_EVICTSERVER,
			    "%s", "unable to reach eviction goal");
			break;
		} else {
			if (cache->evict_aggressive_score > 0)
				--cache->evict_aggressive_score;
			loop = 0;
			eviction_progress = cache->eviction_progress;
		}
	}
	return (0);
}

/*
 * __evict_clear_walk --
 *	Clear a single walk point.
 */
static int
__evict_clear_walk(WT_SESSION_IMPL *session)
{
	WT_BTREE *btree;
	WT_CACHE *cache;
	WT_DECL_RET;
	WT_REF *ref;

	btree = S2BT(session);
	cache = S2C(session)->cache;

	WT_ASSERT(session, F_ISSET(session, WT_SESSION_LOCKED_PASS));
	if (session->dhandle == cache->evict_file_next)
		cache->evict_file_next = NULL;

	if ((ref = btree->evict_ref) == NULL)
		return (0);

	WT_STAT_CONN_INCR(session, cache_eviction_walks_abandoned);
	WT_STAT_DATA_INCR(session, cache_eviction_walks_abandoned);

	/*
	 * Clear evict_ref before releasing it in case that forces eviction (we
	 * assert that we never try to evict the current eviction walk point).
	 */
	btree->evict_ref = NULL;

	WT_WITH_DHANDLE(cache->walk_session, session->dhandle,
	    (ret = __wt_page_release(cache->walk_session,
	    ref, WT_READ_NO_EVICT)));
	return (ret);
}

/*
 * __evict_clear_all_walks --
 *	Clear the eviction walk points for all files a session is waiting on.
 */
static int
__evict_clear_all_walks(WT_SESSION_IMPL *session)
{
	WT_CONNECTION_IMPL *conn;
	WT_DATA_HANDLE *dhandle;
	WT_DECL_RET;

	conn = S2C(session);

	TAILQ_FOREACH(dhandle, &conn->dhqh, q)
		if (dhandle->type == WT_DHANDLE_TYPE_BTREE)
			WT_WITH_DHANDLE(session, dhandle,
			    WT_TRET(__evict_clear_walk(session)));
	return (ret);
}

/*
 * __wt_evict_file_exclusive_on --
 *	Get exclusive eviction access to a file and discard any of the file's
 *	blocks queued for eviction.
 */
int
__wt_evict_file_exclusive_on(WT_SESSION_IMPL *session)
{
	WT_BTREE *btree;
	WT_CACHE *cache;
	WT_DECL_RET;
	WT_EVICT_ENTRY *evict;
	u_int i, elem, q;

	btree = S2BT(session);
	cache = S2C(session)->cache;

	/* Hold the walk lock to turn off eviction. */
	__wt_spin_lock(session, &cache->evict_walk_lock);
	if (++btree->evict_disabled > 1) {
		__wt_spin_unlock(session, &cache->evict_walk_lock);
		return (0);
	}

	/*
	 * Ensure no new pages from the file will be queued for eviction after
	 * this point, then clear any existing LRU eviction walk for the file.
	 */
	(void)__wt_atomic_addv32(&cache->pass_intr, 1);
	WT_WITH_PASS_LOCK(session, ret = __evict_clear_walk(session));
	(void)__wt_atomic_subv32(&cache->pass_intr, 1);
	WT_ERR(ret);

	/*
	 * The eviction candidate list might reference pages from the file,
	 * clear it. Hold the evict lock to remove queued pages from a file.
	 */
	__wt_spin_lock(session, &cache->evict_queue_lock);

	for (q = 0; q < WT_EVICT_QUEUE_MAX; q++) {
		__wt_spin_lock(session, &cache->evict_queues[q].evict_lock);
		elem = cache->evict_queues[q].evict_max;
		for (i = 0, evict = cache->evict_queues[q].evict_queue;
		    i < elem; i++, evict++)
			if (evict->btree == btree)
				__evict_list_clear(session, evict);
		__wt_spin_unlock(session, &cache->evict_queues[q].evict_lock);
	}

	__wt_spin_unlock(session, &cache->evict_queue_lock);

	/*
	 * We have disabled further eviction: wait for concurrent LRU eviction
	 * activity to drain.
	 */
	while (btree->evict_busy > 0)
		__wt_yield();

	if (0) {
err:		--btree->evict_disabled;
	}
	__wt_spin_unlock(session, &cache->evict_walk_lock);
	return (ret);
}

/*
 * __wt_evict_file_exclusive_off --
 *	Release exclusive eviction access to a file.
 */
void
__wt_evict_file_exclusive_off(WT_SESSION_IMPL *session)
{
	WT_BTREE *btree;

	btree = S2BT(session);

	/*
	 * We have seen subtle bugs with multiple threads racing to turn
	 * eviction on/off.  Make races more likely in diagnostic builds.
	 */
	WT_DIAGNOSTIC_YIELD;

	/*
	 * Atomically decrement the evict-disabled count, without acquiring the
	 * eviction walk-lock. We can't acquire that lock here because there's
	 * a potential deadlock. When acquiring exclusive eviction access, we
	 * acquire the eviction walk-lock and then the cache's pass-intr lock.
	 * The current eviction implementation can hold the pass-intr lock and
	 * call into this function (see WT-3303 for the details), which might
	 * deadlock with another thread trying to get exclusive eviction access.
	 */
#if defined(HAVE_DIAGNOSTIC)
	{
	int32_t v;

	WT_ASSERT(session, btree->evict_ref == NULL);
	v = __wt_atomic_subi32(&btree->evict_disabled, 1);
	WT_ASSERT(session, v >= 0);
	}
#else
	(void)__wt_atomic_subi32(&btree->evict_disabled, 1);
#endif
}

#define	EVICT_TUNE_BATCH	1	/* Max workers to add each period */
/*
 * Data points needed before deciding if we should keep adding workers or settle
 * on an earlier value.
 */
#define	EVICT_TUNE_DATAPT_MIN   8
#define	EVICT_TUNE_PERIOD	60	/* Tune period in milliseconds */

/*
 * We will do a fresh re-tune every that many milliseconds to adjust to
 * significant phase changes.
 */
#define	EVICT_FORCE_RETUNE	25000

/*
 * __evict_tune_workers --
 * Find the right number of eviction workers. Gradually ramp up the number of
 * workers increasing the number in batches indicated by the setting above.
 * Store the number of workers that gave us the best throughput so far and the
 * number of data points we have tried.
 *
 * Every once in a while when we have the minimum number of data points we check
 * whether the eviction throughput achieved with the current number of workers
 * is the best we have seen so far. If so, we will keep increasing the number of
 * workers.  If not, we are past the infliction point on the eviction throughput
 * curve.  In that case, we will set the number of workers to the best observed
 * so far and settle into a stable state.
 */
static void
__evict_tune_workers(WT_SESSION_IMPL *session)
{
	struct timespec current_time;
	WT_CACHE *cache;
	WT_CONNECTION_IMPL *conn;
	uint64_t delta_msec, delta_pages;
	uint64_t eviction_progress, eviction_progress_rate, time_diff;
	int32_t cur_threads, i, target_threads, thread_surplus;

	conn = S2C(session);
	cache = conn->cache;

	/*
	 * If we have a fixed number of eviction threads, there is no value in
	 * calculating if we should do any tuning.
	 */
	if (conn->evict_threads_max == conn->evict_threads_min)
		return;

	__wt_epoch(session, &current_time);
	time_diff = WT_TIMEDIFF_MS(current_time, cache->evict_tune_last_time);

	/*
	 * If we have reached the stable state and have not run long enough to
	 * surpass the forced re-tuning threshold, return.
	 */
	if (cache->evict_tune_stable) {
		if (time_diff < EVICT_FORCE_RETUNE)
			return;

		/*
		 * Stable state was reached a long time ago. Let's re-tune.
		 * Reset all the state.
		 */
		cache->evict_tune_stable = false;
		cache->evict_tune_last_action_time.tv_sec = 0;
		cache->evict_tune_progress_last = 0;
		cache->evict_tune_num_points = 0;
		cache->evict_tune_progress_rate_max = 0;

		/* Reduce the number of eviction workers by one */
		thread_surplus =
		    (int32_t)conn->evict_threads.current_threads -
		    (int32_t)conn->evict_threads_min;

		if (thread_surplus > 0) {
			__wt_thread_group_stop_one(
			    session, &conn->evict_threads);
			WT_STAT_CONN_INCR(session,
			    cache_eviction_worker_removed);
		}
		WT_STAT_CONN_INCR(session, cache_eviction_force_retune);
	} else
		if (time_diff < EVICT_TUNE_PERIOD)
			/*
			 * If we have not reached stable state, don't do
			 * anything unless enough time has passed since the last
			 * time we have taken any action in this function.
			 */
			return;

	/*
	 * Measure the evicted progress so far. Eviction rate correlates to
	 * performance, so this is our metric of success.
	 */
	eviction_progress = cache->eviction_progress;

	/*
	 * If we have recorded the number of pages evicted at the end of
	 * the previous measurement interval, we can compute the eviction
	 * rate in evicted pages per second achieved during the current
	 * measurement interval.
	 * Otherwise, we just record the number of evicted pages and return.
	 */
	if (cache->evict_tune_progress_last == 0)
		goto done;

	delta_msec = WT_TIMEDIFF_MS(current_time, cache->evict_tune_last_time);
	delta_pages = eviction_progress - cache->evict_tune_progress_last;
	eviction_progress_rate = (delta_pages * WT_THOUSAND) / delta_msec;
	cache->evict_tune_num_points++;

	/*
	 * Keep track of the maximum eviction throughput seen and the number
	 * of workers corresponding to that throughput.
	 */
	if (eviction_progress_rate > cache->evict_tune_progress_rate_max) {
		cache->evict_tune_progress_rate_max = eviction_progress_rate;
		cache->evict_tune_workers_best =
		    conn->evict_threads.current_threads;
	}

	/*
	 * Compare the current number of data points with the number
	 * needed variable. If they are equal, we will check whether
	 * we are still going up on the performance curve, in which
	 * case we will increase the number of needed data points, to provide
	 * opportunity for further increasing the number of workers. Or
	 * we are past the inflection point on the curve, in which case
	 * we will go back to the best observed number of workers and
	 * settle into a stable state.
	 */
	if (cache->evict_tune_num_points >= cache->evict_tune_datapts_needed) {
		if (cache->evict_tune_workers_best ==
		    conn->evict_threads.current_threads &&
		    conn->evict_threads.current_threads <
		    conn->evict_threads_max) {
			/*
			 * Keep adding workers. We will check again
			 * at the next check point.
			 */
			cache->evict_tune_datapts_needed += WT_MIN(
			    EVICT_TUNE_DATAPT_MIN,
			    (conn->evict_threads_max -
			    conn->evict_threads.current_threads) /
			    EVICT_TUNE_BATCH);
		} else {
			/*
			 * We are past the inflection point. Choose the
			 * best number of eviction workers observed and
			 * settle into a stable state.
			 */
			thread_surplus =
			    (int32_t)conn->evict_threads.current_threads -
			    (int32_t)cache->evict_tune_workers_best;

			for (i = 0; i < thread_surplus; i++) {
				__wt_thread_group_stop_one(
				    session, &conn->evict_threads);
				WT_STAT_CONN_INCR(session,
				    cache_eviction_worker_removed);
			}
			cache->evict_tune_stable = true;
			goto done;
		}
	}

	/*
	 * If we have not added any worker threads in the past, we set the
	 * number of data points needed equal to the number of data points that
	 * we must accumulate before deciding if we should keep adding workers
	 * or settle on a previously tried stable number of workers.
	 */
	if (cache->evict_tune_last_action_time.tv_sec == 0)
		cache->evict_tune_datapts_needed = EVICT_TUNE_DATAPT_MIN;

	if (F_ISSET(cache, WT_CACHE_EVICT_ALL)) {
		cur_threads = (int32_t)conn->evict_threads.current_threads;
		target_threads = WT_MIN(cur_threads + EVICT_TUNE_BATCH,
		    (int32_t)conn->evict_threads_max);
		/*
		 * Start the new threads.
		 */
		for (i = cur_threads; i < target_threads; ++i) {
			__wt_thread_group_start_one(session,
			    &conn->evict_threads, false);
			WT_STAT_CONN_INCR(session,
			    cache_eviction_worker_created);
			__wt_verbose(session,
			    WT_VERB_EVICTSERVER, "%s", "added worker thread");
		}
		cache->evict_tune_last_action_time = current_time;
	}

done:	cache->evict_tune_last_time = current_time;
	cache->evict_tune_progress_last = eviction_progress;
}

/*
 * __evict_lru_pages --
 *	Get pages from the LRU queue to evict.
 */
static int
__evict_lru_pages(WT_SESSION_IMPL *session, bool is_server)
{
	WT_CONNECTION_IMPL *conn;
	WT_DECL_RET;

	conn = S2C(session);

	/*
	 * Reconcile and discard some pages: EBUSY is returned if a page fails
	 * eviction because it's unavailable, continue in that case.
	 */
	while (F_ISSET(conn, WT_CONN_EVICTION_RUN) && ret == 0)
		if ((ret = __evict_page(session, is_server)) == EBUSY)
			ret = 0;

	/* If a worker thread found the queue empty, pause. */
	if (ret == WT_NOTFOUND && !is_server &&
	    F_ISSET(conn, WT_CONN_EVICTION_RUN))
		__wt_cond_wait(
		    session, conn->evict_threads.wait_cond, 10000, NULL);

	return (ret == WT_NOTFOUND ? 0 : ret);
}

/*
 * __evict_lru_walk --
 *	Add pages to the LRU queue to be evicted from cache.
 */
static int
__evict_lru_walk(WT_SESSION_IMPL *session)
{
	WT_CACHE *cache;
	WT_DECL_RET;
	WT_EVICT_QUEUE *queue, *other_queue;
	uint64_t read_gen_oldest;
	uint32_t candidates, entries;

	cache = S2C(session)->cache;

	/* Age out the score of how much the queue has been empty recently. */
	if (cache->evict_empty_score > 0)
		--cache->evict_empty_score;

	/* Fill the next queue (that isn't the urgent queue). */
	queue = cache->evict_fill_queue;
	other_queue = cache->evict_queues + (1 - (queue - cache->evict_queues));
	cache->evict_fill_queue = other_queue;

	/* If this queue is full, try the other one. */
	if (__evict_queue_full(queue) && !__evict_queue_full(other_queue))
		queue = other_queue;

	/*
	 * If both queues are full and haven't been empty on recent refills,
	 * we're done.
	 */
	if (__evict_queue_full(queue) &&
	    cache->evict_empty_score < WT_EVICT_SCORE_CUTOFF)
		return (0);

	/* Get some more pages to consider for eviction. */
	if ((ret = __evict_walk(cache->walk_session, queue)) == EBUSY)
		return (0);	/* An interrupt was requested, give up. */
	WT_RET_NOTFOUND_OK(ret);

	/*
	 * If the queue we are filling is empty, pages are being requested
	 * faster than they are being queued.
	 */
	if (__evict_queue_empty(queue, false)) {
		if (F_ISSET(cache,
		    WT_CACHE_EVICT_CLEAN_HARD | WT_CACHE_EVICT_DIRTY_HARD))
			cache->evict_empty_score = WT_MIN(
			    cache->evict_empty_score + WT_EVICT_SCORE_BUMP,
			    WT_EVICT_SCORE_MAX);
		WT_STAT_CONN_INCR(session, cache_eviction_queue_empty);
	} else
		WT_STAT_CONN_INCR(session, cache_eviction_queue_not_empty);

	/* Sort the list into LRU order and restart. */
	__wt_spin_lock(session, &queue->evict_lock);

	/*
	 * We have locked the queue: in the (unusual) case where we are filling
	 * the current queue, mark it empty so that subsequent requests switch
	 * to the other queue.
	 */
	if (queue == cache->evict_current_queue)
		queue->evict_current = NULL;

	entries = queue->evict_entries;
	qsort(queue->evict_queue,
	    entries, sizeof(WT_EVICT_ENTRY), __evict_lru_cmp);

	/* Trim empty entries from the end. */
	while (entries > 0 && queue->evict_queue[entries - 1].ref == NULL)
		--entries;

	/*
	 * If we have more entries than the maximum tracked between walks,
	 * clear them.  Do this before figuring out how many of the entries are
	 * candidates so we never end up with more candidates than entries.
	 */
	while (entries > WT_EVICT_WALK_BASE)
		__evict_list_clear(session, &queue->evict_queue[--entries]);

	queue->evict_entries = entries;

	if (entries == 0) {
		/*
		 * If there are no entries, there cannot be any candidates.
		 * Make sure application threads don't read past the end of the
		 * candidate list, or they may race with the next walk.
		 */
		queue->evict_candidates = 0;
		queue->evict_current = NULL;
		__wt_spin_unlock(session, &queue->evict_lock);
		return (0);
	}

	/* Decide how many of the candidates we're going to try and evict. */
	if (__wt_cache_aggressive(session))
		queue->evict_candidates = entries;
	else {
		/*
		 * Find the oldest read generation apart that we have in the
		 * queue, used to set the initial value for pages read into the
		 * system.  The queue is sorted, find the first "normal"
		 * generation.
		 */
		read_gen_oldest = WT_READGEN_START_VALUE;
		for (candidates = 0; candidates < entries; ++candidates) {
			read_gen_oldest = queue->evict_queue[candidates].score;
			if (!WT_READGEN_EVICT_SOON(read_gen_oldest))
				break;
		}

		/*
		 * Take all candidates if we only gathered pages with an oldest
		 * read generation set.
		 *
		 * We normally never take more than 50% of the entries but if
		 * 50% of the entries were at the oldest read generation, take
		 * all of them.
		 */
		if (WT_READGEN_EVICT_SOON(read_gen_oldest))
			queue->evict_candidates = entries;
		else if (candidates > entries / 2)
			queue->evict_candidates = candidates;
		else {
			/*
			 * Take all of the urgent pages plus a third of
			 * ordinary candidates (which could be expressed as
			 * WT_EVICT_WALK_INCR / WT_EVICT_WALK_BASE).  In the
			 * steady state, we want to get as many candidates as
			 * the eviction walk adds to the queue.
			 *
			 * That said, if there is only one entry, which is
			 * normal when populating an empty file, don't exclude
			 * it.
			 */
			queue->evict_candidates =
			    1 + candidates + ((entries - candidates) - 1) / 3;
			cache->read_gen_oldest = read_gen_oldest;
		}
	}

	queue->evict_current = queue->evict_queue;
	__wt_spin_unlock(session, &queue->evict_lock);

	/*
	 * Signal any application or helper threads that may be waiting
	 * to help with eviction.
	 */
	__wt_cond_signal(session, S2C(session)->evict_threads.wait_cond);

	return (0);
}

/*
 * __evict_walk --
 *	Fill in the array by walking the next set of pages.
 */
static int
__evict_walk(WT_SESSION_IMPL *session, WT_EVICT_QUEUE *queue)
{
	WT_BTREE *btree;
	WT_CACHE *cache;
	WT_CONNECTION_IMPL *conn;
	WT_DATA_HANDLE *dhandle;
	WT_DECL_RET;
	u_int max_entries, retries, slot, start_slot, total_candidates;
	bool dhandle_locked, incr;

	conn = S2C(session);
	cache = conn->cache;
	btree = NULL;
	dhandle = NULL;
	dhandle_locked = incr = false;
	retries = 0;

	/*
	 * Set the starting slot in the queue and the maximum pages added
	 * per walk.
	 */
	start_slot = slot = queue->evict_entries;
	max_entries = WT_MIN(slot + WT_EVICT_WALK_INCR, cache->evict_slots);

	/*
	 * Another pathological case: if there are only a tiny number of
	 * candidate pages in cache, don't put all of them on one queue.
	 */
	total_candidates = (u_int)(F_ISSET(cache, WT_CACHE_EVICT_CLEAN) ?
	    __wt_cache_pages_inuse(cache) : cache->pages_dirty_leaf);
	max_entries = WT_MIN(max_entries, 1 + total_candidates / 2);

retry:	while (slot < max_entries) {
		/*
		 * If another thread is waiting on the eviction server to clear
		 * the walk point in a tree, give up.
		 */
		if (cache->pass_intr != 0)
			WT_ERR(EBUSY);

		/*
		 * Lock the dhandle list to find the next handle and bump its
		 * reference count to keep it alive while we sweep.
		 */
		if (!dhandle_locked) {
			WT_ERR(__evict_lock_handle_list(session));
			dhandle_locked = true;
		}

		if (dhandle == NULL) {
			/*
			 * On entry, continue from wherever we got to in the
			 * scan last time through.  If we don't have a saved
			 * handle, start from the beginning of the list.
			 */
			if ((dhandle = cache->evict_file_next) != NULL)
				cache->evict_file_next = NULL;
			else
				dhandle = TAILQ_FIRST(&conn->dhqh);
		} else {
			if (incr) {
				WT_ASSERT(session, dhandle->session_inuse > 0);
				(void)__wt_atomic_subi32(
				    &dhandle->session_inuse, 1);
				incr = false;
				cache->evict_file_next = NULL;
			}
			dhandle = TAILQ_NEXT(dhandle, q);
		}

		/* If we reach the end of the list, we're done. */
		if (dhandle == NULL)
			break;

		/* Ignore non-btree handles, or handles that aren't open. */
		if (dhandle->type != WT_DHANDLE_TYPE_BTREE ||
		    !F_ISSET(dhandle, WT_DHANDLE_OPEN))
			continue;

		/* Skip files that don't allow eviction. */
		btree = dhandle->handle;
		if (btree->evict_disabled > 0)
			continue;

		/*
		 * Skip files that are checkpointing if we are only looking for
		 * dirty pages.
		 */
		if (btree->checkpointing != WT_CKPT_OFF &&
		    !F_ISSET(cache, WT_CACHE_EVICT_CLEAN))
			continue;

		/*
		 * Skip files that are configured to stick in cache until we
		 * become aggressive.
		 */
		if (btree->evict_priority != 0 &&
		    !__wt_cache_aggressive(session))
			continue;

		/*
		 * Skip files if we have too many active walks.
		 *
		 * This used to be limited by the configured maximum number of
		 * hazard pointers per session.  Even though that ceiling has
		 * been removed, we need to test eviction with huge numbers of
		 * active trees before allowing larger numbers of hazard
		 * pointers in the walk session.
		 */
		if (btree->evict_ref == NULL &&
		    session->nhazard > WT_EVICT_MAX_TREES)
			continue;

		/*
		 * If we are filling the queue, skip files that haven't been
		 * useful in the past.
		 */
		if (btree->evict_walk_period != 0 &&
		    btree->evict_walk_skips++ < btree->evict_walk_period)
			continue;
		btree->evict_walk_skips = 0;

		(void)__wt_atomic_addi32(&dhandle->session_inuse, 1);
		incr = true;
		__wt_readunlock(session, &conn->dhandle_lock);
		dhandle_locked = false;

		/*
		 * Re-check the "no eviction" flag, used to enforce exclusive
		 * access when a handle is being closed.
		 *
		 * Only try to acquire the lock and simply continue if we fail;
		 * the lock is held while the thread turning off eviction clears
		 * the tree's current eviction point, and part of the process is
		 * waiting on this thread to acknowledge that action.
		 *
		 * If a handle is being discarded, it will still be marked open,
		 * but won't have a root page.
		 */
		if (btree->evict_disabled == 0 &&
		    !__wt_spin_trylock(session, &cache->evict_walk_lock)) {
			if (btree->evict_disabled == 0 &&
			    btree->root.page != NULL) {
				/*
				 * Remember the file to visit first, next loop.
				 */
				cache->evict_file_next = dhandle;
				WT_WITH_DHANDLE(session, dhandle,
				    ret = __evict_walk_file(
				    session, queue, max_entries, &slot));

				WT_ASSERT(session, __wt_session_gen(
				    session, WT_GEN_SPLIT) == 0);
			}
			__wt_spin_unlock(session, &cache->evict_walk_lock);
			WT_ERR(ret);
		}
	}

	if (incr) {
		WT_ASSERT(session, dhandle->session_inuse > 0);
		(void)__wt_atomic_subi32(&dhandle->session_inuse, 1);
		incr = false;
	}

	/*
	 * Walk the list of files a few times if we don't find enough pages.
	 * Try two passes through all the files, give up when we have some
	 * candidates and we aren't finding more.
	 */
	if (slot < max_entries && (retries < 2 ||
	    (retries < 10 &&
	    (slot == queue->evict_entries || slot > start_slot)))) {
		start_slot = slot;
		++retries;
		goto retry;
	}

err:	if (dhandle_locked)
		__wt_readunlock(session, &conn->dhandle_lock);

	/*
	 * If we didn't find any entries on a walk when we weren't interrupted,
	 * let our caller know.
	 */
	if (queue->evict_entries == slot && cache->pass_intr == 0)
		return (WT_NOTFOUND);

	queue->evict_entries = slot;
	return (ret);
}

/*
 * __evict_push_candidate --
 *	Initialize a WT_EVICT_ENTRY structure with a given page.
 */
static bool
__evict_push_candidate(WT_SESSION_IMPL *session,
    WT_EVICT_QUEUE *queue, WT_EVICT_ENTRY *evict, WT_REF *ref)
{
	uint8_t orig_flags, new_flags;
	u_int slot;

	/*
	 * Threads can race to queue a page (e.g., an ordinary LRU walk can
	 * race with a page being queued for urgent eviction).
	 */
	orig_flags = new_flags = ref->page->flags_atomic;
	FLD_SET(new_flags, WT_PAGE_EVICT_LRU);
	if (orig_flags == new_flags ||
	    !__wt_atomic_cas8(&ref->page->flags_atomic, orig_flags, new_flags))
		return (false);

	/* Keep track of the maximum slot we are using. */
	slot = (u_int)(evict - queue->evict_queue);
	if (slot >= queue->evict_max)
		queue->evict_max = slot + 1;

	if (evict->ref != NULL)
		__evict_list_clear(session, evict);

	evict->btree = S2BT(session);
	evict->ref = ref;
	evict->score = __evict_entry_priority(session, ref);

	/* Adjust for size when doing dirty eviction. */
	if (F_ISSET(S2C(session)->cache, WT_CACHE_EVICT_DIRTY) &&
	    evict->score != WT_READGEN_OLDEST && evict->score != UINT64_MAX &&
	    !__wt_page_is_modified(ref->page))
		evict->score += WT_MEGABYTE -
		    WT_MIN(WT_MEGABYTE, ref->page->memory_footprint);

	return (true);
}

/*
 * __evict_walk_file --
 *	Get a few page eviction candidates from a single underlying file.
 */
static int
__evict_walk_file(WT_SESSION_IMPL *session,
    WT_EVICT_QUEUE *queue, u_int max_entries, u_int *slotp)
{
	WT_BTREE *btree;
	WT_CACHE *cache;
	WT_CONNECTION_IMPL *conn;
	WT_DECL_RET;
	WT_EVICT_ENTRY *end, *evict, *start;
	WT_PAGE *last_parent, *page;
	WT_REF *ref;
	uint64_t btree_inuse, bytes_per_slot, cache_inuse, min_pages;
	uint64_t pages_seen, pages_queued, refs_walked;
	uint32_t remaining_slots, total_slots, walk_flags;
	uint32_t target_pages_clean, target_pages_dirty, target_pages;
	int restarts;
	bool give_up, modified, urgent_queued;

	conn = S2C(session);
	btree = S2BT(session);
	cache = conn->cache;
	last_parent = NULL;
	restarts = 0;
	give_up = urgent_queued = false;

	/*
	 * Figure out how many slots to fill from this tree.
	 * Note that some care is taken in the calculation to avoid overflow.
	 */
	start = queue->evict_queue + *slotp;
	remaining_slots = max_entries - *slotp;
	total_slots = max_entries - queue->evict_entries;
	target_pages_clean = target_pages_dirty = 0;

	/*
	 * The number of times we should fill the queue by the end of
	 * considering all trees.
	 */
#define	QUEUE_FILLS_PER_PASS	10

	/*
	 * The minimum number of pages we should consider per tree.
	 */
#define	MIN_PAGES_PER_TREE	10

	/*
	 * The target number of pages for this tree is proportional to the
	 * space it is taking up in cache.  Round to the nearest number of
	 * slots so we assign all of the slots to a tree filling 99+% of the
	 * cache (and only have to walk it once).
	 */
	if (F_ISSET(cache, WT_CACHE_EVICT_CLEAN)) {
		btree_inuse = __wt_btree_bytes_evictable(session);
		cache_inuse = __wt_cache_bytes_inuse(cache);
		bytes_per_slot = 1 + cache_inuse / total_slots;
		target_pages_clean = (uint32_t)(
		    (btree_inuse + bytes_per_slot / 2) / bytes_per_slot);
	}

	if (F_ISSET(cache, WT_CACHE_EVICT_DIRTY)) {
		btree_inuse = __wt_btree_dirty_leaf_inuse(session);
		cache_inuse = __wt_cache_dirty_leaf_inuse(cache);
		bytes_per_slot = 1 + cache_inuse / total_slots;
		target_pages_dirty = (uint32_t)(
		    (btree_inuse + bytes_per_slot / 2) / bytes_per_slot);
	}

	/*
	 * Weight the number of target pages by the number of times we want to
	 * fill the cache per pass through all the trees.  Note that we don't
	 * build this into the calculation above because we don't want to favor
	 * small trees, so round to a whole number of slots (zero for small
	 * trees) before multiplying.
	 */
	target_pages = WT_MAX(target_pages_clean, target_pages_dirty) *
	    QUEUE_FILLS_PER_PASS;

	/*
	 * If the tree is dead or we're near the end of the queue, fill the
	 * remaining slots.
	 */
	if (F_ISSET(session->dhandle, WT_DHANDLE_DEAD))
		target_pages = remaining_slots;

	/*
	 * Walk trees with a small fraction of the cache in case there are so
	 * many trees that none of them use enough of the cache to be allocated
	 * slots.  Only skip a tree if it has no bytes of interest.
	 */
	if (target_pages == 0) {
		btree_inuse = F_ISSET(cache, WT_CACHE_EVICT_CLEAN) ?
		    __wt_btree_bytes_evictable(session) :
		    __wt_btree_dirty_leaf_inuse(session);

		if (btree_inuse == 0)
			return (0);
	}

	/*
	 * There is some cost associated with walking a tree.  If we're going
	 * to visit this tree, always look for a minimum number of pages.
	 */
	if (target_pages < MIN_PAGES_PER_TREE)
		target_pages = MIN_PAGES_PER_TREE;

	if (target_pages > remaining_slots)
		target_pages = remaining_slots;

	/*
	 * These statistics generate a histogram of the number of pages targeted
	 * for eviction each round. The range of values here start at
	 * MIN_PAGES_PER_TREE as this is the smallest number of pages we can
	 * target, unless there are fewer slots available. The aim is to cover
	 * the likely ranges of target pages in as few statistics as possible to
	 * reduce the overall overhead.
	 */
	if (target_pages < MIN_PAGES_PER_TREE) {
		WT_STAT_CONN_INCR(session, cache_eviction_target_page_lt10);
		WT_STAT_DATA_INCR(session, cache_eviction_target_page_lt10);
	} else if (target_pages < 32) {
		WT_STAT_CONN_INCR(session, cache_eviction_target_page_lt32);
		WT_STAT_DATA_INCR(session, cache_eviction_target_page_lt32);
	} else if (target_pages < 64) {
		WT_STAT_CONN_INCR(session, cache_eviction_target_page_lt64);
		WT_STAT_DATA_INCR(session, cache_eviction_target_page_lt64);
	} else if (target_pages < 128) {
		WT_STAT_CONN_INCR(session, cache_eviction_target_page_lt128);
		WT_STAT_DATA_INCR(session, cache_eviction_target_page_lt128);
	} else {
		WT_STAT_CONN_INCR(session, cache_eviction_target_page_ge128);
		WT_STAT_DATA_INCR(session, cache_eviction_target_page_ge128);
	}

	end = start + target_pages;

	/*
	 * Examine at least a reasonable number of pages before deciding
	 * whether to give up.  When we are only looking for dirty pages,
	 * search the tree for longer.
	 */
	min_pages = 10 * (uint64_t)target_pages;
	if (F_ISSET(cache, WT_CACHE_EVICT_DIRTY) &&
	    !F_ISSET(cache, WT_CACHE_EVICT_CLEAN))
		min_pages *= 10;

	if (btree->evict_ref == NULL) {
		WT_STAT_CONN_INCR(session, cache_eviction_walk_from_root);
		WT_STAT_DATA_INCR(session, cache_eviction_walk_from_root);
	} else {
		WT_STAT_CONN_INCR(session, cache_eviction_walk_saved_pos);
		WT_STAT_DATA_INCR(session, cache_eviction_walk_saved_pos);
	}

	walk_flags =
	    WT_READ_CACHE | WT_READ_NO_EVICT | WT_READ_NO_GEN | WT_READ_NO_WAIT;

	/*
	 * Choose a random point in the tree if looking for candidates in a
	 * tree with no starting point set. This is mostly aimed at ensuring
	 * eviction fairly visits all pages in trees with a lot of in-cache
	 * content.
	 */
	switch (btree->evict_start_type) {
	case WT_EVICT_WALK_NEXT:
		break;
	case WT_EVICT_WALK_PREV:
		FLD_SET(walk_flags, WT_READ_PREV);
		break;
	case WT_EVICT_WALK_RAND_PREV:
		FLD_SET(walk_flags, WT_READ_PREV);
		/* FALLTHROUGH */
	case WT_EVICT_WALK_RAND_NEXT:
		if (btree->evict_ref == NULL) {
			/* Ensure internal pages indexes remain valid */
			WT_WITH_PAGE_INDEX(session, ret = __wt_random_descent(
			    session, &btree->evict_ref, true));
			WT_RET_NOTFOUND_OK(ret);
		}
		break;
	}

	/*
	 * Get some more eviction candidate pages, starting at the last saved
	 * point. Clear the saved point immediately, we assert when discarding
	 * pages we're not discarding an eviction point, so this clear must be
	 * complete before the page is released.
	 */
	ref = btree->evict_ref;
	btree->evict_ref = NULL;

	/*
	 * !!! Take care terminating this loop.
	 *
	 * Don't make an extra call to __wt_tree_walk after we hit the end of a
	 * tree: that will leave a page pinned, which may prevent any work from
	 * being done.
	 *
	 * Once we hit the page limit, do one more step through the walk in
	 * case we are appending and only the last page in the file is live.
	 */
	for (evict = start, pages_queued = pages_seen = refs_walked = 0;
	    evict < end && (ret == 0 || ret == WT_NOTFOUND);
	    last_parent = ref == NULL ? NULL : ref->home,
	    ret = __wt_tree_walk_count(
	    session, &ref, &refs_walked, walk_flags)) {
		/*
		 * Check whether we're finding a good ratio of candidates vs
		 * pages seen.  Some workloads create "deserts" in trees where
		 * no good eviction candidates can be found.  Abandon the walk
		 * if we get into that situation.
		 */
		give_up = !__wt_cache_aggressive(session) &&
		    !F_ISSET(btree, WT_BTREE_LOOKASIDE) &&
		    pages_seen > min_pages &&
		    (pages_queued == 0 || (pages_seen / pages_queued) >
		    (min_pages / target_pages));
		if (give_up) {
			/*
			 * Try a different walk start point next time if a
			 * walk gave up.
			 */
			switch (btree->evict_start_type) {
			case WT_EVICT_WALK_NEXT:
				btree->evict_start_type = WT_EVICT_WALK_PREV;
				break;
			case WT_EVICT_WALK_PREV:
				btree->evict_start_type =
				    WT_EVICT_WALK_RAND_PREV;
				break;
			case WT_EVICT_WALK_RAND_PREV:
				btree->evict_start_type =
				    WT_EVICT_WALK_RAND_NEXT;
				break;
			case WT_EVICT_WALK_RAND_NEXT:
				btree->evict_start_type = WT_EVICT_WALK_NEXT;
				break;
			}

			/*
			 * We differentiate the reasons we gave up on this walk
			 * and increment the stats accordingly.
			 */
			if (pages_queued == 0) {
				WT_STAT_CONN_INCR(session,
				    cache_eviction_walks_gave_up_no_targets);
				WT_STAT_DATA_INCR(session,
				    cache_eviction_walks_gave_up_no_targets);
			} else {
				WT_STAT_CONN_INCR(session,
				    cache_eviction_walks_gave_up_ratio);
				WT_STAT_DATA_INCR(session,
				    cache_eviction_walks_gave_up_ratio);
			}
			break;
		}

		if (ref == NULL) {
			WT_STAT_CONN_INCR(session, cache_eviction_walks_ended);
			WT_STAT_DATA_INCR(session, cache_eviction_walks_ended);

			if (++restarts == 2) {
				WT_STAT_CONN_INCR(
				    session, cache_eviction_walks_stopped);
				WT_STAT_DATA_INCR(
				    session, cache_eviction_walks_stopped);
				break;
			}
			WT_STAT_CONN_INCR(
			    session, cache_eviction_walks_started);
			continue;
		}

		++pages_seen;

		/* Ignore root pages entirely. */
		if (__wt_ref_is_root(ref))
			continue;

		page = ref->page;
		modified = __wt_page_is_modified(page);
		page->evict_pass_gen = cache->evict_pass_gen;

		/*
		 * Use the EVICT_LRU flag to avoid putting pages onto the list
		 * multiple times.
		 */
		if (F_ISSET_ATOMIC(page, WT_PAGE_EVICT_LRU))
			continue;

		/*
		 * It's possible (but unlikely) to visit a page without a read
		 * generation, if we race with the read instantiating the page.
		 * Set the page's read generation here to ensure a bug doesn't
		 * somehow leave a page without a read generation.
		 */
		if (page->read_gen == WT_READGEN_NOTSET)
			__wt_cache_read_gen_new(session, page);

		/* Pages being forcibly evicted go on the urgent queue. */
		if (page->read_gen == WT_READGEN_OLDEST ||
		    page->memory_footprint >= btree->splitmempage) {
			WT_STAT_CONN_INCR(
			    session, cache_eviction_pages_queued_oldest);
			if (__wt_page_evict_urgent(session, ref))
				urgent_queued = true;
			continue;
		}

		/*
		 * Pages that are empty or from dead trees are fast-tracked.
		 *
		 * Also evict lookaside table pages without further filtering:
		 * the cache is under pressure by definition and we want to
		 * free space.
		 */
		if (__wt_page_is_empty(page) ||
		    F_ISSET(session->dhandle, WT_DHANDLE_DEAD) ||
		    F_ISSET(btree, WT_BTREE_LOOKASIDE))
			goto fast;

		/*
		 * If application threads are blocked on eviction of clean
		 * pages, and the only thing preventing a clean leaf page from
		 * being evicted is it contains historical data, mark it dirty
		 * so we can do lookaside eviction.  We also mark the tree
		 * dirty to avoid an assertion that we don't discard dirty
		 * pages from a clean tree.
		 */
		if (F_ISSET(cache, WT_CACHE_EVICT_CLEAN_HARD) &&
		    !F_ISSET(conn, WT_CONN_EVICTION_NO_LOOKASIDE) &&
		    !WT_PAGE_IS_INTERNAL(page) &&
		    !modified && page->modify != NULL &&
		    !__wt_txn_visible_all(session, page->modify->rec_max_txn,
		    WT_TIMESTAMP_NULL(&page->modify->rec_max_timestamp))) {
			__wt_page_modify_set(session, page);
			goto fast;
		}

		/* Skip clean pages if appropriate. */
		if (!modified && !F_ISSET(cache, WT_CACHE_EVICT_CLEAN))
			continue;

		/* Skip dirty pages if appropriate. */
		if (modified && !F_ISSET(cache, WT_CACHE_EVICT_DIRTY))
			continue;

		/*
		 * Don't attempt eviction of internal pages with children in
		 * cache (indicated by seeing an internal page that is the
		 * parent of the last page we saw).
		 *
		 * Also skip internal page unless we get aggressive or the tree
		 * is idle (indicated by the tree being skipped for walks).
		 * The goal here is that if trees become completely idle, we
		 * eventually push them out of cache completely.
		 */
		if (WT_PAGE_IS_INTERNAL(page)) {
			if (page == last_parent)
				continue;
			if (btree->evict_walk_period == 0 &&
			    !__wt_cache_aggressive(session))
				continue;
		}

		/* If eviction gets aggressive, anything else is fair game. */
		if (__wt_cache_aggressive(session))
			goto fast;

		/*
		 * If there are active transaction and oldest transaction
		 * hasn't changed since the last time this page was written,
		 * it's unlikely we can make progress.  Similarly, if the most
		 * recent update on the page is not yet globally visible,
		 * eviction will fail.  This heuristic avoids repeated attempts
		 * to evict the same page.
		 */
		if (modified && (!__wt_page_evict_retry(session, page) ||
		    !__txn_visible_all_id(session, page->modify->update_txn)))
			continue;

fast:		/* If the page can't be evicted, give up. */
		if (!__wt_page_can_evict(session, ref, NULL))
			continue;

		WT_ASSERT(session, evict->ref == NULL);
		if (!__evict_push_candidate(session, queue, evict, ref))
			continue;
		++evict;
		++pages_queued;

		__wt_verbose(session, WT_VERB_EVICTSERVER,
		    "select: %p, size %" WT_SIZET_FMT,
		    (void *)page, page->memory_footprint);
	}
	WT_RET_NOTFOUND_OK(ret);

	*slotp += (u_int)(evict - start);
	WT_STAT_CONN_INCRV(
	    session, cache_eviction_pages_queued, (u_int)(evict - start));

	__wt_verbose(session, WT_VERB_EVICTSERVER,
	    "%s walk: seen %" PRIu64 ", queued %" PRIu64,
	    session->dhandle->name, pages_seen, pages_queued);

	/*
	 * If we couldn't find the number of pages we were looking for, skip
	 * the tree next time.
	 */
	if (pages_queued < target_pages / 2 && !urgent_queued)
		btree->evict_walk_period = WT_MIN(
		    WT_MAX(1, 2 * btree->evict_walk_period), 100);
	else if (pages_queued == target_pages)
		btree->evict_walk_period = 0;
	else if (btree->evict_walk_period > 0)
		btree->evict_walk_period /= 2;

	/*
	 * Give up the walk occasionally.
	 *
	 * If we happen to end up on the root page or a page requiring urgent
	 * eviction, clear it.  We have to track hazard pointers, and the root
	 * page complicates that calculation.
	 *
	 * Likewise if we found no new candidates during the walk: there is no
	 * point keeping a page pinned, since it may be the only candidate in
	 * an idle tree.
	 *
	 * If we land on a page requiring forced eviction, move on to the next
	 * page: we want this page evicted as quickly as possible.
	 */
	if (ref != NULL) {
		if (__wt_ref_is_root(ref) || evict == start || give_up ||
		    WT_READGEN_EVICT_SOON(ref->page->read_gen) ||
		    ref->page->memory_footprint >= btree->splitmempage) {
			if (restarts == 0)
				WT_STAT_CONN_INCR(
				    session, cache_eviction_walks_abandoned);
			WT_RET(__wt_page_release(
			    cache->walk_session, ref, walk_flags));
			ref = NULL;
		} else if (WT_READGEN_EVICT_SOON(ref->page->read_gen))
			WT_RET_NOTFOUND_OK(__wt_tree_walk_count(
			    session, &ref, &refs_walked, walk_flags));
		btree->evict_ref = ref;
	}

	WT_STAT_CONN_INCRV(session, cache_eviction_walk, refs_walked);
	WT_STAT_CONN_INCRV(session, cache_eviction_pages_seen, pages_seen);
	WT_STAT_DATA_INCRV(session, cache_eviction_pages_seen, pages_seen);
	WT_STAT_CONN_INCRV(session, cache_eviction_walk_passes, 1);
	WT_STAT_DATA_INCRV(session, cache_eviction_walk_passes, 1);

	return (0);
}

/*
 * __evict_get_ref --
 *	Get a page for eviction.
 */
static int
__evict_get_ref(
    WT_SESSION_IMPL *session, bool is_server, WT_BTREE **btreep, WT_REF **refp)
{
	WT_CACHE *cache;
	WT_EVICT_ENTRY *evict;
	WT_EVICT_QUEUE *queue, *other_queue, *urgent_queue;
	uint32_t candidates;
	bool is_app, server_only, urgent_ok;

	*btreep = NULL;
	*refp = NULL;

	cache = S2C(session)->cache;
	is_app = !F_ISSET(session, WT_SESSION_INTERNAL);
	server_only = is_server && !WT_EVICT_HAS_WORKERS(session);
	urgent_ok = (!is_app && !is_server) ||
	    !WT_EVICT_HAS_WORKERS(session) ||
	    (is_app && __wt_cache_aggressive(session));
	urgent_queue = cache->evict_urgent_queue;

	WT_STAT_CONN_INCR(session, cache_eviction_get_ref);

	/* Avoid the LRU lock if no pages are available. */
	if (__evict_queue_empty(cache->evict_current_queue, is_server) &&
	    __evict_queue_empty(cache->evict_other_queue, is_server) &&
	    (!urgent_ok || __evict_queue_empty(urgent_queue, false))) {
		WT_STAT_CONN_INCR(session, cache_eviction_get_ref_empty);
		return (WT_NOTFOUND);
	}

	/*
	 * The server repopulates whenever the other queue is not full, as long
	 * as at least one page has been evicted out of the current queue.
	 *
	 * Note that there are pathological cases where there are only enough
	 * eviction candidates in the cache to fill one queue.  In that case,
	 * we will continually evict one page and attempt to refill the queues.
	 * Such cases are extremely rare in real applications.
	 */
	if (is_server &&
	    (!urgent_ok || __evict_queue_empty(urgent_queue, false)) &&
	    !__evict_queue_full(cache->evict_current_queue) &&
	    !__evict_queue_full(cache->evict_fill_queue) &&
	    (cache->evict_empty_score > WT_EVICT_SCORE_CUTOFF ||
	    __evict_queue_empty(cache->evict_fill_queue, false)))
		return (WT_NOTFOUND);

	__wt_spin_lock(session, &cache->evict_queue_lock);

	/* Check the urgent queue first. */
	if (urgent_ok && !__evict_queue_empty(urgent_queue, false))
		queue = urgent_queue;
	else {
		/*
		 * Check if the current queue needs to change.
		 *
		 * The server will only evict half of the pages before looking
		 * for more, but should only switch queues if there are no
		 * other eviction workers.
		 */
		queue = cache->evict_current_queue;
		other_queue = cache->evict_other_queue;
		if (__evict_queue_empty(queue, server_only) &&
		    !__evict_queue_empty(other_queue, server_only)) {
			cache->evict_current_queue = other_queue;
			cache->evict_other_queue = queue;
		}
	}

	__wt_spin_unlock(session, &cache->evict_queue_lock);

	/*
	 * We got the queue lock, which should be fast, and chose a queue.
	 * Now we want to get the lock on the individual queue.
	 */
	for (;;) {
		/* Verify there are still pages available. */
		if (__evict_queue_empty(
		    queue, is_server && queue != urgent_queue)) {
			WT_STAT_CONN_INCR(
			    session, cache_eviction_get_ref_empty2);
			return (WT_NOTFOUND);
		}
		if (!is_server)
			__wt_spin_lock(session, &queue->evict_lock);
		else if (__wt_spin_trylock(session, &queue->evict_lock) != 0)
			continue;
		break;
	}

	/*
	 * Only evict half of the pages before looking for more. The remainder
	 * are left to eviction workers (if configured), or application thread
	 * if necessary.
	 */
	candidates = queue->evict_candidates;
	if (is_server && queue != urgent_queue && candidates > 1)
		candidates /= 2;

	/* Get the next page queued for eviction. */
	for (evict = queue->evict_current;
	    evict >= queue->evict_queue &&
	    evict < queue->evict_queue + candidates;
	    ++evict) {
		if (evict->ref == NULL)
			continue;
		WT_ASSERT(session, evict->btree != NULL);

		/*
		 * Evicting a dirty page in the server thread could stall
		 * during a write and prevent eviction from finding new work.
		 *
		 * However, we can't skip entries in the urgent queue or they
		 * may never be found again.
		 *
		 * Don't force application threads to evict dirty pages if they
		 * aren't stalled by the amount of dirty data in cache.
		 */
		if (!urgent_ok && (is_server ||
		    !F_ISSET(cache, WT_CACHE_EVICT_DIRTY_HARD)) &&
		    __wt_page_is_modified(evict->ref->page)) {
			--evict;
			break;
		}

		/*
		 * Lock the page while holding the eviction mutex to prevent
		 * multiple attempts to evict it.  For pages that are already
		 * being evicted, this operation will fail and we will move on.
		 */
		if (!__wt_atomic_casv32(
		    &evict->ref->state, WT_REF_MEM, WT_REF_LOCKED)) {
			__evict_list_clear(session, evict);
			continue;
		}

		/*
		 * Increment the busy count in the btree handle to prevent it
		 * from being closed under us.
		 */
		(void)__wt_atomic_addv32(&evict->btree->evict_busy, 1);

		*btreep = evict->btree;
		*refp = evict->ref;

		/*
		 * Remove the entry so we never try to reconcile the same page
		 * on reconciliation error.
		 */
		__evict_list_clear(session, evict);
		break;
	}

	/* Move to the next item. */
	if (evict != NULL &&
	    evict + 1 < queue->evict_queue + queue->evict_candidates)
		queue->evict_current = evict + 1;
	else /* Clear the current pointer if there are no more candidates. */
		queue->evict_current = NULL;

	__wt_spin_unlock(session, &queue->evict_lock);

	return (*refp == NULL ? WT_NOTFOUND : 0);
}

/*
 * __evict_page --
 *	Called by both eviction and application threads to evict a page.
 */
static int
__evict_page(WT_SESSION_IMPL *session, bool is_server)
{
	struct timespec enter, leave;
	WT_BTREE *btree;
	WT_CACHE *cache;
	WT_DECL_RET;
	WT_REF *ref;
	bool app_timer;

	WT_RET(__evict_get_ref(session, is_server, &btree, &ref));
	WT_ASSERT(session, ref->state == WT_REF_LOCKED);

	app_timer = false;
	cache = S2C(session)->cache;

	/*
	 * An internal session flags either the server itself or an eviction
	 * worker thread.
	 */
	if (is_server) {
		WT_STAT_CONN_INCR(session, cache_eviction_server_evicting);
		cache->server_evicts++;
	} else if (F_ISSET(session, WT_SESSION_INTERNAL)) {
		WT_STAT_CONN_INCR(session, cache_eviction_worker_evicting);
		cache->worker_evicts++;
	} else {
		if (__wt_page_is_modified(ref->page))
			WT_STAT_CONN_INCR(session, cache_eviction_app_dirty);
		WT_STAT_CONN_INCR(session, cache_eviction_app);
		cache->app_evicts++;
		if (WT_STAT_ENABLED(session)) {
			app_timer = true;
			__wt_epoch(session, &enter);
		}
	}

	/*
	 * In case something goes wrong, don't pick the same set of pages every
	 * time.
	 *
	 * We used to bump the page's read generation only if eviction failed,
	 * but that isn't safe: at that point, eviction has already unlocked
	 * the page and some other thread may have evicted it by the time we
	 * look at it.
	 */
	__wt_cache_read_gen_bump(session, ref->page);

	WT_WITH_BTREE(session, btree, ret = __wt_evict(session, ref, false));

	(void)__wt_atomic_subv32(&btree->evict_busy, 1);

	if (app_timer) {
		__wt_epoch(session, &leave);
		WT_STAT_CONN_INCRV(session,
		    application_evict_time, WT_TIMEDIFF_US(leave, enter));
	}
	return (ret);
}

/*
 * __wt_cache_eviction_worker --
 *	Worker function for __wt_cache_eviction_check: evict pages if the cache
 * crosses its boundaries.
 */
int
__wt_cache_eviction_worker(
    WT_SESSION_IMPL *session, bool busy, bool readonly, u_int pct_full)
{
	struct timespec enter, leave;
	WT_CACHE *cache;
	WT_CONNECTION_IMPL *conn;
	WT_DECL_RET;
	WT_TXN_GLOBAL *txn_global;
	WT_TXN_STATE *txn_state;
	uint64_t initial_progress, max_progress;
	bool timer;

	conn = S2C(session);
	cache = conn->cache;
	txn_global = &conn->txn_global;
	txn_state = WT_SESSION_TXN_STATE(session);

	/*
	 * It is not safe to proceed if the eviction server threads aren't
	 * setup yet.
	 */
	if (!conn->evict_server_running)
		return (0);

	if (busy && pct_full < 100)
		return (0);

	/* Wake the eviction server if we need to do work. */
	__wt_evict_server_wake(session);

	/* Track how long application threads spend doing eviction. */
	timer =
	    WT_STAT_ENABLED(session) && !F_ISSET(session, WT_SESSION_INTERNAL);
	if (timer)
		__wt_epoch(session, &enter);

	for (initial_progress = cache->eviction_progress;; ret = 0) {
		/*
		 * A pathological case: if we're the oldest transaction in the
		 * system and the eviction server is stuck trying to find space,
		 * abort the transaction to give up all hazard pointers before
		 * trying again.
		 */
		if (__wt_cache_stuck(session) && __wt_txn_am_oldest(session)) {
			--cache->evict_aggressive_score;
			WT_STAT_CONN_INCR(session, txn_fail_cache);
			WT_ERR(WT_ROLLBACK);
		}

		/*
		 * Check if we have become busy.
		 *
		 * If we're busy (because of the transaction check we just did
		 * or because our caller is waiting on a longer-than-usual event
		 * such as a page read), and the cache level drops below 100%,
		 * limit the work to 5 evictions and return. If that's not the
		 * case, we can do more.
		 */
		if (!busy && txn_state->pinned_id != WT_TXN_NONE &&
		    txn_global->current != txn_global->oldest_id)
			busy = true;
		max_progress = busy ? 5 : 20;

		/* See if eviction is still needed. */
		if (!__wt_eviction_needed(session, busy, readonly, &pct_full) ||
		    ((pct_full < 100 || cache->eviction_scrub_limit > 0.0) &&
		    (cache->eviction_progress >
		    initial_progress + max_progress)))
			break;

		/*
		 * Don't make application threads participate in scrubbing for
		 * checkpoints.  Just throttle updates instead.
		 */
		if (WT_EVICT_HAS_WORKERS(session) &&
		    cache->eviction_scrub_limit > 0.0 &&
		    !F_ISSET(cache, WT_CACHE_EVICT_CLEAN_HARD)) {
			__wt_yield();
			continue;
		}

		/* Evict a page. */
		switch (ret = __evict_page(session, false)) {
		case 0:
			if (busy)
				goto err;
			/* FALLTHROUGH */
		case EBUSY:
			break;
		case WT_NOTFOUND:
			/* Allow the queue to re-populate before retrying. */
			__wt_cond_wait(session,
			    conn->evict_threads.wait_cond, 10000, NULL);
			cache->app_waits++;
			break;
		default:
			goto err;
		}
	}

err:	if (timer) {
		__wt_epoch(session, &leave);
		WT_STAT_CONN_INCRV(session,
		    application_cache_time, WT_TIMEDIFF_US(leave, enter));
	}

	return (ret);
	/* NOTREACHED */
}

/*
 * __wt_page_evict_urgent --
 *      Set a page to be evicted as soon as possible.
 */
bool
__wt_page_evict_urgent(WT_SESSION_IMPL *session, WT_REF *ref)
{
	WT_CACHE *cache;
	WT_EVICT_ENTRY *evict;
	WT_EVICT_QUEUE *urgent_queue;
	WT_PAGE *page;
	bool queued;

	/* Root pages should never be evicted via LRU. */
	WT_ASSERT(session, !__wt_ref_is_root(ref));

	page = ref->page;
	if (F_ISSET_ATOMIC(page, WT_PAGE_EVICT_LRU) ||
	    S2BT(session)->evict_disabled > 0)
		return (false);

	/* Append to the urgent queue if we can. */
	cache = S2C(session)->cache;
	urgent_queue = &cache->evict_queues[WT_EVICT_URGENT_QUEUE];
	queued = false;

	__wt_spin_lock(session, &cache->evict_queue_lock);
	if (F_ISSET_ATOMIC(page, WT_PAGE_EVICT_LRU) ||
	    S2BT(session)->evict_disabled > 0)
		goto done;

	__wt_spin_lock(session, &urgent_queue->evict_lock);
	if (__evict_queue_empty(urgent_queue, false)) {
		urgent_queue->evict_current = urgent_queue->evict_queue;
		urgent_queue->evict_candidates = 0;
	}
	evict = urgent_queue->evict_queue + urgent_queue->evict_candidates;
	if (evict < urgent_queue->evict_queue + cache->evict_slots &&
	    __evict_push_candidate(session, urgent_queue, evict, ref)) {
		++urgent_queue->evict_candidates;
		queued = true;
	}
	__wt_spin_unlock(session, &urgent_queue->evict_lock);

done:	__wt_spin_unlock(session, &cache->evict_queue_lock);
	if (queued) {
		WT_STAT_CONN_INCR(session, cache_eviction_pages_queued_urgent);
		if (WT_EVICT_HAS_WORKERS(session))
			__wt_cond_signal(session,
			    S2C(session)->evict_threads.wait_cond);
		else
			__wt_evict_server_wake(session);
	}

	return (queued);
}

/*
 * __wt_evict_priority_set --
 *	Set a tree's eviction priority.
 */
void
__wt_evict_priority_set(WT_SESSION_IMPL *session, uint64_t v)
{
	S2BT(session)->evict_priority = v;
}

/*
 * __wt_evict_priority_clear --
 *	Clear a tree's eviction priority.
 */
void
__wt_evict_priority_clear(WT_SESSION_IMPL *session)
{
	S2BT(session)->evict_priority = 0;
}

/*
 * __verbose_dump_cache_single --
 *	Output diagnostic information about a single file in the cache.
 */
static int
__verbose_dump_cache_single(WT_SESSION_IMPL *session,
    uint64_t *total_bytesp, uint64_t *total_dirty_bytesp)
{
	WT_BTREE *btree;
	WT_DATA_HANDLE *dhandle;
	WT_PAGE *page;
	WT_REF *next_walk;
	size_t size;
	uint64_t intl_bytes, intl_bytes_max, intl_dirty_bytes;
	uint64_t intl_dirty_bytes_max, intl_dirty_pages, intl_pages;
	uint64_t leaf_bytes, leaf_bytes_max, leaf_dirty_bytes;
	uint64_t leaf_dirty_bytes_max, leaf_dirty_pages, leaf_pages;

	intl_bytes = intl_bytes_max = intl_dirty_bytes = 0;
	intl_dirty_bytes_max = intl_dirty_pages = intl_pages = 0;
	leaf_bytes = leaf_bytes_max = leaf_dirty_bytes = 0;
	leaf_dirty_bytes_max = leaf_dirty_pages = leaf_pages = 0;

	next_walk = NULL;
	while (__wt_tree_walk(session, &next_walk,
	    WT_READ_CACHE | WT_READ_NO_EVICT | WT_READ_NO_WAIT) == 0 &&
	    next_walk != NULL) {
		page = next_walk->page;
		size = page->memory_footprint;

		if (WT_PAGE_IS_INTERNAL(page)) {
			++intl_pages;
			intl_bytes += size;
			intl_bytes_max = WT_MAX(intl_bytes_max, size);
			if (__wt_page_is_modified(page)) {
				++intl_dirty_pages;
				intl_dirty_bytes += size;
				intl_dirty_bytes_max =
				    WT_MAX(intl_dirty_bytes_max, size);
			}
		} else {
			++leaf_pages;
			leaf_bytes += size;
			leaf_bytes_max = WT_MAX(leaf_bytes_max, size);
			if (__wt_page_is_modified(page)) {
				++leaf_dirty_pages;
				leaf_dirty_bytes += size;
				leaf_dirty_bytes_max =
				    WT_MAX(leaf_dirty_bytes_max, size);
			}
		}
	}

	dhandle = session->dhandle;
	btree = dhandle->handle;
	WT_RET(__wt_msg(session, "%s(%s%s)%s%s:",
	    dhandle->name, dhandle->checkpoint != NULL ? "checkpoint=" : "",
	    dhandle->checkpoint != NULL ? dhandle->checkpoint : "<live>",
	    btree->evict_disabled != 0 ?  "eviction disabled" : "",
	    btree->evict_disabled_open ? " at open" : ""));
	if (intl_pages != 0)
		WT_RET(__wt_msg(session,
		    "internal: "
		    "%" PRIu64 " pages, "
		    "%" PRIu64 "MB, "
		    "%" PRIu64 "/%" PRIu64 " clean/dirty pages, "
		    "%" PRIu64 "/%" PRIu64 " clean/dirty MB, "
		    "%" PRIu64 "MB max page, "
		    "%" PRIu64 "MB max dirty page",
		    intl_pages,
		    intl_bytes / WT_MEGABYTE,
		    intl_pages - intl_dirty_pages,
		    intl_dirty_pages,
		    (intl_bytes - intl_dirty_bytes) / WT_MEGABYTE,
		    intl_dirty_bytes / WT_MEGABYTE,
		    intl_bytes_max / WT_MEGABYTE,
		    intl_dirty_bytes_max / WT_MEGABYTE));
	if (leaf_pages != 0)
		WT_RET(__wt_msg(session,
		    "leaf: "
		    "%" PRIu64 " pages, "
		    "%" PRIu64 "MB, "
		    "%" PRIu64 "/%" PRIu64 " clean/dirty pages, "
		    "%" PRIu64 "/%" PRIu64 " clean/dirty MB, "
		    "%" PRIu64 "MB max page, "
		    "%" PRIu64 "MB max dirty page",
		    leaf_pages,
		    leaf_bytes / WT_MEGABYTE,
		    leaf_pages - leaf_dirty_pages,
		    leaf_dirty_pages,
		    (leaf_bytes - leaf_dirty_bytes) / WT_MEGABYTE,
		    leaf_dirty_bytes / WT_MEGABYTE,
		    leaf_bytes_max / WT_MEGABYTE,
		    leaf_dirty_bytes_max / WT_MEGABYTE));

	*total_bytesp += intl_bytes + leaf_bytes;
	*total_dirty_bytesp += intl_dirty_bytes + leaf_dirty_bytes;

	return (0);
}

/*
 * __wt_verbose_dump_cache --
 *	Output diagnostic information about the cache.
 */
int
__wt_verbose_dump_cache(WT_SESSION_IMPL *session)
{
	WT_CONNECTION_IMPL *conn;
	WT_DATA_HANDLE *dhandle;
	WT_DECL_RET;
	uint64_t total_bytes, total_dirty_bytes;
	u_int pct;

	conn = S2C(session);
	total_bytes = total_dirty_bytes = 0;
	pct = 0;				/* [-Werror=uninitialized] */

	WT_RET(__wt_msg(session, "%s", WT_DIVIDER));
	WT_RET(__wt_msg(session, "cache dump"));

	WT_RET(__wt_msg(session,
	    "cache full: %s", __wt_cache_full(session) ? "yes" : "no"));
	WT_RET(__wt_msg(session, "cache clean check: %s (%u%%)",
	    __wt_eviction_clean_needed(session, &pct) ? "yes" : "no", pct));
	WT_RET(__wt_msg(session, "cache dirty check: %s (%u%%)",
	    __wt_eviction_dirty_needed(session, &pct) ? "yes" : "no", pct));

	for (dhandle = NULL;;) {
		WT_WITH_HANDLE_LIST_READ_LOCK(session,
		    WT_DHANDLE_NEXT(session, dhandle, &conn->dhqh, q));
		if (dhandle == NULL)
			break;
		if (dhandle->type != WT_DHANDLE_TYPE_BTREE ||
		    !F_ISSET(dhandle, WT_DHANDLE_OPEN))
			continue;

		WT_WITH_DHANDLE(session, dhandle,
		    ret = __verbose_dump_cache_single(
		    session, &total_bytes, &total_dirty_bytes));
		if (ret != 0)
			break;
	}
	WT_RET(ret);

	/*
	 * Apply the overhead percentage so our total bytes are comparable with
	 * the tracked value.
	 */
	total_bytes = __wt_cache_bytes_plus_overhead(conn->cache, total_bytes);

	WT_RET(__wt_msg(session,
	    "cache dump: "
	    "total found: %" PRIu64 "MB vs tracked inuse %" PRIu64 "MB",
	    total_bytes / WT_MEGABYTE,
	    __wt_cache_bytes_inuse(conn->cache) / WT_MEGABYTE));
	WT_RET(__wt_msg(session,
	    "total dirty bytes: %" PRIu64 "MB",
	    total_dirty_bytes / WT_MEGABYTE));

	return (0);
}