/*- * Copyright (c) 2014-2016 MongoDB, Inc. * Copyright (c) 2008-2014 WiredTiger, Inc. * All rights reserved. * * See the file LICENSE for redistribution information. */ #include "wt_internal.h" static int __evict_clear_all_walks(WT_SESSION_IMPL *); static int __evict_helper(WT_SESSION_IMPL *); static int WT_CDECL __evict_lru_cmp(const void *, const void *); static int __evict_lru_pages(WT_SESSION_IMPL *, bool); static int __evict_lru_walk(WT_SESSION_IMPL *); static int __evict_page(WT_SESSION_IMPL *, bool); static int __evict_pass(WT_SESSION_IMPL *); static int __evict_server(WT_SESSION_IMPL *, bool *); static int __evict_walk(WT_SESSION_IMPL *, uint32_t); static int __evict_walk_file(WT_SESSION_IMPL *, uint32_t, u_int, u_int *); /* * __evict_read_gen -- * Get the adjusted read generation for an eviction entry. */ static inline uint64_t __evict_read_gen(const WT_EVICT_ENTRY *entry) { WT_BTREE *btree; WT_PAGE *page; uint64_t read_gen; btree = entry->btree; page = entry->ref->page; /* Any page set to the oldest generation should be discarded. */ if (page->read_gen == WT_READGEN_OLDEST) return (WT_READGEN_OLDEST); /* * Any leaf page from a dead tree is a great choice (not internal pages, * they may have children and are not yet evictable). */ if (!WT_PAGE_IS_INTERNAL(page) && F_ISSET(btree->dhandle, WT_DHANDLE_DEAD)) return (WT_READGEN_OLDEST); /* Any empty page (leaf or internal), is a good choice. */ if (__wt_page_is_empty(page)) return (WT_READGEN_OLDEST); /* * The base read-generation is skewed by the eviction priority. * Internal pages are also adjusted, we prefer to evict leaf pages. */ read_gen = page->read_gen + btree->evict_priority; if (WT_PAGE_IS_INTERNAL(page)) read_gen += WT_EVICT_INT_SKEW; return (read_gen); } /* * __evict_lru_cmp -- * Qsort function: sort the eviction array. */ static int WT_CDECL __evict_lru_cmp(const void *a_arg, const void *b_arg) { const WT_EVICT_ENTRY *a = a_arg, *b = b_arg; uint64_t a_score, b_score; a_score = (a->ref == NULL ? UINT64_MAX : a->score); b_score = (b->ref == NULL ? UINT64_MAX : b->score); return ((a_score < b_score) ? -1 : (a_score == b_score) ? 0 : 1); } /* * __evict_list_clear -- * Clear an entry in the LRU eviction list. */ static inline void __evict_list_clear(WT_SESSION_IMPL *session, WT_EVICT_ENTRY *e) { if (e->ref != NULL) { WT_ASSERT(session, F_ISSET_ATOMIC(e->ref->page, WT_PAGE_EVICT_LRU)); F_CLR_ATOMIC(e->ref->page, WT_PAGE_EVICT_LRU); } e->ref = NULL; e->btree = WT_DEBUG_POINT; } /* * __wt_evict_list_clear_page -- * Make sure a page is not in the LRU eviction list. This called from the * page eviction code to make sure there is no attempt to evict a child * page multiple times. */ void __wt_evict_list_clear_page(WT_SESSION_IMPL *session, WT_REF *ref) { WT_CACHE *cache; WT_EVICT_ENTRY *evict; uint32_t i, elem, q; bool found; WT_ASSERT(session, __wt_ref_is_root(ref) || ref->state == WT_REF_LOCKED); /* Fast path: if the page isn't on the queue, don't bother searching. */ if (!F_ISSET_ATOMIC(ref->page, WT_PAGE_EVICT_LRU)) return; cache = S2C(session)->cache; __wt_spin_lock(session, &cache->evict_queue_lock); found = false; for (q = 0; q < WT_EVICT_QUEUE_MAX && !found; q++) { __wt_spin_lock(session, &cache->evict_queues[q].evict_lock); elem = cache->evict_queues[q].evict_max; for (i = 0, evict = cache->evict_queues[q].evict_queue; i < elem; i++, evict++) if (evict->ref == ref) { found = true; __evict_list_clear(session, evict); break; } __wt_spin_unlock(session, &cache->evict_queues[q].evict_lock); } WT_ASSERT(session, !F_ISSET_ATOMIC(ref->page, WT_PAGE_EVICT_LRU)); __wt_spin_unlock(session, &cache->evict_queue_lock); } /* * __wt_evict_server_wake -- * Wake the eviction server thread. */ int __wt_evict_server_wake(WT_SESSION_IMPL *session) { WT_CACHE *cache; WT_CONNECTION_IMPL *conn; conn = S2C(session); cache = conn->cache; #ifdef HAVE_VERBOSE if (WT_VERBOSE_ISSET(session, WT_VERB_EVICTSERVER)) { uint64_t bytes_inuse, bytes_max; bytes_inuse = __wt_cache_bytes_inuse(cache); bytes_max = conn->cache_size; WT_RET(__wt_verbose(session, WT_VERB_EVICTSERVER, "waking, bytes inuse %s max (%" PRIu64 "MB %s %" PRIu64 "MB)", bytes_inuse <= bytes_max ? "<=" : ">", bytes_inuse / WT_MEGABYTE, bytes_inuse <= bytes_max ? "<=" : ">", bytes_max / WT_MEGABYTE)); } #endif return (__wt_cond_auto_signal(session, cache->evict_cond)); } /* * __evict_thread_run -- * General wrapper for any eviction thread. */ static WT_THREAD_RET __evict_thread_run(void *arg) { WT_CACHE *cache; WT_CONNECTION_IMPL *conn; WT_DECL_RET; WT_SESSION_IMPL *session; bool did_work; session = arg; conn = S2C(session); cache = conn->cache; #ifdef HAVE_DIAGNOSTIC if (session == conn->evict_session) WT_ERR(__wt_epoch( session, &cache->stuck_ts)); /* -Wuninitialized */ #endif while (F_ISSET(conn, WT_CONN_EVICTION_RUN)) { if (conn->evict_tid_set && __wt_spin_trylock(session, &cache->evict_pass_lock) == 0) { /* * Cannot use WT_WITH_PASS_LOCK because this is a try * lock. Fix when that is supported. We set the flag * on both sessions because we may call clear_walk when * we are walking with the walk session, locked. */ F_SET(session, WT_SESSION_LOCKED_PASS); F_SET(cache->walk_session, WT_SESSION_LOCKED_PASS); ret = __evict_server(session, &did_work); F_CLR(cache->walk_session, WT_SESSION_LOCKED_PASS); F_CLR(session, WT_SESSION_LOCKED_PASS); __wt_spin_unlock(session, &cache->evict_pass_lock); WT_ERR(ret); WT_ERR(__wt_verbose( session, WT_VERB_EVICTSERVER, "sleeping")); /* Don't rely on signals: check periodically. */ WT_ERR(__wt_cond_auto_wait( session, cache->evict_cond, did_work)); WT_ERR(__wt_verbose( session, WT_VERB_EVICTSERVER, "waking")); } else WT_ERR(__evict_helper(session)); } if (session == conn->evict_session) { /* * The eviction server is shutting down: in case any trees are * still open, clear all walks now so that they can be closed. */ WT_WITH_PASS_LOCK(session, ret, ret = __evict_clear_all_walks(session)); WT_ERR(ret); } WT_ERR(__wt_verbose( session, WT_VERB_EVICTSERVER, "cache eviction thread exiting")); /* * The only two cases when eviction workers are expected to stop are * when recovery is finished or when the connection is closing. Check * otherwise fewer eviction worker threads may be running than * expected. */ WT_ASSERT(session, F_ISSET(conn, WT_CONN_CLOSING | WT_CONN_RECOVERING)); if (0) { err: WT_PANIC_MSG(session, ret, "cache eviction thread error"); } return (WT_THREAD_RET_VALUE); } /* * __evict_server -- * Thread to evict pages from the cache. */ static int __evict_server(WT_SESSION_IMPL *session, bool *did_work) { WT_CACHE *cache; WT_CONNECTION_IMPL *conn; WT_DECL_RET; #ifdef HAVE_DIAGNOSTIC struct timespec now; #endif uint64_t orig_pages_evicted; u_int spins; conn = S2C(session); cache = conn->cache; WT_ASSERT(session, did_work != NULL); *did_work = false; orig_pages_evicted = cache->pages_evicted; /* Evict pages from the cache as needed. */ WT_RET(__evict_pass(session)); if (!F_ISSET(conn, WT_CONN_EVICTION_RUN)) return (0); /* * Clear the walks so we don't pin pages while asleep, * otherwise we can block applications evicting large pages. */ if (!F_ISSET(cache, WT_CACHE_STUCK)) { for (spins = 0; (ret = __wt_spin_trylock( session, &conn->dhandle_lock)) == EBUSY && cache->pass_intr == 0; spins++) { if (spins < WT_THOUSAND) __wt_yield(); else __wt_sleep(0, WT_THOUSAND); } /* * If we gave up acquiring the lock, that indicates a * session is waiting for us to clear walks. Do that * as part of a normal pass (without the handle list * lock) to avoid deadlock. */ if (ret == EBUSY) return (0); WT_RET(ret); ret = __evict_clear_all_walks(session); __wt_spin_unlock(session, &conn->dhandle_lock); WT_RET(ret); /* Next time we wake up, reverse the sweep direction. */ cache->flags ^= WT_CACHE_WALK_REVERSE; cache->pages_evicted = 0; } else if (cache->pages_evicted != cache->pages_evict) { cache->pages_evicted = cache->pages_evict; #ifdef HAVE_DIAGNOSTIC WT_RET(__wt_epoch(session, &cache->stuck_ts)); } else { /* After being stuck for 5 minutes, give up. */ WT_RET(__wt_epoch(session, &now)); if (WT_TIMEDIFF_SEC(now, cache->stuck_ts) > 300) { __wt_err(session, ETIMEDOUT, "Cache stuck for too long, giving up"); (void)__wt_cache_dump(session, NULL); WT_RET(ETIMEDOUT); } #endif } *did_work = cache->pages_evicted != orig_pages_evicted; return (0); } /* * __evict_workers_resize -- * Resize the array of eviction workers (as needed after a reconfigure). * We don't do this during the reconfigure because the eviction server * thread owns these structures. */ static int __evict_workers_resize(WT_SESSION_IMPL *session) { WT_CONNECTION_IMPL *conn; WT_DECL_RET; WT_EVICT_WORKER *workers; size_t alloc; uint32_t i, session_flags; conn = S2C(session); workers = NULL; /* -Wconditional-uninitialized */ if (conn->evict_workers_alloc < conn->evict_workers_max) { alloc = conn->evict_workers_alloc * sizeof(*workers); WT_RET(__wt_realloc(session, &alloc, conn->evict_workers_max * sizeof(*workers), &conn->evict_workctx)); workers = conn->evict_workctx; } for (i = conn->evict_workers_alloc; i < conn->evict_workers_max; i++) { /* * Eviction worker threads get their own session. * Eviction worker threads may be called upon to perform slow * operations for the block manager. * * Eviction worker threads get their own lookaside table cursor * if the lookaside table is open. Note that eviction is also * started during recovery, before the lookaside table is * created. */ session_flags = WT_SESSION_CAN_WAIT; if (F_ISSET(conn, WT_CONN_LAS_OPEN)) FLD_SET(session_flags, WT_SESSION_LOOKASIDE_CURSOR); WT_ERR(__wt_open_internal_session(conn, "eviction-worker", false, session_flags, &workers[i].session)); workers[i].id = i; if (i < conn->evict_workers_min) { ++conn->evict_workers; F_SET(&workers[i], WT_EVICT_WORKER_RUN); WT_ERR(__wt_thread_create(workers[i].session, &workers[i].tid, __evict_thread_run, workers[i].session)); } } err: conn->evict_workers_alloc = conn->evict_workers_max; return (ret); } /* * __wt_evict_create -- * Start the eviction server thread. */ int __wt_evict_create(WT_SESSION_IMPL *session) { WT_CONNECTION_IMPL *conn; uint32_t session_flags; conn = S2C(session); /* Set first, the thread might run before we finish up. */ F_SET(conn, WT_CONN_EVICTION_RUN); /* * We need a session handle because we're reading/writing pages. * * The eviction server gets its own lookaside table cursor. * * If there's only a single eviction thread, it may be called upon to * perform slow operations for the block manager. (The flag is not * reset if reconfigured later, but I doubt that's a problem.) */ session_flags = F_ISSET(conn, WT_CONN_LAS_OPEN) ? WT_SESSION_LOOKASIDE_CURSOR : 0; if (conn->evict_workers_max == 0) FLD_SET(session_flags, WT_SESSION_CAN_WAIT); WT_RET(__wt_open_internal_session(conn, "eviction-server", false, session_flags, &conn->evict_session)); session = conn->evict_session; /* * If eviction workers were configured, allocate sessions for them now. * This is done to reduce the chance that we will open new eviction * sessions after WT_CONNECTION::close is called. */ if (conn->evict_workers_max > 0) WT_RET(__evict_workers_resize(session)); /* * Start the primary eviction server thread after the worker threads * have started to avoid it starting additional worker threads before * the worker's sessions are created. */ WT_RET(__wt_thread_create( session, &conn->evict_tid, __evict_thread_run, session)); conn->evict_tid_set = true; return (0); } /* * __wt_evict_destroy -- * Destroy the eviction threads. */ int __wt_evict_destroy(WT_SESSION_IMPL *session) { WT_CACHE *cache; WT_CONNECTION_IMPL *conn; WT_DECL_RET; WT_EVICT_WORKER *workers; WT_SESSION *wt_session; uint32_t i; conn = S2C(session); cache = conn->cache; workers = conn->evict_workctx; F_CLR(conn, WT_CONN_EVICTION_RUN); /* * Wait for the main eviction thread to exit before waiting on the * helpers. The eviction server spawns helper threads, so we can't * safely know how many helpers are running until the main thread is * done. */ WT_TRET(__wt_verbose( session, WT_VERB_EVICTSERVER, "waiting for main thread")); if (conn->evict_tid_set) { WT_TRET(__wt_evict_server_wake(session)); WT_TRET(__wt_thread_join(session, conn->evict_tid)); conn->evict_tid_set = false; } WT_TRET(__wt_verbose( session, WT_VERB_EVICTSERVER, "waiting for helper threads")); for (i = 0; i < conn->evict_workers; i++) { WT_TRET(__wt_cond_signal(session, cache->evict_waiter_cond)); WT_TRET(__wt_thread_join(session, workers[i].tid)); } conn->evict_workers = 0; /* Handle shutdown when cleaning up after a failed open. */ if (conn->evict_workctx != NULL) { for (i = 0; i < conn->evict_workers_alloc; i++) { wt_session = &conn->evict_workctx[i].session->iface; if (wt_session != NULL) WT_TRET(wt_session->close(wt_session, NULL)); } __wt_free(session, conn->evict_workctx); } conn->evict_workers_alloc = 0; if (conn->evict_session != NULL) { wt_session = &conn->evict_session->iface; WT_TRET(wt_session->close(wt_session, NULL)); conn->evict_session = NULL; } return (ret); } /* * __evict_helper -- * Thread to help evict pages from the cache. */ static int __evict_helper(WT_SESSION_IMPL *session) { WT_CACHE *cache; WT_DECL_RET; cache = S2C(session)->cache; if ((ret = __evict_lru_pages(session, false)) == WT_NOTFOUND) WT_RET(__wt_cond_wait( session, cache->evict_waiter_cond, 10000)); else WT_RET(ret); return (0); } /* * __evict_update_work -- * Configure eviction work state. */ static bool __evict_update_work(WT_SESSION_IMPL *session) { WT_CACHE *cache; WT_CONNECTION_IMPL *conn; uint64_t bytes_inuse, bytes_max, dirty_inuse; conn = S2C(session); cache = conn->cache; WT_STAT_FAST_CONN_SET(session, cache_eviction_aggressive_set, 0); /* Clear previous state. */ cache->state = 0; if (!F_ISSET(conn, WT_CONN_EVICTION_RUN)) return (false); /* * Setup the number of refs to consider in each handle, depending * on how many handles are open. We want to consider less candidates * from each file as more files are open. Handle the case where there * are no files open by adding 1. */ cache->evict_max_refs_per_file = WT_MAX(100, WT_MILLION / (conn->open_file_count + 1)); if (cache->evict_queues[WT_EVICT_URGENT_QUEUE].evict_current != NULL) FLD_SET(cache->state, WT_EVICT_STATE_URGENT); /* * If we need space in the cache, try to find clean pages to evict. * * Avoid division by zero if the cache size has not yet been set in a * shared cache. */ bytes_max = conn->cache_size + 1; bytes_inuse = __wt_cache_bytes_inuse(cache); if (bytes_inuse > (cache->eviction_target * bytes_max) / 100) FLD_SET(cache->state, WT_EVICT_STATE_CLEAN); /* * Scrub dirty pages and keep them in cache if we are less than half * way between the cache target and trigger. */ if (bytes_inuse < ((cache->eviction_target + cache->eviction_trigger) * bytes_max) / 200) FLD_SET(cache->state, WT_EVICT_STATE_SCRUB); dirty_inuse = __wt_cache_dirty_leaf_inuse(cache); if (dirty_inuse > (cache->eviction_dirty_target * bytes_max) / 100) FLD_SET(cache->state, WT_EVICT_STATE_DIRTY); /* * If the cache has been stuck and is now under control, clear the * stuck flag. */ if (bytes_inuse < bytes_max && dirty_inuse < (cache->eviction_dirty_trigger * bytes_max) / 100) F_CLR(cache, WT_CACHE_STUCK); if (F_ISSET(cache, WT_CACHE_STUCK)) { WT_ASSERT(session, cache->state != 0); WT_STAT_FAST_CONN_SET(session, cache_eviction_aggressive_set, 1); FLD_SET(cache->state, WT_EVICT_STATE_AGGRESSIVE); } return (FLD_ISSET(cache->state, WT_EVICT_STATE_ALL | WT_EVICT_STATE_URGENT)); } /* * __evict_pass -- * Evict pages from memory. */ static int __evict_pass(WT_SESSION_IMPL *session) { WT_CACHE *cache; WT_CONNECTION_IMPL *conn; WT_EVICT_WORKER *worker; uint64_t pages_evicted; u_int loop; conn = S2C(session); cache = conn->cache; /* Track whether pages are being evicted and progress is made. */ pages_evicted = cache->pages_evict; /* Evict pages from the cache. */ for (loop = 0;; loop++) { /* * If there is a request to clear eviction walks, do that now, * before checking if the cache is full. */ if (cache->pass_intr != 0) break; /* * Increment the shared read generation. Do this occasionally * even if eviction is not currently required, so that pages * have some relative read generation when the eviction server * does need to do some work. */ __wt_cache_read_gen_incr(session); /* * Update the oldest ID: we use it to decide whether pages are * candidates for eviction. Without this, if all threads are * blocked after a long-running transaction (such as a * checkpoint) completes, we may never start evicting again. * * Do this every time the eviction server wakes up, regardless * of whether the cache is full, to prevent the oldest ID * falling too far behind. Don't wait to lock the table: with * highly threaded workloads, that creates a bottleneck. */ WT_RET(__wt_txn_update_oldest(session, WT_TXN_OLDEST_STRICT)); if (!__evict_update_work(session)) break; if (loop > 10) { WT_STAT_FAST_CONN_SET(session, cache_eviction_aggressive_set, 1); FLD_SET(cache->state, WT_EVICT_STATE_AGGRESSIVE); } /* * Start a worker if we have capacity and we haven't reached * the eviction targets. */ if (FLD_ISSET(cache->state, WT_EVICT_STATE_ALL) && conn->evict_workers < conn->evict_workers_max) { WT_RET(__wt_verbose(session, WT_VERB_EVICTSERVER, "Starting evict worker: %"PRIu32"\n", conn->evict_workers)); if (conn->evict_workers >= conn->evict_workers_alloc) WT_RET(__evict_workers_resize(session)); worker = &conn->evict_workctx[conn->evict_workers++]; F_SET(worker, WT_EVICT_WORKER_RUN); WT_RET(__wt_thread_create(session, &worker->tid, __evict_thread_run, worker->session)); } WT_RET(__wt_verbose(session, WT_VERB_EVICTSERVER, "Eviction pass with: Max: %" PRIu64 " In use: %" PRIu64 " Dirty: %" PRIu64, conn->cache_size, cache->bytes_inmem, cache->bytes_dirty_intl + cache->bytes_dirty_leaf)); WT_RET(__evict_lru_walk(session)); WT_RET_NOTFOUND_OK(__evict_lru_pages(session, true)); /* * If we're making progress, keep going; if we're not making * any progress at all, mark the cache "stuck" and go back to * sleep, it's not something we can fix. */ if (pages_evicted == cache->pages_evict) { /* * Back off if we aren't making progress: walks hold * the handle list lock, blocking other operations that * can free space in cache, such as LSM discarding * handles. * * Allow this wait to be interrupted (e.g. if a * checkpoint completes): make sure we wait for a * non-zero number of microseconds). */ WT_STAT_FAST_CONN_INCR(session, cache_eviction_server_slept); WT_RET(__wt_cond_wait(session, cache->evict_cond, WT_THOUSAND * WT_MAX(loop, 1))); if (loop == 100) { /* * Mark the cache as stuck if we need space * and aren't evicting any pages. */ F_SET(cache, WT_CACHE_STUCK); WT_STAT_FAST_CONN_INCR( session, cache_eviction_slow); WT_RET(__wt_verbose( session, WT_VERB_EVICTSERVER, "unable to reach eviction goal")); break; } } else { loop = 0; pages_evicted = cache->pages_evict; } } return (0); } /* * __evict_clear_walk -- * Clear a single walk point. */ static int __evict_clear_walk(WT_SESSION_IMPL *session) { WT_BTREE *btree; WT_CACHE *cache; WT_DECL_RET; WT_REF *ref; btree = S2BT(session); cache = S2C(session)->cache; WT_ASSERT(session, F_ISSET(session, WT_SESSION_LOCKED_PASS)); if (session->dhandle == cache->evict_file_next) cache->evict_file_next = NULL; if ((ref = btree->evict_ref) == NULL) return (0); /* * Clear evict_ref first, in case releasing it forces eviction (we * assert we never try to evict the current eviction walk point). */ btree->evict_ref = NULL; WT_WITH_DHANDLE(cache->walk_session, session->dhandle, (ret = __wt_page_release(cache->walk_session, ref, WT_READ_NO_EVICT))); return (ret); } /* * __evict_clear_all_walks -- * Clear the eviction walk points for all files a session is waiting on. */ static int __evict_clear_all_walks(WT_SESSION_IMPL *session) { WT_CONNECTION_IMPL *conn; WT_DATA_HANDLE *dhandle; WT_DECL_RET; conn = S2C(session); TAILQ_FOREACH(dhandle, &conn->dhqh, q) if (WT_PREFIX_MATCH(dhandle->name, "file:")) WT_WITH_DHANDLE(session, dhandle, WT_TRET(__evict_clear_walk(session))); return (ret); } /* * __wt_evict_file_exclusive_on -- * Get exclusive eviction access to a file and discard any of the file's * blocks queued for eviction. */ int __wt_evict_file_exclusive_on(WT_SESSION_IMPL *session) { WT_BTREE *btree; WT_CACHE *cache; WT_DECL_RET; WT_EVICT_ENTRY *evict; u_int i, elem, q; btree = S2BT(session); cache = S2C(session)->cache; /* * Hold the walk lock to set the no-eviction flag. * * The no-eviction flag can be set permanently, in which case we never * increment the no-eviction count. */ __wt_spin_lock(session, &cache->evict_walk_lock); if (F_ISSET(btree, WT_BTREE_NO_EVICTION)) { if (btree->evict_disabled != 0) ++btree->evict_disabled; __wt_spin_unlock(session, &cache->evict_walk_lock); return (0); } ++btree->evict_disabled; /* * Ensure no new pages from the file will be queued for eviction after * this point. */ F_SET(btree, WT_BTREE_NO_EVICTION); (void)__wt_atomic_add32(&cache->pass_intr, 1); WT_FULL_BARRIER(); /* Clear any existing LRU eviction walk for the file. */ WT_WITH_PASS_LOCK(session, ret, ret = __evict_clear_walk(session)); (void)__wt_atomic_sub32(&cache->pass_intr, 1); WT_ERR(ret); /* * The eviction candidate list might reference pages from the file, * clear it. Hold the evict lock to remove queued pages from a file. */ __wt_spin_lock(session, &cache->evict_queue_lock); for (q = 0; q < WT_EVICT_QUEUE_MAX; q++) { __wt_spin_lock(session, &cache->evict_queues[q].evict_lock); elem = cache->evict_queues[q].evict_max; for (i = 0, evict = cache->evict_queues[q].evict_queue; i < elem; i++, evict++) if (evict->btree == btree) __evict_list_clear(session, evict); __wt_spin_unlock(session, &cache->evict_queues[q].evict_lock); } __wt_spin_unlock(session, &cache->evict_queue_lock); /* * We have disabled further eviction: wait for concurrent LRU eviction * activity to drain. */ while (btree->evict_busy > 0) __wt_yield(); if (0) { err: --btree->evict_disabled; F_CLR(btree, WT_BTREE_NO_EVICTION); } __wt_spin_unlock(session, &cache->evict_walk_lock); return (ret); } /* * __wt_evict_file_exclusive_off -- * Release exclusive eviction access to a file. */ void __wt_evict_file_exclusive_off(WT_SESSION_IMPL *session) { WT_BTREE *btree; WT_CACHE *cache; btree = S2BT(session); cache = S2C(session)->cache; /* * We have seen subtle bugs with multiple threads racing to turn * eviction on/off. Make races more likely in diagnostic builds. */ WT_DIAGNOSTIC_YIELD; WT_ASSERT(session, btree->evict_ref == NULL && F_ISSET(btree, WT_BTREE_NO_EVICTION)); /* * The no-eviction flag can be set permanently, in which case we never * increment the no-eviction count. */ __wt_spin_lock(session, &cache->evict_walk_lock); if (btree->evict_disabled > 0 && --btree->evict_disabled == 0) F_CLR(btree, WT_BTREE_NO_EVICTION); __wt_spin_unlock(session, &cache->evict_walk_lock); } #define APP_EVICT_THRESHOLD 3 /* Threshold to help evict */ /* * __evict_lru_pages -- * Get pages from the LRU queue to evict. */ static int __evict_lru_pages(WT_SESSION_IMPL *session, bool is_server) { WT_CACHE *cache; WT_DECL_RET; uint64_t app_evict_percent, total_evict; /* * The server will not help evict if the workers are coping with * eviction workload, that is, if fewer than the threshold of the * pages are evicted by application threads. */ if (is_server && S2C(session)->evict_workers > 1) { cache = S2C(session)->cache; total_evict = cache->app_evicts + cache->server_evicts + cache->worker_evicts; app_evict_percent = (100 * cache->app_evicts) / (total_evict + 1); if (app_evict_percent < APP_EVICT_THRESHOLD) { WT_STAT_FAST_CONN_INCR(session, cache_eviction_server_not_evicting); return (0); } } /* * Reconcile and discard some pages: EBUSY is returned if a page fails * eviction because it's unavailable, continue in that case. */ while ((ret = __evict_page(session, is_server)) == 0 || ret == EBUSY) ; return (ret); } /* * __evict_lru_walk -- * Add pages to the LRU queue to be evicted from cache. */ static int __evict_lru_walk(WT_SESSION_IMPL *session) { WT_CACHE *cache; WT_DECL_RET; WT_EVICT_QUEUE *queue; uint64_t read_gen_oldest; uint32_t candidates, entries, queue_index; cache = S2C(session)->cache; /* Fill the next queue (that isn't the urgent queue). */ queue_index = 1 + (cache->evict_queue_fill++ % (WT_EVICT_QUEUE_MAX - 1)); queue = &cache->evict_queues[queue_index]; /* Get some more pages to consider for eviction. */ if ((ret = __evict_walk(cache->walk_session, queue_index)) != 0) return (ret == EBUSY ? 0 : ret); /* Sort the list into LRU order and restart. */ __wt_spin_lock(session, &queue->evict_lock); entries = queue->evict_entries; qsort(queue->evict_queue, entries, sizeof(WT_EVICT_ENTRY), __evict_lru_cmp); while (entries > 0 && queue->evict_queue[entries - 1].ref == NULL) --entries; /* * If we have more entries than the maximum tracked between walks, * clear them. Do this before figuring out how many of the entries are * candidates so we never end up with more candidates than entries. */ while (entries > WT_EVICT_WALK_BASE) __evict_list_clear(session, &queue->evict_queue[--entries]); queue->evict_entries = entries; if (entries == 0) { /* * If there are no entries, there cannot be any candidates. * Make sure application threads don't read past the end of the * candidate list, or they may race with the next walk. */ queue->evict_candidates = 0; queue->evict_current = NULL; __wt_spin_unlock(session, &queue->evict_lock); return (0); } /* Decide how many of the candidates we're going to try and evict. */ if (FLD_ISSET(cache->state, WT_EVICT_STATE_AGGRESSIVE)) { /* * Take all candidates if we only gathered pages with an oldest * read generation set. */ queue->evict_candidates = entries; } else { /* * Find the oldest read generation we have in the queue, used * to set the initial value for pages read into the system. * The queue is sorted, find the first "normal" generation. */ read_gen_oldest = WT_READGEN_OLDEST; for (candidates = 0; candidates < entries; ++candidates) { read_gen_oldest = queue->evict_queue[candidates].score; if (read_gen_oldest != WT_READGEN_OLDEST) break; } /* * Take all candidates if we only gathered pages with an oldest * read generation set. * * We normally never take more than 50% of the entries but if * 50% of the entries were at the oldest read generation, take * all of them. */ if (read_gen_oldest == WT_READGEN_OLDEST) queue->evict_candidates = entries; else if (candidates > entries / 2) queue->evict_candidates = candidates; else { /* * Take all of the urgent pages plus a third of * ordinary candidates (which could be expressed as * WT_EVICT_WALK_INCR / WT_EVICT_WALK_BASE). In the * steady state, we want to get as many candidates as * the eviction walk adds to the queue. * * That said, if there is only one entry, which is * normal when populating an empty file, don't exclude * it. */ queue->evict_candidates = 1 + candidates + ((entries - candidates) - 1) / 3; cache->read_gen_oldest = read_gen_oldest; } } queue->evict_current = queue->evict_queue; __wt_spin_unlock(session, &queue->evict_lock); /* * Now we can set the next queue. */ __wt_spin_lock(session, &cache->evict_queue_lock); if (cache->evict_current_queue->evict_current == NULL) WT_STAT_FAST_CONN_INCR(session, cache_eviction_queue_empty); else WT_STAT_FAST_CONN_INCR(session, cache_eviction_queue_not_empty); cache->evict_current_queue = queue; __wt_spin_unlock(session, &cache->evict_queue_lock); /* * Signal any application or helper threads that may be waiting * to help with eviction. */ WT_RET(__wt_cond_signal(session, cache->evict_waiter_cond)); return (0); } /* * __evict_walk -- * Fill in the array by walking the next set of pages. */ static int __evict_walk(WT_SESSION_IMPL *session, uint32_t queue_index) { WT_BTREE *btree; WT_CACHE *cache; WT_CONNECTION_IMPL *conn; WT_DATA_HANDLE *dhandle; WT_DECL_RET; WT_EVICT_QUEUE *queue; u_int max_entries, prev_slot, retries, slot, start_slot, spins; bool dhandle_locked, incr; conn = S2C(session); cache = S2C(session)->cache; btree = NULL; dhandle = NULL; dhandle_locked = incr = false; retries = 0; /* * Set the starting slot in the queue and the maximum pages added * per walk. */ queue = &cache->evict_queues[queue_index]; start_slot = slot = queue->evict_entries; max_entries = WT_MIN(slot + WT_EVICT_WALK_INCR, cache->evict_slots); retry: while (slot < max_entries && ret == 0) { /* * If another thread is waiting on the eviction server to clear * the walk point in a tree, give up. */ if (cache->pass_intr != 0) break; /* * Lock the dhandle list to find the next handle and bump its * reference count to keep it alive while we sweep. */ if (!dhandle_locked) { for (spins = 0; (ret = __wt_spin_trylock( session, &conn->dhandle_lock)) == EBUSY && cache->pass_intr == 0; spins++) { if (spins < WT_THOUSAND) __wt_yield(); else __wt_sleep(0, WT_THOUSAND); } if (ret != 0) break; dhandle_locked = true; } if (dhandle == NULL) { /* * On entry, continue from wherever we got to in the * scan last time through. If we don't have a saved * handle, start from the beginning of the list. */ if ((dhandle = cache->evict_file_next) != NULL) cache->evict_file_next = NULL; else dhandle = TAILQ_FIRST(&conn->dhqh); } else { if (incr) { WT_ASSERT(session, dhandle->session_inuse > 0); (void)__wt_atomic_subi32( &dhandle->session_inuse, 1); incr = false; cache->evict_file_next = NULL; } dhandle = TAILQ_NEXT(dhandle, q); } /* If we reach the end of the list, we're done. */ if (dhandle == NULL) break; /* Ignore non-file handles, or handles that aren't open. */ if (!WT_PREFIX_MATCH(dhandle->name, "file:") || !F_ISSET(dhandle, WT_DHANDLE_OPEN)) continue; /* Skip files that don't allow eviction. */ btree = dhandle->handle; if (F_ISSET(btree, WT_BTREE_NO_EVICTION)) continue; /* * Also skip files that are checkpointing or configured to * stick in cache until we get aggressive. */ if ((btree->checkpointing != WT_CKPT_OFF || btree->evict_priority != 0) && !FLD_ISSET(cache->state, WT_EVICT_STATE_AGGRESSIVE)) continue; /* Skip files if we have used all available hazard pointers. */ if (btree->evict_ref == NULL && session->nhazard >= conn->hazard_max - WT_MIN(conn->hazard_max / 2, 10)) continue; /* * If we are filling the queue, skip files that haven't been * useful in the past. */ if (btree->evict_walk_period != 0 && btree->evict_walk_skips++ < btree->evict_walk_period) continue; btree->evict_walk_skips = 0; prev_slot = slot; (void)__wt_atomic_addi32(&dhandle->session_inuse, 1); incr = true; __wt_spin_unlock(session, &conn->dhandle_lock); dhandle_locked = false; /* * Re-check the "no eviction" flag, used to enforce exclusive * access when a handle is being closed. If not set, remember * the file to visit first, next loop. * * Only try to acquire the lock and simply continue if we fail; * the lock is held while the thread turning off eviction clears * the tree's current eviction point, and part of the process is * waiting on this thread to acknowledge that action. */ if (!F_ISSET(btree, WT_BTREE_NO_EVICTION) && !__wt_spin_trylock(session, &cache->evict_walk_lock)) { if (!F_ISSET(btree, WT_BTREE_NO_EVICTION)) { cache->evict_file_next = dhandle; WT_WITH_DHANDLE(session, dhandle, ret = __evict_walk_file(session, queue_index, max_entries, &slot)); WT_ASSERT(session, session->split_gen == 0); } __wt_spin_unlock(session, &cache->evict_walk_lock); } /* * If we didn't find any candidates in the file, skip it next * time. */ if (slot == prev_slot) btree->evict_walk_period = WT_MIN( WT_MAX(1, 2 * btree->evict_walk_period), 100); else btree->evict_walk_period = 0; } if (incr) { WT_ASSERT(session, dhandle->session_inuse > 0); (void)__wt_atomic_subi32(&dhandle->session_inuse, 1); incr = false; } if (dhandle_locked) { __wt_spin_unlock(session, &conn->dhandle_lock); dhandle_locked = false; } /* * Walk the list of files a few times if we don't find enough pages. * Try two passes through all the files, give up when we have some * candidates and we aren't finding more. */ if (cache->pass_intr == 0 && ret == 0 && slot < max_entries && (retries < 2 || (retries < 10 && (slot == queue->evict_entries || slot > start_slot)))) { start_slot = slot; ++retries; goto retry; } queue->evict_entries = slot; return (ret); } /* * __evict_push_candidate -- * Initialize a WT_EVICT_ENTRY structure with a given page. */ static bool __evict_push_candidate(WT_SESSION_IMPL *session, WT_EVICT_QUEUE *queue, WT_EVICT_ENTRY *evict, WT_REF *ref) { u_int slot; uint8_t orig_flags, new_flags; /* * Threads can race to queue a page (e.g., an ordinary LRU walk can * race with a page being queued for urgent eviction. */ orig_flags = new_flags = ref->page->flags_atomic; FLD_SET(new_flags, WT_PAGE_EVICT_LRU); if (orig_flags == new_flags || !__wt_atomic_cas8(&ref->page->flags_atomic, orig_flags, new_flags)) return (false); /* Keep track of the maximum slot we are using. */ slot = (u_int)(evict - queue->evict_queue); if (slot >= queue->evict_max) queue->evict_max = slot + 1; if (evict->ref != NULL) __evict_list_clear(session, evict); evict->btree = S2BT(session); evict->ref = ref; evict->score = __evict_read_gen(evict); return (true); } /* * __evict_walk_file -- * Get a few page eviction candidates from a single underlying file. */ static int __evict_walk_file(WT_SESSION_IMPL *session, uint32_t queue_index, u_int max_entries, u_int *slotp) { WT_BTREE *btree; WT_CACHE *cache; WT_CONNECTION_IMPL *conn; WT_DECL_RET; WT_EVICT_ENTRY *end, *evict, *start; WT_EVICT_QUEUE *queue; WT_PAGE *page; WT_PAGE_MODIFY *mod; WT_REF *ref; uint64_t btree_inuse, bytes_per_slot, cache_inuse; uint64_t pages_seen, refs_walked; uint32_t remaining_slots, target_pages, total_slots, walk_flags; int internal_pages, restarts; bool enough, modified; conn = S2C(session); btree = S2BT(session); cache = conn->cache; queue = &cache->evict_queues[queue_index]; internal_pages = restarts = 0; enough = false; /* * Figure out how many slots to fill from this tree. * Note that some care is taken in the calculation to avoid overflow. */ start = queue->evict_queue + *slotp; remaining_slots = max_entries - *slotp; btree_inuse = __wt_btree_bytes_inuse(session); cache_inuse = __wt_cache_bytes_inuse(cache); total_slots = max_entries - queue->evict_entries; /* * The target number of pages for this tree is proportional to the * space it is taking up in cache. Round to the nearest number of * slots so we assign all of the slots to a tree filling 99+% of the * cache (and only have to walk it once). */ bytes_per_slot = cache_inuse / total_slots; target_pages = (uint32_t)( (btree_inuse + bytes_per_slot / 2) / bytes_per_slot); if (target_pages == 0) { /* * Randomly walk trees with a tiny fraction of the cache in * case there are so many trees that none of them use enough of * the cache to be allocated slots. * * Map a random number into the range [0..1], and if the result * is greater than the fraction of the cache used by this tree, * give up. In other words, there is a small chance we will * visit trees that use a small fraction of the cache. Arrange * this calculation to avoid overflow (e.g., don't multiply * anything by UINT32_MAX). */ if (__wt_random(&session->rnd) / (double)UINT32_MAX > btree_inuse / (double)cache_inuse) return (0); target_pages = 10; } if (F_ISSET(session->dhandle, WT_DHANDLE_DEAD) || target_pages > remaining_slots) target_pages = remaining_slots; end = start + target_pages; walk_flags = WT_READ_CACHE | WT_READ_NO_EVICT | WT_READ_NO_GEN | WT_READ_NO_WAIT; if (F_ISSET(cache, WT_CACHE_WALK_REVERSE)) walk_flags |= WT_READ_PREV; /* * Get some more eviction candidate pages. * * !!! Take care terminating this loop. * * Don't make an extra call to __wt_tree_walk after we hit the end of a * tree: that will leave a page pinned, which may prevent any work from * being done. * * Once we hit the page limit, do one more step through the walk in * case we are appending and only the last page in the file is live. */ for (evict = start, pages_seen = refs_walked = 0; evict < end && !enough && (ret == 0 || ret == WT_NOTFOUND); ret = __wt_tree_walk_count( session, &btree->evict_ref, &refs_walked, walk_flags)) { enough = refs_walked > cache->evict_max_refs_per_file; if ((ref = btree->evict_ref) == NULL) { if (++restarts == 2 || enough) break; WT_STAT_FAST_CONN_INCR( session, cache_eviction_walks_started); continue; } ++pages_seen; /* Ignore root pages entirely. */ if (__wt_ref_is_root(ref)) continue; page = ref->page; modified = __wt_page_is_modified(page); /* * Use the EVICT_LRU flag to avoid putting pages onto the list * multiple times. */ if (F_ISSET_ATOMIC(page, WT_PAGE_EVICT_LRU)) continue; /* * It's possible (but unlikely) to visit a page without a read * generation, if we race with the read instantiating the page. * Set the page's read generation here to ensure a bug doesn't * somehow leave a page without a read generation. */ if (page->read_gen == WT_READGEN_NOTSET) __wt_cache_read_gen_new(session, page); /* Pages we no longer need (clean or dirty), are found money. */ if (page->read_gen == WT_READGEN_OLDEST) { WT_STAT_FAST_CONN_INCR( session, cache_eviction_pages_queued_oldest); goto fast; } if (__wt_page_is_empty(page) || F_ISSET(session->dhandle, WT_DHANDLE_DEAD) || FLD_ISSET(cache->state, WT_EVICT_STATE_AGGRESSIVE)) goto fast; /* Skip clean pages if appropriate. */ if (!modified && (F_ISSET(conn, WT_CONN_IN_MEMORY) || !FLD_ISSET(cache->state, WT_EVICT_STATE_CLEAN))) continue; /* Skip dirty pages if appropriate. */ if (modified && !FLD_ISSET(cache->state, WT_EVICT_STATE_DIRTY)) continue; /* Limit internal pages to 50% of the total. */ if (WT_PAGE_IS_INTERNAL(page) && internal_pages >= (int)(evict - start) / 2) continue; fast: /* If the page can't be evicted, give up. */ if (!__wt_page_can_evict(session, ref, NULL)) continue; /* * Note: take care with ordering: if we detected that * the page is modified above, we expect mod != NULL. */ mod = page->modify; /* * Additional tests if eviction is likely to succeed. * * If eviction is stuck or we are helping with forced eviction, * try anyway: maybe a transaction that was running last time * we wrote the page has since rolled back, or we can help the * checkpoint complete sooner. Additionally, being stuck will * configure lookaside table writes in reconciliation, allowing * us to evict pages we can't usually evict. */ if (!FLD_ISSET(cache->state, WT_EVICT_STATE_AGGRESSIVE)) { /* * If the page is clean but has modifications that * appear too new to evict, skip it. */ if (!modified && mod != NULL && !__wt_txn_visible_all(session, mod->rec_max_txn)) continue; } WT_ASSERT(session, evict->ref == NULL); if (!__evict_push_candidate(session, queue, evict, ref)) continue; ++evict; if (WT_PAGE_IS_INTERNAL(page)) ++internal_pages; WT_RET(__wt_verbose(session, WT_VERB_EVICTSERVER, "select: %p, size %" PRIu64, page, page->memory_footprint)); } WT_RET_NOTFOUND_OK(ret); *slotp += (u_int)(evict - start); WT_STAT_FAST_CONN_INCRV( session, cache_eviction_pages_queued, (u_int)(evict - start)); /* * If we happen to end up on the root page, clear it. We have to track * hazard pointers, and the root page complicates that calculation. * * Likewise if we found no new candidates during the walk: there is no * point keeping a page pinned, since it may be the only candidate in an * idle tree. * * If we land on a page requiring forced eviction, move on to the next * page: we want this page evicted as quickly as possible. */ if ((ref = btree->evict_ref) != NULL) { if (__wt_ref_is_root(ref) || evict == start) WT_RET(__evict_clear_walk(session)); else if (ref->page->read_gen == WT_READGEN_OLDEST) WT_RET_NOTFOUND_OK(__wt_tree_walk_count( session, &btree->evict_ref, &refs_walked, walk_flags)); } WT_STAT_FAST_CONN_INCRV(session, cache_eviction_walk, refs_walked); WT_STAT_FAST_CONN_INCRV(session, cache_eviction_pages_seen, pages_seen); return (0); } /* * __evict_check_entry_size -- * Check if the size of an entry is too large for this thread to evict. * We use this so that the server thread doesn't get stalled evicting * a very large page. */ static bool __evict_check_entry_size(WT_SESSION_IMPL *session, WT_EVICT_ENTRY *entry) { WT_CACHE *cache; WT_PAGE *page; WT_REF *ref; uint64_t max; cache = S2C(session)->cache; if (cache->pages_evict == 0 || cache->bytes_evict < WT_MEGABYTE) return (true); max = (cache->bytes_evict / cache->pages_evict) * 4; if ((ref = entry->ref) != NULL) { if ((page = ref->page) == NULL) return (true); /* * If this page is dirty and more than four times the average * evicted page size then return false. Return true in all * other cases. */ if (__wt_page_is_modified(page) && page->memory_footprint > max) { WT_STAT_FAST_CONN_INCR( session, cache_eviction_server_toobig); return (false); } } return (true); } /* * __evict_get_ref -- * Get a page for eviction. */ static int __evict_get_ref( WT_SESSION_IMPL *session, bool is_server, WT_BTREE **btreep, WT_REF **refp) { WT_CACHE *cache; WT_EVICT_ENTRY *evict; WT_EVICT_QUEUE *queue, *urgent_queue; uint32_t candidates; cache = S2C(session)->cache; urgent_queue = &cache->evict_queues[WT_EVICT_URGENT_QUEUE]; *btreep = NULL; *refp = NULL; /* Avoid the LRU lock if no pages are available. */ WT_STAT_FAST_CONN_INCR(session, cache_eviction_get_ref); if (cache->evict_current_queue->evict_current == NULL && urgent_queue->evict_current == NULL) { WT_STAT_FAST_CONN_INCR(session, cache_eviction_get_ref_empty); return (WT_NOTFOUND); } __wt_spin_lock(session, &cache->evict_queue_lock); /* Check the urgent queue first. */ queue = urgent_queue->evict_current != NULL && (FLD_ISSET(cache->state, WT_EVICT_STATE_AGGRESSIVE) || (F_ISSET(session, WT_SESSION_INTERNAL) && (!is_server || S2C(session)->evict_workers <= 1))) ? urgent_queue : cache->evict_current_queue; __wt_spin_unlock(session, &cache->evict_queue_lock); /* * Only evict half of the pages before looking for more. The remainder * are left to eviction workers (if configured), or application threads * if necessary. */ candidates = queue->evict_candidates; if (is_server && queue != urgent_queue && candidates > 1) candidates /= 2; /* * We got the queue lock, which should be fast, and chose a queue. * Now we want to get the lock on the individual queue. */ for (;;) { /* Verify there are still pages available. */ if (queue->evict_current == NULL || (uint32_t) (queue->evict_current - queue->evict_queue) >= candidates) { WT_STAT_FAST_CONN_INCR( session, cache_eviction_get_ref_empty2); return (WT_NOTFOUND); } if (!is_server) __wt_spin_lock(session, &queue->evict_lock); else if (__wt_spin_trylock(session, &queue->evict_lock) != 0) continue; break; } /* Get the next page queued for eviction. */ for (evict = queue->evict_current; evict >= queue->evict_queue && evict < queue->evict_queue + candidates; ++evict) { if (evict->ref == NULL) continue; WT_ASSERT(session, evict->btree != NULL); /* * If the server is helping out and encounters an entry that is * too large, it stops helping. Evicting a very large page in * the server thread could stall eviction from finding new * work. * * However, we can't skip entries in the urgent queue or they * may never be found again. */ if (is_server && queue != urgent_queue && S2C(session)->evict_workers > 1 && !__evict_check_entry_size(session, evict)) { --evict; break; } /* * Lock the page while holding the eviction mutex to prevent * multiple attempts to evict it. For pages that are already * being evicted, this operation will fail and we will move on. */ if (!__wt_atomic_casv32( &evict->ref->state, WT_REF_MEM, WT_REF_LOCKED)) { __evict_list_clear(session, evict); continue; } /* * Increment the busy count in the btree handle to prevent it * from being closed under us. */ (void)__wt_atomic_addv32(&evict->btree->evict_busy, 1); *btreep = evict->btree; *refp = evict->ref; /* * Remove the entry so we never try to reconcile the same page * on reconciliation error. */ __evict_list_clear(session, evict); break; } /* Move to the next item. */ if (evict != NULL && evict + 1 < queue->evict_queue + queue->evict_candidates) queue->evict_current = evict + 1; else /* Clear the current pointer if there are no more candidates. */ queue->evict_current = NULL; __wt_spin_unlock(session, &queue->evict_lock); return ((*refp == NULL) ? WT_NOTFOUND : 0); } /* * __evict_page -- * Called by both eviction and application threads to evict a page. */ static int __evict_page(WT_SESSION_IMPL *session, bool is_server) { WT_BTREE *btree; WT_CACHE *cache; WT_DECL_RET; WT_REF *ref; WT_RET(__evict_get_ref(session, is_server, &btree, &ref)); WT_ASSERT(session, ref->state == WT_REF_LOCKED); cache = S2C(session)->cache; /* * An internal session flags either the server itself or an eviction * worker thread. */ if (is_server) { WT_STAT_FAST_CONN_INCR( session, cache_eviction_server_evicting); cache->server_evicts++; } else if (F_ISSET(session, WT_SESSION_INTERNAL)) { WT_STAT_FAST_CONN_INCR( session, cache_eviction_worker_evicting); cache->worker_evicts++; } else { if (__wt_page_is_modified(ref->page)) WT_STAT_FAST_CONN_INCR( session, cache_eviction_app_dirty); WT_STAT_FAST_CONN_INCR(session, cache_eviction_app); cache->app_evicts++; } /* * In case something goes wrong, don't pick the same set of pages every * time. * * We used to bump the page's read generation only if eviction failed, * but that isn't safe: at that point, eviction has already unlocked * the page and some other thread may have evicted it by the time we * look at it. */ __wt_cache_read_gen_bump(session, ref->page); WT_WITH_BTREE(session, btree, ret = __wt_evict(session, ref, false)); (void)__wt_atomic_subv32(&btree->evict_busy, 1); return (ret); } /* * __wt_cache_eviction_worker -- * Worker function for __wt_cache_eviction_check: evict pages if the cache * crosses its boundaries. */ int __wt_cache_eviction_worker(WT_SESSION_IMPL *session, bool busy, u_int pct_full) { WT_CACHE *cache; WT_CONNECTION_IMPL *conn; WT_DECL_RET; WT_TXN_GLOBAL *txn_global; WT_TXN_STATE *txn_state; uint64_t init_evict_count, max_pages_evicted; bool txn_busy; conn = S2C(session); cache = conn->cache; /* * If the current transaction is keeping the oldest ID pinned, it is in * the middle of an operation. This may prevent the oldest ID from * moving forward, leading to deadlock, so only evict what we can. * Otherwise, we are at a transaction boundary and we can work harder * to make sure there is free space in the cache. */ txn_global = &conn->txn_global; txn_state = WT_SESSION_TXN_STATE(session); txn_busy = txn_state->id != WT_TXN_NONE || session->nhazard > 0 || (txn_state->snap_min != WT_TXN_NONE && txn_global->current != txn_global->oldest_id); if (txn_busy && pct_full < 100) return (0); if (busy) txn_busy = true; /* Wake the eviction server if we need to do work. */ WT_RET(__wt_evict_server_wake(session)); /* * If we're busy, either because of the transaction check we just did, * or because our caller is waiting on a longer-than-usual event (such * as a page read), limit the work to a single eviction and return. If * that's not the case, we can do more. */ init_evict_count = cache->pages_evict; for (;;) { max_pages_evicted = txn_busy ? 5 : 20; /* * A pathological case: if we're the oldest transaction in the * system and the eviction server is stuck trying to find space, * abort the transaction to give up all hazard pointers before * trying again. */ if (F_ISSET(cache, WT_CACHE_STUCK) && __wt_txn_am_oldest(session)) { F_CLR(cache, WT_CACHE_STUCK); WT_STAT_FAST_CONN_INCR(session, txn_fail_cache); return (WT_ROLLBACK); } /* See if eviction is still needed. */ if (!__wt_eviction_needed(session, &pct_full) || (pct_full < 100 && cache->pages_evict > init_evict_count + max_pages_evicted)) return (0); /* Evict a page. */ switch (ret = __evict_page(session, false)) { case 0: if (txn_busy) return (0); /* FALLTHROUGH */ case EBUSY: break; case WT_NOTFOUND: /* Allow the queue to re-populate before retrying. */ WT_RET(__wt_cond_wait( session, cache->evict_waiter_cond, 100000)); cache->app_waits++; break; default: return (ret); } /* Check if we have become busy. */ if (!txn_busy && txn_state->snap_min != WT_TXN_NONE && txn_global->current != txn_global->oldest_id) txn_busy = true; } /* NOTREACHED */ } /* * __wt_page_evict_soon -- * Set a page to be evicted as soon as possible. */ int __wt_page_evict_soon(WT_SESSION_IMPL *session, WT_REF *ref) { WT_CACHE *cache; WT_EVICT_ENTRY *evict; WT_EVICT_QUEUE *urgent_queue; WT_PAGE *page; bool queued; /* Root pages should never be evicted via LRU. */ WT_ASSERT(session, !__wt_ref_is_root(ref)); page = ref->page; page->read_gen = WT_READGEN_OLDEST; if (F_ISSET_ATOMIC(page, WT_PAGE_EVICT_LRU) || F_ISSET(S2BT(session), WT_BTREE_NO_EVICTION)) return (0); /* Append to the urgent queue if we can. */ cache = S2C(session)->cache; urgent_queue = &cache->evict_queues[WT_EVICT_URGENT_QUEUE]; queued = false; __wt_spin_lock(session, &cache->evict_queue_lock); if (F_ISSET_ATOMIC(page, WT_PAGE_EVICT_LRU) || F_ISSET(S2BT(session), WT_BTREE_NO_EVICTION)) goto done; __wt_spin_lock(session, &urgent_queue->evict_lock); if (urgent_queue->evict_current == NULL) { urgent_queue->evict_current = urgent_queue->evict_queue; urgent_queue->evict_candidates = 0; } evict = urgent_queue->evict_queue + urgent_queue->evict_candidates; if (evict < urgent_queue->evict_queue + WT_EVICT_QUEUE_MAX && __evict_push_candidate(session, urgent_queue, evict, ref)) { ++urgent_queue->evict_candidates; queued = true; } __wt_spin_unlock(session, &urgent_queue->evict_lock); done: __wt_spin_unlock(session, &cache->evict_queue_lock); if (queued) { WT_STAT_FAST_CONN_INCR( session, cache_eviction_pages_queued_urgent); if (S2C(session)->evict_workers > 1) WT_RET(__wt_cond_signal( session, cache->evict_waiter_cond)); else WT_RET(__wt_evict_server_wake(session)); } return (0); } /* * __wt_evict_priority_set -- * Set a tree's eviction priority. */ void __wt_evict_priority_set(WT_SESSION_IMPL *session, uint64_t v) { S2BT(session)->evict_priority = v; } /* * __wt_evict_priority_clear -- * Clear a tree's eviction priority. */ void __wt_evict_priority_clear(WT_SESSION_IMPL *session) { S2BT(session)->evict_priority = 0; } #ifdef HAVE_DIAGNOSTIC /* * __wt_cache_dump -- * Dump debugging information to a file (default stderr) about the size of * the files in the cache. */ int __wt_cache_dump(WT_SESSION_IMPL *session, const char *ofile) { FILE *fp; WT_CONNECTION_IMPL *conn; WT_DATA_HANDLE *dhandle, *saved_dhandle; WT_PAGE *page; WT_REF *next_walk; uint64_t intl_bytes, intl_bytes_max, intl_dirty_bytes; uint64_t intl_dirty_bytes_max, intl_dirty_pages, intl_pages; uint64_t leaf_bytes, leaf_bytes_max, leaf_dirty_bytes; uint64_t leaf_dirty_bytes_max, leaf_dirty_pages, leaf_pages; uint64_t total_bytes, total_dirty_bytes; size_t size; conn = S2C(session); total_bytes = total_dirty_bytes = 0; if (ofile == NULL) fp = stderr; else if ((fp = fopen(ofile, "w")) == NULL) return (EIO); /* Note: odd string concatenation avoids spelling errors. */ (void)fprintf(fp, "==========\n" "cache dump\n"); saved_dhandle = session->dhandle; TAILQ_FOREACH(dhandle, &conn->dhqh, q) { if (!WT_PREFIX_MATCH(dhandle->name, "file:") || !F_ISSET(dhandle, WT_DHANDLE_OPEN)) continue; intl_bytes = intl_bytes_max = intl_dirty_bytes = 0; intl_dirty_bytes_max = intl_dirty_pages = intl_pages = 0; leaf_bytes = leaf_bytes_max = leaf_dirty_bytes = 0; leaf_dirty_bytes_max = leaf_dirty_pages = leaf_pages = 0; next_walk = NULL; session->dhandle = dhandle; while (__wt_tree_walk(session, &next_walk, WT_READ_CACHE | WT_READ_NO_EVICT | WT_READ_NO_WAIT) == 0 && next_walk != NULL) { page = next_walk->page; size = page->memory_footprint; if (WT_PAGE_IS_INTERNAL(page)) { ++intl_pages; intl_bytes += size; intl_bytes_max = WT_MAX(intl_bytes_max, size); if (__wt_page_is_modified(page)) { ++intl_dirty_pages; intl_dirty_bytes += size; intl_dirty_bytes_max = WT_MAX(intl_dirty_bytes_max, size); } } else { ++leaf_pages; leaf_bytes += size; leaf_bytes_max = WT_MAX(leaf_bytes_max, size); if (__wt_page_is_modified(page)) { ++leaf_dirty_pages; leaf_dirty_bytes += size; leaf_dirty_bytes_max = WT_MAX(leaf_dirty_bytes_max, size); } } } session->dhandle = NULL; if (dhandle->checkpoint == NULL) (void)fprintf(fp, "%s(): \n", dhandle->name); else (void)fprintf(fp, "%s(checkpoint=%s): \n", dhandle->name, dhandle->checkpoint); if (intl_pages != 0) (void)fprintf(fp, "\t" "internal: " "%" PRIu64 " pages, " "%" PRIu64 "MB, " "%" PRIu64 "/%" PRIu64 " clean/dirty pages, " "%" PRIu64 "/%" PRIu64 " clean/dirty MB, " "%" PRIu64 "MB max page, " "%" PRIu64 "MB max dirty page\n", intl_pages, intl_bytes >> 20, intl_pages - intl_dirty_pages, intl_dirty_pages, (intl_bytes - intl_dirty_bytes) >> 20, intl_dirty_bytes >> 20, intl_bytes_max >> 20, intl_dirty_bytes_max >> 20); if (leaf_pages != 0) (void)fprintf(fp, "\t" "leaf: " "%" PRIu64 " pages, " "%" PRIu64 "MB, " "%" PRIu64 "/%" PRIu64 " clean/dirty pages, " "%" PRIu64 "/%" PRIu64 " clean/dirty MB, " "%" PRIu64 "MB max page, " "%" PRIu64 "MB max dirty page\n", leaf_pages, leaf_bytes >> 20, leaf_pages - leaf_dirty_pages, leaf_dirty_pages, (leaf_bytes - leaf_dirty_bytes) >> 20, leaf_dirty_bytes >> 20, leaf_bytes_max >> 20, leaf_dirty_bytes_max >> 20); total_bytes += intl_bytes + leaf_bytes; total_dirty_bytes += intl_dirty_bytes + leaf_dirty_bytes; } session->dhandle = saved_dhandle; /* * Apply the overhead percentage so our total bytes are comparable with * the tracked value. */ if (conn->cache->overhead_pct != 0) total_bytes += (total_bytes * (uint64_t)conn->cache->overhead_pct) / 100; (void)fprintf(fp, "cache dump: " "total found = %" PRIu64 "MB vs tracked inuse %" PRIu64 "MB\n" "total dirty bytes = %" PRIu64 "MB\n", total_bytes >> 20, __wt_cache_bytes_inuse(conn->cache) >> 20, total_dirty_bytes >> 20); (void)fprintf(fp, "==========\n"); if (ofile != NULL && fclose(fp) != 0) return (EIO); return (0); } #endif