/*- * Copyright (c) 2014-2016 MongoDB, Inc. * Copyright (c) 2008-2014 WiredTiger, Inc. * All rights reserved. * * See the file LICENSE for redistribution information. */ #include "wt_internal.h" static int __evict_clear_all_walks(WT_SESSION_IMPL *); static int WT_CDECL __evict_lru_cmp(const void *, const void *); static int __evict_lru_pages(WT_SESSION_IMPL *, bool); static int __evict_lru_walk(WT_SESSION_IMPL *); static int __evict_page(WT_SESSION_IMPL *, bool); static int __evict_pass(WT_SESSION_IMPL *); static int __evict_server(WT_SESSION_IMPL *, bool *); static int __evict_walk(WT_SESSION_IMPL *, WT_EVICT_QUEUE *); static int __evict_walk_file( WT_SESSION_IMPL *, WT_EVICT_QUEUE *, u_int, u_int *); #define WT_EVICT_HAS_WORKERS(s) \ (S2C(s)->evict_threads.current_threads > 1) /* * __evict_entry_priority -- * Get the adjusted read generation for an eviction entry. */ static inline uint64_t __evict_entry_priority(WT_SESSION_IMPL *session, WT_REF *ref) { WT_BTREE *btree; WT_PAGE *page; uint64_t read_gen; btree = S2BT(session); page = ref->page; /* Any page set to the oldest generation should be discarded. */ if (page->read_gen == WT_READGEN_OLDEST) return (WT_READGEN_OLDEST); /* * Any leaf page from a dead tree is a great choice (not internal pages, * they may have children and are not yet evictable). */ if (!WT_PAGE_IS_INTERNAL(page) && F_ISSET(btree->dhandle, WT_DHANDLE_DEAD)) return (WT_READGEN_OLDEST); /* Any empty page (leaf or internal), is a good choice. */ if (__wt_page_is_empty(page)) return (WT_READGEN_OLDEST); /* Any large page in memory is likewise a good choice. */ if (page->memory_footprint > btree->splitmempage) return (WT_READGEN_OLDEST); /* * The base read-generation is skewed by the eviction priority. * Internal pages are also adjusted, we prefer to evict leaf pages. */ if (page->modify != NULL && F_ISSET(S2C(session)->cache, WT_CACHE_EVICT_DIRTY) && !F_ISSET(S2C(session)->cache, WT_CACHE_EVICT_CLEAN)) read_gen = page->modify->update_txn; else read_gen = page->read_gen; read_gen += btree->evict_priority; if (WT_PAGE_IS_INTERNAL(page)) read_gen += WT_EVICT_INT_SKEW; return (read_gen); } /* * __evict_lru_cmp -- * Qsort function: sort the eviction array. */ static int WT_CDECL __evict_lru_cmp(const void *a_arg, const void *b_arg) { const WT_EVICT_ENTRY *a = a_arg, *b = b_arg; uint64_t a_score, b_score; a_score = (a->ref == NULL ? UINT64_MAX : a->score); b_score = (b->ref == NULL ? UINT64_MAX : b->score); return ((a_score < b_score) ? -1 : (a_score == b_score) ? 0 : 1); } /* * __evict_list_clear -- * Clear an entry in the LRU eviction list. */ static inline void __evict_list_clear(WT_SESSION_IMPL *session, WT_EVICT_ENTRY *e) { if (e->ref != NULL) { WT_ASSERT(session, F_ISSET_ATOMIC(e->ref->page, WT_PAGE_EVICT_LRU)); F_CLR_ATOMIC(e->ref->page, WT_PAGE_EVICT_LRU); } e->ref = NULL; e->btree = WT_DEBUG_POINT; } /* * __wt_evict_list_clear_page -- * Make sure a page is not in the LRU eviction list. This called from the * page eviction code to make sure there is no attempt to evict a child * page multiple times. */ void __wt_evict_list_clear_page(WT_SESSION_IMPL *session, WT_REF *ref) { WT_CACHE *cache; WT_EVICT_ENTRY *evict; uint32_t i, elem, q; bool found; WT_ASSERT(session, __wt_ref_is_root(ref) || ref->state == WT_REF_LOCKED); /* Fast path: if the page isn't on the queue, don't bother searching. */ if (!F_ISSET_ATOMIC(ref->page, WT_PAGE_EVICT_LRU)) return; cache = S2C(session)->cache; __wt_spin_lock(session, &cache->evict_queue_lock); found = false; for (q = 0; q < WT_EVICT_QUEUE_MAX && !found; q++) { __wt_spin_lock(session, &cache->evict_queues[q].evict_lock); elem = cache->evict_queues[q].evict_max; for (i = 0, evict = cache->evict_queues[q].evict_queue; i < elem; i++, evict++) if (evict->ref == ref) { found = true; __evict_list_clear(session, evict); break; } __wt_spin_unlock(session, &cache->evict_queues[q].evict_lock); } WT_ASSERT(session, !F_ISSET_ATOMIC(ref->page, WT_PAGE_EVICT_LRU)); __wt_spin_unlock(session, &cache->evict_queue_lock); } /* * __evict_queue_empty -- * Is the queue empty? * * Note that the eviction server is pessimistic and treats a half full * queue as empty. */ static inline bool __evict_queue_empty(WT_EVICT_QUEUE *queue, bool server_check) { uint32_t candidates, used; if (queue->evict_current == NULL) return (true); /* The eviction server only considers half of the candidates. */ candidates = queue->evict_candidates; if (server_check && candidates > 1) candidates /= 2; used = (uint32_t)(queue->evict_current - queue->evict_queue); return (used >= candidates); } /* * __evict_queue_full -- * Is the queue full (i.e., it has been populated with candidates and none * of them have been evicted yet)? */ static inline bool __evict_queue_full(WT_EVICT_QUEUE *queue) { return (queue->evict_current == queue->evict_queue && queue->evict_candidates != 0); } /* * __wt_evict_server_wake -- * Wake the eviction server thread. */ void __wt_evict_server_wake(WT_SESSION_IMPL *session) { WT_CACHE *cache; WT_CONNECTION_IMPL *conn; conn = S2C(session); cache = conn->cache; #ifdef HAVE_VERBOSE if (WT_VERBOSE_ISSET(session, WT_VERB_EVICTSERVER)) { uint64_t bytes_inuse, bytes_max; bytes_inuse = __wt_cache_bytes_inuse(cache); bytes_max = conn->cache_size; __wt_verbose(session, WT_VERB_EVICTSERVER, "waking, bytes inuse %s max (%" PRIu64 "MB %s %" PRIu64 "MB)", bytes_inuse <= bytes_max ? "<=" : ">", bytes_inuse / WT_MEGABYTE, bytes_inuse <= bytes_max ? "<=" : ">", bytes_max / WT_MEGABYTE); } #endif __wt_cond_auto_signal(session, cache->evict_cond); } /* * __wt_evict_thread_run -- * Starting point for an eviction thread. */ int __wt_evict_thread_run(WT_SESSION_IMPL *session, WT_THREAD *thread) { WT_CACHE *cache; WT_CONNECTION_IMPL *conn; WT_DECL_RET; bool did_work; conn = S2C(session); cache = conn->cache; #ifdef HAVE_DIAGNOSTIC /* * Ensure the cache stuck timer is initialized when starting eviction. */ if (thread->id == 0) __wt_epoch(session, &cache->stuck_ts); #endif while (F_ISSET(conn, WT_CONN_EVICTION_RUN) && F_ISSET(thread, WT_THREAD_RUN)) { if (conn->evict_server_running && __wt_spin_trylock(session, &cache->evict_pass_lock) == 0) { /* * Cannot use WT_WITH_PASS_LOCK because this is a try * lock. Fix when that is supported. We set the flag * on both sessions because we may call clear_walk when * we are walking with the walk session, locked. */ F_SET(session, WT_SESSION_LOCKED_PASS); F_SET(cache->walk_session, WT_SESSION_LOCKED_PASS); ret = __evict_server(session, &did_work); F_CLR(cache->walk_session, WT_SESSION_LOCKED_PASS); F_CLR(session, WT_SESSION_LOCKED_PASS); __wt_spin_unlock(session, &cache->evict_pass_lock); WT_ERR(ret); __wt_verbose(session, WT_VERB_EVICTSERVER, "sleeping"); /* Don't rely on signals: check periodically. */ __wt_cond_auto_wait( session, cache->evict_cond, did_work); __wt_verbose(session, WT_VERB_EVICTSERVER, "waking"); } else WT_ERR(__evict_lru_pages(session, false)); } /* * The only time the first eviction thread is stopped is on shutdown: * in case any trees are still open, clear all walks now so that they * can be closed. */ if (thread->id == 0) { WT_WITH_PASS_LOCK(session, ret = __evict_clear_all_walks(session)); WT_ERR(ret); /* * The only two cases when the eviction server is expected to * stop are when recovery is finished or when the connection is * closing. */ WT_ASSERT(session, F_ISSET(conn, WT_CONN_CLOSING | WT_CONN_RECOVERING)); } __wt_verbose( session, WT_VERB_EVICTSERVER, "cache eviction thread exiting"); if (0) { err: WT_PANIC_MSG(session, ret, "cache eviction thread error"); } return (ret); } /* * __evict_server -- * Thread to evict pages from the cache. */ static int __evict_server(WT_SESSION_IMPL *session, bool *did_work) { WT_CACHE *cache; WT_CONNECTION_IMPL *conn; WT_DECL_RET; #ifdef HAVE_DIAGNOSTIC struct timespec now; #endif uint64_t orig_pages_evicted; u_int spins; conn = S2C(session); cache = conn->cache; WT_ASSERT(session, did_work != NULL); *did_work = false; orig_pages_evicted = cache->pages_evicted; /* Evict pages from the cache as needed. */ WT_RET(__evict_pass(session)); if (!F_ISSET(conn, WT_CONN_EVICTION_RUN)) return (0); /* * Clear the walks so we don't pin pages while asleep, * otherwise we can block applications evicting large pages. */ if (!__wt_cache_stuck(session)) { for (spins = 0; (ret = __wt_spin_trylock( session, &conn->dhandle_lock)) == EBUSY && cache->pass_intr == 0; spins++) { if (spins < WT_THOUSAND) __wt_yield(); else __wt_sleep(0, WT_THOUSAND); } /* * If we gave up acquiring the lock, that indicates a * session is waiting for us to clear walks. Do that * as part of a normal pass (without the handle list * lock) to avoid deadlock. */ if (ret == EBUSY) return (0); WT_RET(ret); ret = __evict_clear_all_walks(session); __wt_spin_unlock(session, &conn->dhandle_lock); WT_RET(ret); cache->pages_evicted = 0; } else if (cache->pages_evicted != cache->pages_evict) { cache->pages_evicted = cache->pages_evict; #ifdef HAVE_DIAGNOSTIC __wt_epoch(session, &cache->stuck_ts); } else if (!F_ISSET(conn, WT_CONN_IN_MEMORY)) { /* * After being stuck for 5 minutes, give up. * * We don't do this check for in-memory workloads because * application threads are not blocked by the cache being full. * If the cache becomes full of clean pages, we can be * servicing reads while the cache appears stuck to eviction. */ __wt_epoch(session, &now); if (WT_TIMEDIFF_SEC(now, cache->stuck_ts) > 300) { ret = ETIMEDOUT; __wt_err(session, ret, "Cache stuck for too long, giving up"); WT_TRET(__wt_cache_dump(session, NULL)); return (ret); } #endif } *did_work = cache->pages_evicted != orig_pages_evicted; return (0); } /* * __wt_evict_create -- * Start the eviction server. */ int __wt_evict_create(WT_SESSION_IMPL *session) { WT_CONNECTION_IMPL *conn; conn = S2C(session); WT_ASSERT(session, conn->evict_threads_min > 0); /* Set first, the thread might run before we finish up. */ F_SET(conn, WT_CONN_EVICTION_RUN); /* Create the eviction thread group */ WT_RET(__wt_thread_group_create(session, &conn->evict_threads, "eviction-server", conn->evict_threads_min, conn->evict_threads_max, WT_THREAD_CAN_WAIT | WT_THREAD_PANIC_FAIL, __wt_evict_thread_run)); /* * Allow queues to be populated now that the eviction threads * are running. */ conn->evict_server_running = true; return (0); } /* * __wt_evict_destroy -- * Destroy the eviction threads. */ int __wt_evict_destroy(WT_SESSION_IMPL *session) { WT_CONNECTION_IMPL *conn; conn = S2C(session); /* We are done if the eviction server didn't start successfully. */ if (!conn->evict_server_running) return (0); /* Wait for any eviction thread group changes to stabilize. */ __wt_writelock(session, conn->evict_threads.lock); /* * Signal the threads to finish and stop populating the queue. */ F_CLR(conn, WT_CONN_EVICTION_RUN); conn->evict_server_running = false; __wt_evict_server_wake(session); __wt_verbose( session, WT_VERB_EVICTSERVER, "waiting for helper threads"); /* * We call the destroy function still holding the write lock. * It assumes it is called locked. */ WT_RET(__wt_thread_group_destroy(session, &conn->evict_threads)); return (0); } /* * __evict_update_work -- * Configure eviction work state. */ static bool __evict_update_work(WT_SESSION_IMPL *session) { WT_CACHE *cache; WT_CONNECTION_IMPL *conn; uint64_t bytes_inuse, bytes_max, dirty_inuse; conn = S2C(session); cache = conn->cache; /* Clear previous state. */ F_CLR(cache, WT_CACHE_EVICT_MASK); if (!F_ISSET(conn, WT_CONN_EVICTION_RUN)) return (false); if (!__evict_queue_empty(cache->evict_urgent_queue, false)) F_SET(cache, WT_CACHE_EVICT_URGENT); /* * If we need space in the cache, try to find clean pages to evict. * * Avoid division by zero if the cache size has not yet been set in a * shared cache. */ bytes_max = conn->cache_size + 1; bytes_inuse = __wt_cache_bytes_inuse(cache); if (__wt_eviction_clean_needed(session, NULL)) F_SET(cache, WT_CACHE_EVICT_CLEAN | WT_CACHE_EVICT_CLEAN_HARD); else if (bytes_inuse > (cache->eviction_target * bytes_max) / 100) F_SET(cache, WT_CACHE_EVICT_CLEAN); dirty_inuse = __wt_cache_dirty_leaf_inuse(cache); if (__wt_eviction_dirty_needed(session, NULL)) F_SET(cache, WT_CACHE_EVICT_DIRTY | WT_CACHE_EVICT_DIRTY_HARD); else if (dirty_inuse > (cache->eviction_dirty_target * bytes_max) / 100) F_SET(cache, WT_CACHE_EVICT_DIRTY); /* * If application threads are blocked by the total volume of data in * cache, try dirty pages as well. */ if (__wt_cache_aggressive(session) && F_ISSET(cache, WT_CACHE_EVICT_CLEAN_HARD)) F_SET(cache, WT_CACHE_EVICT_DIRTY); /* * Scrub dirty pages and keep them in cache if we are less than half * way to the clean or dirty trigger. */ if (bytes_inuse < ((cache->eviction_target + cache->eviction_trigger) * bytes_max) / 200 && dirty_inuse < (uint64_t) ((cache->eviction_dirty_target + cache->eviction_dirty_trigger) * bytes_max) / 200) F_SET(cache, WT_CACHE_EVICT_SCRUB); /* * With an in-memory cache, we only do dirty eviction in order to scrub * pages. */ if (F_ISSET(conn, WT_CONN_IN_MEMORY)) { if (F_ISSET(cache, WT_CACHE_EVICT_CLEAN)) F_SET(cache, WT_CACHE_EVICT_DIRTY); if (F_ISSET(cache, WT_CACHE_EVICT_CLEAN_HARD)) F_SET(cache, WT_CACHE_EVICT_DIRTY_HARD); F_CLR(cache, WT_CACHE_EVICT_CLEAN | WT_CACHE_EVICT_CLEAN_HARD); } WT_STAT_CONN_SET(session, cache_eviction_state, F_MASK(cache, WT_CACHE_EVICT_MASK)); return (F_ISSET(cache, WT_CACHE_EVICT_ALL | WT_CACHE_EVICT_URGENT)); } /* * __evict_pass -- * Evict pages from memory. */ static int __evict_pass(WT_SESSION_IMPL *session) { WT_CACHE *cache; WT_CONNECTION_IMPL *conn; WT_TXN_GLOBAL *txn_global; struct timespec now, prev; uint64_t oldest_id, pages_evicted, prev_oldest_id; u_int loop; conn = S2C(session); cache = conn->cache; txn_global = &conn->txn_global; /* Track whether pages are being evicted and progress is made. */ pages_evicted = cache->pages_evict; prev_oldest_id = txn_global->oldest_id; WT_CLEAR(prev); /* Evict pages from the cache. */ for (loop = 0; cache->pass_intr == 0; loop++) { __wt_epoch(session, &now); if (loop == 0) prev = now; /* * Increment the shared read generation. Do this occasionally * even if eviction is not currently required, so that pages * have some relative read generation when the eviction server * does need to do some work. */ __wt_cache_read_gen_incr(session); ++cache->evict_pass_gen; /* * Update the oldest ID: we use it to decide whether pages are * candidates for eviction. Without this, if all threads are * blocked after a long-running transaction (such as a * checkpoint) completes, we may never start evicting again. * * Do this every time the eviction server wakes up, regardless * of whether the cache is full, to prevent the oldest ID * falling too far behind. Don't wait to lock the table: with * highly threaded workloads, that creates a bottleneck. */ WT_RET(__wt_txn_update_oldest(session, WT_TXN_OLDEST_STRICT)); if (!__evict_update_work(session)) break; /* * Try to start a new thread if we have capacity and haven't * reached the eviction targets. */ if (F_ISSET(cache, WT_CACHE_EVICT_ALL)) WT_RET(__wt_thread_group_start_one( session, &conn->evict_threads, false)); __wt_verbose(session, WT_VERB_EVICTSERVER, "Eviction pass with: Max: %" PRIu64 " In use: %" PRIu64 " Dirty: %" PRIu64, conn->cache_size, cache->bytes_inmem, cache->bytes_dirty_intl + cache->bytes_dirty_leaf); if (F_ISSET(cache, WT_CACHE_EVICT_ALL)) WT_RET(__evict_lru_walk(session)); /* * If the queue has been empty recently, keep queuing more * pages to evict. If the rate of queuing pages is high * enough, this score will go to zero, in which case the * eviction server might as well help out with eviction. * * Also, if there is a single eviction server thread with no * workers, it must service the urgent queue in case all * application threads are busy. */ if (cache->evict_empty_score < WT_EVICT_SCORE_CUTOFF || (!WT_EVICT_HAS_WORKERS(session) && !__evict_queue_empty(cache->evict_urgent_queue, false))) WT_RET(__evict_lru_pages(session, true)); if (cache->pass_intr != 0) break; /* * If we're making progress, keep going; if we're not making * any progress at all, mark the cache "stuck" and go back to * sleep, it's not something we can fix. * * We check for progress every 20ms, the idea being that the * aggressive score will reach 10 after 200ms if we aren't * making progress and eviction will start considering more * pages. If there is still no progress after 2s, we will * treat the cache as stuck and start rolling back * transactions and writing updates to the lookaside table. */ if (pages_evicted == cache->pages_evict) { if (WT_TIMEDIFF_MS(now, prev) >= 20 && F_ISSET(cache, WT_CACHE_EVICT_CLEAN_HARD | WT_CACHE_EVICT_DIRTY_HARD)) { if (cache->evict_aggressive_score < 100) ++cache->evict_aggressive_score; oldest_id = txn_global->oldest_id; if (prev_oldest_id == oldest_id && txn_global->current != oldest_id && cache->evict_aggressive_score < 100) ++cache->evict_aggressive_score; WT_STAT_CONN_SET(session, cache_eviction_aggressive_set, cache->evict_aggressive_score); prev = now; prev_oldest_id = oldest_id; } /* * Keep trying for long enough that we should be able * to evict a page if the server isn't interfering. */ if (loop < 100 || cache->evict_aggressive_score < 100) { /* * Back off if we aren't making progress: walks * hold the handle list lock, blocking other * operations that can free space in cache, * such as LSM discarding handles. * * Allow this wait to be interrupted (e.g. if a * checkpoint completes): make sure we wait for * a non-zero number of microseconds). */ WT_STAT_CONN_INCR(session, cache_eviction_server_slept); __wt_cond_wait( session, cache->evict_cond, WT_THOUSAND); continue; } WT_STAT_CONN_INCR(session, cache_eviction_slow); __wt_verbose(session, WT_VERB_EVICTSERVER, "unable to reach eviction goal"); break; } else { if (cache->evict_aggressive_score > 0) { --cache->evict_aggressive_score; WT_STAT_CONN_SET(session, cache_eviction_aggressive_set, cache->evict_aggressive_score); } loop = 0; pages_evicted = cache->pages_evict; } } return (0); } /* * __evict_clear_walk -- * Clear a single walk point. */ static int __evict_clear_walk(WT_SESSION_IMPL *session, bool count_stat) { WT_BTREE *btree; WT_CACHE *cache; WT_DECL_RET; WT_REF *ref; btree = S2BT(session); cache = S2C(session)->cache; WT_ASSERT(session, F_ISSET(session, WT_SESSION_LOCKED_PASS)); if (session->dhandle == cache->evict_file_next) cache->evict_file_next = NULL; if ((ref = btree->evict_ref) == NULL) return (0); if (count_stat) WT_STAT_CONN_INCR(session, cache_eviction_walks_abandoned); /* * Clear evict_ref first, in case releasing it forces eviction (we * assert we never try to evict the current eviction walk point). */ btree->evict_ref = NULL; WT_WITH_DHANDLE(cache->walk_session, session->dhandle, (ret = __wt_page_release(cache->walk_session, ref, WT_READ_NO_EVICT))); return (ret); } /* * __evict_clear_all_walks -- * Clear the eviction walk points for all files a session is waiting on. */ static int __evict_clear_all_walks(WT_SESSION_IMPL *session) { WT_CONNECTION_IMPL *conn; WT_DATA_HANDLE *dhandle; WT_DECL_RET; conn = S2C(session); TAILQ_FOREACH(dhandle, &conn->dhqh, q) if (WT_PREFIX_MATCH(dhandle->name, "file:")) WT_WITH_DHANDLE(session, dhandle, WT_TRET(__evict_clear_walk(session, true))); return (ret); } /* * __wt_evict_file_exclusive_on -- * Get exclusive eviction access to a file and discard any of the file's * blocks queued for eviction. */ int __wt_evict_file_exclusive_on(WT_SESSION_IMPL *session) { WT_BTREE *btree; WT_CACHE *cache; WT_DECL_RET; WT_EVICT_ENTRY *evict; u_int i, elem, q; btree = S2BT(session); cache = S2C(session)->cache; /* * Hold the walk lock to set the no-eviction flag. * * The no-eviction flag can be set permanently, in which case we never * increment the no-eviction count. */ __wt_spin_lock(session, &cache->evict_walk_lock); if (F_ISSET(btree, WT_BTREE_NO_EVICTION)) { if (btree->evict_disabled != 0) ++btree->evict_disabled; __wt_spin_unlock(session, &cache->evict_walk_lock); return (0); } ++btree->evict_disabled; /* * Ensure no new pages from the file will be queued for eviction after * this point. */ F_SET(btree, WT_BTREE_NO_EVICTION); (void)__wt_atomic_addv32(&cache->pass_intr, 1); /* Clear any existing LRU eviction walk for the file. */ WT_WITH_PASS_LOCK(session, ret = __evict_clear_walk(session, true)); (void)__wt_atomic_subv32(&cache->pass_intr, 1); WT_ERR(ret); /* * The eviction candidate list might reference pages from the file, * clear it. Hold the evict lock to remove queued pages from a file. */ __wt_spin_lock(session, &cache->evict_queue_lock); for (q = 0; q < WT_EVICT_QUEUE_MAX; q++) { __wt_spin_lock(session, &cache->evict_queues[q].evict_lock); elem = cache->evict_queues[q].evict_max; for (i = 0, evict = cache->evict_queues[q].evict_queue; i < elem; i++, evict++) if (evict->btree == btree) __evict_list_clear(session, evict); __wt_spin_unlock(session, &cache->evict_queues[q].evict_lock); } __wt_spin_unlock(session, &cache->evict_queue_lock); /* * We have disabled further eviction: wait for concurrent LRU eviction * activity to drain. */ while (btree->evict_busy > 0) __wt_yield(); if (0) { err: --btree->evict_disabled; F_CLR(btree, WT_BTREE_NO_EVICTION); } __wt_spin_unlock(session, &cache->evict_walk_lock); return (ret); } /* * __wt_evict_file_exclusive_off -- * Release exclusive eviction access to a file. */ void __wt_evict_file_exclusive_off(WT_SESSION_IMPL *session) { WT_BTREE *btree; WT_CACHE *cache; btree = S2BT(session); cache = S2C(session)->cache; /* * We have seen subtle bugs with multiple threads racing to turn * eviction on/off. Make races more likely in diagnostic builds. */ WT_DIAGNOSTIC_YIELD; WT_ASSERT(session, btree->evict_ref == NULL && F_ISSET(btree, WT_BTREE_NO_EVICTION)); /* * The no-eviction flag can be set permanently, in which case we never * increment the no-eviction count. */ __wt_spin_lock(session, &cache->evict_walk_lock); if (btree->evict_disabled > 0 && --btree->evict_disabled == 0) F_CLR(btree, WT_BTREE_NO_EVICTION); __wt_spin_unlock(session, &cache->evict_walk_lock); } /* * __evict_lru_pages -- * Get pages from the LRU queue to evict. */ static int __evict_lru_pages(WT_SESSION_IMPL *session, bool is_server) { WT_CONNECTION_IMPL *conn; WT_DECL_RET; conn = S2C(session); /* * Reconcile and discard some pages: EBUSY is returned if a page fails * eviction because it's unavailable, continue in that case. */ while (F_ISSET(S2C(session), WT_CONN_EVICTION_RUN) && ret == 0) if ((ret = __evict_page(session, is_server)) == EBUSY) ret = 0; /* If a worker thread found the queue empty, pause. */ if (ret == WT_NOTFOUND && !is_server && F_ISSET(S2C(session), WT_CONN_EVICTION_RUN)) __wt_cond_wait(session, conn->evict_threads.wait_cond, 10000); return (ret == WT_NOTFOUND ? 0 : ret); } /* * __evict_lru_walk -- * Add pages to the LRU queue to be evicted from cache. */ static int __evict_lru_walk(WT_SESSION_IMPL *session) { WT_CACHE *cache; WT_DECL_RET; WT_EVICT_QUEUE *queue, *other_queue; uint64_t read_gen_oldest; uint32_t candidates, entries; cache = S2C(session)->cache; /* Age out the score of how much the queue has been empty recently. */ if (cache->evict_empty_score > 0) { --cache->evict_empty_score; WT_STAT_CONN_SET(session, cache_eviction_empty_score, cache->evict_empty_score); } /* Fill the next queue (that isn't the urgent queue). */ queue = cache->evict_fill_queue; other_queue = cache->evict_queues + (1 - (queue - cache->evict_queues)); cache->evict_fill_queue = other_queue; /* If this queue is full, try the other one. */ if (__evict_queue_full(queue) && !__evict_queue_full(other_queue)) queue = other_queue; /* * If both queues are full and haven't been empty on recent refills, * we're done. */ if (__evict_queue_full(queue) && cache->evict_empty_score < WT_EVICT_SCORE_CUTOFF) return (0); /* Get some more pages to consider for eviction. */ if ((ret = __evict_walk(cache->walk_session, queue)) == EBUSY) return (0); /* An interrupt was requested, give up. */ WT_RET_NOTFOUND_OK(ret); /* * If the queue we are filling is empty, pages are being requested * faster than they are being queued. */ if (__evict_queue_empty(queue, false)) { if (F_ISSET(cache, WT_CACHE_EVICT_CLEAN_HARD | WT_CACHE_EVICT_DIRTY_HARD)) { cache->evict_empty_score = WT_MIN( cache->evict_empty_score + WT_EVICT_SCORE_BUMP, WT_EVICT_SCORE_MAX); WT_STAT_CONN_SET(session, cache_eviction_empty_score, cache->evict_empty_score); } WT_STAT_CONN_INCR(session, cache_eviction_queue_empty); } else WT_STAT_CONN_INCR(session, cache_eviction_queue_not_empty); /* Sort the list into LRU order and restart. */ __wt_spin_lock(session, &queue->evict_lock); /* * We have locked the queue: in the (unusual) case where we are filling * the current queue, mark it empty so that subsequent requests switch * to the other queue. */ if (queue == cache->evict_current_queue) queue->evict_current = NULL; entries = queue->evict_entries; qsort(queue->evict_queue, entries, sizeof(WT_EVICT_ENTRY), __evict_lru_cmp); /* Trim empty entries from the end. */ while (entries > 0 && queue->evict_queue[entries - 1].ref == NULL) --entries; /* * If we have more entries than the maximum tracked between walks, * clear them. Do this before figuring out how many of the entries are * candidates so we never end up with more candidates than entries. */ while (entries > WT_EVICT_WALK_BASE) __evict_list_clear(session, &queue->evict_queue[--entries]); queue->evict_entries = entries; if (entries == 0) { /* * If there are no entries, there cannot be any candidates. * Make sure application threads don't read past the end of the * candidate list, or they may race with the next walk. */ queue->evict_candidates = 0; queue->evict_current = NULL; __wt_spin_unlock(session, &queue->evict_lock); return (0); } /* Decide how many of the candidates we're going to try and evict. */ if (__wt_cache_aggressive(session)) queue->evict_candidates = entries; else { /* * Find the oldest read generation apart that we have in the * queue, used to set the initial value for pages read into the * system. The queue is sorted, find the first "normal" * generation. */ read_gen_oldest = WT_READGEN_OLDEST; for (candidates = 0; candidates < entries; ++candidates) { read_gen_oldest = queue->evict_queue[candidates].score; if (read_gen_oldest != WT_READGEN_OLDEST) break; } /* * Take all candidates if we only gathered pages with an oldest * read generation set. * * We normally never take more than 50% of the entries but if * 50% of the entries were at the oldest read generation, take * all of them. */ if (read_gen_oldest == WT_READGEN_OLDEST) queue->evict_candidates = entries; else if (candidates > entries / 2) queue->evict_candidates = candidates; else { /* * Take all of the urgent pages plus a third of * ordinary candidates (which could be expressed as * WT_EVICT_WALK_INCR / WT_EVICT_WALK_BASE). In the * steady state, we want to get as many candidates as * the eviction walk adds to the queue. * * That said, if there is only one entry, which is * normal when populating an empty file, don't exclude * it. */ queue->evict_candidates = 1 + candidates + ((entries - candidates) - 1) / 3; cache->read_gen_oldest = read_gen_oldest; } } queue->evict_current = queue->evict_queue; __wt_spin_unlock(session, &queue->evict_lock); /* * Signal any application or helper threads that may be waiting * to help with eviction. */ __wt_cond_signal(session, S2C(session)->evict_threads.wait_cond); return (0); } /* * __evict_walk -- * Fill in the array by walking the next set of pages. */ static int __evict_walk(WT_SESSION_IMPL *session, WT_EVICT_QUEUE *queue) { WT_BTREE *btree; WT_CACHE *cache; WT_CONNECTION_IMPL *conn; WT_DATA_HANDLE *dhandle; WT_DECL_RET; u_int max_entries, retries, slot, spins, start_slot, total_candidates; bool dhandle_locked, incr; conn = S2C(session); cache = S2C(session)->cache; btree = NULL; dhandle = NULL; dhandle_locked = incr = false; retries = 0; /* * Set the starting slot in the queue and the maximum pages added * per walk. */ start_slot = slot = queue->evict_entries; max_entries = WT_MIN(slot + WT_EVICT_WALK_INCR, cache->evict_slots); /* * Another pathological case: if there are only a tiny number of * candidate pages in cache, don't put all of them on one queue. */ total_candidates = (u_int)(F_ISSET(cache, WT_CACHE_EVICT_CLEAN) ? __wt_cache_pages_inuse(cache) : cache->pages_dirty_leaf); max_entries = WT_MIN(max_entries, 1 + total_candidates / 2); retry: while (slot < max_entries) { /* * If another thread is waiting on the eviction server to clear * the walk point in a tree, give up. */ if (cache->pass_intr != 0) WT_ERR(EBUSY); /* * Lock the dhandle list to find the next handle and bump its * reference count to keep it alive while we sweep. */ if (!dhandle_locked) { for (spins = 0; (ret = __wt_spin_trylock( session, &conn->dhandle_lock)) == EBUSY && cache->pass_intr == 0; spins++) { if (spins < WT_THOUSAND) __wt_yield(); else __wt_sleep(0, WT_THOUSAND); } WT_ERR(ret); dhandle_locked = true; } if (dhandle == NULL) { /* * On entry, continue from wherever we got to in the * scan last time through. If we don't have a saved * handle, start from the beginning of the list. */ if ((dhandle = cache->evict_file_next) != NULL) cache->evict_file_next = NULL; else dhandle = TAILQ_FIRST(&conn->dhqh); } else { if (incr) { WT_ASSERT(session, dhandle->session_inuse > 0); (void)__wt_atomic_subi32( &dhandle->session_inuse, 1); incr = false; cache->evict_file_next = NULL; } dhandle = TAILQ_NEXT(dhandle, q); } /* If we reach the end of the list, we're done. */ if (dhandle == NULL) break; /* Ignore non-file handles, or handles that aren't open. */ if (!WT_PREFIX_MATCH(dhandle->name, "file:") || !F_ISSET(dhandle, WT_DHANDLE_OPEN)) continue; /* Skip files that don't allow eviction. */ btree = dhandle->handle; if (F_ISSET(btree, WT_BTREE_NO_EVICTION)) continue; /* * Skip files that are checkpointing if we are only looking for * dirty pages. */ if (btree->checkpointing != WT_CKPT_OFF && !F_ISSET(cache, WT_CACHE_EVICT_CLEAN)) continue; /* * Skip files that are configured to stick in cache until we * become aggressive. */ if (btree->evict_priority != 0 && !__wt_cache_aggressive(session)) continue; /* Skip files if we have used all available hazard pointers. */ if (btree->evict_ref == NULL && session->nhazard >= conn->hazard_max - WT_MIN(conn->hazard_max / 2, 10)) continue; /* * If we are filling the queue, skip files that haven't been * useful in the past. */ if (btree->evict_walk_period != 0 && btree->evict_walk_skips++ < btree->evict_walk_period) continue; btree->evict_walk_skips = 0; (void)__wt_atomic_addi32(&dhandle->session_inuse, 1); incr = true; __wt_spin_unlock(session, &conn->dhandle_lock); dhandle_locked = false; /* * Re-check the "no eviction" flag, used to enforce exclusive * access when a handle is being closed. If not set, remember * the file to visit first, next loop. * * Only try to acquire the lock and simply continue if we fail; * the lock is held while the thread turning off eviction clears * the tree's current eviction point, and part of the process is * waiting on this thread to acknowledge that action. */ if (!F_ISSET(btree, WT_BTREE_NO_EVICTION) && !__wt_spin_trylock(session, &cache->evict_walk_lock)) { if (!F_ISSET(btree, WT_BTREE_NO_EVICTION)) { cache->evict_file_next = dhandle; WT_WITH_DHANDLE(session, dhandle, ret = __evict_walk_file(session, queue, max_entries, &slot)); WT_ASSERT(session, session->split_gen == 0); } __wt_spin_unlock(session, &cache->evict_walk_lock); WT_ERR(ret); } } if (incr) { WT_ASSERT(session, dhandle->session_inuse > 0); (void)__wt_atomic_subi32(&dhandle->session_inuse, 1); incr = false; } /* * Walk the list of files a few times if we don't find enough pages. * Try two passes through all the files, give up when we have some * candidates and we aren't finding more. */ if (slot < max_entries && (retries < 2 || (retries < 10 && (slot == queue->evict_entries || slot > start_slot)))) { start_slot = slot; ++retries; goto retry; } err: if (dhandle_locked) { __wt_spin_unlock(session, &conn->dhandle_lock); dhandle_locked = false; } /* * If we didn't find any entries on a walk when we weren't interrupted, * let our caller know. */ if (queue->evict_entries == slot && cache->pass_intr == 0) return (WT_NOTFOUND); queue->evict_entries = slot; return (ret); } /* * __evict_push_candidate -- * Initialize a WT_EVICT_ENTRY structure with a given page. */ static bool __evict_push_candidate(WT_SESSION_IMPL *session, WT_EVICT_QUEUE *queue, WT_EVICT_ENTRY *evict, WT_REF *ref) { u_int slot; uint8_t orig_flags, new_flags; /* * Threads can race to queue a page (e.g., an ordinary LRU walk can * race with a page being queued for urgent eviction). */ orig_flags = new_flags = ref->page->flags_atomic; FLD_SET(new_flags, WT_PAGE_EVICT_LRU); if (orig_flags == new_flags || !__wt_atomic_cas8(&ref->page->flags_atomic, orig_flags, new_flags)) return (false); /* Keep track of the maximum slot we are using. */ slot = (u_int)(evict - queue->evict_queue); if (slot >= queue->evict_max) queue->evict_max = slot + 1; if (evict->ref != NULL) __evict_list_clear(session, evict); evict->btree = S2BT(session); evict->ref = ref; evict->score = __evict_entry_priority(session, ref); /* Adjust for size when doing dirty eviction. */ if (F_ISSET(S2C(session)->cache, WT_CACHE_EVICT_DIRTY) && evict->score != WT_READGEN_OLDEST && evict->score != UINT64_MAX && !__wt_page_is_modified(ref->page)) evict->score += WT_MEGABYTE - WT_MIN(WT_MEGABYTE, ref->page->memory_footprint); return (true); } /* * __evict_walk_file -- * Get a few page eviction candidates from a single underlying file. */ static int __evict_walk_file(WT_SESSION_IMPL *session, WT_EVICT_QUEUE *queue, u_int max_entries, u_int *slotp) { WT_BTREE *btree; WT_CACHE *cache; WT_CONNECTION_IMPL *conn; WT_DECL_RET; WT_EVICT_ENTRY *end, *evict, *start; WT_PAGE *page; WT_PAGE_MODIFY *mod; WT_REF *ref; WT_TXN_GLOBAL *txn_global; uint64_t btree_inuse, bytes_per_slot, cache_inuse, min_pages; uint64_t pages_seen, pages_queued, refs_walked; uint32_t remaining_slots, total_slots, walk_flags; uint32_t target_pages_clean, target_pages_dirty, target_pages; int internal_pages, restarts; bool give_up, modified, urgent_queued; conn = S2C(session); btree = S2BT(session); cache = conn->cache; txn_global = &conn->txn_global; internal_pages = restarts = 0; give_up = urgent_queued = false; /* * Figure out how many slots to fill from this tree. * Note that some care is taken in the calculation to avoid overflow. */ start = queue->evict_queue + *slotp; remaining_slots = max_entries - *slotp; total_slots = max_entries - queue->evict_entries; /* * The target number of pages for this tree is proportional to the * space it is taking up in cache. Round to the nearest number of * slots so we assign all of the slots to a tree filling 99+% of the * cache (and only have to walk it once). */ if (F_ISSET(cache, WT_CACHE_EVICT_CLEAN)) { btree_inuse = __wt_btree_bytes_inuse(session); cache_inuse = __wt_cache_bytes_inuse(cache); bytes_per_slot = 1 + cache_inuse / total_slots; target_pages_clean = (uint32_t)( (btree_inuse + bytes_per_slot / 2) / bytes_per_slot); } else target_pages_clean = 0; if (F_ISSET(cache, WT_CACHE_EVICT_DIRTY)) { btree_inuse = __wt_btree_dirty_leaf_inuse(session); cache_inuse = __wt_cache_dirty_leaf_inuse(cache); bytes_per_slot = 1 + cache_inuse / total_slots; target_pages_dirty = (uint32_t)( (btree_inuse + bytes_per_slot / 2) / bytes_per_slot); } else target_pages_dirty = 0; target_pages = WT_MAX(target_pages_clean, target_pages_dirty); if (target_pages == 0) { /* * Randomly walk trees with a tiny fraction of the cache in * case there are so many trees that none of them use enough of * the cache to be allocated slots. Walk small trees 1% of the * time. */ if (__wt_random(&session->rnd) > UINT32_MAX / 100) return (0); target_pages = 10; } if (F_ISSET(session->dhandle, WT_DHANDLE_DEAD) || target_pages > remaining_slots) target_pages = remaining_slots; end = start + target_pages; walk_flags = WT_READ_CACHE | WT_READ_NO_EVICT | WT_READ_NO_GEN | WT_READ_NO_WAIT; /* Randomize the walk direction. */ if (btree->evict_walk_reverse) FLD_SET(walk_flags, WT_READ_PREV); /* * Examine at least a reasonable number of pages before deciding * whether to give up. When we are only looking for dirty pages, * search the tree for longer. */ min_pages = 10 * target_pages; if (F_ISSET(cache, WT_CACHE_EVICT_DIRTY) && !F_ISSET(cache, WT_CACHE_EVICT_CLEAN)) min_pages *= 10; /* * Get some more eviction candidate pages. * * !!! Take care terminating this loop. * * Don't make an extra call to __wt_tree_walk after we hit the end of a * tree: that will leave a page pinned, which may prevent any work from * being done. * * Once we hit the page limit, do one more step through the walk in * case we are appending and only the last page in the file is live. */ for (evict = start, pages_queued = pages_seen = refs_walked = 0; evict < end && (ret == 0 || ret == WT_NOTFOUND); ret = __wt_tree_walk_count( session, &btree->evict_ref, &refs_walked, walk_flags)) { /* * Check whether we're finding a good ratio of candidates vs * pages seen. Some workloads create "deserts" in trees where * no good eviction candidates can be found. Abandon the walk * if we get into that situation. */ give_up = !__wt_cache_aggressive(session) && pages_seen > min_pages && (pages_queued == 0 || (pages_seen / pages_queued) > (min_pages / target_pages)); if (give_up) break; if ((ref = btree->evict_ref) == NULL) { if (++restarts == 2) break; WT_STAT_CONN_INCR( session, cache_eviction_walks_started); continue; } ++pages_seen; /* Ignore root pages entirely. */ if (__wt_ref_is_root(ref)) continue; page = ref->page; modified = __wt_page_is_modified(page); page->evict_pass_gen = cache->evict_pass_gen; /* * Use the EVICT_LRU flag to avoid putting pages onto the list * multiple times. */ if (F_ISSET_ATOMIC(page, WT_PAGE_EVICT_LRU)) continue; /* * It's possible (but unlikely) to visit a page without a read * generation, if we race with the read instantiating the page. * Set the page's read generation here to ensure a bug doesn't * somehow leave a page without a read generation. */ if (page->read_gen == WT_READGEN_NOTSET) __wt_cache_read_gen_new(session, page); /* Pages we no longer need (clean or dirty), are found money. */ if (page->read_gen == WT_READGEN_OLDEST || page->memory_footprint >= btree->splitmempage) { WT_STAT_CONN_INCR( session, cache_eviction_pages_queued_oldest); if (__wt_page_evict_urgent(session, ref)) urgent_queued = true; continue; } /* Pages that are empty or from dead trees are also good. */ if (__wt_page_is_empty(page) || F_ISSET(session->dhandle, WT_DHANDLE_DEAD)) goto fast; /* Skip clean pages if appropriate. */ if (!modified && !F_ISSET(cache, WT_CACHE_EVICT_CLEAN)) continue; /* Skip dirty pages if appropriate. */ if (modified && !F_ISSET(cache, WT_CACHE_EVICT_DIRTY)) continue; /* Limit internal pages to 50% of the total. */ if (WT_PAGE_IS_INTERNAL(page) && internal_pages > (int)(evict - start) / 2) continue; /* If eviction gets aggressive, anything else is fair game. */ if (__wt_cache_aggressive(session)) goto fast; /* * If the oldest transaction hasn't changed since the last time * this page was written, it's unlikely we can make progress. * Similarly, if the most recent update on the page is not yet * globally visible, eviction will fail. These heuristics * attempt to avoid repeated attempts to evict the same page. */ mod = page->modify; if (modified && txn_global->current != txn_global->oldest_id && (mod->last_eviction_id == __wt_txn_oldest_id(session) || !__wt_txn_visible_all(session, mod->update_txn))) continue; fast: /* If the page can't be evicted, give up. */ if (!__wt_page_can_evict(session, ref, NULL)) continue; WT_ASSERT(session, evict->ref == NULL); if (!__evict_push_candidate(session, queue, evict, ref)) continue; ++evict; ++pages_queued; if (WT_PAGE_IS_INTERNAL(page)) ++internal_pages; __wt_verbose(session, WT_VERB_EVICTSERVER, "select: %p, size %" WT_SIZET_FMT, (void *)page, page->memory_footprint); } WT_RET_NOTFOUND_OK(ret); *slotp += (u_int)(evict - start); WT_STAT_CONN_INCRV( session, cache_eviction_pages_queued, (u_int)(evict - start)); /* * If we didn't find any candidates in the file, reverse the direction * of the walk and skip it next time. */ if (give_up) btree->evict_walk_reverse = !btree->evict_walk_reverse; if (pages_queued == 0 && !urgent_queued) btree->evict_walk_period = WT_MIN( WT_MAX(1, 2 * btree->evict_walk_period), 100); else if (pages_queued == target_pages) btree->evict_walk_period = 0; else if (btree->evict_walk_period > 0) btree->evict_walk_period /= 2; /* * If we happen to end up on the root page or a page requiring urgent * eviction, clear it. We have to track hazard pointers, and the root * page complicates that calculation. * * Likewise if we found no new candidates during the walk: there is no * point keeping a page pinned, since it may be the only candidate in * an idle tree. * * If we land on a page requiring forced eviction, move on to the next * page: we want this page evicted as quickly as possible. */ if ((ref = btree->evict_ref) != NULL) { /* Give up the walk occasionally. */ if (__wt_ref_is_root(ref) || evict == start || give_up || ref->page->read_gen == WT_READGEN_OLDEST || ref->page->memory_footprint >= btree->splitmempage) WT_RET(__evict_clear_walk(session, restarts == 0)); else if (ref->page->read_gen == WT_READGEN_OLDEST) WT_RET_NOTFOUND_OK(__wt_tree_walk_count( session, &btree->evict_ref, &refs_walked, walk_flags)); } WT_STAT_CONN_INCRV(session, cache_eviction_walk, refs_walked); WT_STAT_CONN_INCRV(session, cache_eviction_pages_seen, pages_seen); return (0); } /* * __evict_get_ref -- * Get a page for eviction. */ static int __evict_get_ref( WT_SESSION_IMPL *session, bool is_server, WT_BTREE **btreep, WT_REF **refp) { WT_CACHE *cache; WT_EVICT_ENTRY *evict; WT_EVICT_QUEUE *queue, *other_queue, *urgent_queue; uint32_t candidates; bool is_app, server_only, urgent_ok; cache = S2C(session)->cache; is_app = !F_ISSET(session, WT_SESSION_INTERNAL); server_only = is_server && !WT_EVICT_HAS_WORKERS(session); urgent_ok = (!is_app && !is_server) || !WT_EVICT_HAS_WORKERS(session) || (is_app && __wt_cache_aggressive(session)); urgent_queue = cache->evict_urgent_queue; *btreep = NULL; *refp = NULL; WT_STAT_CONN_INCR(session, cache_eviction_get_ref); /* Avoid the LRU lock if no pages are available. */ if (__evict_queue_empty(cache->evict_current_queue, is_server) && __evict_queue_empty(cache->evict_other_queue, is_server) && (!urgent_ok || __evict_queue_empty(urgent_queue, false))) { WT_STAT_CONN_INCR(session, cache_eviction_get_ref_empty); return (WT_NOTFOUND); } /* * The server repopulates whenever the other queue is not full, as long * as at least one page has been evicted out of the current queue. * * Note that there are pathological cases where there are only enough * eviction candidates in the cache to fill one queue. In that case, * we will continually evict one page and attempt to refill the queues. * Such cases are extremely rare in real applications. */ if (is_server && (!urgent_ok || __evict_queue_empty(urgent_queue, false)) && !__evict_queue_full(cache->evict_current_queue) && !__evict_queue_full(cache->evict_fill_queue) && (cache->evict_empty_score > WT_EVICT_SCORE_CUTOFF || __evict_queue_empty(cache->evict_fill_queue, false))) return (WT_NOTFOUND); __wt_spin_lock(session, &cache->evict_queue_lock); /* Check the urgent queue first. */ if (urgent_ok && !__evict_queue_empty(urgent_queue, false)) queue = urgent_queue; else { /* * Check if the current queue needs to change. * * The server will only evict half of the pages before looking * for more, but should only switch queues if there are no * other eviction workers. */ queue = cache->evict_current_queue; other_queue = cache->evict_other_queue; if (__evict_queue_empty(queue, server_only) && !__evict_queue_empty(other_queue, server_only)) { cache->evict_current_queue = other_queue; cache->evict_other_queue = queue; } } __wt_spin_unlock(session, &cache->evict_queue_lock); /* * We got the queue lock, which should be fast, and chose a queue. * Now we want to get the lock on the individual queue. */ for (;;) { /* Verify there are still pages available. */ if (__evict_queue_empty( queue, is_server && queue != urgent_queue)) { WT_STAT_CONN_INCR( session, cache_eviction_get_ref_empty2); return (WT_NOTFOUND); } if (!is_server) __wt_spin_lock(session, &queue->evict_lock); else if (__wt_spin_trylock(session, &queue->evict_lock) != 0) continue; break; } /* * Only evict half of the pages before looking for more. The remainder * are left to eviction workers (if configured), or application thread * if necessary. */ candidates = queue->evict_candidates; if (is_server && queue != urgent_queue && candidates > 1) candidates /= 2; /* Get the next page queued for eviction. */ for (evict = queue->evict_current; evict >= queue->evict_queue && evict < queue->evict_queue + candidates; ++evict) { if (evict->ref == NULL) continue; WT_ASSERT(session, evict->btree != NULL); /* * Evicting a dirty page in the server thread could stall * during a write and prevent eviction from finding new work. * * However, we can't skip entries in the urgent queue or they * may never be found again. * * Don't force application threads to evict dirty pages if they * aren't stalled by the amount of dirty data in cache. */ if (!urgent_ok && (is_server || !F_ISSET(cache, WT_CACHE_EVICT_DIRTY_HARD)) && __wt_page_is_modified(evict->ref->page)) { --evict; break; } /* * Lock the page while holding the eviction mutex to prevent * multiple attempts to evict it. For pages that are already * being evicted, this operation will fail and we will move on. */ if (!__wt_atomic_casv32( &evict->ref->state, WT_REF_MEM, WT_REF_LOCKED)) { __evict_list_clear(session, evict); continue; } /* * Increment the busy count in the btree handle to prevent it * from being closed under us. */ (void)__wt_atomic_addv32(&evict->btree->evict_busy, 1); *btreep = evict->btree; *refp = evict->ref; /* * Remove the entry so we never try to reconcile the same page * on reconciliation error. */ __evict_list_clear(session, evict); break; } /* Move to the next item. */ if (evict != NULL && evict + 1 < queue->evict_queue + queue->evict_candidates) queue->evict_current = evict + 1; else /* Clear the current pointer if there are no more candidates. */ queue->evict_current = NULL; __wt_spin_unlock(session, &queue->evict_lock); return (*refp == NULL ? WT_NOTFOUND : 0); } /* * __evict_page -- * Called by both eviction and application threads to evict a page. */ static int __evict_page(WT_SESSION_IMPL *session, bool is_server) { struct timespec enter, leave; WT_BTREE *btree; WT_CACHE *cache; WT_DECL_RET; WT_REF *ref; bool app_timer; WT_RET(__evict_get_ref(session, is_server, &btree, &ref)); WT_ASSERT(session, ref->state == WT_REF_LOCKED); app_timer = false; cache = S2C(session)->cache; /* * An internal session flags either the server itself or an eviction * worker thread. */ if (is_server) { WT_STAT_CONN_INCR(session, cache_eviction_server_evicting); cache->server_evicts++; } else if (F_ISSET(session, WT_SESSION_INTERNAL)) { WT_STAT_CONN_INCR(session, cache_eviction_worker_evicting); cache->worker_evicts++; } else { if (__wt_page_is_modified(ref->page)) WT_STAT_CONN_INCR(session, cache_eviction_app_dirty); WT_STAT_CONN_INCR(session, cache_eviction_app); cache->app_evicts++; if (WT_STAT_ENABLED(session)) { app_timer = true; __wt_epoch(session, &enter); } } /* * In case something goes wrong, don't pick the same set of pages every * time. * * We used to bump the page's read generation only if eviction failed, * but that isn't safe: at that point, eviction has already unlocked * the page and some other thread may have evicted it by the time we * look at it. */ __wt_cache_read_gen_bump(session, ref->page); WT_WITH_BTREE(session, btree, ret = __wt_evict(session, ref, false)); (void)__wt_atomic_subv32(&btree->evict_busy, 1); if (app_timer) { __wt_epoch(session, &leave); WT_STAT_CONN_INCRV(session, application_evict_time, WT_TIMEDIFF_US(leave, enter)); } return (ret); } /* * __wt_cache_eviction_worker -- * Worker function for __wt_cache_eviction_check: evict pages if the cache * crosses its boundaries. */ int __wt_cache_eviction_worker(WT_SESSION_IMPL *session, bool busy, u_int pct_full) { struct timespec enter, leave; WT_CACHE *cache; WT_CONNECTION_IMPL *conn; WT_DECL_RET; WT_TXN_GLOBAL *txn_global; WT_TXN_STATE *txn_state; uint64_t init_evict_count, max_pages_evicted; conn = S2C(session); cache = conn->cache; txn_global = &conn->txn_global; txn_state = WT_SESSION_TXN_STATE(session); /* * It is not safe to proceed if the eviction server threads aren't * setup yet. */ if (!conn->evict_server_running) return (0); if (busy && pct_full < 100) return (0); /* Wake the eviction server if we need to do work. */ __wt_evict_server_wake(session); /* Track how long application threads spend doing eviction. */ if (WT_STAT_ENABLED(session) && !F_ISSET(session, WT_SESSION_INTERNAL)) __wt_epoch(session, &enter); for (init_evict_count = cache->pages_evict;; ret = 0) { /* * A pathological case: if we're the oldest transaction in the * system and the eviction server is stuck trying to find space, * abort the transaction to give up all hazard pointers before * trying again. */ if (__wt_cache_stuck(session) && __wt_txn_am_oldest(session)) { --cache->evict_aggressive_score; WT_STAT_CONN_INCR(session, txn_fail_cache); WT_ERR(WT_ROLLBACK); } /* * Check if we have become busy. * * If we're busy (because of the transaction check we just did * or because our caller is waiting on a longer-than-usual event * such as a page read), and the cache level drops below 100%, * limit the work to 5 evictions and return. If that's not the * case, we can do more. */ if (!busy && txn_state->pinned_id != WT_TXN_NONE && txn_global->current != txn_global->oldest_id) busy = true; max_pages_evicted = busy ? 5 : 20; /* See if eviction is still needed. */ if (!__wt_eviction_needed(session, busy, &pct_full) || (pct_full < 100 && cache->pages_evict > init_evict_count + max_pages_evicted)) break; /* * Don't make application threads participate in scrubbing for * checkpoints. Just throttle updates instead. */ if (busy && WT_EVICT_HAS_WORKERS(session) && cache->eviction_scrub_limit > 0.0 && !F_ISSET(cache, WT_CACHE_EVICT_CLEAN_HARD)) { __wt_yield(); continue; } /* Evict a page. */ switch (ret = __evict_page(session, false)) { case 0: if (busy) goto err; /* FALLTHROUGH */ case EBUSY: break; case WT_NOTFOUND: /* Allow the queue to re-populate before retrying. */ __wt_cond_wait( session, conn->evict_threads.wait_cond, 10000); cache->app_waits++; break; default: goto err; } } err: if (WT_STAT_ENABLED(session) && !F_ISSET(session, WT_SESSION_INTERNAL)) { __wt_epoch(session, &leave); WT_STAT_CONN_INCRV(session, application_cache_time, WT_TIMEDIFF_US(leave, enter)); } return (ret); /* NOTREACHED */ } /* * __wt_page_evict_urgent -- * Set a page to be evicted as soon as possible. */ bool __wt_page_evict_urgent(WT_SESSION_IMPL *session, WT_REF *ref) { WT_CACHE *cache; WT_EVICT_ENTRY *evict; WT_EVICT_QUEUE *urgent_queue; WT_PAGE *page; bool queued; /* Root pages should never be evicted via LRU. */ WT_ASSERT(session, !__wt_ref_is_root(ref)); page = ref->page; if (F_ISSET_ATOMIC(page, WT_PAGE_EVICT_LRU) || F_ISSET(S2BT(session), WT_BTREE_NO_EVICTION)) return (false); /* Append to the urgent queue if we can. */ cache = S2C(session)->cache; urgent_queue = &cache->evict_queues[WT_EVICT_URGENT_QUEUE]; queued = false; __wt_spin_lock(session, &cache->evict_queue_lock); if (F_ISSET_ATOMIC(page, WT_PAGE_EVICT_LRU) || F_ISSET(S2BT(session), WT_BTREE_NO_EVICTION)) goto done; __wt_spin_lock(session, &urgent_queue->evict_lock); if (__evict_queue_empty(urgent_queue, false)) { urgent_queue->evict_current = urgent_queue->evict_queue; urgent_queue->evict_candidates = 0; } evict = urgent_queue->evict_queue + urgent_queue->evict_candidates; if (evict < urgent_queue->evict_queue + cache->evict_slots && __evict_push_candidate(session, urgent_queue, evict, ref)) { ++urgent_queue->evict_candidates; queued = true; } __wt_spin_unlock(session, &urgent_queue->evict_lock); done: __wt_spin_unlock(session, &cache->evict_queue_lock); if (queued) { WT_STAT_CONN_INCR(session, cache_eviction_pages_queued_urgent); if (WT_EVICT_HAS_WORKERS(session)) __wt_cond_signal(session, S2C(session)->evict_threads.wait_cond); else __wt_evict_server_wake(session); } return (queued); } /* * __wt_evict_priority_set -- * Set a tree's eviction priority. */ void __wt_evict_priority_set(WT_SESSION_IMPL *session, uint64_t v) { S2BT(session)->evict_priority = v; } /* * __wt_evict_priority_clear -- * Clear a tree's eviction priority. */ void __wt_evict_priority_clear(WT_SESSION_IMPL *session) { S2BT(session)->evict_priority = 0; } #ifdef HAVE_DIAGNOSTIC /* * __wt_cache_dump -- * Dump debugging information to a file (default stderr) about the size of * the files in the cache. */ int __wt_cache_dump(WT_SESSION_IMPL *session, const char *ofile) { FILE *fp; WT_CONNECTION_IMPL *conn; WT_DATA_HANDLE *dhandle, *saved_dhandle; WT_PAGE *page; WT_REF *next_walk; uint64_t intl_bytes, intl_bytes_max, intl_dirty_bytes; uint64_t intl_dirty_bytes_max, intl_dirty_pages, intl_pages; uint64_t leaf_bytes, leaf_bytes_max, leaf_dirty_bytes; uint64_t leaf_dirty_bytes_max, leaf_dirty_pages, leaf_pages; uint64_t total_bytes, total_dirty_bytes; size_t size; conn = S2C(session); total_bytes = total_dirty_bytes = 0; if (ofile == NULL) fp = stderr; else if ((fp = fopen(ofile, "w")) == NULL) return (EIO); /* Note: odd string concatenation avoids spelling errors. */ (void)fprintf(fp, "==========\n" "cache dump\n"); saved_dhandle = session->dhandle; TAILQ_FOREACH(dhandle, &conn->dhqh, q) { if (!WT_PREFIX_MATCH(dhandle->name, "file:") || !F_ISSET(dhandle, WT_DHANDLE_OPEN)) continue; intl_bytes = intl_bytes_max = intl_dirty_bytes = 0; intl_dirty_bytes_max = intl_dirty_pages = intl_pages = 0; leaf_bytes = leaf_bytes_max = leaf_dirty_bytes = 0; leaf_dirty_bytes_max = leaf_dirty_pages = leaf_pages = 0; next_walk = NULL; session->dhandle = dhandle; while (__wt_tree_walk(session, &next_walk, WT_READ_CACHE | WT_READ_NO_EVICT | WT_READ_NO_WAIT) == 0 && next_walk != NULL) { page = next_walk->page; size = page->memory_footprint; if (WT_PAGE_IS_INTERNAL(page)) { ++intl_pages; intl_bytes += size; intl_bytes_max = WT_MAX(intl_bytes_max, size); if (__wt_page_is_modified(page)) { ++intl_dirty_pages; intl_dirty_bytes += size; intl_dirty_bytes_max = WT_MAX(intl_dirty_bytes_max, size); } } else { ++leaf_pages; leaf_bytes += size; leaf_bytes_max = WT_MAX(leaf_bytes_max, size); if (__wt_page_is_modified(page)) { ++leaf_dirty_pages; leaf_dirty_bytes += size; leaf_dirty_bytes_max = WT_MAX(leaf_dirty_bytes_max, size); } } } session->dhandle = NULL; if (dhandle->checkpoint == NULL) (void)fprintf(fp, "%s(): \n", dhandle->name); else (void)fprintf(fp, "%s(checkpoint=%s): \n", dhandle->name, dhandle->checkpoint); if (intl_pages != 0) (void)fprintf(fp, "\t" "internal: " "%" PRIu64 " pages, " "%" PRIu64 "MB, " "%" PRIu64 "/%" PRIu64 " clean/dirty pages, " "%" PRIu64 "/%" PRIu64 " clean/dirty MB, " "%" PRIu64 "MB max page, " "%" PRIu64 "MB max dirty page\n", intl_pages, intl_bytes >> 20, intl_pages - intl_dirty_pages, intl_dirty_pages, (intl_bytes - intl_dirty_bytes) >> 20, intl_dirty_bytes >> 20, intl_bytes_max >> 20, intl_dirty_bytes_max >> 20); if (leaf_pages != 0) (void)fprintf(fp, "\t" "leaf: " "%" PRIu64 " pages, " "%" PRIu64 "MB, " "%" PRIu64 "/%" PRIu64 " clean/dirty pages, " "%" PRIu64 "/%" PRIu64 " clean/dirty MB, " "%" PRIu64 "MB max page, " "%" PRIu64 "MB max dirty page\n", leaf_pages, leaf_bytes >> 20, leaf_pages - leaf_dirty_pages, leaf_dirty_pages, (leaf_bytes - leaf_dirty_bytes) >> 20, leaf_dirty_bytes >> 20, leaf_bytes_max >> 20, leaf_dirty_bytes_max >> 20); total_bytes += intl_bytes + leaf_bytes; total_dirty_bytes += intl_dirty_bytes + leaf_dirty_bytes; } session->dhandle = saved_dhandle; /* * Apply the overhead percentage so our total bytes are comparable with * the tracked value. */ total_bytes = __wt_cache_bytes_plus_overhead(conn->cache, total_bytes); (void)fprintf(fp, "cache dump: " "total found = %" PRIu64 "MB vs tracked inuse %" PRIu64 "MB\n" "total dirty bytes = %" PRIu64 "MB\n", total_bytes >> 20, __wt_cache_bytes_inuse(conn->cache) >> 20, total_dirty_bytes >> 20); (void)fprintf(fp, "==========\n"); if (ofile != NULL && fclose(fp) != 0) return (EIO); return (0); } #endif