/*- * Copyright (c) 2014-present MongoDB, Inc. * Copyright (c) 2008-2014 WiredTiger, Inc. * All rights reserved. * * See the file LICENSE for redistribution information. */ #include "wt_internal.h" static int __evict_clear_all_walks(WT_SESSION_IMPL *); static void __evict_list_clear_page_locked(WT_SESSION_IMPL *, WT_REF *, bool); static int WT_CDECL __evict_lru_cmp(const void *, const void *); static int __evict_lru_pages(WT_SESSION_IMPL *, bool); static int __evict_lru_walk(WT_SESSION_IMPL *); static int __evict_page(WT_SESSION_IMPL *, bool); static int __evict_pass(WT_SESSION_IMPL *); static int __evict_server(WT_SESSION_IMPL *, bool *); static void __evict_tune_workers(WT_SESSION_IMPL *session); static int __evict_walk(WT_SESSION_IMPL *, WT_EVICT_QUEUE *); static int __evict_walk_tree(WT_SESSION_IMPL *, WT_EVICT_QUEUE *, u_int, u_int *); #define WT_EVICT_HAS_WORKERS(s) (S2C(s)->evict_threads.current_threads > 1) /* * __evict_lock_handle_list -- * Try to get the handle list lock, with yield and sleep back off. Keep timing statistics * overall. */ static int __evict_lock_handle_list(WT_SESSION_IMPL *session) { WT_CACHE *cache; WT_CONNECTION_IMPL *conn; WT_DECL_RET; WT_RWLOCK *dh_lock; u_int spins; conn = S2C(session); cache = conn->cache; dh_lock = &conn->dhandle_lock; /* * Use a custom lock acquisition back off loop so the eviction server notices any interrupt * quickly. */ for (spins = 0; (ret = __wt_try_readlock(session, dh_lock)) == EBUSY && cache->pass_intr == 0; spins++) { if (spins < WT_THOUSAND) __wt_yield(); else __wt_sleep(0, WT_THOUSAND); } return (ret); } /* * __evict_entry_priority -- * Get the adjusted read generation for an eviction entry. */ static inline uint64_t __evict_entry_priority(WT_SESSION_IMPL *session, WT_REF *ref) { WT_BTREE *btree; WT_PAGE *page; uint64_t read_gen; btree = S2BT(session); page = ref->page; /* Any page set to the oldest generation should be discarded. */ if (WT_READGEN_EVICT_SOON(page->read_gen)) return (WT_READGEN_OLDEST); /* Any page from a dead tree is a great choice. */ if (F_ISSET(btree->dhandle, WT_DHANDLE_DEAD)) return (WT_READGEN_OLDEST); /* Any empty page (leaf or internal), is a good choice. */ if (__wt_page_is_empty(page)) return (WT_READGEN_OLDEST); /* Any large page in memory is likewise a good choice. */ if (page->memory_footprint > btree->splitmempage) return (WT_READGEN_OLDEST); /* * The base read-generation is skewed by the eviction priority. Internal pages are also * adjusted, we prefer to evict leaf pages. */ if (page->modify != NULL && F_ISSET(S2C(session)->cache, WT_CACHE_EVICT_DIRTY) && !F_ISSET(S2C(session)->cache, WT_CACHE_EVICT_CLEAN)) read_gen = page->modify->update_txn; else read_gen = page->read_gen; read_gen += btree->evict_priority; #define WT_EVICT_INTL_SKEW WT_THOUSAND if (F_ISSET(ref, WT_REF_FLAG_INTERNAL)) read_gen += WT_EVICT_INTL_SKEW; return (read_gen); } /* * __evict_lru_cmp_debug -- * Qsort function: sort the eviction array. Version for eviction debug mode. */ static int WT_CDECL __evict_lru_cmp_debug(const void *a_arg, const void *b_arg) { const WT_EVICT_ENTRY *a, *b; uint64_t a_score, b_score; a = a_arg; b = b_arg; a_score = (a->ref == NULL ? UINT64_MAX : 0); b_score = (b->ref == NULL ? UINT64_MAX : 0); return ((a_score < b_score) ? -1 : (a_score == b_score) ? 0 : 1); } /* * __evict_lru_cmp -- * Qsort function: sort the eviction array. */ static int WT_CDECL __evict_lru_cmp(const void *a_arg, const void *b_arg) { const WT_EVICT_ENTRY *a, *b; uint64_t a_score, b_score; a = a_arg; b = b_arg; a_score = (a->ref == NULL ? UINT64_MAX : a->score); b_score = (b->ref == NULL ? UINT64_MAX : b->score); return ((a_score < b_score) ? -1 : (a_score == b_score) ? 0 : 1); } /* * __evict_list_clear -- * Clear an entry in the LRU eviction list. */ static inline void __evict_list_clear(WT_SESSION_IMPL *session, WT_EVICT_ENTRY *e) { if (e->ref != NULL) { WT_ASSERT(session, F_ISSET_ATOMIC_16(e->ref->page, WT_PAGE_EVICT_LRU)); F_CLR_ATOMIC_16(e->ref->page, WT_PAGE_EVICT_LRU | WT_PAGE_EVICT_LRU_URGENT); } e->ref = NULL; e->btree = WT_DEBUG_POINT; } /* * __evict_list_clear_page_locked -- * This function searches for the page in all the eviction queues (skipping the urgent queue if * requested) and clears it if found. It does not take the eviction queue lock, so the caller * should hold the appropriate locks before calling this function. */ static void __evict_list_clear_page_locked(WT_SESSION_IMPL *session, WT_REF *ref, bool exclude_urgent) { WT_CACHE *cache; WT_EVICT_ENTRY *evict; uint32_t elem, i, q, last_queue_idx; bool found; last_queue_idx = exclude_urgent ? WT_EVICT_URGENT_QUEUE : WT_EVICT_QUEUE_MAX; cache = S2C(session)->cache; found = false; for (q = 0; q < last_queue_idx && !found; q++) { __wt_spin_lock(session, &cache->evict_queues[q].evict_lock); elem = cache->evict_queues[q].evict_max; for (i = 0, evict = cache->evict_queues[q].evict_queue; i < elem; i++, evict++) if (evict->ref == ref) { found = true; __evict_list_clear(session, evict); break; } __wt_spin_unlock(session, &cache->evict_queues[q].evict_lock); } WT_ASSERT(session, !F_ISSET_ATOMIC_16(ref->page, WT_PAGE_EVICT_LRU)); } /* * __wt_evict_list_clear_page -- * Check whether a page is present in the LRU eviction list. If the page is found in the list, * remove it. This is called from the page eviction code to make sure there is no attempt to * evict a child page multiple times. */ void __wt_evict_list_clear_page(WT_SESSION_IMPL *session, WT_REF *ref) { WT_CACHE *cache; WT_ASSERT(session, __wt_ref_is_root(ref) || ref->state == WT_REF_LOCKED); /* Fast path: if the page isn't in the queue, don't bother searching. */ if (!F_ISSET_ATOMIC_16(ref->page, WT_PAGE_EVICT_LRU)) return; cache = S2C(session)->cache; __wt_spin_lock(session, &cache->evict_queue_lock); /* Remove the reference from the eviction queues. */ __evict_list_clear_page_locked(session, ref, false); __wt_spin_unlock(session, &cache->evict_queue_lock); } /* * __evict_queue_empty -- * Is the queue empty? Note that the eviction server is pessimistic and treats a half full queue * as empty. */ static inline bool __evict_queue_empty(WT_EVICT_QUEUE *queue, bool server_check) { uint32_t candidates, used; if (queue->evict_current == NULL) return (true); /* The eviction server only considers half of the candidates. */ candidates = queue->evict_candidates; if (server_check && candidates > 1) candidates /= 2; used = (uint32_t)(queue->evict_current - queue->evict_queue); return (used >= candidates); } /* * __evict_queue_full -- * Is the queue full (i.e., it has been populated with candidates and none of them have been * evicted yet)? */ static inline bool __evict_queue_full(WT_EVICT_QUEUE *queue) { return (queue->evict_current == queue->evict_queue && queue->evict_candidates != 0); } /* * __wt_evict_server_wake -- * Wake the eviction server thread. */ void __wt_evict_server_wake(WT_SESSION_IMPL *session) { WT_CACHE *cache; WT_CONNECTION_IMPL *conn; conn = S2C(session); cache = conn->cache; if (WT_VERBOSE_LEVEL_ISSET(session, WT_VERB_EVICTSERVER, WT_VERBOSE_DEBUG_2)) { uint64_t bytes_inuse, bytes_max; bytes_inuse = __wt_cache_bytes_inuse(cache); bytes_max = conn->cache_size; __wt_verbose_debug2(session, WT_VERB_EVICTSERVER, "waking, bytes inuse %s max (%" PRIu64 "MB %s %" PRIu64 "MB)", bytes_inuse <= bytes_max ? "<=" : ">", bytes_inuse / WT_MEGABYTE, bytes_inuse <= bytes_max ? "<=" : ">", bytes_max / WT_MEGABYTE); } __wt_cond_signal(session, cache->evict_cond); } /* * __wt_evict_thread_chk -- * Check to decide if the eviction thread should continue running. */ bool __wt_evict_thread_chk(WT_SESSION_IMPL *session) { return (F_ISSET(S2C(session), WT_CONN_EVICTION_RUN)); } /* * __wt_evict_thread_run -- * Entry function for an eviction thread. This is called repeatedly from the thread group code * so it does not need to loop itself. */ int __wt_evict_thread_run(WT_SESSION_IMPL *session, WT_THREAD *thread) { WT_CACHE *cache; WT_CONNECTION_IMPL *conn; WT_DECL_RET; bool did_work, was_intr; conn = S2C(session); cache = conn->cache; /* Mark the session as an eviction thread session. */ F_SET(session, WT_SESSION_EVICTION); /* * Cache a history store cursor to avoid deadlock: if an eviction thread marks a file busy and * then opens a different file (in this case, the HS file), it can deadlock with a thread * waiting for the first file to drain from the eviction queue. See WT-5946 for details. */ WT_ERR(__wt_curhs_cache(session)); if (conn->evict_server_running && __wt_spin_trylock(session, &cache->evict_pass_lock) == 0) { /* * Cannot use WT_WITH_PASS_LOCK because this is a try lock. Fix when that is supported. We * set the flag on both sessions because we may call clear_walk when we are walking with the * walk session, locked. */ FLD_SET(session->lock_flags, WT_SESSION_LOCKED_PASS); FLD_SET(cache->walk_session->lock_flags, WT_SESSION_LOCKED_PASS); ret = __evict_server(session, &did_work); FLD_CLR(cache->walk_session->lock_flags, WT_SESSION_LOCKED_PASS); FLD_CLR(session->lock_flags, WT_SESSION_LOCKED_PASS); was_intr = cache->pass_intr != 0; __wt_spin_unlock(session, &cache->evict_pass_lock); WT_ERR(ret); /* * If the eviction server was interrupted, wait until requests have been processed: the * system may otherwise be busy so don't go to sleep. */ if (was_intr) while (cache->pass_intr != 0 && F_ISSET(conn, WT_CONN_EVICTION_RUN) && F_ISSET(thread, WT_THREAD_RUN)) __wt_yield(); else { __wt_verbose_debug2(session, WT_VERB_EVICTSERVER, "%s", "sleeping"); /* Don't rely on signals: check periodically. */ __wt_cond_auto_wait(session, cache->evict_cond, did_work, NULL); __wt_verbose_debug2(session, WT_VERB_EVICTSERVER, "%s", "waking"); } } else WT_ERR(__evict_lru_pages(session, false)); if (0) { err: WT_RET_PANIC(session, ret, "cache eviction thread error"); } return (ret); } /* * __wt_evict_thread_stop -- * Shutdown function for an eviction thread. */ int __wt_evict_thread_stop(WT_SESSION_IMPL *session, WT_THREAD *thread) { WT_CACHE *cache; WT_CONNECTION_IMPL *conn; WT_DECL_RET; if (thread->id != 0) return (0); conn = S2C(session); cache = conn->cache; /* * The only time the first eviction thread is stopped is on shutdown: in case any trees are * still open, clear all walks now so that they can be closed. */ WT_WITH_PASS_LOCK(session, ret = __evict_clear_all_walks(session)); WT_ERR(ret); /* * The only cases when the eviction server is expected to stop are when recovery is finished, * when the connection is closing or when an error has occurred and connection panic flag is * set. */ WT_ASSERT(session, F_ISSET(conn, WT_CONN_CLOSING | WT_CONN_PANIC | WT_CONN_RECOVERING)); /* Clear the eviction thread session flag. */ F_CLR(session, WT_SESSION_EVICTION); __wt_verbose(session, WT_VERB_EVICTSERVER, "%s", "cache eviction thread exiting"); if (0) { err: WT_RET_PANIC(session, ret, "cache eviction thread error"); } return (ret); } /* * __evict_server -- * Thread to evict pages from the cache. */ static int __evict_server(WT_SESSION_IMPL *session, bool *did_work) { struct timespec now; WT_CACHE *cache; WT_CONNECTION_IMPL *conn; WT_DECL_RET; uint64_t time_diff_ms; /* Assume there has been no progress. */ *did_work = false; conn = S2C(session); cache = conn->cache; /* Evict pages from the cache as needed. */ WT_RET(__evict_pass(session)); if (!F_ISSET(conn, WT_CONN_EVICTION_RUN) || cache->pass_intr != 0) return (0); if (!__wt_cache_stuck(session)) { /* * Try to get the handle list lock: if we give up, that indicates a session is waiting for * us to clear walks. Do that as part of a normal pass (without the handle list lock) to * avoid deadlock. */ if ((ret = __evict_lock_handle_list(session)) == EBUSY) return (0); WT_RET(ret); /* * Clear the walks so we don't pin pages while asleep, otherwise we can block applications * evicting large pages. */ ret = __evict_clear_all_walks(session); __wt_readunlock(session, &conn->dhandle_lock); WT_RET(ret); /* Make sure we'll notice next time we're stuck. */ cache->last_eviction_progress = 0; return (0); } /* Track if work was done. */ *did_work = cache->eviction_progress != cache->last_eviction_progress; cache->last_eviction_progress = cache->eviction_progress; /* Eviction is stuck, check if we have made progress. */ if (*did_work) { #if !defined(HAVE_DIAGNOSTIC) /* Need verbose check only if not in diagnostic build */ if (WT_VERBOSE_ISSET(session, WT_VERB_EVICT_STUCK)) #endif __wt_epoch(session, &cache->stuck_time); return (0); } #if !defined(HAVE_DIAGNOSTIC) /* Need verbose check only if not in diagnostic build */ if (!WT_VERBOSE_ISSET(session, WT_VERB_EVICT_STUCK)) return (0); #endif /* * If we're stuck for 5 minutes in diagnostic mode, or the verbose evict_stuck flag is * configured, log the cache and transaction state. * * If we're stuck for 5 minutes in diagnostic mode, give up. * * We don't do this check for in-memory workloads because application threads are not blocked by * the cache being full. If the cache becomes full of clean pages, we can be servicing reads * while the cache appears stuck to eviction. */ if (F_ISSET(conn, WT_CONN_IN_MEMORY)) return (0); __wt_epoch(session, &now); /* The checks below should only be executed when a cache timeout has been set. */ if (cache->cache_stuck_timeout_ms > 0) { time_diff_ms = WT_TIMEDIFF_MS(now, cache->stuck_time); #ifdef HAVE_DIAGNOSTIC /* Enable extra logs 20ms before timing out. */ if (cache->cache_stuck_timeout_ms < 20 || (time_diff_ms > cache->cache_stuck_timeout_ms - 20)) { WT_SET_VERBOSE_LEVEL(session, WT_VERB_EVICT, WT_VERBOSE_DEBUG_1); WT_SET_VERBOSE_LEVEL(session, WT_VERB_EVICTSERVER, WT_VERBOSE_DEBUG_1); WT_SET_VERBOSE_LEVEL(session, WT_VERB_EVICT_STUCK, WT_VERBOSE_DEBUG_1); } #endif if (time_diff_ms >= cache->cache_stuck_timeout_ms) { #ifdef HAVE_DIAGNOSTIC __wt_err(session, ETIMEDOUT, "Cache stuck for too long, giving up"); WT_RET(__wt_verbose_dump_txn(session)); WT_RET(__wt_verbose_dump_cache(session)); return (__wt_set_return(session, ETIMEDOUT)); #else if (WT_VERBOSE_ISSET(session, WT_VERB_EVICT_STUCK)) { WT_RET(__wt_verbose_dump_txn(session)); WT_RET(__wt_verbose_dump_cache(session)); /* Reset the timer. */ __wt_epoch(session, &cache->stuck_time); } #endif } } return (0); } /* * __wt_evict_create -- * Start the eviction server. */ int __wt_evict_create(WT_SESSION_IMPL *session) { WT_CONNECTION_IMPL *conn; uint32_t session_flags; conn = S2C(session); /* * In case recovery has allocated some transaction IDs, bump to the current state. This will * prevent eviction threads from pinning anything as they start up and read metadata in order to * open cursors. */ WT_RET(__wt_txn_update_oldest(session, WT_TXN_OLDEST_STRICT | WT_TXN_OLDEST_WAIT)); WT_ASSERT(session, conn->evict_threads_min > 0); /* Set first, the thread might run before we finish up. */ F_SET(conn, WT_CONN_EVICTION_RUN); /* * Create the eviction thread group. Set the group size to the maximum allowed sessions. */ session_flags = WT_THREAD_CAN_WAIT | WT_THREAD_PANIC_FAIL; WT_RET(__wt_thread_group_create(session, &conn->evict_threads, "eviction-server", conn->evict_threads_min, conn->evict_threads_max, session_flags, __wt_evict_thread_chk, __wt_evict_thread_run, __wt_evict_thread_stop)); /* * Ensure the cache stuck timer is initialized when starting eviction. */ #if !defined(HAVE_DIAGNOSTIC) /* Need verbose check only if not in diagnostic build */ if (WT_VERBOSE_ISSET(session, WT_VERB_EVICTSERVER)) #endif __wt_epoch(session, &conn->cache->stuck_time); /* * Allow queues to be populated now that the eviction threads are running. */ conn->evict_server_running = true; return (0); } /* * __wt_evict_destroy -- * Destroy the eviction threads. */ int __wt_evict_destroy(WT_SESSION_IMPL *session) { WT_CONNECTION_IMPL *conn; conn = S2C(session); /* We are done if the eviction server didn't start successfully. */ if (!conn->evict_server_running) return (0); /* Wait for any eviction thread group changes to stabilize. */ __wt_writelock(session, &conn->evict_threads.lock); /* * Signal the threads to finish and stop populating the queue. */ F_CLR(conn, WT_CONN_EVICTION_RUN); conn->evict_server_running = false; __wt_evict_server_wake(session); __wt_verbose(session, WT_VERB_EVICTSERVER, "%s", "waiting for helper threads"); /* * We call the destroy function still holding the write lock. It assumes it is called locked. */ WT_RET(__wt_thread_group_destroy(session, &conn->evict_threads)); return (0); } /* * __evict_update_work -- * Configure eviction work state. */ static bool __evict_update_work(WT_SESSION_IMPL *session) { WT_BTREE *hs_tree; WT_CACHE *cache; WT_CONNECTION_IMPL *conn; double dirty_target, dirty_trigger, target, trigger, updates_target, updates_trigger; uint64_t bytes_dirty, bytes_inuse, bytes_max, bytes_updates; uint32_t flags; conn = S2C(session); cache = conn->cache; dirty_target = __wt_eviction_dirty_target(cache); dirty_trigger = cache->eviction_dirty_trigger; target = cache->eviction_target; trigger = cache->eviction_trigger; updates_target = cache->eviction_updates_target; updates_trigger = cache->eviction_updates_trigger; /* Build up the new state. */ flags = 0; if (!F_ISSET(conn, WT_CONN_EVICTION_RUN)) { cache->flags = 0; return (false); } if (!__evict_queue_empty(cache->evict_urgent_queue, false)) LF_SET(WT_CACHE_EVICT_URGENT); /* * TODO: We are caching the cache usage values associated with the history store because the * history store dhandle isn't always available to eviction. Keeping potentially out-of-date * values could lead to surprising bugs in the future. */ if (F_ISSET(conn, WT_CONN_HS_OPEN) && __wt_hs_get_btree(session, &hs_tree) == 0) { cache->bytes_hs = hs_tree->bytes_inmem; cache->bytes_hs_dirty = hs_tree->bytes_dirty_intl + hs_tree->bytes_dirty_leaf; } /* * If we need space in the cache, try to find clean pages to evict. * * Avoid division by zero if the cache size has not yet been set in a shared cache. */ bytes_max = conn->cache_size + 1; bytes_inuse = __wt_cache_bytes_inuse(cache); if (__wt_eviction_clean_needed(session, NULL)) LF_SET(WT_CACHE_EVICT_CLEAN | WT_CACHE_EVICT_CLEAN_HARD); else if (bytes_inuse > (target * bytes_max) / 100) LF_SET(WT_CACHE_EVICT_CLEAN); bytes_dirty = __wt_cache_dirty_leaf_inuse(cache); if (__wt_eviction_dirty_needed(session, NULL)) LF_SET(WT_CACHE_EVICT_DIRTY | WT_CACHE_EVICT_DIRTY_HARD); else if (bytes_dirty > (uint64_t)(dirty_target * bytes_max) / 100) LF_SET(WT_CACHE_EVICT_DIRTY); bytes_updates = __wt_cache_bytes_updates(cache); if (__wt_eviction_updates_needed(session, NULL)) LF_SET(WT_CACHE_EVICT_UPDATES | WT_CACHE_EVICT_UPDATES_HARD); else if (bytes_updates > (uint64_t)(updates_target * bytes_max) / 100) LF_SET(WT_CACHE_EVICT_UPDATES); /* * If application threads are blocked by the total volume of data in cache, try dirty pages as * well. */ if (__wt_cache_aggressive(session) && LF_ISSET(WT_CACHE_EVICT_CLEAN_HARD)) LF_SET(WT_CACHE_EVICT_DIRTY); /* * Scrub dirty pages and keep them in cache if we are less than half way to the clean, dirty or * updates triggers. */ if (bytes_inuse < (uint64_t)((target + trigger) * bytes_max) / 200) { if (bytes_dirty < (uint64_t)((dirty_target + dirty_trigger) * bytes_max) / 200 && bytes_updates < (uint64_t)((updates_target + updates_trigger) * bytes_max) / 200) LF_SET(WT_CACHE_EVICT_SCRUB); } else LF_SET(WT_CACHE_EVICT_NOKEEP); if (FLD_ISSET(conn->debug_flags, WT_CONN_DEBUG_UPDATE_RESTORE_EVICT)) { LF_SET(WT_CACHE_EVICT_SCRUB); LF_CLR(WT_CACHE_EVICT_NOKEEP); } /* * With an in-memory cache, we only do dirty eviction in order to scrub pages. */ if (F_ISSET(conn, WT_CONN_IN_MEMORY)) { if (LF_ISSET(WT_CACHE_EVICT_CLEAN)) LF_SET(WT_CACHE_EVICT_DIRTY); if (LF_ISSET(WT_CACHE_EVICT_CLEAN_HARD)) LF_SET(WT_CACHE_EVICT_DIRTY_HARD); LF_CLR(WT_CACHE_EVICT_CLEAN | WT_CACHE_EVICT_CLEAN_HARD); } /* Update the global eviction state. */ cache->flags = flags; return (F_ISSET(cache, WT_CACHE_EVICT_ALL | WT_CACHE_EVICT_URGENT)); } /* * __evict_pass -- * Evict pages from memory. */ static int __evict_pass(WT_SESSION_IMPL *session) { WT_CACHE *cache; WT_CONNECTION_IMPL *conn; WT_TXN_GLOBAL *txn_global; uint64_t eviction_progress, oldest_id, prev_oldest_id; uint64_t time_now, time_prev; u_int loop; conn = S2C(session); cache = conn->cache; txn_global = &conn->txn_global; time_prev = 0; /* [-Wconditional-uninitialized] */ /* Track whether pages are being evicted and progress is made. */ eviction_progress = cache->eviction_progress; prev_oldest_id = txn_global->oldest_id; /* Evict pages from the cache. */ for (loop = 0; cache->pass_intr == 0; loop++) { time_now = __wt_clock(session); if (loop == 0) time_prev = time_now; __evict_tune_workers(session); /* * Increment the shared read generation. Do this occasionally even if eviction is not * currently required, so that pages have some relative read generation when the eviction * server does need to do some work. */ __wt_cache_read_gen_incr(session); ++cache->evict_pass_gen; /* * Update the oldest ID: we use it to decide whether pages are candidates for eviction. * Without this, if all threads are blocked after a long-running transaction (such as a * checkpoint) completes, we may never start evicting again. * * Do this every time the eviction server wakes up, regardless of whether the cache is full, * to prevent the oldest ID falling too far behind. Don't wait to lock the table: with * highly threaded workloads, that creates a bottleneck. */ WT_RET(__wt_txn_update_oldest(session, WT_TXN_OLDEST_STRICT)); if (!__evict_update_work(session)) break; __wt_verbose_debug2(session, WT_VERB_EVICTSERVER, "Eviction pass with: Max: %" PRIu64 " In use: %" PRIu64 " Dirty: %" PRIu64, conn->cache_size, cache->bytes_inmem, cache->bytes_dirty_intl + cache->bytes_dirty_leaf); if (F_ISSET(cache, WT_CACHE_EVICT_ALL)) WT_RET(__evict_lru_walk(session)); /* * If the queue has been empty recently, keep queuing more pages to evict. If the rate of * queuing pages is high enough, this score will go to zero, in which case the eviction * server might as well help out with eviction. * * Also, if there is a single eviction server thread with no workers, it must service the * urgent queue in case all application threads are busy. */ if (!WT_EVICT_HAS_WORKERS(session) && (cache->evict_empty_score < WT_EVICT_SCORE_CUTOFF || !__evict_queue_empty(cache->evict_urgent_queue, false))) WT_RET(__evict_lru_pages(session, true)); if (cache->pass_intr != 0) break; /* * If we're making progress, keep going; if we're not making any progress at all, mark the * cache "stuck" and go back to sleep, it's not something we can fix. * * We check for progress every 20ms, the idea being that the aggressive score will reach 10 * after 200ms if we aren't making progress and eviction will start considering more pages. * If there is still no progress after 2s, we will treat the cache as stuck and start * rolling back transactions and writing updates to the history store table. */ if (eviction_progress == cache->eviction_progress) { if (WT_CLOCKDIFF_MS(time_now, time_prev) >= 20 && F_ISSET(cache, WT_CACHE_EVICT_HARD)) { if (cache->evict_aggressive_score < 100) ++cache->evict_aggressive_score; oldest_id = txn_global->oldest_id; if (prev_oldest_id == oldest_id && txn_global->current != oldest_id && cache->evict_aggressive_score < 100) ++cache->evict_aggressive_score; time_prev = time_now; prev_oldest_id = oldest_id; } /* * Keep trying for long enough that we should be able to evict a page if the server * isn't interfering. */ if (loop < 100 || cache->evict_aggressive_score < 100) { /* * Back off if we aren't making progress: walks hold the handle list lock, blocking * other operations that can free space in cache, such as LSM discarding handles. * * Allow this wait to be interrupted (e.g. if a checkpoint completes): make sure we * wait for a non-zero number of microseconds). */ WT_STAT_CONN_INCR(session, cache_eviction_server_slept); __wt_cond_wait(session, cache->evict_cond, WT_THOUSAND, NULL); continue; } WT_STAT_CONN_INCR(session, cache_eviction_slow); __wt_verbose(session, WT_VERB_EVICTSERVER, "%s", "unable to reach eviction goal"); break; } if (cache->evict_aggressive_score > 0) --cache->evict_aggressive_score; loop = 0; eviction_progress = cache->eviction_progress; } return (0); } /* * __evict_clear_walk -- * Clear a single walk point. */ static int __evict_clear_walk(WT_SESSION_IMPL *session) { WT_BTREE *btree; WT_CACHE *cache; WT_DECL_RET; WT_REF *ref; btree = S2BT(session); cache = S2C(session)->cache; WT_ASSERT(session, FLD_ISSET(session->lock_flags, WT_SESSION_LOCKED_PASS)); if (session->dhandle == cache->walk_tree) cache->walk_tree = NULL; if ((ref = btree->evict_ref) == NULL) return (0); WT_STAT_CONN_INCR(session, cache_eviction_walks_abandoned); WT_STAT_DATA_INCR(session, cache_eviction_walks_abandoned); /* * Clear evict_ref before releasing it in case that forces eviction (we assert that we never try * to evict the current eviction walk point). */ btree->evict_ref = NULL; WT_WITH_DHANDLE(cache->walk_session, session->dhandle, (ret = __wt_page_release(cache->walk_session, ref, WT_READ_NO_EVICT))); return (ret); } /* * __evict_clear_all_walks -- * Clear the eviction walk points for all files a session is waiting on. */ static int __evict_clear_all_walks(WT_SESSION_IMPL *session) { WT_CONNECTION_IMPL *conn; WT_DATA_HANDLE *dhandle; WT_DECL_RET; conn = S2C(session); TAILQ_FOREACH (dhandle, &conn->dhqh, q) if (WT_DHANDLE_BTREE(dhandle)) WT_WITH_DHANDLE(session, dhandle, WT_TRET(__evict_clear_walk(session))); return (ret); } /* * __wt_evict_file_exclusive_on -- * Get exclusive eviction access to a file and discard any of the file's blocks queued for * eviction. */ int __wt_evict_file_exclusive_on(WT_SESSION_IMPL *session) { WT_BTREE *btree; WT_CACHE *cache; WT_DECL_RET; WT_EVICT_ENTRY *evict; u_int i, elem, q; btree = S2BT(session); cache = S2C(session)->cache; /* Hold the walk lock to turn off eviction. */ __wt_spin_lock(session, &cache->evict_walk_lock); if (++btree->evict_disabled > 1) { __wt_spin_unlock(session, &cache->evict_walk_lock); return (0); } /* * Ensure no new pages from the file will be queued for eviction after this point, then clear * any existing LRU eviction walk for the file. */ (void)__wt_atomic_addv32(&cache->pass_intr, 1); WT_WITH_PASS_LOCK(session, ret = __evict_clear_walk(session)); (void)__wt_atomic_subv32(&cache->pass_intr, 1); WT_ERR(ret); /* * The eviction candidate list might reference pages from the file, clear it. Hold the evict * lock to remove queued pages from a file. */ __wt_spin_lock(session, &cache->evict_queue_lock); for (q = 0; q < WT_EVICT_QUEUE_MAX; q++) { __wt_spin_lock(session, &cache->evict_queues[q].evict_lock); elem = cache->evict_queues[q].evict_max; for (i = 0, evict = cache->evict_queues[q].evict_queue; i < elem; i++, evict++) if (evict->btree == btree) __evict_list_clear(session, evict); __wt_spin_unlock(session, &cache->evict_queues[q].evict_lock); } __wt_spin_unlock(session, &cache->evict_queue_lock); /* * We have disabled further eviction: wait for concurrent LRU eviction activity to drain. */ while (btree->evict_busy > 0) __wt_yield(); if (0) { err: --btree->evict_disabled; } __wt_spin_unlock(session, &cache->evict_walk_lock); return (ret); } /* * __wt_evict_file_exclusive_off -- * Release exclusive eviction access to a file. */ void __wt_evict_file_exclusive_off(WT_SESSION_IMPL *session) { WT_BTREE *btree; btree = S2BT(session); /* * We have seen subtle bugs with multiple threads racing to turn eviction on/off. Make races * more likely in diagnostic builds. */ WT_DIAGNOSTIC_YIELD; /* * Atomically decrement the evict-disabled count, without acquiring the eviction walk-lock. We can't * acquire that lock here because there's a potential deadlock. When acquiring exclusive eviction * access, we acquire the eviction walk-lock and then the cache's pass-intr lock. The current * eviction implementation can hold the pass-intr lock and call into this function (see WT-3303 for * the details), which might deadlock with another thread trying to get exclusive eviction access. */ #if defined(HAVE_DIAGNOSTIC) { int32_t v; WT_ASSERT(session, btree->evict_ref == NULL); v = __wt_atomic_subi32(&btree->evict_disabled, 1); WT_ASSERT(session, v >= 0); } #else (void)__wt_atomic_subi32(&btree->evict_disabled, 1); #endif } #define EVICT_TUNE_BATCH 1 /* Max workers to add each period */ /* * Data points needed before deciding if we should keep adding workers or * settle on an earlier value. */ #define EVICT_TUNE_DATAPT_MIN 8 #define EVICT_TUNE_PERIOD 60 /* Tune period in milliseconds */ /* * We will do a fresh re-tune every that many milliseconds to adjust to significant phase changes. */ #define EVICT_FORCE_RETUNE (25 * WT_THOUSAND) /* * __evict_tune_workers -- * Find the right number of eviction workers. Gradually ramp up the number of workers increasing * the number in batches indicated by the setting above. Store the number of workers that gave * us the best throughput so far and the number of data points we have tried. Every once in a * while when we have the minimum number of data points we check whether the eviction throughput * achieved with the current number of workers is the best we have seen so far. If so, we will * keep increasing the number of workers. If not, we are past the infliction point on the * eviction throughput curve. In that case, we will set the number of workers to the best * observed so far and settle into a stable state. */ static void __evict_tune_workers(WT_SESSION_IMPL *session) { struct timespec current_time; WT_CACHE *cache; WT_CONNECTION_IMPL *conn; uint64_t delta_msec, delta_pages; uint64_t eviction_progress, eviction_progress_rate, time_diff; int32_t cur_threads, i, target_threads, thread_surplus; conn = S2C(session); cache = conn->cache; /* * If we have a fixed number of eviction threads, there is no value in calculating if we should * do any tuning. */ if (conn->evict_threads_max == conn->evict_threads_min) return; __wt_epoch(session, ¤t_time); time_diff = WT_TIMEDIFF_MS(current_time, cache->evict_tune_last_time); /* * If we have reached the stable state and have not run long enough to surpass the forced * re-tuning threshold, return. */ if (cache->evict_tune_stable) { if (time_diff < EVICT_FORCE_RETUNE) return; /* * Stable state was reached a long time ago. Let's re-tune. Reset all the state. */ cache->evict_tune_stable = false; cache->evict_tune_last_action_time.tv_sec = 0; cache->evict_tune_progress_last = 0; cache->evict_tune_num_points = 0; cache->evict_tune_progress_rate_max = 0; /* Reduce the number of eviction workers by one */ thread_surplus = (int32_t)conn->evict_threads.current_threads - (int32_t)conn->evict_threads_min; if (thread_surplus > 0) { __wt_thread_group_stop_one(session, &conn->evict_threads); WT_STAT_CONN_INCR(session, cache_eviction_worker_removed); } WT_STAT_CONN_INCR(session, cache_eviction_force_retune); } else if (time_diff < EVICT_TUNE_PERIOD) /* * If we have not reached stable state, don't do anything unless enough time has passed * since the last time we have taken any action in this function. */ return; /* * Measure the evicted progress so far. Eviction rate correlates to performance, so this is our * metric of success. */ eviction_progress = cache->eviction_progress; /* * If we have recorded the number of pages evicted at the end of the previous measurement * interval, we can compute the eviction rate in evicted pages per second achieved during the * current measurement interval. Otherwise, we just record the number of evicted pages and * return. */ if (cache->evict_tune_progress_last == 0) goto done; delta_msec = WT_TIMEDIFF_MS(current_time, cache->evict_tune_last_time); delta_pages = eviction_progress - cache->evict_tune_progress_last; eviction_progress_rate = (delta_pages * WT_THOUSAND) / delta_msec; cache->evict_tune_num_points++; /* * Keep track of the maximum eviction throughput seen and the number of workers corresponding to * that throughput. */ if (eviction_progress_rate > cache->evict_tune_progress_rate_max) { cache->evict_tune_progress_rate_max = eviction_progress_rate; cache->evict_tune_workers_best = conn->evict_threads.current_threads; } /* * Compare the current number of data points with the number needed variable. If they are equal, * we will check whether we are still going up on the performance curve, in which case we will * increase the number of needed data points, to provide opportunity for further increasing the * number of workers. Or we are past the inflection point on the curve, in which case we will go * back to the best observed number of workers and settle into a stable state. */ if (cache->evict_tune_num_points >= cache->evict_tune_datapts_needed) { if (cache->evict_tune_workers_best == conn->evict_threads.current_threads && conn->evict_threads.current_threads < conn->evict_threads_max) { /* * Keep adding workers. We will check again at the next check point. */ cache->evict_tune_datapts_needed += WT_MIN(EVICT_TUNE_DATAPT_MIN, (conn->evict_threads_max - conn->evict_threads.current_threads) / EVICT_TUNE_BATCH); } else { /* * We are past the inflection point. Choose the best number of eviction workers observed * and settle into a stable state. */ thread_surplus = (int32_t)conn->evict_threads.current_threads - (int32_t)cache->evict_tune_workers_best; for (i = 0; i < thread_surplus; i++) { __wt_thread_group_stop_one(session, &conn->evict_threads); WT_STAT_CONN_INCR(session, cache_eviction_worker_removed); } cache->evict_tune_stable = true; goto done; } } /* * If we have not added any worker threads in the past, we set the number of data points needed * equal to the number of data points that we must accumulate before deciding if we should keep * adding workers or settle on a previously tried stable number of workers. */ if (cache->evict_tune_last_action_time.tv_sec == 0) cache->evict_tune_datapts_needed = EVICT_TUNE_DATAPT_MIN; if (F_ISSET(cache, WT_CACHE_EVICT_ALL)) { cur_threads = (int32_t)conn->evict_threads.current_threads; target_threads = WT_MIN(cur_threads + EVICT_TUNE_BATCH, (int32_t)conn->evict_threads_max); /* * Start the new threads. */ for (i = cur_threads; i < target_threads; ++i) { __wt_thread_group_start_one(session, &conn->evict_threads, false); WT_STAT_CONN_INCR(session, cache_eviction_worker_created); __wt_verbose(session, WT_VERB_EVICTSERVER, "%s", "added worker thread"); } cache->evict_tune_last_action_time = current_time; } done: cache->evict_tune_last_time = current_time; cache->evict_tune_progress_last = eviction_progress; } /* * __evict_lru_pages -- * Get pages from the LRU queue to evict. */ static int __evict_lru_pages(WT_SESSION_IMPL *session, bool is_server) { WT_CONNECTION_IMPL *conn; WT_DECL_RET; WT_TRACK_OP_DECL; WT_TRACK_OP_INIT(session); conn = S2C(session); /* * Reconcile and discard some pages: EBUSY is returned if a page fails eviction because it's * unavailable, continue in that case. */ while (F_ISSET(conn, WT_CONN_EVICTION_RUN) && ret == 0) if ((ret = __evict_page(session, is_server)) == EBUSY) ret = 0; /* If any resources are pinned, release them now. */ WT_TRET(__wt_session_release_resources(session)); /* If a worker thread found the queue empty, pause. */ if (ret == WT_NOTFOUND && !is_server && F_ISSET(conn, WT_CONN_EVICTION_RUN)) __wt_cond_wait(session, conn->evict_threads.wait_cond, 10 * WT_THOUSAND, NULL); WT_TRACK_OP_END(session); return (ret == WT_NOTFOUND ? 0 : ret); } /* * __evict_lru_walk -- * Add pages to the LRU queue to be evicted from cache. */ static int __evict_lru_walk(WT_SESSION_IMPL *session) { WT_CACHE *cache; WT_CONNECTION_IMPL *conn; WT_DECL_RET; WT_EVICT_QUEUE *queue, *other_queue; WT_TRACK_OP_DECL; uint64_t read_gen_oldest; uint32_t candidates, entries; WT_TRACK_OP_INIT(session); conn = S2C(session); cache = conn->cache; /* Age out the score of how much the queue has been empty recently. */ if (cache->evict_empty_score > 0) --cache->evict_empty_score; /* Fill the next queue (that isn't the urgent queue). */ queue = cache->evict_fill_queue; other_queue = cache->evict_queues + (1 - (queue - cache->evict_queues)); cache->evict_fill_queue = other_queue; /* If this queue is full, try the other one. */ if (__evict_queue_full(queue) && !__evict_queue_full(other_queue)) queue = other_queue; /* * If both queues are full and haven't been empty on recent refills, we're done. */ if (__evict_queue_full(queue) && cache->evict_empty_score < WT_EVICT_SCORE_CUTOFF) goto err; /* * If the queue we are filling is empty, pages are being requested faster than they are being * queued. */ if (__evict_queue_empty(queue, false)) { if (F_ISSET(cache, WT_CACHE_EVICT_HARD)) cache->evict_empty_score = WT_MIN(cache->evict_empty_score + WT_EVICT_SCORE_BUMP, WT_EVICT_SCORE_MAX); WT_STAT_CONN_INCR(session, cache_eviction_queue_empty); } else WT_STAT_CONN_INCR(session, cache_eviction_queue_not_empty); /* * Get some more pages to consider for eviction. * * If the walk is interrupted, we still need to sort the queue: the next walk assumes there are * no entries beyond WT_EVICT_WALK_BASE. */ if ((ret = __evict_walk(cache->walk_session, queue)) == EBUSY) ret = 0; WT_ERR_NOTFOUND_OK(ret, false); /* Sort the list into LRU order and restart. */ __wt_spin_lock(session, &queue->evict_lock); /* * We have locked the queue: in the (unusual) case where we are filling the current queue, mark * it empty so that subsequent requests switch to the other queue. */ if (queue == cache->evict_current_queue) queue->evict_current = NULL; entries = queue->evict_entries; /* * Style note: __wt_qsort is a macro that can leave a dangling else. Full curly braces are * needed here for the compiler. */ if (FLD_ISSET(conn->debug_flags, WT_CONN_DEBUG_EVICT_AGGRESSIVE_MODE)) { __wt_qsort(queue->evict_queue, entries, sizeof(WT_EVICT_ENTRY), __evict_lru_cmp_debug); } else { __wt_qsort(queue->evict_queue, entries, sizeof(WT_EVICT_ENTRY), __evict_lru_cmp); } /* Trim empty entries from the end. */ while (entries > 0 && queue->evict_queue[entries - 1].ref == NULL) --entries; /* * If we have more entries than the maximum tracked between walks, clear them. Do this before * figuring out how many of the entries are candidates so we never end up with more candidates * than entries. */ while (entries > WT_EVICT_WALK_BASE) __evict_list_clear(session, &queue->evict_queue[--entries]); queue->evict_entries = entries; if (entries == 0) { /* * If there are no entries, there cannot be any candidates. Make sure application threads * don't read past the end of the candidate list, or they may race with the next walk. */ queue->evict_candidates = 0; queue->evict_current = NULL; __wt_spin_unlock(session, &queue->evict_lock); goto err; } /* Decide how many of the candidates we're going to try and evict. */ if (__wt_cache_aggressive(session)) queue->evict_candidates = entries; else { /* * Find the oldest read generation apart that we have in the queue, used to set the initial * value for pages read into the system. The queue is sorted, find the first "normal" * generation. */ read_gen_oldest = WT_READGEN_START_VALUE; for (candidates = 0; candidates < entries; ++candidates) { read_gen_oldest = queue->evict_queue[candidates].score; if (!WT_READGEN_EVICT_SOON(read_gen_oldest)) break; } /* * Take all candidates if we only gathered pages with an oldest * read generation set. * * We normally never take more than 50% of the entries but if * 50% of the entries were at the oldest read generation, take * all of them. */ if (WT_READGEN_EVICT_SOON(read_gen_oldest)) queue->evict_candidates = entries; else if (candidates > entries / 2) queue->evict_candidates = candidates; else { /* * Take all of the urgent pages plus a third of ordinary candidates (which could be * expressed as WT_EVICT_WALK_INCR / WT_EVICT_WALK_BASE). In the steady state, we want * to get as many candidates as the eviction walk adds to the queue. * * That said, if there is only one entry, which is normal when populating an empty file, * don't exclude it. */ queue->evict_candidates = 1 + candidates + ((entries - candidates) - 1) / 3; cache->read_gen_oldest = read_gen_oldest; } } WT_STAT_CONN_INCRV(session, cache_eviction_pages_queued_post_lru, queue->evict_candidates); queue->evict_current = queue->evict_queue; __wt_spin_unlock(session, &queue->evict_lock); /* * Signal any application or helper threads that may be waiting to help with eviction. */ __wt_cond_signal(session, conn->evict_threads.wait_cond); err: WT_TRACK_OP_END(session); return (ret); } /* * __evict_walk_choose_dhandle -- * Randomly select a dhandle for the next eviction walk */ static void __evict_walk_choose_dhandle(WT_SESSION_IMPL *session, WT_DATA_HANDLE **dhandle_p) { WT_CONNECTION_IMPL *conn; WT_DATA_HANDLE *dhandle; u_int dh_bucket_count, rnd_bucket, rnd_dh; conn = S2C(session); WT_ASSERT(session, __wt_rwlock_islocked(session, &conn->dhandle_lock)); #undef RANDOM_DH_SELECTION_ENABLED #ifdef RANDOM_DH_SELECTION_ENABLED *dhandle_p = NULL; /* * If we don't have many dhandles, most hash buckets will be empty. Just pick a random dhandle * from the list in that case. */ if (conn->dhandle_count < conn->dh_hash_size / 4) { rnd_dh = __wt_random(&session->rnd) % conn->dhandle_count; dhandle = TAILQ_FIRST(&conn->dhqh); for (; rnd_dh > 0; rnd_dh--) dhandle = TAILQ_NEXT(dhandle, q); *dhandle_p = dhandle; return; } /* * Keep picking up a random bucket until we find one that is not empty. */ do { rnd_bucket = __wt_random(&session->rnd) & (conn->dh_hash_size - 1); } while ((dh_bucket_count = conn->dh_bucket_count[rnd_bucket]) == 0); /* We can't pick up an empty bucket with a non zero bucket count. */ WT_ASSERT(session, !TAILQ_EMPTY(&conn->dhhash[rnd_bucket])); /* Pick a random dhandle in the chosen bucket. */ rnd_dh = __wt_random(&session->rnd) % dh_bucket_count; dhandle = TAILQ_FIRST(&conn->dhhash[rnd_bucket]); for (; rnd_dh > 0; rnd_dh--) dhandle = TAILQ_NEXT(dhandle, hashq); #else /* Just step through dhandles. */ dhandle = *dhandle_p; if (dhandle != NULL) dhandle = TAILQ_NEXT(dhandle, q); if (dhandle == NULL) dhandle = TAILQ_FIRST(&conn->dhqh); WT_UNUSED(dh_bucket_count); WT_UNUSED(rnd_bucket); WT_UNUSED(rnd_dh); #endif *dhandle_p = dhandle; } /* * __evict_walk -- * Fill in the array by walking the next set of pages. */ static int __evict_walk(WT_SESSION_IMPL *session, WT_EVICT_QUEUE *queue) { WT_BTREE *btree; WT_CACHE *cache; WT_CONNECTION_IMPL *conn; WT_DATA_HANDLE *dhandle; WT_DECL_RET; WT_TRACK_OP_DECL; u_int loop_count, max_entries, retries, slot, start_slot; u_int total_candidates; bool dhandle_locked, incr; WT_TRACK_OP_INIT(session); conn = S2C(session); cache = conn->cache; btree = NULL; dhandle = NULL; dhandle_locked = incr = false; retries = 0; /* * Set the starting slot in the queue and the maximum pages added per walk. */ start_slot = slot = queue->evict_entries; max_entries = WT_MIN(slot + WT_EVICT_WALK_INCR, cache->evict_slots); /* * Another pathological case: if there are only a tiny number of candidate pages in cache, don't * put all of them on one queue. */ total_candidates = (u_int)(F_ISSET(cache, WT_CACHE_EVICT_CLEAN | WT_CACHE_EVICT_UPDATES) ? __wt_cache_pages_inuse(cache) : cache->pages_dirty_leaf); max_entries = WT_MIN(max_entries, 1 + total_candidates / 2); retry: loop_count = 0; while (slot < max_entries && loop_count++ < conn->dhandle_count) { /* We're done if shutting down or reconfiguring. */ if (F_ISSET(conn, WT_CONN_CLOSING) || F_ISSET(conn, WT_CONN_RECONFIGURING)) break; /* * If another thread is waiting on the eviction server to clear the walk point in a tree, * give up. */ if (cache->pass_intr != 0) WT_ERR(EBUSY); /* * Lock the dhandle list to find the next handle and bump its reference count to keep it * alive while we sweep. */ if (!dhandle_locked) { WT_ERR(__evict_lock_handle_list(session)); dhandle_locked = true; } if (dhandle == NULL) { /* * On entry, continue from wherever we got to in the scan last time through. If we don't * have a saved handle, pick one randomly from the list. */ if ((dhandle = cache->walk_tree) != NULL) cache->walk_tree = NULL; else __evict_walk_choose_dhandle(session, &dhandle); } else { if (incr) { WT_ASSERT(session, dhandle->session_inuse > 0); (void)__wt_atomic_subi32(&dhandle->session_inuse, 1); incr = false; cache->walk_tree = NULL; } __evict_walk_choose_dhandle(session, &dhandle); } /* If we couldn't find any dhandle, we're done. */ if (dhandle == NULL) break; /* Ignore non-btree handles, or handles that aren't open. */ if (!WT_DHANDLE_BTREE(dhandle) || !F_ISSET(dhandle, WT_DHANDLE_OPEN)) continue; /* Skip files that don't allow eviction. */ btree = dhandle->handle; if (btree->evict_disabled > 0) continue; /* * Skip files that are checkpointing if we are only looking for dirty pages. */ if (WT_BTREE_SYNCING(btree) && !F_ISSET(cache, WT_CACHE_EVICT_CLEAN | WT_CACHE_EVICT_UPDATES)) continue; /* * Skip files that are configured to stick in cache until we become aggressive. * * If the file is contributing heavily to our cache usage then ignore the "stickiness" of * its pages. */ if (btree->evict_priority != 0 && !__wt_cache_aggressive(session) && !__wt_btree_dominating_cache(session, btree)) continue; /* * Skip files if we have too many active walks. * * This used to be limited by the configured maximum number of hazard pointers per session. * Even though that ceiling has been removed, we need to test eviction with huge numbers of * active trees before allowing larger numbers of hazard pointers in the walk session. */ if (btree->evict_ref == NULL && session->nhazard > WT_EVICT_MAX_TREES) continue; /* * If we are filling the queue, skip files that haven't been useful in the past. */ if (btree->evict_walk_period != 0 && btree->evict_walk_skips++ < btree->evict_walk_period) continue; btree->evict_walk_skips = 0; (void)__wt_atomic_addi32(&dhandle->session_inuse, 1); incr = true; __wt_readunlock(session, &conn->dhandle_lock); dhandle_locked = false; /* * Re-check the "no eviction" flag, used to enforce exclusive access when a handle is being * closed. * * Only try to acquire the lock and simply continue if we fail; the lock is held while the * thread turning off eviction clears the tree's current eviction point, and part of the * process is waiting on this thread to acknowledge that action. * * If a handle is being discarded, it will still be marked open, but won't have a root page. */ if (btree->evict_disabled == 0 && !__wt_spin_trylock(session, &cache->evict_walk_lock)) { if (btree->evict_disabled == 0 && btree->root.page != NULL) { /* * Remember the file to visit first, next loop. */ cache->walk_tree = dhandle; WT_WITH_DHANDLE( session, dhandle, ret = __evict_walk_tree(session, queue, max_entries, &slot)); WT_ASSERT(session, __wt_session_gen(session, WT_GEN_SPLIT) == 0); } __wt_spin_unlock(session, &cache->evict_walk_lock); WT_ERR(ret); /* * If there is a checkpoint thread gathering handles, which means it is holding the * schema lock, then there is often contention on the evict walk lock with that thread. * If eviction is not in aggressive mode, sleep a bit to give the checkpoint thread a * chance to gather its handles. */ if (F_ISSET(conn, WT_CONN_CKPT_GATHER) && !__wt_cache_aggressive(session)) { __wt_sleep(0, 10); WT_STAT_CONN_INCR(session, cache_eviction_walk_sleeps); } } } if (incr) { WT_ASSERT(session, dhandle->session_inuse > 0); (void)__wt_atomic_subi32(&dhandle->session_inuse, 1); incr = false; } /* * Repeat the walks a few times if we don't find enough pages. Give up when we have some * candidates and we aren't finding more. */ if (slot < max_entries && (retries < 2 || (retries < WT_RETRY_MAX && (slot == queue->evict_entries || slot > start_slot)))) { start_slot = slot; ++retries; goto retry; } err: if (dhandle_locked) __wt_readunlock(session, &conn->dhandle_lock); if (incr) { WT_ASSERT(session, dhandle->session_inuse > 0); (void)__wt_atomic_subi32(&dhandle->session_inuse, 1); } /* * If we didn't find any entries on a walk when we weren't interrupted, let our caller know. */ if (queue->evict_entries == slot && cache->pass_intr == 0) ret = WT_NOTFOUND; queue->evict_entries = slot; WT_TRACK_OP_END(session); return (ret); } /* * __evict_push_candidate -- * Initialize a WT_EVICT_ENTRY structure with a given page. */ static bool __evict_push_candidate( WT_SESSION_IMPL *session, WT_EVICT_QUEUE *queue, WT_EVICT_ENTRY *evict, WT_REF *ref) { uint16_t orig_flags, new_flags; u_int slot; /* * Threads can race to queue a page (e.g., an ordinary LRU walk can race with a page being * queued for urgent eviction). */ orig_flags = new_flags = ref->page->flags_atomic; FLD_SET(new_flags, WT_PAGE_EVICT_LRU); if (orig_flags == new_flags || !__wt_atomic_cas16(&ref->page->flags_atomic, orig_flags, new_flags)) return (false); /* Keep track of the maximum slot we are using. */ slot = (u_int)(evict - queue->evict_queue); if (slot >= queue->evict_max) queue->evict_max = slot + 1; if (evict->ref != NULL) __evict_list_clear(session, evict); evict->btree = S2BT(session); evict->ref = ref; evict->score = __evict_entry_priority(session, ref); /* Adjust for size when doing dirty eviction. */ if (F_ISSET(S2C(session)->cache, WT_CACHE_EVICT_DIRTY) && evict->score != WT_READGEN_OLDEST && evict->score != UINT64_MAX && !__wt_page_is_modified(ref->page)) evict->score += WT_MEGABYTE - WT_MIN(WT_MEGABYTE, ref->page->memory_footprint); return (true); } /* * __evict_walk_target -- * Calculate how many pages to queue for a given tree. */ static uint32_t __evict_walk_target(WT_SESSION_IMPL *session) { WT_CACHE *cache; uint64_t btree_inuse, bytes_per_slot, cache_inuse; uint32_t target_pages, target_pages_clean, target_pages_dirty, target_pages_updates; cache = S2C(session)->cache; target_pages_clean = target_pages_dirty = target_pages_updates = 0; /* * The minimum number of pages we should consider per tree. */ #define MIN_PAGES_PER_TREE 10 /* * The target number of pages for this tree is proportional to the space it is taking up in * cache. Round to the nearest number of slots so we assign all of the slots to a tree filling * 99+% of the cache (and only have to walk it once). */ if (F_ISSET(cache, WT_CACHE_EVICT_CLEAN)) { btree_inuse = __wt_btree_bytes_evictable(session); cache_inuse = __wt_cache_bytes_inuse(cache); bytes_per_slot = 1 + cache_inuse / cache->evict_slots; target_pages_clean = (uint32_t)((btree_inuse + bytes_per_slot / 2) / bytes_per_slot); } if (F_ISSET(cache, WT_CACHE_EVICT_DIRTY)) { btree_inuse = __wt_btree_dirty_leaf_inuse(session); cache_inuse = __wt_cache_dirty_leaf_inuse(cache); bytes_per_slot = 1 + cache_inuse / cache->evict_slots; target_pages_dirty = (uint32_t)((btree_inuse + bytes_per_slot / 2) / bytes_per_slot); } if (F_ISSET(cache, WT_CACHE_EVICT_UPDATES)) { btree_inuse = __wt_btree_bytes_updates(session); cache_inuse = __wt_cache_bytes_updates(cache); bytes_per_slot = 1 + cache_inuse / cache->evict_slots; target_pages_updates = (uint32_t)((btree_inuse + bytes_per_slot / 2) / bytes_per_slot); } target_pages = WT_MAX(target_pages_clean, target_pages_dirty); target_pages = WT_MAX(target_pages, target_pages_updates); /* * Walk trees with a small fraction of the cache in case there are so many trees that none of * them use enough of the cache to be allocated slots. Only skip a tree if it has no bytes of * interest. */ if (target_pages == 0) { btree_inuse = F_ISSET(cache, WT_CACHE_EVICT_CLEAN | WT_CACHE_EVICT_UPDATES) ? __wt_btree_bytes_evictable(session) : __wt_btree_dirty_leaf_inuse(session); if (btree_inuse == 0) return (0); } /* * There is some cost associated with walking a tree. If we're going to visit this tree, always * look for a minimum number of pages. */ if (target_pages < MIN_PAGES_PER_TREE) target_pages = MIN_PAGES_PER_TREE; /* If the tree is dead, take a lot of pages. */ if (F_ISSET(session->dhandle, WT_DHANDLE_DEAD)) target_pages *= 10; return (target_pages); } /* * __evict_walk_tree -- * Get a few page eviction candidates from a single underlying file. */ static int __evict_walk_tree(WT_SESSION_IMPL *session, WT_EVICT_QUEUE *queue, u_int max_entries, u_int *slotp) { WT_BTREE *btree; WT_CACHE *cache; WT_CONNECTION_IMPL *conn; WT_DECL_RET; WT_EVICT_ENTRY *end, *evict, *start; WT_PAGE *last_parent, *page; WT_REF *ref; uint64_t internal_pages_already_queued, internal_pages_queued, internal_pages_seen; uint64_t min_pages, pages_already_queued, pages_seen, pages_queued, refs_walked; uint32_t read_flags, remaining_slots, target_pages, walk_flags; int restarts; bool give_up, modified, urgent_queued, want_page; conn = S2C(session); btree = S2BT(session); cache = conn->cache; last_parent = NULL; restarts = 0; give_up = urgent_queued = false; /* * Figure out how many slots to fill from this tree. Note that some care is taken in the * calculation to avoid overflow. */ start = queue->evict_queue + *slotp; remaining_slots = max_entries - *slotp; if (btree->evict_walk_progress >= btree->evict_walk_target) { btree->evict_walk_target = __evict_walk_target(session); btree->evict_walk_progress = 0; } target_pages = btree->evict_walk_target - btree->evict_walk_progress; if (target_pages > remaining_slots) target_pages = remaining_slots; /* * Reduce the number of pages to be selected from btrees other than the history store (HS) if * the cache pressure is high and HS content dominates the cache. Evicting unclean non-HS pages * can generate even more HS content and will not help with the cache pressure, and will * probably just amplify it further. */ if (!WT_IS_HS(btree->dhandle) && __wt_cache_hs_dirty(session)) { /* If target pages are less than 10, keep it like that. */ if (target_pages >= 10) { target_pages = target_pages / 10; WT_STAT_CONN_DATA_INCR(session, cache_eviction_target_page_reduced); } } /* If we don't want any pages from this tree, move on. */ if (target_pages == 0) return (0); /* * These statistics generate a histogram of the number of pages targeted for eviction each * round. The range of values here start at MIN_PAGES_PER_TREE as this is the smallest number of * pages we can target, unless there are fewer slots available. The aim is to cover the likely * ranges of target pages in as few statistics as possible to reduce the overall overhead. */ if (target_pages < MIN_PAGES_PER_TREE) { WT_STAT_CONN_INCR(session, cache_eviction_target_page_lt10); WT_STAT_DATA_INCR(session, cache_eviction_target_page_lt10); } else if (target_pages < 32) { WT_STAT_CONN_INCR(session, cache_eviction_target_page_lt32); WT_STAT_DATA_INCR(session, cache_eviction_target_page_lt32); } else if (target_pages < 64) { WT_STAT_CONN_INCR(session, cache_eviction_target_page_lt64); WT_STAT_DATA_INCR(session, cache_eviction_target_page_lt64); } else if (target_pages < 128) { WT_STAT_CONN_INCR(session, cache_eviction_target_page_lt128); WT_STAT_DATA_INCR(session, cache_eviction_target_page_lt128); } else { WT_STAT_CONN_INCR(session, cache_eviction_target_page_ge128); WT_STAT_DATA_INCR(session, cache_eviction_target_page_ge128); } end = start + target_pages; /* * Examine at least a reasonable number of pages before deciding whether to give up. When we are * only looking for dirty pages, search the tree for longer. */ min_pages = 10 * (uint64_t)target_pages; if (!F_ISSET(cache, WT_CACHE_EVICT_DIRTY | WT_CACHE_EVICT_UPDATES)) WT_STAT_CONN_INCR(session, cache_eviction_target_strategy_clean); else if (!F_ISSET(cache, WT_CACHE_EVICT_CLEAN)) { min_pages *= 10; WT_STAT_CONN_INCR(session, cache_eviction_target_strategy_dirty); } else WT_STAT_CONN_INCR(session, cache_eviction_target_strategy_both_clean_and_dirty); if (btree->evict_ref == NULL) { WT_STAT_CONN_INCR(session, cache_eviction_walk_from_root); WT_STAT_DATA_INCR(session, cache_eviction_walk_from_root); } else { WT_STAT_CONN_INCR(session, cache_eviction_walk_saved_pos); WT_STAT_DATA_INCR(session, cache_eviction_walk_saved_pos); } walk_flags = WT_READ_CACHE | WT_READ_NO_EVICT | WT_READ_NO_GEN | WT_READ_NO_WAIT; if (!F_ISSET(session->txn, WT_TXN_HAS_SNAPSHOT)) walk_flags |= WT_READ_VISIBLE_ALL; /* * Choose a random point in the tree if looking for candidates in a tree with no starting point * set. This is mostly aimed at ensuring eviction fairly visits all pages in trees with a lot of * in-cache content. */ switch (btree->evict_start_type) { case WT_EVICT_WALK_NEXT: break; case WT_EVICT_WALK_PREV: FLD_SET(walk_flags, WT_READ_PREV); break; case WT_EVICT_WALK_RAND_PREV: FLD_SET(walk_flags, WT_READ_PREV); /* FALLTHROUGH */ case WT_EVICT_WALK_RAND_NEXT: read_flags = WT_READ_CACHE | WT_READ_NO_EVICT | WT_READ_NO_GEN | WT_READ_NO_WAIT | WT_READ_NOTFOUND_OK | WT_READ_RESTART_OK; if (btree->evict_ref == NULL) { for (;;) { /* Ensure internal pages indexes remain valid */ WT_WITH_PAGE_INDEX( session, ret = __wt_random_descent(session, &btree->evict_ref, read_flags)); if (ret != WT_RESTART) break; WT_STAT_CONN_INCR(session, cache_eviction_walk_restart); WT_STAT_DATA_INCR(session, cache_eviction_walk_restart); } WT_RET_NOTFOUND_OK(ret); } break; } /* * Get some more eviction candidate pages, starting at the last saved point. Clear the saved * point immediately, we assert when discarding pages we're not discarding an eviction point, so * this clear must be complete before the page is released. */ ref = btree->evict_ref; btree->evict_ref = NULL; /* * !!! Take care terminating this loop. * * Don't make an extra call to __wt_tree_walk after we hit the end of a * tree: that will leave a page pinned, which may prevent any work from * being done. * * Once we hit the page limit, do one more step through the walk in * case we are appending and only the last page in the file is live. */ internal_pages_already_queued = internal_pages_queued = internal_pages_seen = 0; for (evict = start, pages_already_queued = pages_queued = pages_seen = refs_walked = 0; evict < end && (ret == 0 || ret == WT_NOTFOUND); last_parent = ref == NULL ? NULL : ref->home, ret = __wt_tree_walk_count(session, &ref, &refs_walked, walk_flags)) { /* * Check whether we're finding a good ratio of candidates vs pages seen. Some workloads * create "deserts" in trees where no good eviction candidates can be found. Abandon the * walk if we get into that situation. */ give_up = !__wt_cache_aggressive(session) && !WT_IS_HS(btree->dhandle) && pages_seen > min_pages && (pages_queued == 0 || (pages_seen / pages_queued) > (min_pages / target_pages)); if (give_up) { /* * Try a different walk start point next time if a walk gave up. */ switch (btree->evict_start_type) { case WT_EVICT_WALK_NEXT: btree->evict_start_type = WT_EVICT_WALK_PREV; break; case WT_EVICT_WALK_PREV: btree->evict_start_type = WT_EVICT_WALK_RAND_PREV; break; case WT_EVICT_WALK_RAND_PREV: btree->evict_start_type = WT_EVICT_WALK_RAND_NEXT; break; case WT_EVICT_WALK_RAND_NEXT: btree->evict_start_type = WT_EVICT_WALK_NEXT; break; } /* * We differentiate the reasons we gave up on this walk and increment the stats * accordingly. */ if (pages_queued == 0) { WT_STAT_CONN_INCR(session, cache_eviction_walks_gave_up_no_targets); WT_STAT_DATA_INCR(session, cache_eviction_walks_gave_up_no_targets); } else { WT_STAT_CONN_INCR(session, cache_eviction_walks_gave_up_ratio); WT_STAT_DATA_INCR(session, cache_eviction_walks_gave_up_ratio); } break; } if (ref == NULL) { WT_STAT_CONN_INCR(session, cache_eviction_walks_ended); WT_STAT_DATA_INCR(session, cache_eviction_walks_ended); if (++restarts == 2) { WT_STAT_CONN_INCR(session, cache_eviction_walks_stopped); WT_STAT_DATA_INCR(session, cache_eviction_walks_stopped); break; } WT_STAT_CONN_INCR(session, cache_eviction_walks_started); continue; } ++pages_seen; /* Ignore root pages entirely. */ if (__wt_ref_is_root(ref)) continue; page = ref->page; modified = __wt_page_is_modified(page); page->evict_pass_gen = cache->evict_pass_gen; /* Count internal pages seen. */ if (F_ISSET(ref, WT_REF_FLAG_INTERNAL)) internal_pages_seen++; /* Use the EVICT_LRU flag to avoid putting pages onto the list multiple times. */ if (F_ISSET_ATOMIC_16(page, WT_PAGE_EVICT_LRU)) { pages_already_queued++; if (F_ISSET(ref, WT_REF_FLAG_INTERNAL)) internal_pages_already_queued++; continue; } /* Don't queue dirty pages in trees during checkpoints. */ if (modified && WT_BTREE_SYNCING(btree)) continue; /* * It's possible (but unlikely) to visit a page without a read generation, if we race with * the read instantiating the page. Set the page's read generation here to ensure a bug * doesn't somehow leave a page without a read generation. */ if (page->read_gen == WT_READGEN_NOTSET) __wt_cache_read_gen_new(session, page); /* Pages being forcibly evicted go on the urgent queue. */ if (modified && (page->read_gen == WT_READGEN_OLDEST || page->memory_footprint >= btree->splitmempage)) { WT_STAT_CONN_INCR(session, cache_eviction_pages_queued_oldest); if (__wt_page_evict_urgent(session, ref)) urgent_queued = true; continue; } /* * If history store dirty content is dominating the cache, we want to prioritize evicting * history store pages over other btree pages. This helps in keeping cache contents below * the configured cache size during checkpoints where reconciling non-HS pages can generate * significant amount of HS dirty content very quickly. */ if (WT_IS_HS(btree->dhandle) && __wt_cache_hs_dirty(session)) { WT_STAT_CONN_INCR(session, cache_eviction_pages_queued_urgent_hs_dirty); if (__wt_page_evict_urgent(session, ref)) urgent_queued = true; continue; } /* Pages that are empty or from dead trees are fast-tracked. */ if (__wt_page_is_empty(page) || F_ISSET(session->dhandle, WT_DHANDLE_DEAD)) goto fast; /* * Do not evict a clean metadata page that contains historical data needed to satisfy a * reader. Since there is no history store for metadata, we won't be able to serve an older * reader if we evict this page. */ if (WT_IS_METADATA(session->dhandle) && F_ISSET(cache, WT_CACHE_EVICT_CLEAN_HARD) && F_ISSET(ref, WT_REF_FLAG_LEAF) && !modified && page->modify != NULL && !__wt_txn_visible_all( session, page->modify->rec_max_txn, page->modify->rec_max_timestamp)) continue; /* Skip pages we don't want. */ want_page = (F_ISSET(cache, WT_CACHE_EVICT_CLEAN) && !modified) || (F_ISSET(cache, WT_CACHE_EVICT_DIRTY) && modified) || (F_ISSET(cache, WT_CACHE_EVICT_UPDATES) && page->modify != NULL); if (!want_page) continue; /* * Don't attempt eviction of internal pages with children in cache (indicated by seeing an * internal page that is the parent of the last page we saw). * * Also skip internal page unless we get aggressive, the tree is idle (indicated by the tree * being skipped for walks), or we are in eviction debug mode. The goal here is that if * trees become completely idle, we eventually push them out of cache completely. */ if (!FLD_ISSET(conn->debug_flags, WT_CONN_DEBUG_EVICT_AGGRESSIVE_MODE) && F_ISSET(ref, WT_REF_FLAG_INTERNAL)) { if (page == last_parent) continue; if (btree->evict_walk_period == 0 && !__wt_cache_aggressive(session)) continue; } /* If eviction gets aggressive, anything else is fair game. */ if (__wt_cache_aggressive(session)) goto fast; /* * If the global transaction state hasn't changed since the last time we tried eviction, * it's unlikely we can make progress. Similarly, if the most recent update on the page is * not yet globally visible, eviction will fail. This heuristic avoids repeated attempts to * evict the same page. */ if (!__wt_page_evict_retry(session, page) || (modified && page->modify->update_txn >= conn->txn_global.last_running)) continue; fast: /* If the page can't be evicted, give up. */ if (!__wt_page_can_evict(session, ref, NULL)) continue; WT_ASSERT(session, evict->ref == NULL); if (!__evict_push_candidate(session, queue, evict, ref)) continue; ++evict; ++pages_queued; ++btree->evict_walk_progress; /* Count internal pages queued. */ if (F_ISSET(ref, WT_REF_FLAG_INTERNAL)) internal_pages_queued++; __wt_verbose(session, WT_VERB_EVICTSERVER, "select: %p, size %" WT_SIZET_FMT, (void *)page, page->memory_footprint); } WT_RET_NOTFOUND_OK(ret); *slotp += (u_int)(evict - start); WT_STAT_CONN_INCRV(session, cache_eviction_pages_queued, (u_int)(evict - start)); __wt_verbose_debug2(session, WT_VERB_EVICTSERVER, "%s walk: seen %" PRIu64 ", queued %" PRIu64, session->dhandle->name, pages_seen, pages_queued); /* * If we couldn't find the number of pages we were looking for, skip the tree next time. */ if (pages_queued < target_pages / 2 && !urgent_queued) btree->evict_walk_period = WT_MIN(WT_MAX(1, 2 * btree->evict_walk_period), 100); else if (pages_queued == target_pages) { btree->evict_walk_period = 0; /* * If there's a chance the Btree was fully evicted, update the evicted flag in the handle. */ if (__wt_btree_bytes_evictable(session) == 0) F_SET(session->dhandle, WT_DHANDLE_EVICTED); } else if (btree->evict_walk_period > 0) btree->evict_walk_period /= 2; /* * Give up the walk occasionally. * * If we happen to end up on the root page or a page requiring urgent eviction, clear it. We * have to track hazard pointers, and the root page complicates that calculation. * * Likewise if we found no new candidates during the walk: there is no point keeping a page * pinned, since it may be the only candidate in an idle tree. * * If we land on a page requiring forced eviction, or that isn't an ordinary in-memory page, * move until we find an ordinary page: we should not prevent exclusive access to the page until * the next walk. */ if (ref != NULL) { if (__wt_ref_is_root(ref) || evict == start || give_up || ref->page->memory_footprint >= btree->splitmempage) { if (restarts == 0) WT_STAT_CONN_INCR(session, cache_eviction_walks_abandoned); WT_RET(__wt_page_release(cache->walk_session, ref, walk_flags)); ref = NULL; } else while (ref != NULL && (ref->state != WT_REF_MEM || WT_READGEN_EVICT_SOON(ref->page->read_gen))) WT_RET_NOTFOUND_OK(__wt_tree_walk_count(session, &ref, &refs_walked, walk_flags)); btree->evict_ref = ref; } WT_STAT_CONN_INCRV(session, cache_eviction_walk, refs_walked); WT_STAT_CONN_DATA_INCRV(session, cache_eviction_pages_seen, pages_seen); WT_STAT_CONN_INCRV(session, cache_eviction_pages_already_queued, pages_already_queued); WT_STAT_CONN_INCRV(session, cache_eviction_internal_pages_seen, internal_pages_seen); WT_STAT_CONN_INCRV( session, cache_eviction_internal_pages_already_queued, internal_pages_already_queued); WT_STAT_CONN_INCRV(session, cache_eviction_internal_pages_queued, internal_pages_queued); WT_STAT_CONN_DATA_INCR(session, cache_eviction_walk_passes); return (0); } /* * __evict_get_ref -- * Get a page for eviction. */ static int __evict_get_ref(WT_SESSION_IMPL *session, bool is_server, WT_BTREE **btreep, WT_REF **refp, uint8_t *previous_statep) { WT_CACHE *cache; WT_EVICT_ENTRY *evict; WT_EVICT_QUEUE *queue, *other_queue, *urgent_queue; uint32_t candidates; uint8_t previous_state; bool is_app, server_only, urgent_ok; *btreep = NULL; /* * It is polite to initialize output variables, but it isn't safe for callers to use the * previous state if we don't return a locked ref. */ *previous_statep = WT_REF_MEM; *refp = NULL; cache = S2C(session)->cache; is_app = !F_ISSET(session, WT_SESSION_INTERNAL); server_only = is_server && !WT_EVICT_HAS_WORKERS(session); /* Application threads do eviction when cache is full of dirty data */ urgent_ok = (!is_app && !is_server) || !WT_EVICT_HAS_WORKERS(session) || (is_app && F_ISSET(cache, WT_CACHE_EVICT_DIRTY_HARD)); urgent_queue = cache->evict_urgent_queue; WT_STAT_CONN_INCR(session, cache_eviction_get_ref); /* Avoid the LRU lock if no pages are available. */ if (__evict_queue_empty(cache->evict_current_queue, is_server) && __evict_queue_empty(cache->evict_other_queue, is_server) && (!urgent_ok || __evict_queue_empty(urgent_queue, false))) { WT_STAT_CONN_INCR(session, cache_eviction_get_ref_empty); return (WT_NOTFOUND); } /* * The server repopulates whenever the other queue is not full, as long as at least one page has * been evicted out of the current queue. * * Note that there are pathological cases where there are only enough eviction candidates in the * cache to fill one queue. In that case, we will continually evict one page and attempt to * refill the queues. Such cases are extremely rare in real applications. */ if (is_server && (!urgent_ok || __evict_queue_empty(urgent_queue, false)) && !__evict_queue_full(cache->evict_current_queue) && !__evict_queue_full(cache->evict_fill_queue) && (cache->evict_empty_score > WT_EVICT_SCORE_CUTOFF || __evict_queue_empty(cache->evict_fill_queue, false))) return (WT_NOTFOUND); __wt_spin_lock(session, &cache->evict_queue_lock); /* Check the urgent queue first. */ if (urgent_ok && !__evict_queue_empty(urgent_queue, false)) queue = urgent_queue; else { /* * Check if the current queue needs to change. * * The server will only evict half of the pages before looking for more, but should only * switch queues if there are no other eviction workers. */ queue = cache->evict_current_queue; other_queue = cache->evict_other_queue; if (__evict_queue_empty(queue, server_only) && !__evict_queue_empty(other_queue, server_only)) { cache->evict_current_queue = other_queue; cache->evict_other_queue = queue; } } __wt_spin_unlock(session, &cache->evict_queue_lock); /* * We got the queue lock, which should be fast, and chose a queue. Now we want to get the lock * on the individual queue. */ for (;;) { /* Verify there are still pages available. */ if (__evict_queue_empty(queue, is_server && queue != urgent_queue)) { WT_STAT_CONN_INCR(session, cache_eviction_get_ref_empty2); return (WT_NOTFOUND); } if (!is_server) __wt_spin_lock(session, &queue->evict_lock); else if (__wt_spin_trylock(session, &queue->evict_lock) != 0) continue; break; } /* * Only evict half of the pages before looking for more. The remainder are left to eviction * workers (if configured), or application thread if necessary. */ candidates = queue->evict_candidates; if (is_server && queue != urgent_queue && candidates > 1) candidates /= 2; /* Get the next page queued for eviction. */ for (evict = queue->evict_current; evict >= queue->evict_queue && evict < queue->evict_queue + candidates; ++evict) { if (evict->ref == NULL) continue; WT_ASSERT(session, evict->btree != NULL); /* * Evicting a dirty page in the server thread could stall during a write and prevent * eviction from finding new work. * * However, we can't skip entries in the urgent queue or they may never be found again. * * Don't force application threads to evict dirty pages if they aren't stalled by the amount * of dirty data in cache. */ if (!urgent_ok && (is_server || !F_ISSET(cache, WT_CACHE_EVICT_DIRTY_HARD | WT_CACHE_EVICT_UPDATES_HARD)) && __wt_page_is_modified(evict->ref->page)) { --evict; break; } /* * Lock the page while holding the eviction mutex to prevent multiple attempts to evict it. * For pages that are already being evicted, this operation will fail and we will move on. */ if ((previous_state = evict->ref->state) != WT_REF_MEM || !WT_REF_CAS_STATE(session, evict->ref, previous_state, WT_REF_LOCKED)) { __evict_list_clear(session, evict); continue; } /* * Increment the busy count in the btree handle to prevent it from being closed under us. */ (void)__wt_atomic_addv32(&evict->btree->evict_busy, 1); *btreep = evict->btree; *refp = evict->ref; *previous_statep = previous_state; /* * Remove the entry so we never try to reconcile the same page on reconciliation error. */ __evict_list_clear(session, evict); break; } /* Move to the next item. */ if (evict != NULL && evict + 1 < queue->evict_queue + queue->evict_candidates) queue->evict_current = evict + 1; else /* Clear the current pointer if there are no more candidates. */ queue->evict_current = NULL; __wt_spin_unlock(session, &queue->evict_lock); return (*refp == NULL ? WT_NOTFOUND : 0); } /* * __evict_page -- * Called by both eviction and application threads to evict a page. */ static int __evict_page(WT_SESSION_IMPL *session, bool is_server) { WT_BTREE *btree; WT_CACHE *cache; WT_DECL_RET; WT_REF *ref; WT_TRACK_OP_DECL; uint64_t time_start, time_stop; uint32_t flags; uint8_t previous_state; WT_TRACK_OP_INIT(session); WT_RET_TRACK(__evict_get_ref(session, is_server, &btree, &ref, &previous_state)); WT_ASSERT(session, ref->state == WT_REF_LOCKED); cache = S2C(session)->cache; time_start = 0; flags = 0; /* * An internal session flags either the server itself or an eviction worker thread. */ if (is_server) WT_STAT_CONN_INCR(session, cache_eviction_server_evicting); else if (F_ISSET(session, WT_SESSION_INTERNAL)) WT_STAT_CONN_INCR(session, cache_eviction_worker_evicting); else { if (__wt_page_is_modified(ref->page)) WT_STAT_CONN_INCR(session, cache_eviction_app_dirty); WT_STAT_CONN_INCR(session, cache_eviction_app); cache->app_evicts++; time_start = WT_STAT_ENABLED(session) ? __wt_clock(session) : 0; } /* * In case something goes wrong, don't pick the same set of pages every time. * * We used to bump the page's read generation only if eviction failed, but that isn't safe: at * that point, eviction has already unlocked the page and some other thread may have evicted it * by the time we look at it. */ __wt_cache_read_gen_bump(session, ref->page); WT_WITH_BTREE(session, btree, ret = __wt_evict(session, ref, previous_state, flags)); (void)__wt_atomic_subv32(&btree->evict_busy, 1); if (time_start != 0) { time_stop = __wt_clock(session); WT_STAT_CONN_INCRV(session, application_evict_time, WT_CLOCKDIFF_US(time_stop, time_start)); } WT_TRACK_OP_END(session); return (ret); } /* * __wt_cache_eviction_worker -- * Worker function for __wt_cache_eviction_check: evict pages if the cache crosses its * boundaries. */ int __wt_cache_eviction_worker(WT_SESSION_IMPL *session, bool busy, bool readonly, double pct_full) { WT_CACHE *cache; WT_CONNECTION_IMPL *conn; WT_DECL_RET; WT_TRACK_OP_DECL; WT_TXN_GLOBAL *txn_global; WT_TXN_SHARED *txn_shared; uint64_t cache_max_wait_us, initial_progress, max_progress; uint64_t elapsed, time_start, time_stop; bool app_thread; WT_TRACK_OP_INIT(session); conn = S2C(session); cache = conn->cache; time_start = 0; txn_global = &conn->txn_global; txn_shared = WT_SESSION_TXN_SHARED(session); if (session->cache_max_wait_us != 0) cache_max_wait_us = session->cache_max_wait_us; else cache_max_wait_us = cache->cache_max_wait_us; /* * Before we enter the eviction generation, make sure this session has a cached history store * cursor, otherwise we can deadlock with a session wanting exclusive access to a handle: that * session will have a handle list write lock and will be waiting on eviction to drain, we'll be * inside eviction waiting on a handle list read lock to open a history store cursor. */ WT_ERR(__wt_curhs_cache(session)); /* * It is not safe to proceed if the eviction server threads aren't setup yet. */ if (!conn->evict_server_running || (busy && pct_full < 100.0)) goto done; /* Wake the eviction server if we need to do work. */ __wt_evict_server_wake(session); /* Track how long application threads spend doing eviction. */ app_thread = !F_ISSET(session, WT_SESSION_INTERNAL); if (app_thread) time_start = __wt_clock(session); for (initial_progress = cache->eviction_progress;; ret = 0) { /* * If eviction is stuck, check if this thread is likely causing problems and should be * rolled back. Ignore if in recovery, those transactions can't be rolled back. */ if (!F_ISSET(conn, WT_CONN_RECOVERING) && __wt_cache_stuck(session)) { ret = __wt_txn_is_blocking(session); if (ret == WT_ROLLBACK) { --cache->evict_aggressive_score; WT_STAT_CONN_INCR(session, txn_rollback_oldest_pinned); __wt_verbose_debug1( session, WT_VERB_TRANSACTION, "%s", session->txn->rollback_reason); } WT_ERR(ret); } /* * Check if we've exceeded our operation timeout, this would also get called from the * previous txn is blocking call, however it won't pickup transactions that have been * committed or rolled back as their mod count is 0, and that txn needs to be the oldest. * * Additionally we don't return rollback which could confuse the caller. */ if (__wt_op_timer_fired(session)) break; /* * Check if we have become busy. * * If we're busy (because of the transaction check we just did or because our caller is * waiting on a longer-than-usual event such as a page read), and the cache level drops * below 100%, limit the work to 5 evictions and return. If that's not the case, we can do * more. */ if (!busy && txn_shared->pinned_id != WT_TXN_NONE && txn_global->current != txn_global->oldest_id) busy = true; max_progress = busy ? 5 : 20; /* See if eviction is still needed. */ if (!__wt_eviction_needed(session, busy, readonly, &pct_full) || (pct_full < 100.0 && (cache->eviction_progress > initial_progress + max_progress))) break; /* Evict a page. */ switch (ret = __evict_page(session, false)) { case 0: if (busy) goto err; /* FALLTHROUGH */ case EBUSY: break; case WT_NOTFOUND: /* Allow the queue to re-populate before retrying. */ __wt_cond_wait(session, conn->evict_threads.wait_cond, 10 * WT_THOUSAND, NULL); cache->app_waits++; break; default: goto err; } /* Stop if we've exceeded the time out. */ if (time_start != 0 && cache_max_wait_us != 0) { time_stop = __wt_clock(session); if (session->cache_wait_us + WT_CLOCKDIFF_US(time_stop, time_start) > cache_max_wait_us) goto err; } } err: if (time_start != 0) { time_stop = __wt_clock(session); elapsed = WT_CLOCKDIFF_US(time_stop, time_start); WT_STAT_CONN_INCRV(session, application_cache_time, elapsed); WT_STAT_SESSION_INCRV(session, cache_time, elapsed); session->cache_wait_us += elapsed; if (cache_max_wait_us != 0 && session->cache_wait_us > cache_max_wait_us) { WT_TRET(__wt_txn_rollback_required(session, WT_TXN_ROLLBACK_REASON_CACHE_OVERFLOW)); --cache->evict_aggressive_score; WT_STAT_CONN_INCR(session, cache_timed_out_ops); __wt_verbose_notice(session, WT_VERB_TRANSACTION, "%s", session->txn->rollback_reason); } } done: WT_TRACK_OP_END(session); return (ret); } /* * __wt_page_evict_urgent -- * Set a page to be evicted as soon as possible. */ bool __wt_page_evict_urgent(WT_SESSION_IMPL *session, WT_REF *ref) { WT_CACHE *cache; WT_EVICT_ENTRY *evict; WT_EVICT_QUEUE *urgent_queue; WT_PAGE *page; bool queued; /* Root pages should never be evicted via LRU. */ WT_ASSERT(session, !__wt_ref_is_root(ref)); page = ref->page; if (S2BT(session)->evict_disabled > 0 || F_ISSET_ATOMIC_16(page, WT_PAGE_EVICT_LRU_URGENT)) return (false); cache = S2C(session)->cache; if (F_ISSET_ATOMIC_16(page, WT_PAGE_EVICT_LRU) && F_ISSET(cache, WT_CACHE_EVICT_ALL)) return (false); /* Append to the urgent queue if we can. */ urgent_queue = &cache->evict_queues[WT_EVICT_URGENT_QUEUE]; queued = false; __wt_spin_lock(session, &cache->evict_queue_lock); /* Check again, in case we raced with another thread. */ if (S2BT(session)->evict_disabled > 0 || F_ISSET_ATOMIC_16(page, WT_PAGE_EVICT_LRU_URGENT)) goto done; /* * If the page is already in the LRU eviction list, clear it from the list if eviction server is * not running. */ if (F_ISSET_ATOMIC_16(page, WT_PAGE_EVICT_LRU)) { if (!F_ISSET(cache, WT_CACHE_EVICT_ALL)) { __evict_list_clear_page_locked(session, ref, true); WT_STAT_CONN_INCR(session, cache_eviction_clear_ordinary); } else goto done; } __wt_spin_lock(session, &urgent_queue->evict_lock); if (__evict_queue_empty(urgent_queue, false)) { urgent_queue->evict_current = urgent_queue->evict_queue; urgent_queue->evict_candidates = 0; } evict = urgent_queue->evict_queue + urgent_queue->evict_candidates; if (evict < urgent_queue->evict_queue + cache->evict_slots && __evict_push_candidate(session, urgent_queue, evict, ref)) { ++urgent_queue->evict_candidates; queued = true; FLD_SET(page->flags_atomic, WT_PAGE_EVICT_LRU_URGENT); } __wt_spin_unlock(session, &urgent_queue->evict_lock); done: __wt_spin_unlock(session, &cache->evict_queue_lock); if (queued) { WT_STAT_CONN_INCR(session, cache_eviction_pages_queued_urgent); if (WT_EVICT_HAS_WORKERS(session)) __wt_cond_signal(session, S2C(session)->evict_threads.wait_cond); else __wt_evict_server_wake(session); } return (queued); } /* * __wt_evict_priority_set -- * Set a tree's eviction priority. */ void __wt_evict_priority_set(WT_SESSION_IMPL *session, uint64_t v) { S2BT(session)->evict_priority = v; } /* * __wt_evict_priority_clear -- * Clear a tree's eviction priority. */ void __wt_evict_priority_clear(WT_SESSION_IMPL *session) { S2BT(session)->evict_priority = 0; } /* * __verbose_dump_cache_single -- * Output diagnostic information about a single file in the cache. */ static int __verbose_dump_cache_single(WT_SESSION_IMPL *session, uint64_t *total_bytesp, uint64_t *total_dirty_bytesp, uint64_t *total_updates_bytesp) { WT_BTREE *btree; WT_DATA_HANDLE *dhandle; WT_PAGE *page; WT_REF *next_walk; size_t size; uint64_t intl_bytes, intl_bytes_max, intl_dirty_bytes; uint64_t intl_dirty_bytes_max, intl_dirty_pages, intl_pages; uint64_t leaf_bytes, leaf_bytes_max, leaf_dirty_bytes; uint64_t leaf_dirty_bytes_max, leaf_dirty_pages, leaf_pages, updates_bytes; intl_bytes = intl_bytes_max = intl_dirty_bytes = 0; intl_dirty_bytes_max = intl_dirty_pages = intl_pages = 0; leaf_bytes = leaf_bytes_max = leaf_dirty_bytes = 0; leaf_dirty_bytes_max = leaf_dirty_pages = leaf_pages = 0; updates_bytes = 0; dhandle = session->dhandle; btree = dhandle->handle; WT_RET(__wt_msg(session, "%s(%s%s)%s%s:", dhandle->name, WT_DHANDLE_IS_CHECKPOINT(dhandle) ? "checkpoint=" : "", WT_DHANDLE_IS_CHECKPOINT(dhandle) ? dhandle->checkpoint : "", btree->evict_disabled != 0 ? " eviction disabled" : "", btree->evict_disabled_open ? " at open" : "")); /* * We cannot walk the tree of a dhandle held exclusively because the owning thread could be * manipulating it in a way that causes us to dump core. So print out that we visited and * skipped it. */ if (F_ISSET(dhandle, WT_DHANDLE_EXCLUSIVE)) return (__wt_msg(session, " handle opened exclusively, cannot walk tree, skipping")); next_walk = NULL; while (__wt_tree_walk(session, &next_walk, WT_READ_CACHE | WT_READ_NO_EVICT | WT_READ_NO_WAIT | WT_READ_VISIBLE_ALL) == 0 && next_walk != NULL) { page = next_walk->page; size = page->memory_footprint; if (F_ISSET(next_walk, WT_REF_FLAG_INTERNAL)) { ++intl_pages; intl_bytes += size; intl_bytes_max = WT_MAX(intl_bytes_max, size); if (__wt_page_is_modified(page)) { ++intl_dirty_pages; intl_dirty_bytes += size; intl_dirty_bytes_max = WT_MAX(intl_dirty_bytes_max, size); } } else { ++leaf_pages; leaf_bytes += size; leaf_bytes_max = WT_MAX(leaf_bytes_max, size); if (__wt_page_is_modified(page)) { ++leaf_dirty_pages; leaf_dirty_bytes += size; leaf_dirty_bytes_max = WT_MAX(leaf_dirty_bytes_max, size); } if (page->modify != NULL) updates_bytes += page->modify->bytes_updates; } } if (intl_pages == 0) WT_RET(__wt_msg(session, "internal: 0 pages")); else WT_RET( __wt_msg(session, "internal: " "%" PRIu64 " pages, %.2f KB, " "%" PRIu64 "/%" PRIu64 " clean/dirty pages, " "%.2f/%.2f clean / dirty KB, " "%.2f KB max page, " "%.2f KB max dirty page ", intl_pages, (double)intl_bytes / WT_KILOBYTE, intl_pages - intl_dirty_pages, intl_dirty_pages, (double)(intl_bytes - intl_dirty_bytes) / WT_KILOBYTE, (double)intl_dirty_bytes / WT_KILOBYTE, (double)intl_bytes_max / WT_KILOBYTE, (double)intl_dirty_bytes_max / WT_KILOBYTE)); if (leaf_pages == 0) WT_RET(__wt_msg(session, "leaf: 0 pages")); else WT_RET( __wt_msg(session, "leaf: " "%" PRIu64 " pages, %.2f KB, " "%" PRIu64 "/%" PRIu64 " clean/dirty pages, " "%.2f /%.2f /%.2f clean/dirty/updates KB, " "%.2f KB max page, " "%.2f KB max dirty page", leaf_pages, (double)leaf_bytes / WT_KILOBYTE, leaf_pages - leaf_dirty_pages, leaf_dirty_pages, (double)(leaf_bytes - leaf_dirty_bytes) / WT_KILOBYTE, (double)leaf_dirty_bytes / WT_KILOBYTE, (double)updates_bytes / WT_KILOBYTE, (double)leaf_bytes_max / WT_KILOBYTE, (double)leaf_dirty_bytes_max / WT_KILOBYTE)); *total_bytesp += intl_bytes + leaf_bytes; *total_dirty_bytesp += intl_dirty_bytes + leaf_dirty_bytes; *total_updates_bytesp += updates_bytes; return (0); } /* * __verbose_dump_cache_apply -- * Apply dumping cache for all the dhandles. */ static int __verbose_dump_cache_apply(WT_SESSION_IMPL *session, uint64_t *total_bytesp, uint64_t *total_dirty_bytesp, uint64_t *total_updates_bytesp) { WT_CONNECTION_IMPL *conn; WT_DATA_HANDLE *dhandle; WT_DECL_RET; conn = S2C(session); for (dhandle = NULL;;) { WT_DHANDLE_NEXT(session, dhandle, &conn->dhqh, q); if (dhandle == NULL) break; /* Skip if the tree is marked discarded by another thread. */ if (!WT_DHANDLE_BTREE(dhandle) || !F_ISSET(dhandle, WT_DHANDLE_OPEN) || F_ISSET(dhandle, WT_DHANDLE_DISCARD)) continue; WT_WITH_DHANDLE(session, dhandle, ret = __verbose_dump_cache_single( session, total_bytesp, total_dirty_bytesp, total_updates_bytesp)); if (ret != 0) WT_RET(ret); } return (0); } /* * __wt_verbose_dump_cache -- * Output diagnostic information about the cache. */ int __wt_verbose_dump_cache(WT_SESSION_IMPL *session) { WT_CACHE *cache; WT_CONNECTION_IMPL *conn; WT_DECL_RET; double pct; uint64_t total_bytes, total_dirty_bytes, total_updates_bytes, cache_bytes_updates; bool needed; conn = S2C(session); cache = conn->cache; total_bytes = total_dirty_bytes = total_updates_bytes = cache_bytes_updates = 0; pct = 0.0; /* [-Werror=uninitialized] */ WT_RET(__wt_msg(session, "%s", WT_DIVIDER)); WT_RET(__wt_msg(session, "cache dump")); WT_RET(__wt_msg(session, "cache full: %s", __wt_cache_full(session) ? "yes" : "no")); needed = __wt_eviction_clean_needed(session, &pct); WT_RET(__wt_msg(session, "cache clean check: %s (%2.3f%%)", needed ? "yes" : "no", pct)); needed = __wt_eviction_dirty_needed(session, &pct); WT_RET(__wt_msg(session, "cache dirty check: %s (%2.3f%%)", needed ? "yes" : "no", pct)); needed = __wt_eviction_updates_needed(session, &pct); WT_RET(__wt_msg(session, "cache updates check: %s (%2.3f%%)", needed ? "yes" : "no", pct)); WT_WITH_HANDLE_LIST_READ_LOCK(session, ret = __verbose_dump_cache_apply( session, &total_bytes, &total_dirty_bytes, &total_updates_bytes)); WT_RET(ret); /* * Apply the overhead percentage so our total bytes are comparable with the tracked value. */ total_bytes = __wt_cache_bytes_plus_overhead(conn->cache, total_bytes); cache_bytes_updates = __wt_cache_bytes_updates(cache); WT_RET(__wt_msg(session, "cache dump: total found: %.2f MB vs tracked inuse %.2f MB", (double)total_bytes / WT_MEGABYTE, (double)cache->bytes_inmem / WT_MEGABYTE)); WT_RET(__wt_msg(session, "total dirty bytes: %.2f MB vs tracked dirty %.2f MB", (double)total_dirty_bytes / WT_MEGABYTE, (double)(cache->bytes_dirty_intl + cache->bytes_dirty_leaf) / WT_MEGABYTE)); WT_RET(__wt_msg(session, "total updates bytes: %.2f MB vs tracked updates %.2f MB", (double)total_updates_bytes / WT_MEGABYTE, (double)cache_bytes_updates / WT_MEGABYTE)); return (0); }