1 files changed, 95 insertions, 123 deletions
diff --git a/src/third_party/wiredtiger/src/evict/evict_lru.c b/src/third_party/wiredtiger/src/evict/evict_lru.c
index c224a3b7b11..2f9f3220106 100644
--- a/src/third_party/wiredtiger/src/evict/evict_lru.c
+++ b/src/third_party/wiredtiger/src/evict/evict_lru.c
@@ -277,10 +277,12 @@ __wt_evict_thread_run(WT_SESSION_IMPL *session, WT_THREAD *thread)
     conn = S2C(session);
     cache = conn->cache;
 
-    /*
-     * The thread group code calls us repeatedly. So each call is one pass through eviction.
-     */
-    WT_TRACK_TIME(session);
+/*
+ * The thread group code calls us repeatedly. So each call is one pass through eviction.
+ */
+#ifdef HAVE_DIAGNOSTIC
+    __wt_seconds32(session, &session->op_5043_seconds);
+#endif
     if (conn->evict_server_running && __wt_spin_trylock(session, &cache->evict_pass_lock) == 0) {
         /*
          * Cannot use WT_WITH_PASS_LOCK because this is a try lock. Fix when that is supported. We
@@ -426,15 +428,14 @@ __evict_server(WT_SESSION_IMPL *session, bool *did_work)
         return (0);
 #endif
     /*
-     * If we're stuck for 5 minutes in diagnostic mode, or the verbose
-     * evict_stuck flag is configured, log the cache and transaction state.
+     * If we're stuck for 5 minutes in diagnostic mode, or the verbose evict_stuck flag is
+     * configured, log the cache and transaction state.
      *
      * If we're stuck for 5 minutes in diagnostic mode, give up.
      *
-     * We don't do this check for in-memory workloads because application
-     * threads are not blocked by the cache being full. If the cache becomes
-     * full of clean pages, we can be servicing reads while the cache
-     * appears stuck to eviction.
+     * We don't do this check for in-memory workloads because application threads are not blocked by
+     * the cache being full. If the cache becomes full of clean pages, we can be servicing reads
+     * while the cache appears stuck to eviction.
      */
     if (F_ISSET(conn, WT_CONN_IN_MEMORY))
         return (0);
@@ -578,8 +579,7 @@ __evict_update_work(WT_SESSION_IMPL *session)
     /*
      * If we need space in the cache, try to find clean pages to evict.
      *
-     * Avoid division by zero if the cache size has not yet been set in a
-     * shared cache.
+     * Avoid division by zero if the cache size has not yet been set in a shared cache.
      */
     bytes_max = conn->cache_size + 1;
     bytes_inuse = __wt_cache_bytes_inuse(cache);
@@ -679,14 +679,12 @@ __evict_pass(WT_SESSION_IMPL *session)
         ++cache->evict_pass_gen;
 
         /*
-         * Update the oldest ID: we use it to decide whether pages are
-         * candidates for eviction.  Without this, if all threads are
-         * blocked after a long-running transaction (such as a
+         * Update the oldest ID: we use it to decide whether pages are candidates for eviction.
+         * Without this, if all threads are blocked after a long-running transaction (such as a
          * checkpoint) completes, we may never start evicting again.
          *
-         * Do this every time the eviction server wakes up, regardless
-         * of whether the cache is full, to prevent the oldest ID
-         * falling too far behind.  Don't wait to lock the table: with
+         * Do this every time the eviction server wakes up, regardless of whether the cache is full,
+         * to prevent the oldest ID falling too far behind. Don't wait to lock the table: with
          * highly threaded workloads, that creates a bottleneck.
          */
         WT_RET(__wt_txn_update_oldest(session, WT_TXN_OLDEST_STRICT));
@@ -702,14 +700,12 @@ __evict_pass(WT_SESSION_IMPL *session)
             WT_RET(__evict_lru_walk(session));
 
         /*
-         * If the queue has been empty recently, keep queuing more
-         * pages to evict.  If the rate of queuing pages is high
-         * enough, this score will go to zero, in which case the
-         * eviction server might as well help out with eviction.
+         * If the queue has been empty recently, keep queuing more pages to evict. If the rate of
+         * queuing pages is high enough, this score will go to zero, in which case the eviction
+         * server might as well help out with eviction.
          *
-         * Also, if there is a single eviction server thread with no
-         * workers, it must service the urgent queue in case all
-         * application threads are busy.
+         * Also, if there is a single eviction server thread with no workers, it must service the
+         * urgent queue in case all application threads are busy.
          */
         if (!WT_EVICT_HAS_WORKERS(session) &&
           (cache->evict_empty_score < WT_EVICT_SCORE_CUTOFF ||
@@ -720,16 +716,13 @@ __evict_pass(WT_SESSION_IMPL *session)
             break;
 
         /*
-         * If we're making progress, keep going; if we're not making
-         * any progress at all, mark the cache "stuck" and go back to
-         * sleep, it's not something we can fix.
+         * If we're making progress, keep going; if we're not making any progress at all, mark the
+         * cache "stuck" and go back to sleep, it's not something we can fix.
          *
-         * We check for progress every 20ms, the idea being that the
-         * aggressive score will reach 10 after 200ms if we aren't
-         * making progress and eviction will start considering more
-         * pages.  If there is still no progress after 2s, we will
-         * treat the cache as stuck and start rolling back
-         * transactions and writing updates to the lookaside table.
+         * We check for progress every 20ms, the idea being that the aggressive score will reach 10
+         * after 200ms if we aren't making progress and eviction will start considering more pages.
+         * If there is still no progress after 2s, we will treat the cache as stuck and start
+         * rolling back transactions and writing updates to the lookaside table.
          */
         if (eviction_progress == cache->eviction_progress) {
             if (WT_CLOCKDIFF_MS(time_now, time_prev) >= 20 &&
@@ -750,14 +743,11 @@ __evict_pass(WT_SESSION_IMPL *session)
              */
             if (loop < 100 || cache->evict_aggressive_score < 100) {
                 /*
-                 * Back off if we aren't making progress: walks
-                 * hold the handle list lock, blocking other
-                 * operations that can free space in cache,
-                 * such as LSM discarding handles.
+                 * Back off if we aren't making progress: walks hold the handle list lock, blocking
+                 * other operations that can free space in cache, such as LSM discarding handles.
                  *
-                 * Allow this wait to be interrupted (e.g. if a
-                 * checkpoint completes): make sure we wait for
-                 * a non-zero number of microseconds).
+                 * Allow this wait to be interrupted (e.g. if a checkpoint completes): make sure we
+                 * wait for a non-zero number of microseconds).
                  */
                 WT_STAT_CONN_INCR(session, cache_eviction_server_slept);
                 __wt_cond_wait(session, cache->evict_cond, WT_THOUSAND, NULL);
@@ -1181,8 +1171,8 @@ __evict_lru_walk(WT_SESSION_IMPL *session)
     /*
      * Get some more pages to consider for eviction.
      *
-     * If the walk is interrupted, we still need to sort the queue: the
-     * next walk assumes there are no entries beyond WT_EVICT_WALK_BASE.
+     * If the walk is interrupted, we still need to sort the queue: the next walk assumes there are
+     * no entries beyond WT_EVICT_WALK_BASE.
      */
     if ((ret = __evict_walk(cache->walk_session, queue)) == EBUSY)
         ret = 0;
@@ -1264,15 +1254,12 @@ __evict_lru_walk(WT_SESSION_IMPL *session)
             queue->evict_candidates = candidates;
         else {
             /*
-             * Take all of the urgent pages plus a third of
-             * ordinary candidates (which could be expressed as
-             * WT_EVICT_WALK_INCR / WT_EVICT_WALK_BASE).  In the
-             * steady state, we want to get as many candidates as
-             * the eviction walk adds to the queue.
+             * Take all of the urgent pages plus a third of ordinary candidates (which could be
+             * expressed as WT_EVICT_WALK_INCR / WT_EVICT_WALK_BASE). In the steady state, we want
+             * to get as many candidates as the eviction walk adds to the queue.
              *
-             * That said, if there is only one entry, which is
-             * normal when populating an empty file, don't exclude
-             * it.
+             * That said, if there is only one entry, which is normal when populating an empty file,
+             * don't exclude it.
              */
             queue->evict_candidates = 1 + candidates + ((entries - candidates) - 1) / 3;
             cache->read_gen_oldest = read_gen_oldest;
@@ -1468,11 +1455,9 @@ retry:
         /*
          * Skip files if we have too many active walks.
          *
-         * This used to be limited by the configured maximum number of
-         * hazard pointers per session.  Even though that ceiling has
-         * been removed, we need to test eviction with huge numbers of
-         * active trees before allowing larger numbers of hazard
-         * pointers in the walk session.
+         * This used to be limited by the configured maximum number of hazard pointers per session.
+         * Even though that ceiling has been removed, we need to test eviction with huge numbers of
+         * active trees before allowing larger numbers of hazard pointers in the walk session.
          */
         if (btree->evict_ref == NULL && session->nhazard > WT_EVICT_MAX_TREES)
             continue;
@@ -1490,16 +1475,14 @@ retry:
         dhandle_locked = false;
 
         /*
-         * Re-check the "no eviction" flag, used to enforce exclusive
-         * access when a handle is being closed.
+         * Re-check the "no eviction" flag, used to enforce exclusive access when a handle is being
+         * closed.
          *
-         * Only try to acquire the lock and simply continue if we fail;
-         * the lock is held while the thread turning off eviction clears
-         * the tree's current eviction point, and part of the process is
-         * waiting on this thread to acknowledge that action.
+         * Only try to acquire the lock and simply continue if we fail; the lock is held while the
+         * thread turning off eviction clears the tree's current eviction point, and part of the
+         * process is waiting on this thread to acknowledge that action.
          *
-         * If a handle is being discarded, it will still be marked open,
-         * but won't have a root page.
+         * If a handle is being discarded, it will still be marked open, but won't have a root page.
          */
         if (btree->evict_disabled == 0 && !__wt_spin_trylock(session, &cache->evict_walk_lock)) {
             if (btree->evict_disabled == 0 && btree->root.page != NULL) {
@@ -1888,9 +1871,8 @@ __evict_walk_tree(WT_SESSION_IMPL *session, WT_EVICT_QUEUE *queue, u_int max_ent
         /*
          * Pages that are empty or from dead trees are fast-tracked.
          *
-         * Also evict lookaside table pages without further filtering:
-         * the cache is under pressure by definition and we want to
-         * free space.
+         * Also evict lookaside table pages without further filtering: the cache is under pressure
+         * by definition and we want to free space.
          */
         if (__wt_page_is_empty(page) || F_ISSET(session->dhandle, WT_DHANDLE_DEAD) ||
           F_ISSET(btree, WT_BTREE_LOOKASIDE))
@@ -1920,15 +1902,12 @@ __evict_walk_tree(WT_SESSION_IMPL *session, WT_EVICT_QUEUE *queue, u_int max_ent
             continue;
 
         /*
-         * Don't attempt eviction of internal pages with children in
-         * cache (indicated by seeing an internal page that is the
-         * parent of the last page we saw).
+         * Don't attempt eviction of internal pages with children in cache (indicated by seeing an
+         * internal page that is the parent of the last page we saw).
          *
-         * Also skip internal page unless we get aggressive, the tree
-         * is idle (indicated by the tree being skipped for walks),
-         * or we are in eviction debug mode.
-         * The goal here is that if trees become completely idle, we
-         * eventually push them out of cache completely.
+         * Also skip internal page unless we get aggressive, the tree is idle (indicated by the tree
+         * being skipped for walks), or we are in eviction debug mode. The goal here is that if
+         * trees become completely idle, we eventually push them out of cache completely.
          */
         if (!F_ISSET(cache, WT_CACHE_EVICT_DEBUG_MODE) && WT_PAGE_IS_INTERNAL(page)) {
             if (page == last_parent)
@@ -1987,18 +1966,15 @@ fast:
     /*
      * Give up the walk occasionally.
      *
-     * If we happen to end up on the root page or a page requiring urgent
-     * eviction, clear it.  We have to track hazard pointers, and the root
-     * page complicates that calculation.
+     * If we happen to end up on the root page or a page requiring urgent eviction, clear it. We
+     * have to track hazard pointers, and the root page complicates that calculation.
      *
-     * Likewise if we found no new candidates during the walk: there is no
-     * point keeping a page pinned, since it may be the only candidate in
-     * an idle tree.
+     * Likewise if we found no new candidates during the walk: there is no point keeping a page
+     * pinned, since it may be the only candidate in an idle tree.
      *
-     * If we land on a page requiring forced eviction, or that isn't an
-     * ordinary in-memory page (e.g., WT_REF_LIMBO), move until we find an
-     * ordinary page: we should not prevent exclusive access to the page
-     * until the next walk.
+     * If we land on a page requiring forced eviction, or that isn't an ordinary in-memory page
+     * (e.g., WT_REF_LIMBO), move until we find an ordinary page: we should not prevent exclusive
+     * access to the page until the next walk.
      */
     if (ref != NULL) {
         if (__wt_ref_is_root(ref) || evict == start || give_up ||
@@ -2064,13 +2040,12 @@ __evict_get_ref(WT_SESSION_IMPL *session, bool is_server, WT_BTREE **btreep, WT_
     }
 
     /*
-     * The server repopulates whenever the other queue is not full, as long
-     * as at least one page has been evicted out of the current queue.
+     * The server repopulates whenever the other queue is not full, as long as at least one page has
+     * been evicted out of the current queue.
      *
-     * Note that there are pathological cases where there are only enough
-     * eviction candidates in the cache to fill one queue.  In that case,
-     * we will continually evict one page and attempt to refill the queues.
-     * Such cases are extremely rare in real applications.
+     * Note that there are pathological cases where there are only enough eviction candidates in the
+     * cache to fill one queue. In that case, we will continually evict one page and attempt to
+     * refill the queues. Such cases are extremely rare in real applications.
      */
     if (is_server && (!urgent_ok || __evict_queue_empty(urgent_queue, false)) &&
       !__evict_queue_full(cache->evict_current_queue) &&
@@ -2088,9 +2063,8 @@ __evict_get_ref(WT_SESSION_IMPL *session, bool is_server, WT_BTREE **btreep, WT_
         /*
          * Check if the current queue needs to change.
          *
-         * The server will only evict half of the pages before looking
-         * for more, but should only switch queues if there are no
-         * other eviction workers.
+         * The server will only evict half of the pages before looking for more, but should only
+         * switch queues if there are no other eviction workers.
          */
         queue = cache->evict_current_queue;
         other_queue = cache->evict_other_queue;
@@ -2136,14 +2110,13 @@ __evict_get_ref(WT_SESSION_IMPL *session, bool is_server, WT_BTREE **btreep, WT_
         WT_ASSERT(session, evict->btree != NULL);
 
         /*
-         * Evicting a dirty page in the server thread could stall
-         * during a write and prevent eviction from finding new work.
+         * Evicting a dirty page in the server thread could stall during a write and prevent
+         * eviction from finding new work.
          *
-         * However, we can't skip entries in the urgent queue or they
-         * may never be found again.
+         * However, we can't skip entries in the urgent queue or they may never be found again.
          *
-         * Don't force application threads to evict dirty pages if they
-         * aren't stalled by the amount of dirty data in cache.
+         * Don't force application threads to evict dirty pages if they aren't stalled by the amount
+         * of dirty data in cache.
          */
         if (!urgent_ok && (is_server || !F_ISSET(cache, WT_CACHE_EVICT_DIRTY_HARD)) &&
           __wt_page_is_modified(evict->ref->page)) {
@@ -2233,13 +2206,11 @@ __evict_page(WT_SESSION_IMPL *session, bool is_server)
     }
 
     /*
-     * In case something goes wrong, don't pick the same set of pages every
-     * time.
+     * In case something goes wrong, don't pick the same set of pages every time.
      *
-     * We used to bump the page's read generation only if eviction failed,
-     * but that isn't safe: at that point, eviction has already unlocked
-     * the page and some other thread may have evicted it by the time we
-     * look at it.
+     * We used to bump the page's read generation only if eviction failed, but that isn't safe: at
+     * that point, eviction has already unlocked the page and some other thread may have evicted it
+     * by the time we look at it.
      */
     __wt_cache_read_gen_bump(session, ref->page);
 
@@ -2295,31 +2266,32 @@ __wt_cache_eviction_worker(WT_SESSION_IMPL *session, bool busy, bool readonly, d
     if (timer)
         time_start = __wt_clock(session);
 
-    WT_TRACK_TIME(session);
+#ifdef HAVE_DIAGNOSTIC
+    __wt_seconds32(session, &session->op_5043_seconds);
+#endif
     for (initial_progress = cache->eviction_progress;; ret = 0) {
         /*
-         * A pathological case: if we're the oldest transaction in the
-         * system and the eviction server is stuck trying to find space
-         * (and we're not in recovery, because those transactions can't
-         * be rolled back), abort the transaction to give up all hazard
-         * pointers before trying again.
+         * If eviction is stuck, check if this thread is likely causing problems and should be
+         * rolled back. Ignore if in recovery, those transactions can't be rolled back.
          */
-        if (__wt_cache_stuck(session) && __wt_txn_am_oldest(session) &&
-          !F_ISSET(conn, WT_CONN_RECOVERING)) {
-            --cache->evict_aggressive_score;
-            WT_STAT_CONN_INCR(session, txn_fail_cache);
-            WT_ERR(
-              __wt_txn_rollback_required(session, "oldest transaction rolled back for eviction"));
+        if (!F_ISSET(conn, WT_CONN_RECOVERING) && __wt_cache_stuck(session)) {
+            ret = __wt_txn_is_blocking_old(session);
+            if (ret == 0)
+                ret = __wt_txn_is_blocking_pin(session);
+            if (ret == WT_ROLLBACK) {
+                --cache->evict_aggressive_score;
+                WT_STAT_CONN_INCR(session, txn_fail_cache);
+            }
+            WT_ERR(ret);
         }
 
         /*
          * Check if we have become busy.
          *
-         * If we're busy (because of the transaction check we just did
-         * or because our caller is waiting on a longer-than-usual event
-         * such as a page read), and the cache level drops below 100%,
-         * limit the work to 5 evictions and return. If that's not the
-         * case, we can do more.
+         * If we're busy (because of the transaction check we just did or because our caller is
+         * waiting on a longer-than-usual event such as a page read), and the cache level drops
+         * below 100%, limit the work to 5 evictions and return. If that's not the case, we can do
+         * more.
          */
         if (!busy && txn_state->pinned_id != WT_TXN_NONE &&
           txn_global->current != txn_global->oldest_id)