/*-
 * Copyright (c) 2014-present MongoDB, Inc.
 * Copyright (c) 2008-2014 WiredTiger, Inc.
 *	All rights reserved.
 *
 * See the file LICENSE for redistribution information.
 */

#include "wt_internal.h"

static int __evict_page_clean_update(WT_SESSION_IMPL *, WT_REF *, uint32_t);
static int __evict_page_dirty_update(WT_SESSION_IMPL *, WT_REF *, uint32_t);
static int __evict_reconcile(WT_SESSION_IMPL *, WT_REF *, uint32_t);
static int __evict_review(WT_SESSION_IMPL *, WT_REF *, uint32_t, bool *);

/*
 * __evict_exclusive_clear --
 *     Release exclusive access to a page.
 */
static inline void
__evict_exclusive_clear(WT_SESSION_IMPL *session, WT_REF *ref, uint8_t previous_state)
{
    WT_ASSERT(session, ref->state == WT_REF_LOCKED && ref->page != NULL);

    WT_REF_SET_STATE(ref, previous_state);
}

/*
 * __evict_exclusive --
 *     Acquire exclusive access to a page.
 */
static inline int
__evict_exclusive(WT_SESSION_IMPL *session, WT_REF *ref)
{
    WT_ASSERT(session, ref->state == WT_REF_LOCKED);

    /*
     * Check for a hazard pointer indicating another thread is using the page, meaning the page
     * cannot be evicted.
     */
    if (__wt_hazard_check(session, ref, NULL) == NULL)
        return (0);

    WT_STAT_CONN_DATA_INCR(session, cache_eviction_blocked_hazard);
    return (__wt_set_return(session, EBUSY));
}

/*
 * __wt_page_release_evict --
 *     Release a reference to a page, and attempt to immediately evict it.
 */
int
__wt_page_release_evict(WT_SESSION_IMPL *session, WT_REF *ref, uint32_t flags)
{
    WT_BTREE *btree;
    WT_DECL_RET;
    uint32_t evict_flags;
    uint8_t previous_state;
    bool locked;

    btree = S2BT(session);

    /*
     * This function always releases the hazard pointer - ensure that's done regardless of whether
     * we can get exclusive access. Take some care with order of operations: if we release the
     * hazard pointer without first locking the page, it could be evicted in between.
     */
    previous_state = ref->state;
    locked =
      previous_state == WT_REF_MEM && WT_REF_CAS_STATE(session, ref, previous_state, WT_REF_LOCKED);
    if ((ret = __wt_hazard_clear(session, ref)) != 0 || !locked) {
        if (locked)
            WT_REF_SET_STATE(ref, previous_state);
        return (ret == 0 ? EBUSY : ret);
    }

    evict_flags = LF_ISSET(WT_READ_NO_SPLIT) ? WT_EVICT_CALL_NO_SPLIT : 0;
    FLD_SET(evict_flags, WT_EVICT_CALL_URGENT);

    /*
     * There is no need to cache a history store cursor if evicting a readonly page. That includes
     * pages from a checkpoint. Note that opening a history store cursor on a checkpoint page from
     * here will explode because the identity of the matching history store checkpoint isn't
     * available.
     */
    if (ref->page != NULL && !__wt_page_evict_clean(ref->page)) {
        WT_ASSERT(session, !WT_READING_CHECKPOINT(session));
        WT_RET(__wt_curhs_cache(session));
    }
    (void)__wt_atomic_addv32(&btree->evict_busy, 1);
    ret = __wt_evict(session, ref, previous_state, evict_flags);
    (void)__wt_atomic_subv32(&btree->evict_busy, 1);

    return (ret);
}

#define WT_EVICT_STATS_CLEAN 0x01
#define WT_EVICT_STATS_FORCE_HS 0x02
#define WT_EVICT_STATS_SUCCESS 0x04
#define WT_EVICT_STATS_URGENT 0x08

/*
 * __evict_stats_update --
 *     Update the stats of eviction.
 *
 */
static void
__evict_stats_update(WT_SESSION_IMPL *session, uint8_t flags)
{
    WT_CONNECTION_IMPL *conn;
    uint64_t eviction_time, eviction_time_milliseconds;

    conn = S2C(session);

    if (session->evict_timeline.reentry_hs_eviction) {
        session->evict_timeline.reentry_hs_evict_finish = __wt_clock(session);
        eviction_time = WT_CLOCKDIFF_US(session->evict_timeline.reentry_hs_evict_finish,
          session->evict_timeline.reentry_hs_evict_start);
    } else {
        session->evict_timeline.evict_finish = __wt_clock(session);
        eviction_time = WT_CLOCKDIFF_US(
          session->evict_timeline.evict_finish, session->evict_timeline.evict_start);
    }
    if (LF_ISSET(WT_EVICT_STATS_SUCCESS)) {
        if (LF_ISSET(WT_EVICT_STATS_URGENT)) {
            if (LF_ISSET(WT_EVICT_STATS_FORCE_HS))
                WT_STAT_CONN_INCR(session, cache_eviction_force_hs_success);
            if (LF_ISSET(WT_EVICT_STATS_CLEAN)) {
                WT_STAT_CONN_INCR(session, cache_eviction_force_clean);
                WT_STAT_CONN_INCRV(session, cache_eviction_force_clean_time, eviction_time);
            } else {
                WT_STAT_CONN_INCR(session, cache_eviction_force_dirty);
                WT_STAT_CONN_INCRV(session, cache_eviction_force_dirty_time, eviction_time);
            }
        }

        if (LF_ISSET(WT_EVICT_STATS_CLEAN))
            WT_STAT_CONN_DATA_INCR(session, cache_eviction_clean);
        else
            WT_STAT_CONN_DATA_INCR(session, cache_eviction_dirty);

        /* Count page evictions in parallel with checkpoint. */
        if (conn->txn_global.checkpoint_running)
            WT_STAT_CONN_INCR(session, cache_eviction_pages_in_parallel_with_checkpoint);
    } else {
        if (LF_ISSET(WT_EVICT_CALL_URGENT)) {
            if (LF_ISSET(WT_EVICT_STATS_FORCE_HS))
                WT_STAT_CONN_INCR(session, cache_eviction_force_hs_fail);
            WT_STAT_CONN_INCR(session, cache_eviction_force_fail);
            WT_STAT_CONN_INCRV(session, cache_eviction_force_fail_time, eviction_time);
        }

        WT_STAT_CONN_DATA_INCR(session, cache_eviction_fail);
    }
    if (!session->evict_timeline.reentry_hs_eviction) {
        eviction_time_milliseconds = eviction_time / WT_THOUSAND;
        if (eviction_time_milliseconds > conn->cache->evict_max_ms)
            conn->cache->evict_max_ms = eviction_time_milliseconds;
        if (eviction_time_milliseconds > WT_MINUTE * WT_THOUSAND)
            __wt_verbose_warning(session, WT_VERB_EVICT,
              "Eviction took more than 1 minute (%" PRIu64 "). Building disk image took %" PRIu64
              "us. History store wrapup took %" PRIu64 "us.",
              eviction_time,
              WT_CLOCKDIFF_US(session->reconcile_timeline.image_build_finish,
                session->reconcile_timeline.image_build_start),
              WT_CLOCKDIFF_US(session->reconcile_timeline.hs_wrapup_finish,
                session->reconcile_timeline.hs_wrapup_start));
    } else {
        /*
         * We are in the reentrant history store eviction inside a data store reconciliation. Add to
         * the total time taken to do the reentrant history store eviction.
         */
        session->reconcile_timeline.total_reentry_hs_eviction_time +=
          WT_CLOCKDIFF_MS(session->evict_timeline.reentry_hs_evict_finish,
            session->evict_timeline.reentry_hs_evict_start);
        session->evict_timeline.reentry_hs_eviction = false;
    }
}

/*
 * __wt_evict --
 *     Evict a page.
 */
int
__wt_evict(WT_SESSION_IMPL *session, WT_REF *ref, uint8_t previous_state, uint32_t flags)
{
    WT_CONNECTION_IMPL *conn;
    WT_DECL_RET;
    WT_PAGE *page;
    uint8_t stats_flags;
    bool clean_page, closing, inmem_split, tree_dead;

    conn = S2C(session);
    page = ref->page;
    closing = LF_ISSET(WT_EVICT_CALL_CLOSING);
    stats_flags = 0;
    clean_page = false;

    __wt_verbose(
      session, WT_VERB_EVICT, "page %p (%s)", (void *)page, __wt_page_type_string(page->type));

    tree_dead = F_ISSET(session->dhandle, WT_DHANDLE_DEAD);
    if (tree_dead)
        LF_SET(WT_EVICT_CALL_NO_SPLIT);

    /* As re-entry into eviction is possible, only clear the statistics on the first entry. */
    if (__wt_session_gen((session), (WT_GEN_EVICT)) == 0) {
        WT_CLEAR(session->reconcile_timeline);
        WT_CLEAR(session->evict_timeline);
        session->evict_timeline.evict_start = __wt_clock(session);
    } else {
        session->evict_timeline.reentry_hs_eviction = true;
        session->evict_timeline.reentry_hs_evict_start = __wt_clock(session);
    }

    /*
     * Enter the eviction and split generation. If we re-enter eviction, leave the previous
     * generation (eviction or split) generation (which must be as low as the current generation),
     * untouched.
     */
    WT_ENTER_GENERATION(session, WT_GEN_EVICT);
    WT_ENTER_GENERATION(session, WT_GEN_SPLIT);

    /*
     * Immediately increment the forcible eviction counter, we might do an in-memory split and not
     * an eviction, which skips the other statistics.
     */
    if (LF_ISSET(WT_EVICT_CALL_URGENT)) {
        FLD_SET(stats_flags, WT_EVICT_STATS_URGENT);
        WT_STAT_CONN_INCR(session, cache_eviction_force);

        /*
         * Track history store pages being force evicted while holding a history store cursor open.
         */
        if (session->hs_cursor_counter > 0 && WT_IS_HS(session->dhandle)) {
            FLD_SET(stats_flags, WT_EVICT_STATS_FORCE_HS);
            WT_STAT_CONN_INCR(session, cache_eviction_force_hs);
        }
    }

    /*
     * Get exclusive access to the page if our caller doesn't have the tree locked down.
     */
    if (!closing) {
        WT_ERR(__evict_exclusive(session, ref));

        /*
         * Now the page is locked, remove it from the LRU eviction queue. We have to do this before
         * freeing the page memory or otherwise touching the reference because eviction paths assume
         * a non-NULL reference on the queue is pointing at valid memory.
         */
        __wt_evict_list_clear_page(session, ref);
    }

    /*
     * Review the page for conditions that would block its eviction. If the check fails (for
     * example, we find a page with active children), quit. Make this check for clean pages, too:
     * while unlikely eviction would choose an internal page with children, it's not disallowed.
     */
    WT_ERR(__evict_review(session, ref, flags, &inmem_split));

    /*
     * If we decide to do an in-memory split. Do it now. If an in-memory split completes, the page
     * stays in memory and the tree is left in the desired state: avoid the usual cleanup.
     */
    if (inmem_split) {
        WT_ERR(__wt_split_insert(session, ref));
        goto done;
    }

    /* No need to reconcile the page if it is from a dead tree or it is clean. */
    if (!tree_dead && __wt_page_is_modified(page))
        WT_ERR(__evict_reconcile(session, ref, flags));

    /*
     * Fail 0.1% of the time after we have done reconciliation. We should always evict the page of a
     * dead tree.
     */
    if (!closing && !tree_dead &&
      __wt_failpoint(session, WT_TIMING_STRESS_FAILPOINT_EVICTION_FAIL_AFTER_RECONCILIATION, 10)) {
        ret = EBUSY;
        goto err;
    }

    /* Check we are not evicting an accessible internal page with an active split generation. */
    WT_ASSERT(session,
      closing || !F_ISSET(ref, WT_REF_FLAG_INTERNAL) ||
        F_ISSET(session->dhandle, WT_DHANDLE_DEAD | WT_DHANDLE_EXCLUSIVE) ||
        !__wt_gen_active(session, WT_GEN_SPLIT, page->pg_intl_split_gen));

    /* Count evictions of internal pages during normal operation. */
    if (!closing && F_ISSET(ref, WT_REF_FLAG_INTERNAL))
        WT_STAT_CONN_DATA_INCR(session, cache_eviction_internal);

    /*
     * Track the largest page size seen at eviction, it tells us something about our ability to
     * force pages out before they're larger than the cache. We don't care about races, it's just a
     * statistic.
     */
    if (page->memory_footprint > conn->cache->evict_max_page_size)
        conn->cache->evict_max_page_size = page->memory_footprint;

    /* Figure out whether reconciliation was done on the page */
    if (__wt_page_evict_clean(page)) {
        clean_page = true;
        FLD_SET(stats_flags, WT_EVICT_STATS_CLEAN);
    }

    /* Update the reference and discard the page. */
    if (__wt_ref_is_root(ref))
        __wt_ref_out(session, ref);
    else if ((clean_page && !F_ISSET(conn, WT_CONN_IN_MEMORY)) || tree_dead)
        /*
         * Pages that belong to dead trees never write back to disk and can't support page splits.
         */
        WT_ERR(__evict_page_clean_update(session, ref, flags));
    else
        WT_ERR(__evict_page_dirty_update(session, ref, flags));

    /*
     * We have loaded the new disk image and updated the tree structure. We can no longer fail after
     * this point.
     */

    if (0) {
err:
        if (!closing)
            __evict_exclusive_clear(session, ref, previous_state);
    }

done:
    if (ret == 0)
        FLD_SET(stats_flags, WT_EVICT_STATS_SUCCESS);
    __evict_stats_update(session, stats_flags);

    /* Leave any local eviction generation. */
    WT_LEAVE_GENERATION(session, WT_GEN_SPLIT);
    WT_LEAVE_GENERATION(session, WT_GEN_EVICT);

    return (ret);
}

/*
 * __evict_delete_ref --
 *     Mark a page reference deleted and check if the parent can reverse split.
 */
static int
__evict_delete_ref(WT_SESSION_IMPL *session, WT_REF *ref, uint32_t flags)
{
    WT_DECL_RET;
    WT_PAGE *parent;
    WT_PAGE_INDEX *pindex;
    uint32_t ndeleted;

    if (__wt_ref_is_root(ref))
        return (0);

    /*
     * Avoid doing reverse splits when closing the file, it is wasted work and some structures may
     * have already been freed.
     */
    if (!LF_ISSET(WT_EVICT_CALL_NO_SPLIT | WT_EVICT_CALL_CLOSING)) {
        parent = ref->home;
        WT_INTL_INDEX_GET(session, parent, pindex);
        ndeleted = __wt_atomic_addv32(&pindex->deleted_entries, 1);

        /*
         * If more than 10% of the parent references are deleted, try a reverse split. Don't bother
         * if there is a single deleted reference: the internal page is empty and we have to wait
         * for eviction to notice.
         *
         * This will consume the deleted ref (and eventually free it). If the reverse split can't
         * get the access it needs because something is busy, be sure that the page still ends up
         * marked deleted.
         *
         * Don't do it if we are a VLCS tree and the child we're deleting is the leftmost child. The
         * reverse split will automatically remove the page entirely, creating a namespace gap at
         * the beginning of the internal page, and that leaves search nowhere to go. Note that the
         * situation will be handled safely if another child gets deleted, or if eviction comes for
         * a visit.
         */
        if (ndeleted > pindex->entries / 10 && pindex->entries > 1) {
            if (S2BT(session)->type == BTREE_COL_VAR && ref == pindex->index[0])
                WT_STAT_CONN_DATA_INCR(session, cache_reverse_splits_skipped_vlcs);
            else {
                if ((ret = __wt_split_reverse(session, ref)) == 0) {
                    WT_STAT_CONN_DATA_INCR(session, cache_reverse_splits);
                    return (0);
                }
                WT_RET_BUSY_OK(ret);

                /*
                 * The child must be locked after a failed reverse split.
                 */
                WT_ASSERT(session, ref->state == WT_REF_LOCKED);
            }
        }
    }

    WT_REF_SET_STATE(ref, WT_REF_DELETED);
    return (0);
}

/*
 * __evict_page_clean_update --
 *     Update a clean page's reference on eviction.
 */
static int
__evict_page_clean_update(WT_SESSION_IMPL *session, WT_REF *ref, uint32_t flags)
{
    WT_DECL_RET;
    bool instantiated;

    /*
     * We might discard an instantiated deleted page, because instantiated pages are not marked
     * dirty by default. Check this before discarding the modify structure in __wt_ref_out.
     */
    if (ref->page->modify != NULL && ref->page->modify->instantiated)
        instantiated = true;
    else {
        WT_ASSERT(session, ref->page_del == NULL);
        instantiated = false;
    }

    /*
     * Discard the page and update the reference structure. A leaf page without a disk address is a
     * deleted page that either was created empty and never written out, or had its on-disk page
     * discarded already after the deletion became globally visible. It is not immediately clear if
     * it's possible to get an internal page without a disk address here, but if one appears it can
     * be deleted. (Note that deleting an internal page implicitly turns it into a leaf.)
     *
     * A page with a disk address is now on disk, unless it was deleted and instantiated and then
     * evicted unmodified, in which case it is still deleted. In the latter case set the state back
     * to WT_REF_DELETED.
     */
    __wt_ref_out(session, ref);
    if (ref->addr == NULL) {
        WT_WITH_PAGE_INDEX(session, ret = __evict_delete_ref(session, ref, flags));
        WT_RET_BUSY_OK(ret);
    } else
        WT_REF_SET_STATE(ref, instantiated ? WT_REF_DELETED : WT_REF_DISK);

    return (0);
}

/*
 * __evict_page_dirty_update --
 *     Update a dirty page's reference on eviction.
 */
static int
__evict_page_dirty_update(WT_SESSION_IMPL *session, WT_REF *ref, uint32_t evict_flags)
{
    WT_ADDR *addr;
    WT_DECL_RET;
    WT_MULTI multi;
    WT_PAGE_MODIFY *mod;
    bool closing;
    void *tmp;

    mod = ref->page->modify;
    closing = FLD_ISSET(evict_flags, WT_EVICT_CALL_CLOSING);

    WT_ASSERT(session, ref->addr == NULL);

    switch (mod->rec_result) {
    case WT_PM_REC_EMPTY:
        /*
         * Page is empty: Update the parent to reference a deleted page. Reconciliation left the
         * page "empty", so there's no older transaction in the system that might need to see an
         * earlier version of the page. There's no backing address, if we're forced to "read" into
         * that namespace, we instantiate a new page instead of trying to read from the backing
         * store.
         */
        __wt_ref_out(session, ref);
        WT_WITH_PAGE_INDEX(session, ret = __evict_delete_ref(session, ref, evict_flags));
        WT_RET_BUSY_OK(ret);
        break;
    case WT_PM_REC_MULTIBLOCK:
        /*
         * Multiple blocks: Either a split where we reconciled a page and it turned into a lot of
         * pages or an in-memory page that got too large, we forcibly evicted it, and there wasn't
         * anything to write.
         *
         * The latter is a special case of forced eviction. Imagine a thread updating a small set
         * keys on a leaf page. The page is too large or has too many deleted items, so we try and
         * evict it, but after reconciliation there's only a small amount of live data (so it's a
         * single page we can't split), and if there's an older reader somewhere, there's data on
         * the page we can't write (so the page can't be evicted). In that case, we end up here with
         * a single block that we can't write. Take advantage of the fact we have exclusive access
         * to the page and rewrite it in memory.
         */
        if (mod->mod_multi_entries == 1) {
            WT_ASSERT(session, closing == false);
            WT_RET(__wt_split_rewrite(session, ref, &mod->mod_multi[0]));
        } else
            WT_RET(__wt_split_multi(session, ref, closing));
        break;
    case WT_PM_REC_REPLACE:
        /*
         * 1-for-1 page swap: Update the parent to reference the replacement page.
         *
         * Publish: a barrier to ensure the structure fields are set before the state change makes
         * the page available to readers.
         */
        WT_ASSERT(session, mod->mod_replace.addr != NULL);
        WT_RET(__wt_calloc_one(session, &addr));
        *addr = mod->mod_replace;
        mod->mod_replace.addr = NULL;
        mod->mod_replace.size = 0;
        ref->addr = addr;

        /*
         * Eviction wants to keep this page if we have a disk image, re-instantiate the page in
         * memory, else discard the page.
         */
        if (mod->mod_disk_image == NULL) {
            __wt_page_modify_clear(session, ref->page);
            __wt_ref_out(session, ref);
            WT_REF_SET_STATE(ref, WT_REF_DISK);
        } else {
            /* The split code works with WT_MULTI structures, build one for the disk image. */
            memset(&multi, 0, sizeof(multi));
            multi.disk_image = mod->mod_disk_image;
            /*
             * Store the disk image to a temporary pointer in case we fail to rewrite the page and
             * we need to link the new disk image back to the old disk image.
             */
            tmp = mod->mod_disk_image;
            mod->mod_disk_image = NULL;
            ret = __wt_split_rewrite(session, ref, &multi);
            if (ret != 0) {
                mod->mod_disk_image = tmp;
                return (ret);
            }
        }

        break;
    default:
        return (__wt_illegal_value(session, mod->rec_result));
    }

    return (0);
}

/*
 * __evict_child_check --
 *     Review an internal page for active children.
 */
static int
__evict_child_check(WT_SESSION_IMPL *session, WT_REF *parent)
{
    WT_REF *child;
    bool visible;

    /*
     * There may be cursors in the tree walking the list of child pages. The parent is locked, so
     * all we care about is cursors already in the child pages, no thread can enter them. Any cursor
     * moving through the child pages must be hazard pointer coupling between pages, where the page
     * on which it currently has a hazard pointer must be in a state other than on-disk. Walk the
     * child list forward, then backward, to ensure we don't race with a cursor walking in the
     * opposite direction from our check.
     */
    WT_INTL_FOREACH_BEGIN (session, parent->page, child) {
        switch (child->state) {
        case WT_REF_DISK:    /* On-disk */
        case WT_REF_DELETED: /* On-disk, deleted */
            break;
        default:
            return (__wt_set_return(session, EBUSY));
        }
    }
    WT_INTL_FOREACH_END;
    WT_INTL_FOREACH_REVERSE_BEGIN (session, parent->page, child) {
        switch (child->state) {
        case WT_REF_DISK:    /* On-disk */
        case WT_REF_DELETED: /* On-disk, deleted */
            break;
        default:
            return (__wt_set_return(session, EBUSY));
        }
    }
    WT_INTL_FOREACH_END;

    /*
     * It is always OK to evict pages from checkpoint cursor trees if they don't have children, and
     * visibility checks for pages found to be deleted in the checkpoint aren't needed (or correct
     * when done in eviction threads).
     */
    if (WT_READING_CHECKPOINT(session))
        return (0);

    /*
     * The fast check is done and there are no cursors in the child pages. Make sure the child
     * WT_REF structures pages can be discarded.
     */
    WT_INTL_FOREACH_BEGIN (session, parent->page, child) {
        switch (child->state) {
        case WT_REF_DISK: /* On-disk */
            break;
        case WT_REF_DELETED: /* On-disk, deleted */
                             /*
                              * If the child page was part of a truncate, transaction rollback might
                              * switch this page into its previous state at any time, so the delete
                              * must be resolved before the parent can be evicted.
                              *
                              * We have the internal page locked, which prevents a search from
                              * descending into it. However, a walk from an adjacent leaf page could
                              * attempt to hazard couple into a child page and free the page_del
                              * structure as we are examining it. Flip the state to locked to make
                              * this check safe: if that fails, we have raced with a read and should
                              * give up on evicting the parent.
                              */
            if (!WT_REF_CAS_STATE(session, child, WT_REF_DELETED, WT_REF_LOCKED))
                return (__wt_set_return(session, EBUSY));
            /*
             * Insert a read/read barrier so we're guaranteed the page_del state we read below comes
             * after the locking operation on the ref state and therefore after the previous unlock
             * of the ref. Otherwise we might read an inconsistent view of the page deletion info,
             * and while many combinations are harmless and would just lead us to falsely refuse to
             * evict, some (e.g. reading committed as true and a stale durable timestamp from before
             * it was set by commit) are not.
             *
             * Note that while ordinarily a lock acquire should have an acquire (read/any) barrier
             * after it, because we are only reading the write part is irrelevant and a read/read
             * barrier is sufficient.
             *
             * FIXME-WT-9780: this and the CAS should be rolled into a WT_REF_TRYLOCK macro.
             */
            WT_READ_BARRIER();

            /*
             * We can evict any truncation that's committed. However, restrictions in reconciliation
             * mean that it needs to be visible to us when we get there. And unfortunately we are
             * upstream of the point where eviction threads get snapshots. Plus, application threads
             * doing eviction can see their own uncommitted truncations. So, use the following
             * logic:
             *     1. First check if the operation is committed. If not, it's not visible for these
             *        purposes.
             *     2. If we already have a snapshot, use it to check visibility.
             *     3. If we do not but we're an eviction thread, go ahead. We will get a snapshot
             *        shortly and any committed operation will be visible in it.
             *     4. Otherwise, check if the operation is globally visible.
             *
             * Even though we specifically can't evict prepared truncations, we don't need to deploy
             * the special-case logic for prepared transactions in __wt_page_del_visible; prepared
             * transactions aren't committed so they'll fail the first check.
             */
            if (!__wt_page_del_committed_set(child->page_del))
                visible = false;
            else if (F_ISSET(session->txn, WT_TXN_HAS_SNAPSHOT))
                visible = __wt_page_del_visible(session, child->page_del, false);
            else if (F_ISSET(session, WT_SESSION_EVICTION))
                visible = true;
            else
                visible = __wt_page_del_visible_all(session, child->page_del, false);
            /* FIXME-WT-9780: is there a reason this doesn't use WT_REF_UNLOCK? */
            child->state = WT_REF_DELETED;
            if (!visible)
                return (__wt_set_return(session, EBUSY));
            break;
        default:
            return (__wt_set_return(session, EBUSY));
        }
    }
    WT_INTL_FOREACH_END;

    return (0);
}

/*
 * __evict_review --
 *     Review the page and its subtree for conditions that would block its eviction.
 */
static int
__evict_review(WT_SESSION_IMPL *session, WT_REF *ref, uint32_t evict_flags, bool *inmem_splitp)
{
    WT_BTREE *btree;
    WT_CONNECTION_IMPL *conn;
    WT_DECL_RET;
    WT_PAGE *page;
    bool closing, modified;

    *inmem_splitp = false;

    btree = S2BT(session);
    conn = S2C(session);
    page = ref->page;
    closing = FLD_ISSET(evict_flags, WT_EVICT_CALL_CLOSING);

    /*
     * Fail if an internal has active children, the children must be evicted first. The test is
     * necessary but shouldn't fire much: the eviction code is biased for leaf pages, an internal
     * page shouldn't be selected for eviction until all children have been evicted.
     */
    if (F_ISSET(ref, WT_REF_FLAG_INTERNAL)) {
        WT_WITH_PAGE_INDEX(session, ret = __evict_child_check(session, ref));
        if (ret != 0)
            WT_STAT_CONN_INCR(session, cache_eviction_fail_active_children_on_an_internal_page);
        WT_RET(ret);
    }

    /* It is always OK to evict pages from dead trees if they don't have children. */
    if (F_ISSET(session->dhandle, WT_DHANDLE_DEAD))
        return (0);

    /*
     * Retrieve the modified state of the page. This must happen after the check for evictable
     * internal pages otherwise there is a race where a page could be marked modified due to a child
     * being transitioned to WT_REF_DISK after the modified check and before we visited the ref
     * while walking the parent index.
     */
    modified = __wt_page_is_modified(page);

    /*
     * Clean pages can't be evicted when running in memory only. This should be uncommon - we don't
     * add clean pages to the queue.
     */
    if (F_ISSET(conn, WT_CONN_IN_MEMORY) && !modified && !closing)
        return (__wt_set_return(session, EBUSY));

    /* Check if the page can be evicted. */
    if (!closing) {
        /*
         * Update the oldest ID to avoid wasted effort should it have fallen behind current.
         */
        if (modified)
            WT_RET(__wt_txn_update_oldest(session, WT_TXN_OLDEST_STRICT));

        if (!__wt_page_can_evict(session, ref, inmem_splitp))
            return (__wt_set_return(session, EBUSY));

        /* Check for an append-only workload needing an in-memory split. */
        if (*inmem_splitp)
            return (0);
    }

    /* If the page is clean, we're done and we can evict. */
    if (!modified)
        return (0);

    /*
     * If we are trying to evict a dirty page that does not belong to history store(HS) and
     * checkpoint is processing the HS file, avoid evicting the dirty non-HS page for now if the
     * cache is already dominated by dirty HS content.
     *
     * Evicting an non-HS dirty page can generate even more HS content. As we cannot evict HS pages
     * while checkpoint is operating on the HS file, we can end up in a situation where we exceed
     * the cache size limit.
     */
    if (conn->txn_global.checkpoint_running_hs && !WT_IS_HS(btree->dhandle) &&
      __wt_cache_hs_dirty(session) && __wt_cache_full(session)) {
        WT_STAT_CONN_INCR(session, cache_eviction_blocked_checkpoint_hs);
        return (__wt_set_return(session, EBUSY));
    }
    /*
     * If reconciliation is disabled for this thread (e.g., during an eviction that writes to the
     * history store), give up.
     */
    if (F_ISSET(session, WT_SESSION_NO_RECONCILE))
        return (__wt_set_return(session, EBUSY));

    return (0);
}

/*
 * __evict_reconcile --
 *     Reconcile the page for eviction.
 */
static int
__evict_reconcile(WT_SESSION_IMPL *session, WT_REF *ref, uint32_t evict_flags)
{
    WT_BTREE *btree;
    WT_CACHE *cache;
    WT_CONNECTION_IMPL *conn;
    WT_DECL_RET;
    uint32_t flags;
    bool closing, is_eviction_thread, use_snapshot_for_app_thread;

    btree = S2BT(session);
    conn = S2C(session);
    flags = WT_REC_EVICT;
    closing = FLD_ISSET(evict_flags, WT_EVICT_CALL_CLOSING);

    cache = conn->cache;

    /*
     * Urgent eviction and forced eviction want two different behaviors for inefficient update
     * restore evictions, pass this flag so that reconciliation knows which to use.
     */
    if (FLD_ISSET(evict_flags, WT_EVICT_CALL_URGENT))
        LF_SET(WT_REC_CALL_URGENT);

    /*
     * If we have an exclusive lock (we're discarding the tree), assert there are no updates we
     * cannot read.
     */
    if (closing)
        LF_SET(WT_REC_VISIBILITY_ERR);
    /*
     * Don't set any other flags for internal pages: there are no update lists to be saved and
     * restored, changes can't be written into the history store table, nor can we re-create
     * internal pages in memory.
     *
     * Don't set any other flags for history store table as all the content is evictable.
     */
    else if (F_ISSET(ref, WT_REF_FLAG_INTERNAL) || WT_IS_HS(btree->dhandle))
        ;
    /* Always do update restore for in-memory database. */
    else if (F_ISSET(conn, WT_CONN_IN_MEMORY))
        LF_SET(WT_REC_IN_MEMORY | WT_REC_SCRUB);
    /* For data store leaf pages, write the history to history store except for metadata. */
    else if (!WT_IS_METADATA(btree->dhandle)) {
        LF_SET(WT_REC_HS);

        /*
         * Scrub and we're supposed to or toss it in sometimes if we are in debugging mode.
         *
         * Note that don't scrub if checkpoint is running on the tree.
         */
        if (!WT_SESSION_BTREE_SYNC(session) &&
          (F_ISSET(cache, WT_CACHE_EVICT_SCRUB) ||
            (FLD_ISSET(conn->debug_flags, WT_CONN_DEBUG_EVICT_AGGRESSIVE_MODE) &&
              __wt_random(&session->rnd) % 3 == 0)))
            LF_SET(WT_REC_SCRUB);
    }

    /*
     * Acquire a snapshot if coming through the eviction thread route. Also, if we have entered
     * eviction through application threads and we have a transaction snapshot, we will use our
     * existing snapshot to evict pages that are not globally visible based on the last_running
     * transaction. Avoid using snapshots when application transactions are in the final stages of
     * commit or rollback as they have already released the snapshot. Otherwise, it becomes harder
     * in the later part of the code to detect updates that belonged to the last running application
     * transaction.
     */
    use_snapshot_for_app_thread = !F_ISSET(session, WT_SESSION_INTERNAL) &&
      !WT_IS_METADATA(session->dhandle) && WT_SESSION_TXN_SHARED(session)->id != WT_TXN_NONE &&
      F_ISSET(session->txn, WT_TXN_HAS_SNAPSHOT);
    is_eviction_thread = F_ISSET(session, WT_SESSION_EVICTION);

    /* Make sure that both conditions above are not true at the same time. */
    WT_ASSERT(session, !use_snapshot_for_app_thread || !is_eviction_thread);

    /*
     * If checkpoint is running concurrently, set the checkpoint running flag and we will abort the
     * eviction if we detect any updates without timestamps.
     */
    if (conn->txn_global.checkpoint_running)
        LF_SET(WT_REC_CHECKPOINT_RUNNING);

    /* Eviction thread doing eviction. */
    if (is_eviction_thread)
        /*
         * Eviction threads do not need to pin anything in the cache. We have an exclusive lock for
         * the page being evicted so we are sure that the page will always be there while it is
         * being processed. Therefore, we use snapshot API that doesn't publish shared IDs to the
         * outside world.
         */
        __wt_txn_bump_snapshot(session);
    else if (use_snapshot_for_app_thread)
        LF_SET(WT_REC_APP_EVICTION_SNAPSHOT);
    else if (!WT_SESSION_BTREE_SYNC(session))
        LF_SET(WT_REC_VISIBLE_ALL);

    WT_ASSERT(session, LF_ISSET(WT_REC_VISIBLE_ALL) || F_ISSET(session->txn, WT_TXN_HAS_SNAPSHOT));

    /* We should not be trying to evict using a checkpoint-cursor transaction. */
    WT_ASSERT(session, !F_ISSET(session->txn, WT_TXN_IS_CHECKPOINT));

    /*
     * Reconcile the page. Force read-committed isolation level if we are using snapshots for
     * eviction workers or application threads.
     */
    if (is_eviction_thread || use_snapshot_for_app_thread)
        WT_WITH_TXN_ISOLATION(
          session, WT_ISO_READ_COMMITTED, ret = __wt_reconcile(session, ref, NULL, flags));
    else
        ret = __wt_reconcile(session, ref, NULL, flags);

    if (ret != 0)
        WT_STAT_CONN_INCR(session, cache_eviction_fail_in_reconciliation);

    if (is_eviction_thread)
        __wt_txn_release_snapshot(session);

    WT_RET(ret);

    /*
     * Success: assert that the page is clean or reconciliation was configured to save updates.
     */
    WT_ASSERT(session,
      !__wt_page_is_modified(ref->page) || LF_ISSET(WT_REC_HS | WT_REC_IN_MEMORY) ||
        WT_IS_METADATA(btree->dhandle));

    return (0);
}