/*-
 * Copyright (c) 2014-present MongoDB, Inc.
 * Copyright (c) 2008-2014 WiredTiger, Inc.
 *	All rights reserved.
 *
 * See the file LICENSE for redistribution information.
 */

#include "wt_internal.h"

static void __checkpoint_timing_stress(WT_SESSION_IMPL *, uint64_t, struct timespec *);
static int __checkpoint_lock_dirty_tree(WT_SESSION_IMPL *, bool, bool, bool, const char *[]);
static int __checkpoint_mark_skip(WT_SESSION_IMPL *, WT_CKPT *, bool);
static int __checkpoint_presync(WT_SESSION_IMPL *, const char *[]);
static int __checkpoint_tree_helper(WT_SESSION_IMPL *, const char *[]);
static int __drop_list_execute(WT_SESSION_IMPL *session, WT_ITEM *drop_list);

/*
 * __checkpoint_name_ok --
 *     Complain if the checkpoint name isn't acceptable.
 */
static int
__checkpoint_name_ok(WT_SESSION_IMPL *session, const char *name, size_t len, bool allow_all)
{
    /* Check for characters we don't want to see in a metadata file. */
    WT_RET(__wt_name_check(session, name, len, true));

    /*
     * The internal checkpoint name is special, applications aren't allowed to use it. Be aggressive
     * and disallow any matching prefix, it makes things easier when checking in other places.
     */
    if (len >= strlen(WT_CHECKPOINT) && WT_PREFIX_MATCH(name, WT_CHECKPOINT))
        WT_RET_MSG(session, EINVAL, "the checkpoint name \"%s\" is reserved", WT_CHECKPOINT);

    /* The name "all" is also special. */
    if (!allow_all && WT_STRING_MATCH("all", name, len))
        WT_RET_MSG(session, EINVAL, "the checkpoint name \"all\" is reserved");

    return (0);
}

/*
 * __checkpoint_name_check --
 *     Check for an attempt to name a checkpoint that includes anything other than a file object.
 */
static int
__checkpoint_name_check(WT_SESSION_IMPL *session, const char *uri)
{
    WT_CURSOR *cursor;
    WT_DECL_RET;
    const char *fail;

    cursor = NULL;
    fail = NULL;

    /*
     * This function exists as a place for this comment: named checkpoints are only supported on
     * file objects, and not on LSM trees. If a target list is configured for the checkpoint, this
     * function is called with each target list entry; check the entry to make sure it's backed by a
     * file. If no target list is configured, confirm the metadata file contains no non-file
     * objects. Skip any internal system objects. We don't want spurious error messages, other code
     * will skip over them and the user has no control over their existence.
     */
    if (uri == NULL) {
        WT_RET(__wt_metadata_cursor(session, &cursor));
        while ((ret = cursor->next(cursor)) == 0) {
            WT_ERR(cursor->get_key(cursor, &uri));
            if (!WT_PREFIX_MATCH(uri, "colgroup:") && !WT_PREFIX_MATCH(uri, "file:") &&
              !WT_PREFIX_MATCH(uri, "index:") && !WT_PREFIX_MATCH(uri, WT_SYSTEM_PREFIX) &&
              !WT_PREFIX_MATCH(uri, "table:") && !WT_PREFIX_MATCH(uri, "tiered:")) {
                fail = uri;
                break;
            }
        }
        WT_ERR_NOTFOUND_OK(ret, false);
    } else if (!WT_PREFIX_MATCH(uri, "colgroup:") && !WT_PREFIX_MATCH(uri, "file:") &&
      !WT_PREFIX_MATCH(uri, "index:") && !WT_PREFIX_MATCH(uri, "table:") &&
      !WT_PREFIX_MATCH(uri, "tiered:"))
        fail = uri;

    if (fail != NULL)
        WT_ERR_MSG(session, EINVAL, "%s object does not support named checkpoints", fail);

err:
    WT_TRET(__wt_metadata_cursor_release(session, &cursor));
    return (ret);
}

/*
 * __checkpoint_update_generation --
 *     Update the checkpoint generation of the current tree. This indicates that the tree will not
 *     be visited again by the current checkpoint.
 */
static void
__checkpoint_update_generation(WT_SESSION_IMPL *session)
{
    WT_BTREE *btree;

    btree = S2BT(session);

    /*
     * Updates to the metadata are made by the checkpoint transaction, so the metadata tree's
     * checkpoint generation should never be updated.
     */
    if (WT_IS_METADATA(session->dhandle))
        return;

    WT_PUBLISH(btree->checkpoint_gen, __wt_gen(session, WT_GEN_CHECKPOINT));
    WT_STAT_DATA_SET(session, btree_checkpoint_generation, btree->checkpoint_gen);
}

/*
 * __checkpoint_apply_operation --
 *     Apply a preliminary operation to all files involved in a checkpoint.
 */
static int
__checkpoint_apply_operation(
  WT_SESSION_IMPL *session, const char *cfg[], int (*op)(WT_SESSION_IMPL *, const char *[]))
{
    WT_CONFIG targetconf;
    WT_CONFIG_ITEM cval, k, v;
    WT_DECL_ITEM(tmp);
    WT_DECL_RET;
    bool ckpt_closed, named, target_list;

    target_list = false;

    /* Flag if this is a named checkpoint, and check if the name is OK. */
    WT_RET(__wt_config_gets(session, cfg, "name", &cval));
    named = cval.len != 0;
    if (named)
        WT_RET(__checkpoint_name_ok(session, cval.str, cval.len, false));

    /* Step through the targets and optionally operate on each one. */
    WT_ERR(__wt_config_gets(session, cfg, "target", &cval));
    __wt_config_subinit(session, &targetconf, &cval);
    while ((ret = __wt_config_next(&targetconf, &k, &v)) == 0) {
        if (!target_list) {
            WT_ERR(__wt_scr_alloc(session, 512, &tmp));
            target_list = true;
        }

        if (v.len != 0)
            WT_ERR_MSG(session, EINVAL, "invalid checkpoint target %.*s: URIs may require quoting",
              (int)cval.len, (char *)cval.str);

        /* Some objects don't support named checkpoints. */
        if (named)
            WT_ERR(__checkpoint_name_check(session, k.str));

        if (op == NULL)
            continue;
        WT_ERR(__wt_buf_fmt(session, tmp, "%.*s", (int)k.len, k.str));
        if ((ret = __wt_schema_worker(session, tmp->data, op, NULL, cfg, 0)) != 0)
            WT_ERR_MSG(session, ret, "%s", (const char *)tmp->data);
    }
    WT_ERR_NOTFOUND_OK(ret, false);

    if (!target_list && named)
        /* Some objects don't support named checkpoints. */
        WT_ERR(__checkpoint_name_check(session, NULL));

    if (!target_list && op != NULL) {
        /*
         * If the checkpoint is named or we're dropping checkpoints, we checkpoint both open and
         * closed files; else, only checkpoint open files.
         *
         * XXX We don't optimize unnamed checkpoints of a list of targets, we open the targets and
         * checkpoint them even if they are quiescent and don't need a checkpoint, believing
         * applications unlikely to checkpoint a list of closed targets.
         */
        ckpt_closed = named;
        if (!ckpt_closed) {
            WT_ERR(__wt_config_gets(session, cfg, "drop", &cval));
            ckpt_closed = cval.len != 0;
        }
        WT_ERR(ckpt_closed ? __wt_meta_apply_all(session, op, NULL, cfg) :
                             __wt_conn_btree_apply(session, NULL, op, NULL, cfg));
    }

err:
    __wt_scr_free(session, &tmp);
    return (ret);
}

/*
 * __checkpoint_apply_to_dhandles --
 *     Apply an operation to all handles locked for a checkpoint.
 */
static int
__checkpoint_apply_to_dhandles(
  WT_SESSION_IMPL *session, const char *cfg[], int (*op)(WT_SESSION_IMPL *, const char *[]))
{
    WT_DECL_RET;
    u_int i;

    /* If we have already locked the handles, apply the operation. */
    for (i = 0; i < session->ckpt_handle_next; ++i) {
        if (session->ckpt_handle[i] == NULL)
            continue;
        WT_WITH_DHANDLE(session, session->ckpt_handle[i], ret = (*op)(session, cfg));
        WT_RET(ret);
    }

    return (0);
}

/*
 * __checkpoint_data_source --
 *     Checkpoint all data sources.
 */
static int
__checkpoint_data_source(WT_SESSION_IMPL *session, const char *cfg[])
{
    WT_DATA_SOURCE *dsrc;
    WT_NAMED_DATA_SOURCE *ndsrc;

    /*
     * A place-holder, to support data sources: we assume calling the underlying data-source session
     * checkpoint function is sufficient to checkpoint all objects in the data source, open or
     * closed, and we don't attempt to optimize the checkpoint of individual targets. Those
     * assumptions are not necessarily going to be true for all data sources.
     *
     * It's not difficult to support data-source checkpoints of individual targets
     * (__wt_schema_worker is the underlying function that will do the work, and it's already
     * written to support data-sources, although we'd probably need to pass the URI of the object to
     * the data source checkpoint function which we don't currently do). However, doing a full data
     * checkpoint is trickier: currently, the connection code is written to ignore all objects other
     * than "file:", and that code will require significant changes to work with data sources.
     */
    TAILQ_FOREACH (ndsrc, &S2C(session)->dsrcqh, q) {
        dsrc = ndsrc->dsrc;
        if (dsrc->checkpoint != NULL)
            WT_RET(dsrc->checkpoint(dsrc, (WT_SESSION *)session, (WT_CONFIG_ARG *)cfg));
    }
    return (0);
}

/*
 * __wt_checkpoint_get_handles --
 *     Get a list of handles to flush.
 */
int
__wt_checkpoint_get_handles(WT_SESSION_IMPL *session, const char *cfg[])
{
    WT_BTREE *btree;
    WT_CONFIG_ITEM cval;
    WT_DECL_RET;
    const char *name;
    bool force;

    /* Find out if we have to force a checkpoint. */
    WT_RET(__wt_config_gets_def(session, cfg, "force", 0, &cval));
    force = cval.val != 0;
    if (!force) {
        WT_RET(__wt_config_gets_def(session, cfg, "name", 0, &cval));
        force = cval.len != 0;
    }

    /* Should not be called with anything other than a live btree handle. */
    WT_ASSERT(session, WT_DHANDLE_BTREE(session->dhandle) && !WT_READING_CHECKPOINT(session));

    btree = S2BT(session);

    /*
     * Skip files that are never involved in a checkpoint. Skip the history store file as it is,
     * checkpointed manually later.
     */
    if (F_ISSET(btree, WT_BTREE_NO_CHECKPOINT) || WT_IS_HS(btree->dhandle))
        return (0);

    /*
     * We may have raced between starting the checkpoint transaction and some operation completing
     * on the handle that updated the metadata (e.g., closing a bulk load cursor). All such
     * operations either have exclusive access to the handle or hold the schema lock. We are now
     * holding the schema lock and have an open btree handle, so if we can't update the metadata,
     * then there has been some state change invisible to the checkpoint transaction.
     */
    if (!WT_IS_METADATA(session->dhandle)) {
        WT_CURSOR *meta_cursor;

        WT_ASSERT(session, !F_ISSET(session->txn, WT_TXN_ERROR));
        WT_RET(__wt_metadata_cursor(session, &meta_cursor));
        meta_cursor->set_key(meta_cursor, session->dhandle->name);
        ret = __wt_curfile_insert_check(meta_cursor);
        if (ret == WT_ROLLBACK) {
            /*
             * If create or drop or any schema operation of a table is with in an user transaction
             * then checkpoint can see the dhandle before the commit, which will lead to the
             * rollback error. We will ignore this dhandle as part of this checkpoint by returning
             * from here.
             */
            __wt_verbose_notice(session, WT_VERB_CHECKPOINT, "%s",
              "WT_ROLLBACK: checkpoint raced with transaction operating on dhandle");
            WT_TRET(__wt_metadata_cursor_release(session, &meta_cursor));
            return (0);
        }
        WT_TRET(__wt_metadata_cursor_release(session, &meta_cursor));
        WT_RET(ret);
    }

    /*
     * Decide whether the tree needs to be included in the checkpoint and if so, acquire the
     * necessary locks.
     */
    WT_SAVE_DHANDLE(session, ret = __checkpoint_lock_dirty_tree(session, true, force, true, cfg));
    WT_RET(ret);
    if (F_ISSET(btree, WT_BTREE_SKIP_CKPT)) {
        __checkpoint_update_generation(session);
        return (0);
    }

    /*
     * Make sure there is space for the new entry: do this before getting the handle to avoid
     * cleanup if we can't allocate the memory.
     */
    WT_RET(__wt_realloc_def(session, &session->ckpt_handle_allocated, session->ckpt_handle_next + 1,
      &session->ckpt_handle));

    /*
     * The current tree will be included: get it again because the handle we have is only valid for
     * the duration of this function.
     */
    name = session->dhandle->name;
    session->dhandle = NULL;

    if ((ret = __wt_session_get_dhandle(session, name, NULL, NULL, 0)) != 0)
        return (ret == EBUSY ? 0 : ret);

    /*
     * Save the current eviction walk setting: checkpoint can interfere with eviction and we don't
     * want to unfairly penalize (or promote) eviction in trees due to checkpoints.
     */
    btree->evict_walk_saved = btree->evict_walk_period;

    session->ckpt_handle[session->ckpt_handle_next++] = session->dhandle;
    return (0);
}

/*
 * __checkpoint_wait_reduce_dirty_cache --
 *     Try to reduce the amount of dirty data in cache so there is less work do during the critical
 *     section of the checkpoint.
 */
static void
__checkpoint_wait_reduce_dirty_cache(WT_SESSION_IMPL *session)
{
    WT_CACHE *cache;
    WT_CONNECTION_IMPL *conn;
    double current_dirty, prev_dirty;
    uint64_t bytes_written_start, bytes_written_total;
    uint64_t cache_size, max_write;
    uint64_t time_start, time_stop;
    uint64_t total_ms;

    conn = S2C(session);
    cache = conn->cache;

    /* Give up if scrubbing is disabled. */
    if (cache->eviction_checkpoint_target < DBL_EPSILON)
        return;

    time_start = __wt_clock(session);
    bytes_written_start = cache->bytes_written;

    /*
     * If the cache size is zero or very small, we're done. The cache size can briefly become zero
     * if we're transitioning to a shared cache via reconfigure. This avoids potential divide by
     * zero.
     */
    if ((cache_size = conn->cache_size) < 10 * WT_MEGABYTE)
        return;

    current_dirty = (100.0 * __wt_cache_dirty_leaf_inuse(cache)) / cache_size;
    if (current_dirty <= cache->eviction_checkpoint_target)
        return;

    /* Stop if we write as much dirty data as is currently in cache. */
    max_write = __wt_cache_dirty_leaf_inuse(cache);

    /* Set the dirty trigger to the target value. */
    cache->eviction_scrub_target = cache->eviction_checkpoint_target;
    WT_STAT_CONN_SET(session, txn_checkpoint_scrub_target, 0);

    /* Wait while the dirty level is going down. */
    for (;;) {
        __wt_sleep(0, 100 * WT_THOUSAND);

        prev_dirty = current_dirty;
        current_dirty = (100.0 * __wt_cache_dirty_leaf_inuse(cache)) / cache_size;
        if (current_dirty <= cache->eviction_checkpoint_target || current_dirty >= prev_dirty)
            break;

        /*
         * We haven't reached the current target.
         *
         * Don't wait indefinitely: there might be dirty pages that can't be evicted. If we can't
         * meet the target, give up and start the checkpoint for real.
         */
        bytes_written_total = cache->bytes_written - bytes_written_start;
        if (bytes_written_total > max_write)
            break;
    }

    time_stop = __wt_clock(session);
    total_ms = WT_CLOCKDIFF_MS(time_stop, time_start);
    WT_STAT_CONN_SET(session, txn_checkpoint_scrub_time, total_ms);
}

/*
 * __wt_checkpoint_progress --
 *     Output a checkpoint progress message.
 */
void
__wt_checkpoint_progress(WT_SESSION_IMPL *session, bool closing)
{
    struct timespec cur_time;
    WT_CONNECTION_IMPL *conn;
    uint64_t time_diff;

    conn = S2C(session);
    __wt_epoch(session, &cur_time);

    /* Time since the full database checkpoint started */
    time_diff = WT_TIMEDIFF_SEC(cur_time, conn->ckpt_timer_start);

    if (closing || (time_diff / WT_PROGRESS_MSG_PERIOD) > conn->ckpt_progress_msg_count) {
        __wt_verbose(session, WT_VERB_CHECKPOINT_PROGRESS,
          "Checkpoint %s for %" PRIu64 " seconds and wrote: %" PRIu64 " pages (%" PRIu64 " MB)",
          closing ? "ran" : "has been running", time_diff, conn->ckpt_write_pages,
          conn->ckpt_write_bytes / WT_MEGABYTE);
        conn->ckpt_progress_msg_count++;
    }
}

/*
 * __checkpoint_stats --
 *     Update checkpoint timer stats.
 */
static void
__checkpoint_stats(WT_SESSION_IMPL *session)
{
    struct timespec stop;
    WT_CONNECTION_IMPL *conn;
    uint64_t msec;

    conn = S2C(session);

    /* Output a verbose progress message for long running checkpoints. */
    if (conn->ckpt_progress_msg_count > 0)
        __wt_checkpoint_progress(session, true);

    /* Compute end-to-end timer statistics for checkpoint. */
    __wt_epoch(session, &stop);
    msec = WT_TIMEDIFF_MS(stop, conn->ckpt_timer_scrub_end);

    if (msec > conn->ckpt_time_max)
        conn->ckpt_time_max = msec;
    if (msec < conn->ckpt_time_min)
        conn->ckpt_time_min = msec;
    conn->ckpt_time_recent = msec;
    conn->ckpt_time_total += msec;

    /* Compute timer statistics for the checkpoint prepare. */
    msec = WT_TIMEDIFF_MS(conn->ckpt_prep_end, conn->ckpt_prep_start);

    if (msec > conn->ckpt_prep_max)
        conn->ckpt_prep_max = msec;
    if (msec < conn->ckpt_prep_min)
        conn->ckpt_prep_min = msec;
    conn->ckpt_prep_recent = msec;
    conn->ckpt_prep_total += msec;
}

/*
 * __checkpoint_verbose_track --
 *     Output a verbose message with timing information
 */
static void
__checkpoint_verbose_track(WT_SESSION_IMPL *session, const char *msg)
{
    struct timespec stop;
    WT_CONNECTION_IMPL *conn;
    uint64_t msec;

    if (!WT_VERBOSE_ISSET(session, WT_VERB_CHECKPOINT))
        return;

    conn = S2C(session);
    __wt_epoch(session, &stop);

    /* Get time diff in milliseconds. */
    msec = WT_TIMEDIFF_MS(stop, conn->ckpt_timer_start);
    __wt_verbose(session, WT_VERB_CHECKPOINT,
      "time: %" PRIu64 " ms, gen: %" PRIu64 ": Full database checkpoint %s", msec,
      __wt_gen(session, WT_GEN_CHECKPOINT), msg);
}

/*
 * __checkpoint_fail_reset --
 *     Reset fields when a failure occurs.
 */
static void
__checkpoint_fail_reset(WT_SESSION_IMPL *session)
{
    WT_BTREE *btree;

    btree = S2BT(session);
    btree->modified = true;
    __wt_meta_ckptlist_free(session, &btree->ckpt);
}

/*
 * __checkpoint_prepare --
 *     Start the transaction for a checkpoint and gather handles.
 */
static int
__checkpoint_prepare(WT_SESSION_IMPL *session, bool *trackingp, const char *cfg[])
{
    struct timespec tsp;
    WT_CONFIG_ITEM cval;
    WT_CONNECTION_IMPL *conn;
    WT_DECL_RET;
    WT_TXN *txn;
    WT_TXN_GLOBAL *txn_global;
    WT_TXN_SHARED *txn_shared;
    uint64_t original_snap_min;
    const char *txn_cfg[] = {
      WT_CONFIG_BASE(session, WT_SESSION_begin_transaction), "isolation=snapshot", NULL};
    bool use_timestamp;

    conn = S2C(session);
    txn = session->txn;
    txn_global = &conn->txn_global;
    txn_shared = WT_SESSION_TXN_SHARED(session);

    WT_RET(__wt_config_gets(session, cfg, "use_timestamp", &cval));
    use_timestamp = (cval.val != 0);

    /*
     * Start a snapshot transaction for the checkpoint.
     *
     * Note: we don't go through the public API calls because they have side effects on cursors,
     * which applications can hold open across calls to checkpoint.
     */
    WT_STAT_CONN_SET(session, txn_checkpoint_prep_running, 1);
    __wt_epoch(session, &conn->ckpt_prep_start);

    WT_RET(__wt_txn_begin(session, txn_cfg));
    /* Wait 1000 microseconds to simulate slowdown in checkpoint prepare. */
    tsp.tv_sec = 0;
    tsp.tv_nsec = WT_MILLION;
    __checkpoint_timing_stress(session, WT_TIMING_STRESS_PREPARE_CHECKPOINT_DELAY, &tsp);
    original_snap_min = session->txn->snap_min;

    WT_DIAGNOSTIC_YIELD;

    /* Ensure a transaction ID is allocated prior to sharing it globally */
    WT_RET(__wt_txn_id_check(session));

    /* Keep track of handles acquired for locking. */
    WT_RET(__wt_meta_track_on(session));
    *trackingp = true;

    /*
     * Mark the connection as clean. If some data gets modified after generating checkpoint
     * transaction id, connection will be reset to dirty when reconciliation marks the btree dirty
     * on encountering the dirty page.
     */
    conn->modified = false;

    /*
     * Save the checkpoint session ID.
     *
     * We never do checkpoints in the default session (with id zero).
     */
    WT_ASSERT(session, session->id != 0 && txn_global->checkpoint_id == 0);
    txn_global->checkpoint_id = session->id;

    /*
     * Remove the checkpoint transaction from the global table.
     *
     * This allows ordinary visibility checks to move forward because checkpoints often take a long
     * time and only write to the metadata.
     */
    __wt_writelock(session, &txn_global->rwlock);
    txn_global->checkpoint_txn_shared = *txn_shared;
    txn_global->checkpoint_txn_shared.pinned_id = txn->snap_min;

    /*
     * Sanity check that the oldest ID hasn't moved on before we have cleared our entry.
     */
    WT_ASSERT(session,
      WT_TXNID_LE(txn_global->oldest_id, txn_shared->id) &&
        WT_TXNID_LE(txn_global->oldest_id, txn_shared->pinned_id));

    /*
     * Clear our entry from the global transaction session table. Any operation that needs to know
     * about the ID for this checkpoint will consider the checkpoint ID in the global structure.
     * Most operations can safely ignore the checkpoint ID (see the visible all check for details).
     */
    txn_shared->id = txn_shared->pinned_id = txn_shared->metadata_pinned = WT_TXN_NONE;

    /*
     * Set the checkpoint transaction's timestamp, if requested.
     *
     * We rely on having the global transaction data locked so the oldest timestamp can't move past
     * the stable timestamp.
     */
    WT_ASSERT(session,
      !F_ISSET(txn, WT_TXN_HAS_TS_COMMIT | WT_TXN_SHARED_TS_DURABLE | WT_TXN_SHARED_TS_READ));

    if (use_timestamp) {
        /*
         * If the user wants timestamps then set the metadata checkpoint timestamp based on whether
         * or not a stable timestamp is actually in use. Only set it when we're not running recovery
         * because recovery doesn't set the recovery timestamp until its checkpoint is complete.
         */
        if (txn_global->has_stable_timestamp) {
            txn_global->checkpoint_timestamp = txn_global->stable_timestamp;
            if (!F_ISSET(conn, WT_CONN_RECOVERING))
                txn_global->meta_ckpt_timestamp = txn_global->checkpoint_timestamp;
        } else if (!F_ISSET(conn, WT_CONN_RECOVERING))
            txn_global->meta_ckpt_timestamp = txn_global->recovery_timestamp;
    } else {
        if (!F_ISSET(conn, WT_CONN_RECOVERING))
            txn_global->meta_ckpt_timestamp = WT_TS_NONE;
        txn_shared->read_timestamp = WT_TS_NONE;
    }

    __wt_writeunlock(session, &txn_global->rwlock);

    /*
     * Refresh our snapshot here without publishing our shared ids to the world, doing so prevents
     * us from racing with the stable timestamp moving ahead of current snapshot. i.e. if the stable
     * timestamp moves after we begin the checkpoint transaction but before we set the checkpoint
     * timestamp we can end up missing updates in our checkpoint.
     */
    __wt_txn_bump_snapshot(session);

    /* Assert that our snapshot min didn't somehow move backwards. */
    WT_ASSERT(session, session->txn->snap_min >= original_snap_min);
    /* Flag as unused for non diagnostic builds. */
    WT_UNUSED(original_snap_min);

    if (use_timestamp)
        __wt_verbose_timestamp(
          session, txn_global->checkpoint_timestamp, "Checkpoint requested at stable timestamp");

    /*
     * Get a list of handles we want to flush; for named checkpoints this may pull closed objects
     * into the session cache.
     *
     * First, gather all handles, then start the checkpoint transaction, then release any clean
     * handles.
     */
    WT_ASSERT(session, session->ckpt_handle_next == 0);
    WT_WITH_TABLE_READ_LOCK(
      session, ret = __checkpoint_apply_operation(session, cfg, __wt_checkpoint_get_handles));

    __wt_epoch(session, &conn->ckpt_prep_end);
    WT_STAT_CONN_SET(session, txn_checkpoint_prep_running, 0);

    return (ret);
}

/*
 * __txn_checkpoint_can_skip --
 *     Determine whether it's safe to skip taking a checkpoint.
 */
static int
__txn_checkpoint_can_skip(
  WT_SESSION_IMPL *session, const char *cfg[], bool *fullp, bool *use_timestampp, bool *can_skipp)
{
    WT_CONFIG targetconf;
    WT_CONFIG_ITEM cval, k, v;
    WT_CONNECTION_IMPL *conn;
    WT_TXN_GLOBAL *txn_global;
    bool full, use_timestamp;

    /*
     * Default to not skipping - also initialize the other output parameters - even though they will
     * always be initialized unless there is an error and callers need to ignore the results on
     * error.
     */
    *can_skipp = *fullp = *use_timestampp = false;

    conn = S2C(session);
    txn_global = &conn->txn_global;

    /*
     * This function also parses out some configuration options and hands them back to the caller -
     * make sure it does that parsing regardless of the result.
     *
     * Determine if this is going to be a full checkpoint, that is a checkpoint that applies to all
     * data tables in a database.
     */
    WT_RET(__wt_config_gets(session, cfg, "target", &cval));
    __wt_config_subinit(session, &targetconf, &cval);
    *fullp = full = __wt_config_next(&targetconf, &k, &v) != 0;

    WT_RET(__wt_config_gets(session, cfg, "use_timestamp", &cval));
    *use_timestampp = use_timestamp = cval.val != 0;

    /* Never skip non-full checkpoints */
    if (!full)
        return (0);

    /* Never skip if force is configured. */
    WT_RET(__wt_config_gets_def(session, cfg, "force", 0, &cval));
    if (cval.val != 0)
        return (0);

    /* Never skip named checkpoints. */
    WT_RET(__wt_config_gets(session, cfg, "name", &cval));
    if (cval.len != 0)
        return (0);

    /*
     * If the checkpoint is using timestamps, and the stable timestamp hasn't been updated since the
     * last checkpoint there is nothing more that could be written. Except when a non timestamped
     * file has been modified, as such if the connection has been modified it is currently unsafe to
     * skip checkpoints.
     */
    if (!conn->modified && use_timestamp && txn_global->has_stable_timestamp &&
      txn_global->last_ckpt_timestamp != WT_TS_NONE &&
      txn_global->last_ckpt_timestamp == txn_global->stable_timestamp) {
        *can_skipp = true;
        return (0);
    }

    /*
     * Skip checkpointing the database if nothing has been dirtied since the last checkpoint. That
     * said there can be short instances when a btree gets marked dirty and the connection is yet to
     * be. We might skip a checkpoint in that short instance, which is okay because by the next time
     * we get to checkpoint, the connection would have been marked dirty and hence the checkpoint
     * will not be skipped again.
     *
     * If we are using timestamps then we shouldn't skip as the stable timestamp must have moved,
     * and as such we still need to run checkpoint to update the checkpoint timestamp and the
     * metadata.
     */
    if (!use_timestamp && !conn->modified)
        *can_skipp = true;

    return (0);
}

/*
 * __txn_checkpoint_establish_time --
 *     Get a time (wall time, not a timestamp) for this checkpoint. The time is left in the session.
 */
static void
__txn_checkpoint_establish_time(WT_SESSION_IMPL *session)
{
    WT_CONNECTION_IMPL *conn;
    uint64_t ckpt_sec, most_recent;

    conn = S2C(session);

    /*
     * If tiered storage is in use, move the time up to at least the most recent flush first. NOTE:
     * reading the most recent flush time is not an ordered read (or repeated on retry) because
     * currently checkpoint and flush tier are mutually exclusive.
     *
     * Update the global value that tracks the most recent checkpoint, and use it to make sure the
     * most recent checkpoint time doesn't move backwards. Also make sure that this checkpoint time
     * is not the same as the previous one, by running the clock forwards as needed.
     *
     * Note that while it's possible to run the clock a good long way forward if one tries (e.g. by
     * doing a large number of schema operations that are fast and generate successive checkpoints
     * of the metadata) and some tests (e.g. f_ops) do, this is not expected to happen in real use
     * or lead to significant deviations from wall clock time. In a real database of any size full
     * checkpoints take more than one second and schema operations are rare. Furthermore, though
     * these times are saved on disk and displayed by 'wt list' they are not used operationally
     * except in restricted ways:
     *    - to manage the interaction between hot backups and checkpointing, where the absolute time
     *      does not matter;
     *    - to track when tiered storage was last flushed in order to avoid redoing work, where the
     *      absolute time does not matter;
     *    - to detect and retry races between opening checkpoint cursors and checkpoints in progress
     *      (which only cares about ordering and only since the last database open).
     *
     * Currently the checkpoint time can move backwards if something has run it forward and a crash
     * (or shutdown) and restart happens quickly enough that the wall clock hasn't caught up yet.
     * This is a property of the way it gets initialized at startup, which is naive, and if issues
     * arise where this matters it can get adjusted during startup in much the way the base write
     * generation does. The checkpoint cursor opening code was set up specifically so that this does
     * not matter.
     *
     * It is possible to race here, so use atomic CAS. This code relies on the fact that anyone we
     * race with will only increase (never decrease) the most recent checkpoint time value.
     *
     * We store the time in the session rather than passing it around explicitly because passing it
     * around explicitly runs afoul of the type signatures of the functions passed to schema_worker.
     */

    __wt_seconds(session, &ckpt_sec);
    ckpt_sec = WT_MAX(ckpt_sec, conn->flush_most_recent);

    for (;;) {
        WT_ORDERED_READ(most_recent, conn->ckpt_most_recent);
        if (ckpt_sec <= most_recent)
            ckpt_sec = most_recent + 1;
        if (__wt_atomic_cas64(&conn->ckpt_most_recent, most_recent, ckpt_sec))
            break;
    }

    WT_ASSERT(session, session->current_ckpt_sec == 0);
    session->current_ckpt_sec = ckpt_sec;
}

/*
 * __txn_checkpoint_clear_time --
 *     Clear the current checkpoint time in the session.
 */
static void
__txn_checkpoint_clear_time(WT_SESSION_IMPL *session)
{
    WT_ASSERT(session, session->current_ckpt_sec > 0);
    session->current_ckpt_sec = 0;
}

/*
 * __txn_checkpoint --
 *     Checkpoint a database or a list of objects in the database.
 */
static int
__txn_checkpoint(WT_SESSION_IMPL *session, const char *cfg[])
{
    struct timespec tsp;
    WT_CACHE *cache;
    WT_CONFIG_ITEM cval;
    WT_CONNECTION_IMPL *conn;
    WT_DATA_HANDLE *hs_dhandle;
    WT_DECL_RET;
    WT_TXN *txn;
    WT_TXN_GLOBAL *txn_global;
    WT_TXN_ISOLATION saved_isolation;
    wt_off_t hs_size;
    wt_timestamp_t ckpt_tmp_ts;
    size_t namelen;
    uint64_t fsync_duration_usecs, generation, hs_ckpt_duration_usecs;
    uint64_t time_start_fsync, time_start_hs, time_stop_fsync, time_stop_hs;
    u_int i;
    const char *name;
    bool can_skip, failed, full, idle, logging, tracking, use_timestamp;
    void *saved_meta_next;

    conn = S2C(session);
    cache = conn->cache;
    hs_size = 0;
    hs_dhandle = NULL;
    txn = session->txn;
    txn_global = &conn->txn_global;
    saved_isolation = session->isolation;
    full = idle = tracking = use_timestamp = false;

    /* Avoid doing work if possible. */
    WT_RET(__txn_checkpoint_can_skip(session, cfg, &full, &use_timestamp, &can_skip));
    if (can_skip) {
        WT_STAT_CONN_INCR(session, txn_checkpoint_skipped);
        return (0);
    }

    /* Check if this is a named checkpoint. */
    WT_RET(__wt_config_gets(session, cfg, "name", &cval));
    if (cval.len != 0) {
        name = cval.str;
        namelen = cval.len;
    } else {
        name = NULL;
        namelen = 0;
    }

    /*
     * Do a pass over the configuration arguments and figure out what kind of checkpoint this is.
     */
    WT_RET(__checkpoint_apply_operation(session, cfg, NULL));

    logging = FLD_ISSET(conn->log_flags, WT_CONN_LOG_ENABLED);

    /* Reset the statistics tracked per checkpoint. */
    cache->evict_max_page_size = 0;
    conn->rec_maximum_seconds = 0;

    /* Initialize the verbose tracking timer */
    __wt_epoch(session, &conn->ckpt_timer_start);

    /* Initialize the checkpoint progress tracking data */
    conn->ckpt_progress_msg_count = 0;
    conn->ckpt_write_bytes = 0;
    conn->ckpt_write_pages = 0;

    /*
     * Get a time (wall time, not a timestamp) for this checkpoint. This will be applied to all the
     * trees so they match. The time is left in the session.
     */
    __txn_checkpoint_establish_time(session);

    /*
     * Update the global oldest ID so we do all possible cleanup.
     *
     * This is particularly important for compact, so that all dirty pages can be fully written.
     */
    WT_ERR(__wt_txn_update_oldest(session, WT_TXN_OLDEST_STRICT | WT_TXN_OLDEST_WAIT));

    /* Flush data-sources before we start the checkpoint. */
    WT_ERR(__checkpoint_data_source(session, cfg));

    /*
     * Try to reduce the amount of dirty data in cache so there is less work do during the critical
     * section of the checkpoint.
     */
    __checkpoint_wait_reduce_dirty_cache(session);

    /* Tell logging that we are about to start a database checkpoint. */
    if (full && logging)
        WT_ERR(__wt_txn_checkpoint_log(session, full, WT_TXN_LOG_CKPT_PREPARE, NULL));

    __checkpoint_verbose_track(session, "starting transaction");

    if (full)
        __wt_epoch(session, &conn->ckpt_timer_scrub_end);

    /*
     * Start the checkpoint for real.
     *
     * Bump the global checkpoint generation, used to figure out whether checkpoint has visited a
     * tree. Use an atomic increment even though we are single-threaded because readers of the
     * checkpoint generation don't hold the checkpoint lock.
     *
     * We do need to update it before clearing the checkpoint's entry out of the transaction table,
     * or a thread evicting in a tree could ignore the checkpoint's transaction.
     */
    __wt_gen_next(session, WT_GEN_CHECKPOINT, &generation);
    WT_STAT_CONN_SET(session, txn_checkpoint_generation, generation);

    /*
     * We want to skip checkpointing clean handles whenever possible. That is, when the checkpoint
     * is not named or forced. However, we need to take care about ordering with respect to the
     * checkpoint transaction.
     *
     * We can't skip clean handles before starting the transaction or the checkpoint can miss
     * updates in trees that become dirty as the checkpoint is starting. If we wait until the
     * transaction has started before locking a handle, there could be a metadata-changing operation
     * in between (e.g., salvage) that will cause a write conflict when the checkpoint goes to write
     * the metadata.
     *
     * Hold the schema lock while starting the transaction and gathering handles so the set we get
     * is complete and correct.
     */
    WT_WITH_SCHEMA_LOCK(session, ret = __checkpoint_prepare(session, &tracking, cfg));
    WT_ERR(ret);

    /*
     * Save the checkpoint timestamp in a temporary variable, when we release our snapshot it'll be
     * reset to zero.
     */
    WT_ORDERED_READ(ckpt_tmp_ts, txn_global->checkpoint_timestamp);

    WT_ASSERT(session, txn->isolation == WT_ISO_SNAPSHOT);

    /*
     * Unblock updates -- we can figure out that any updates to clean pages after this point are too
     * new to be written in the checkpoint.
     */
    cache->eviction_scrub_target = 0.0;
    WT_STAT_CONN_SET(session, txn_checkpoint_scrub_target, 0);

    /* Tell logging that we have started a database checkpoint. */
    if (full && logging)
        WT_ERR(__wt_txn_checkpoint_log(session, full, WT_TXN_LOG_CKPT_START, NULL));

    /* Add a ten second wait to simulate checkpoint slowness. */
    tsp.tv_sec = 10;
    tsp.tv_nsec = 0;
    __checkpoint_timing_stress(session, WT_TIMING_STRESS_CHECKPOINT_SLOW, &tsp);
    WT_ERR(__checkpoint_apply_to_dhandles(session, cfg, __checkpoint_tree_helper));

    /* Wait prior to checkpointing the history store to simulate checkpoint slowness. */
    __checkpoint_timing_stress(session, WT_TIMING_STRESS_HS_CHECKPOINT_DELAY, &tsp);

    /*
     * Get a history store dhandle. If the history store file is opened for a special operation this
     * will return EBUSY which we treat as an error. In scenarios where the history store is not
     * part of the metadata file (performing recovery on backup folder where no checkpoint
     * occurred), this will return ENOENT which we ignore and continue.
     */
    WT_ERR_ERROR_OK(__wt_session_get_dhandle(session, WT_HS_URI, NULL, NULL, 0), ENOENT, false);
    hs_dhandle = session->dhandle;

    /*
     * It is possible that we don't have a history store file in certain recovery scenarios. As such
     * we could get a dhandle that is not opened.
     */
    if (F_ISSET(hs_dhandle, WT_DHANDLE_OPEN)) {
        time_start_hs = __wt_clock(session);
        conn->txn_global.checkpoint_running_hs = true;
        WT_STAT_CONN_SET(session, txn_checkpoint_running_hs, 1);

        WT_WITH_DHANDLE(session, hs_dhandle, ret = __wt_checkpoint(session, cfg));

        WT_STAT_CONN_SET(session, txn_checkpoint_running_hs, 0);
        conn->txn_global.checkpoint_running_hs = false;
        WT_ERR(ret);

        /*
         * Once the history store checkpoint is complete, we increment the checkpoint generation of
         * the associated b-tree. The checkpoint generation controls whether we include the
         * checkpoint transaction in our calculations of the pinned and oldest_ids for a given
         * btree. We increment it here to ensure that the visibility checks performed on updates in
         * the history store do not include the checkpoint transaction.
         */
        __checkpoint_update_generation(session);

        time_stop_hs = __wt_clock(session);
        hs_ckpt_duration_usecs = WT_CLOCKDIFF_US(time_stop_hs, time_start_hs);
        WT_STAT_CONN_SET(session, txn_hs_ckpt_duration, hs_ckpt_duration_usecs);
    }

    /*
     * As part of recovery, rollback to stable may have left out clearing stale transaction ids.
     * Update the connection base write generation based on the latest checkpoint write generations
     * to reset these transaction ids present on the pages when reading them.
     */
    if (F_ISSET(conn, WT_CONN_RECOVERING))
        WT_ERR(__wt_metadata_correct_base_write_gen(session));

    /*
     * Clear the dhandle so the visibility check doesn't get confused about the snap min. Don't
     * bother restoring the handle since it doesn't make sense to carry a handle across a
     * checkpoint.
     */
    session->dhandle = NULL;

    /*
     * We have to update the system information before we release the snapshot. Drop the system
     * information for checkpoints we're dropping first in case the names overlap.
     */
    if (session->ckpt_drop_list != NULL) {
        __drop_list_execute(session, session->ckpt_drop_list);
        __wt_scr_free(session, &session->ckpt_drop_list);
    }
    if (full || name != NULL)
        WT_ERR(__wt_meta_sysinfo_set(session, full, name, namelen));

    /* Release the snapshot so we aren't pinning updates in cache. */
    __wt_txn_release_snapshot(session);

    /* Mark all trees as open for business (particularly eviction). */
    WT_ERR(__checkpoint_apply_to_dhandles(session, cfg, __checkpoint_presync));

    __checkpoint_verbose_track(session, "committing transaction");

    /*
     * Checkpoints have to hit disk (it would be reasonable to configure for lazy checkpoints, but
     * we don't support them yet).
     */
    time_start_fsync = __wt_clock(session);

    WT_ERR(__checkpoint_apply_to_dhandles(session, cfg, __wt_checkpoint_sync));

    /* Sync the history store file. */
    if (F_ISSET(hs_dhandle, WT_DHANDLE_OPEN))
        WT_WITH_DHANDLE(session, hs_dhandle, ret = __wt_checkpoint_sync(session, NULL));

    time_stop_fsync = __wt_clock(session);
    fsync_duration_usecs = WT_CLOCKDIFF_US(time_stop_fsync, time_start_fsync);
    WT_STAT_CONN_INCR(session, txn_checkpoint_fsync_post);
    WT_STAT_CONN_SET(session, txn_checkpoint_fsync_post_duration, fsync_duration_usecs);

    __checkpoint_verbose_track(session, "sync completed");

    /* If the history store file exists on disk, update its statistic. */
    if (F_ISSET(hs_dhandle, WT_DHANDLE_OPEN)) {
        WT_ERR(__wt_block_manager_named_size(session, WT_HS_FILE, &hs_size));
        WT_STAT_CONN_SET(session, cache_hs_ondisk, hs_size);
    }

    /*
     * Commit the transaction now that we are sure that all files in the checkpoint have been
     * flushed to disk. It's OK to commit before checkpointing the metadata since we know that all
     * files in the checkpoint are now in a consistent state.
     */
    WT_ERR(__wt_txn_commit(session, NULL));

    /*
     * Flush all the logs that are generated during the checkpoint. It is possible that checkpoint
     * may include the changes that are written in parallel by an eviction. To have a consistent
     * view of the data, make sure that all the logs are flushed to disk before the checkpoint is
     * complete.
     */
    if (FLD_ISSET(conn->log_flags, WT_CONN_LOG_ENABLED))
        WT_ERR(__wt_log_flush(session, WT_LOG_FSYNC));

    /*
     * Ensure that the metadata changes are durable before the checkpoint is resolved. Do this by
     * either checkpointing the metadata or syncing the log file. Recovery relies on the checkpoint
     * LSN in the metadata only being updated by full checkpoints so only checkpoint the metadata
     * for full or non-logged checkpoints.
     *
     * This is very similar to __wt_meta_track_off, ideally they would be merged.
     */
    if (full || !logging) {
        session->isolation = txn->isolation = WT_ISO_READ_UNCOMMITTED;
        /* Disable metadata tracking during the metadata checkpoint. */
        saved_meta_next = session->meta_track_next;
        session->meta_track_next = NULL;
        WT_WITH_DHANDLE(session, WT_SESSION_META_DHANDLE(session),
          WT_WITH_METADATA_LOCK(session, ret = __wt_checkpoint(session, cfg)));
        session->meta_track_next = saved_meta_next;
        WT_ERR(ret);

        WT_WITH_DHANDLE(
          session, WT_SESSION_META_DHANDLE(session), ret = __wt_checkpoint_sync(session, NULL));
        WT_ERR(ret);

        __checkpoint_verbose_track(session, "metadata sync completed");
    } else
        WT_WITH_DHANDLE(session, WT_SESSION_META_DHANDLE(session),
          ret = __wt_txn_checkpoint_log(session, false, WT_TXN_LOG_CKPT_SYNC, NULL));

    WT_STAT_CONN_SET(session, txn_checkpoint_stop_stress_active, 1);
    /* Wait prior to flush the checkpoint stop log record. */
    __checkpoint_timing_stress(session, WT_TIMING_STRESS_CHECKPOINT_STOP, &tsp);
    WT_STAT_CONN_SET(session, txn_checkpoint_stop_stress_active, 0);

    /*
     * Now that the metadata is stable, re-open the metadata file for regular eviction by clearing
     * the checkpoint_pinned flag.
     */
    txn_global->checkpoint_txn_shared.pinned_id = WT_TXN_NONE;

    if (full) {
        __checkpoint_stats(session);

        /*
         * If timestamps defined the checkpoint's content, set the saved last checkpoint timestamp,
         * otherwise clear it. We clear it for a couple of reasons: applications can query it and we
         * don't want to lie, and we use it to decide if WT_CONNECTION.rollback_to_stable is an
         * allowed operation. For the same reason, don't set it to WT_TS_NONE when the checkpoint
         * timestamp is WT_TS_NONE, set it to 1 so we can tell the difference.
         */
        if (use_timestamp) {
            conn->txn_global.last_ckpt_timestamp = ckpt_tmp_ts;
            /*
             * MongoDB assumes the checkpoint timestamp will be initialized with WT_TS_NONE. In such
             * cases it queries the recovery timestamp to determine the last stable recovery
             * timestamp. So, if the recovery timestamp is valid, set the last checkpoint timestamp
             * to recovery timestamp. This should never be a problem, as checkpoint timestamp should
             * never be less than recovery timestamp. This could potentially avoid MongoDB making
             * two calls to determine last stable recovery timestamp.
             */
            if (conn->txn_global.last_ckpt_timestamp == WT_TS_NONE)
                conn->txn_global.last_ckpt_timestamp = conn->txn_global.recovery_timestamp;
        } else
            conn->txn_global.last_ckpt_timestamp = WT_TS_NONE;
    }

err:
    /*
     * Reset the timer so that next checkpoint tracks the progress only if configured.
     */
    conn->ckpt_timer_start.tv_sec = 0;

    /*
     * XXX Rolling back the changes here is problematic.
     *
     * If we unroll here, we need a way to roll back changes to the avail list for each tree that
     * was successfully synced before the error occurred. Otherwise, the next time we try this
     * operation, we will try to free an old checkpoint again.
     *
     * OTOH, if we commit the changes after a failure, we have partially overwritten the checkpoint,
     * so what ends up on disk is not consistent.
     */
    failed = ret != 0;
    if (failed)
        conn->modified = true;

    session->isolation = txn->isolation = WT_ISO_READ_UNCOMMITTED;
    if (tracking)
        WT_TRET(__wt_meta_track_off(session, false, failed));

    cache->eviction_scrub_target = 0.0;
    WT_STAT_CONN_SET(session, txn_checkpoint_scrub_target, 0);

    if (F_ISSET(txn, WT_TXN_RUNNING)) {
        /*
         * Clear the dhandle so the visibility check doesn't get confused about the snap min. Don't
         * bother restoring the handle since it doesn't make sense to carry a handle across a
         * checkpoint.
         */
        session->dhandle = NULL;
        WT_TRET(__wt_txn_rollback(session, NULL));
    }

    /*
     * Tell logging that we have finished a database checkpoint. Do not write a log record if the
     * database was idle.
     */
    if (full && logging) {
        if (ret == 0 && F_ISSET(CUR2BT(session->meta_cursor), WT_BTREE_SKIP_CKPT))
            idle = true;
        WT_TRET(__wt_txn_checkpoint_log(session, full,
          (ret == 0 && !idle) ? WT_TXN_LOG_CKPT_STOP : WT_TXN_LOG_CKPT_CLEANUP, NULL));
    }

    for (i = 0; i < session->ckpt_handle_next; ++i) {
        if (session->ckpt_handle[i] == NULL)
            continue;
        /*
         * If the operation failed, mark all trees dirty so they are included if a future checkpoint
         * can succeed.
         */
        if (failed)
            WT_WITH_DHANDLE(session, session->ckpt_handle[i], __checkpoint_fail_reset(session));
        WT_WITH_DHANDLE(
          session, session->ckpt_handle[i], WT_TRET(__wt_session_release_dhandle(session)));
    }

    if (session->ckpt_drop_list != NULL)
        __wt_scr_free(session, &session->ckpt_drop_list);

    __txn_checkpoint_clear_time(session);

    __wt_free(session, session->ckpt_handle);
    session->ckpt_handle_allocated = session->ckpt_handle_next = 0;

    session->isolation = txn->isolation = saved_isolation;
    return (ret);
}

/*
 * __txn_checkpoint_wrapper --
 *     Checkpoint wrapper.
 */
static int
__txn_checkpoint_wrapper(WT_SESSION_IMPL *session, const char *cfg[])
{
    WT_CONNECTION_IMPL *conn;
    WT_DECL_RET;
    WT_TXN_GLOBAL *txn_global;

    conn = S2C(session);
    txn_global = &conn->txn_global;

    WT_STAT_CONN_SET(session, txn_checkpoint_running, 1);
    txn_global->checkpoint_running = true;

    ret = __txn_checkpoint(session, cfg);

    WT_STAT_CONN_SET(session, txn_checkpoint_running, 0);
    txn_global->checkpoint_running = false;

    /*
     * Signal the tiered storage thread because it waits for the following checkpoint to complete to
     * process flush units. Indicate that the checkpoint has completed.
     */
    if (conn->tiered_cond != NULL) {
        conn->flush_ckpt_complete = true;
        __wt_cond_signal(session, conn->tiered_cond);
    }

    return (ret);
}

/*
 * __wt_txn_checkpoint --
 *     Checkpoint a database or a list of objects in the database.
 */
int
__wt_txn_checkpoint(WT_SESSION_IMPL *session, const char *cfg[], bool waiting)
{
    WT_DECL_RET;
    uint32_t orig_flags;

    /*
     * Reset open cursors. Do this explicitly, even though it will happen implicitly in the call to
     * begin_transaction for the checkpoint, the checkpoint code will acquire the schema lock before
     * we do that, and some implementation of WT_CURSOR::reset might need the schema lock.
     */
    WT_RET(__wt_session_reset_cursors(session, false));

    /* Ensure the metadata table is open before taking any locks. */
    WT_RET(__wt_metadata_cursor(session, NULL));

/*
 * Don't highjack the session checkpoint thread for eviction.
 *
 * Application threads are not generally available for potentially slow operations, but checkpoint
 * does enough I/O it may be called upon to perform slow operations for the block manager.
 *
 * Application checkpoints wait until the checkpoint lock is available, compaction checkpoints
 * don't.
 *
 * Checkpoints should always use a separate session for history store updates, otherwise those
 * updates are pinned until the checkpoint commits. Also, there are unfortunate interactions between
 * the special rules for history store eviction and the special handling of the checkpoint
 * transaction.
 */
#undef WT_CHECKPOINT_SESSION_FLAGS
#define WT_CHECKPOINT_SESSION_FLAGS (WT_SESSION_CAN_WAIT | WT_SESSION_IGNORE_CACHE_SIZE)
    orig_flags = F_MASK(session, WT_CHECKPOINT_SESSION_FLAGS);
    F_SET(session, WT_CHECKPOINT_SESSION_FLAGS);

    /*
     * Only one checkpoint can be active at a time, and checkpoints must run in the same order as
     * they update the metadata. It's probably a bad idea to run checkpoints out of multiple
     * threads, but as compaction calls checkpoint directly, it can be tough to avoid. Serialize
     * here to ensure we don't get into trouble.
     */
    if (waiting)
        WT_WITH_CHECKPOINT_LOCK(session, ret = __txn_checkpoint_wrapper(session, cfg));
    else
        WT_WITH_CHECKPOINT_LOCK_NOWAIT(session, ret, ret = __txn_checkpoint_wrapper(session, cfg));

    F_CLR(session, WT_CHECKPOINT_SESSION_FLAGS);
    F_SET(session, orig_flags);

    return (ret);
}

/*
 * __drop_list_execute --
 *     Clear the system info (snapshot and timestamp info) for the named checkpoints on the drop
 *     list.
 */
static int
__drop_list_execute(WT_SESSION_IMPL *session, WT_ITEM *drop_list)
{
    WT_CONFIG dropconf;
    WT_CONFIG_ITEM k, v;
    WT_DECL_RET;

    /* The list has the form (name, name, ...,) so we can read it with the config parser. */
    __wt_config_init(session, &dropconf, drop_list->data);
    while ((ret = __wt_config_next(&dropconf, &k, &v)) == 0) {
        WT_RET(__wt_meta_sysinfo_clear(session, k.str, k.len));
    }
    WT_RET_NOTFOUND_OK(ret);

    return (0);
}

/*
 * __drop_list_add --
 *     Add a checkpoint name to the list of (named) checkpoints being dropped. The list is produced
 *     by the first tree in the checkpoint (it must be the same in every tree, so it only needs to
 *     be produced once) and used at the top level to drop the snapshot and timestamp metadata for
 *     those checkpoints. Note that while there are several places in this file where WT_CKPT_DELETE
 *     is cleared on the fly, meaning the checkpoint won't actually be dropped, none of these apply
 *     to named checkpoints.
 */
static int
__drop_list_add(WT_SESSION_IMPL *session, WT_ITEM *drop_list, const char *name)
{
    return (__wt_buf_catfmt(session, drop_list, "%s,", name));
}

/*
 * __drop --
 *     Drop all checkpoints with a specific name.
 */
static int
__drop(
  WT_SESSION_IMPL *session, WT_ITEM *drop_list, WT_CKPT *ckptbase, const char *name, size_t len)
{
    WT_CKPT *ckpt;

    /*
     * If we're dropping internal checkpoints, match to the '.' separating the checkpoint name from
     * the generational number, and take all that we can find. Applications aren't allowed to use
     * any variant of this name, so the test is still pretty simple, if the leading bytes match,
     * it's one we want to drop.
     */
    if (strncmp(WT_CHECKPOINT, name, len) == 0) {
        WT_CKPT_FOREACH (ckptbase, ckpt)
            if (WT_PREFIX_MATCH(ckpt->name, WT_CHECKPOINT))
                F_SET(ckpt, WT_CKPT_DELETE);
    } else
        WT_CKPT_FOREACH (ckptbase, ckpt)
            if (WT_STRING_MATCH(ckpt->name, name, len)) {
                /* Remember the names of named checkpoints we're dropping. */
                if (drop_list != NULL)
                    WT_RET(__drop_list_add(session, drop_list, ckpt->name));
                F_SET(ckpt, WT_CKPT_DELETE);
            }

    return (0);
}

/*
 * __drop_from --
 *     Drop all checkpoints after, and including, the named checkpoint.
 */
static int
__drop_from(
  WT_SESSION_IMPL *session, WT_ITEM *drop_list, WT_CKPT *ckptbase, const char *name, size_t len)
{
    WT_CKPT *ckpt;
    bool matched;

    /*
     * There's a special case -- if the name is "all", then we delete all of the checkpoints.
     */
    if (WT_STRING_MATCH("all", name, len)) {
        WT_CKPT_FOREACH (ckptbase, ckpt) {
            /* Remember the names of named checkpoints we're dropping. */
            if (drop_list != NULL && !WT_PREFIX_MATCH(ckpt->name, WT_CHECKPOINT))
                WT_RET(__drop_list_add(session, drop_list, ckpt->name));
            F_SET(ckpt, WT_CKPT_DELETE);
        }
        return (0);
    }

    /*
     * We use the first checkpoint we can find, that is, if there are two checkpoints with the same
     * name in the list, we'll delete from the first match to the end.
     */
    matched = false;
    WT_CKPT_FOREACH (ckptbase, ckpt) {
        if (!matched && !WT_STRING_MATCH(ckpt->name, name, len))
            continue;

        matched = true;
        /* Remember the names of named checkpoints we're dropping. */
        if (drop_list != NULL && !WT_PREFIX_MATCH(ckpt->name, WT_CHECKPOINT))
            WT_RET(__drop_list_add(session, drop_list, ckpt->name));
        F_SET(ckpt, WT_CKPT_DELETE);
    }

    return (0);
}

/*
 * __drop_to --
 *     Drop all checkpoints before, and including, the named checkpoint.
 */
static int
__drop_to(
  WT_SESSION_IMPL *session, WT_ITEM *drop_list, WT_CKPT *ckptbase, const char *name, size_t len)
{
    WT_CKPT *ckpt, *mark;

    /*
     * We use the last checkpoint we can find, that is, if there are two checkpoints with the same
     * name in the list, we'll delete from the beginning to the second match, not the first.
     */
    mark = NULL;
    WT_CKPT_FOREACH (ckptbase, ckpt)
        if (WT_STRING_MATCH(ckpt->name, name, len))
            mark = ckpt;

    if (mark == NULL)
        return (0);

    WT_CKPT_FOREACH (ckptbase, ckpt) {
        /* Remember the names of named checkpoints we're dropping. */
        if (drop_list != NULL && !WT_PREFIX_MATCH(ckpt->name, WT_CHECKPOINT))
            WT_RET(__drop_list_add(session, drop_list, ckpt->name));
        F_SET(ckpt, WT_CKPT_DELETE);

        if (ckpt == mark)
            break;
    }

    return (0);
}

/*
 * __checkpoint_lock_dirty_tree_int --
 *     Helper for __checkpoint_lock_dirty_tree. Intended to be called while holding the hot backup
 *     lock.
 */
static int
__checkpoint_lock_dirty_tree_int(WT_SESSION_IMPL *session, bool is_checkpoint, bool force,
  WT_BTREE *btree, WT_CKPT *ckpt, WT_CKPT *ckptbase)
{
    WT_CONNECTION_IMPL *conn;
    WT_DECL_RET;
    u_int max_ckpt_drop;
    bool is_wt_ckpt;

    WT_UNUSED(is_checkpoint);
    conn = S2C(session);

    /* Check that it is OK to remove all the checkpoints marked for deletion. */
    max_ckpt_drop = 0;
    WT_CKPT_FOREACH (ckptbase, ckpt) {
        if (!F_ISSET(ckpt, WT_CKPT_DELETE))
            continue;
        is_wt_ckpt = WT_PREFIX_MATCH(ckpt->name, WT_CHECKPOINT);

        /*
         * If there is a hot backup, don't delete any WiredTiger checkpoint that could possibly have
         * been created before the backup started. Fail if trying to delete any other named
         * checkpoint.
         */
        if (conn->hot_backup_start != 0 && ckpt->sec <= conn->hot_backup_start) {
            if (is_wt_ckpt) {
                F_CLR(ckpt, WT_CKPT_DELETE);
                continue;
            }
            WT_RET_MSG(session, EBUSY,
              "checkpoint %s blocked by hot backup: it would delete an existing named checkpoint, "
              "and such checkpoints cannot be deleted during a hot backup",
              ckpt->name);
        }
        /*
         * Dropping checkpoints involves a fair amount of work while holding locks. Limit the number
         * of WiredTiger checkpoints dropped per checkpoint.
         */
        if (is_wt_ckpt)
#define WT_MAX_CHECKPOINT_DROP 4
            if (++max_ckpt_drop >= WT_MAX_CHECKPOINT_DROP)
                F_CLR(ckpt, WT_CKPT_DELETE);
    }

    /*
     * Mark old checkpoints that are being deleted and figure out which trees we can skip in this
     * checkpoint.
     */
    WT_RET(__checkpoint_mark_skip(session, ckptbase, force));
    if (F_ISSET(btree, WT_BTREE_SKIP_CKPT)) {
        /*
         * If we decide to skip checkpointing, clear the delete flag on the checkpoints. The list of
         * checkpoints will be cached for a future access. Which checkpoints need to be deleted can
         * change in the meanwhile.
         */
        WT_CKPT_FOREACH (ckptbase, ckpt)
            if (F_ISSET(ckpt, WT_CKPT_DELETE))
                F_CLR(ckpt, WT_CKPT_DELETE);
        return (0);
    }

    /*
     * Lock the checkpoints that will be deleted.
     *
     * Checkpoints are only locked when tracking is enabled, which covers checkpoint and drop
     * operations, but not close. The reasoning is there should be no access to a checkpoint during
     * close, because any thread accessing a checkpoint will also have the current file handle open.
     */
    if (WT_META_TRACKING(session))
        WT_CKPT_FOREACH (ckptbase, ckpt) {
            if (!F_ISSET(ckpt, WT_CKPT_DELETE))
                continue;
            WT_ASSERT(session,
              !WT_PREFIX_MATCH(ckpt->name, WT_CHECKPOINT) || conn->hot_backup_start == 0 ||
                ckpt->sec > conn->hot_backup_start);
            /*
             * We can't delete checkpoints referenced by a cursor. WiredTiger checkpoints are
             * uniquely named and it's OK to have multiple in the system: clear the delete flag for
             * them, and otherwise fail.
             */
            ret = __wt_session_lock_checkpoint(session, ckpt->name);
            if (ret == 0)
                continue;
            if (ret == EBUSY && WT_PREFIX_MATCH(ckpt->name, WT_CHECKPOINT)) {
                F_CLR(ckpt, WT_CKPT_DELETE);
                continue;
            }
            WT_RET_MSG(session, ret, "checkpoints cannot be dropped when in-use");
        }
    /*
     * There are special trees: those being bulk-loaded, salvaged, upgraded or verified during the
     * checkpoint. They should never be part of a checkpoint: we will fail to lock them because the
     * operations have exclusive access to the handles. Named checkpoints will fail in that case,
     * ordinary checkpoints skip files that cannot be opened normally.
     */
    WT_ASSERT(session, !is_checkpoint || !F_ISSET(btree, WT_BTREE_SPECIAL_FLAGS));

    return (0);
}

/*
 * __checkpoint_lock_dirty_tree --
 *     Decide whether the tree needs to be included in the checkpoint and if so, acquire the
 *     necessary locks.
 */
static int
__checkpoint_lock_dirty_tree(
  WT_SESSION_IMPL *session, bool is_checkpoint, bool force, bool need_tracking, const char *cfg[])
{
    WT_BTREE *btree;
    WT_CKPT *ckpt, *ckptbase;
    WT_CONFIG dropconf;
    WT_CONFIG_ITEM cval, k, v;
    WT_DATA_HANDLE *dhandle;
    WT_DECL_RET;
    WT_ITEM *drop_list;
    size_t ckpt_bytes_allocated;
    uint64_t now;
    char *name_alloc;
    const char *name;
    bool is_drop, is_wt_ckpt, seen_ckpt_add, skip_ckpt;

    btree = S2BT(session);
    ckpt = ckptbase = NULL;
    ckpt_bytes_allocated = 0;
    dhandle = session->dhandle;
    drop_list = NULL;
    name_alloc = NULL;
    seen_ckpt_add = false;

    /*
     * Only referenced in diagnostic builds and gcc 5.1 isn't satisfied with wrapping the entire
     * assert condition in the unused macro.
     */
    WT_UNUSED(need_tracking);

    /*
     * Most callers need meta tracking to be on here, otherwise it is
     * possible for this checkpoint to cleanup handles that are still in
     * use. The exceptions are:
     *  - Checkpointing the metadata handle itself.
     *  - On connection close when we know there can't be any races.
     */
    WT_ASSERT(session, !need_tracking || WT_IS_METADATA(dhandle) || WT_META_TRACKING(session));

    /* This may be a named checkpoint, check the configuration. */
    cval.len = 0;
    is_drop = is_wt_ckpt = false;
    if (cfg != NULL)
        WT_ERR(__wt_config_gets(session, cfg, "name", &cval));
    if (cval.len == 0) {
        name = WT_CHECKPOINT;
        is_wt_ckpt = true;
    } else {
        WT_ERR(__checkpoint_name_ok(session, cval.str, cval.len, false));
        WT_ERR(__wt_strndup(session, cval.str, cval.len, &name_alloc));
        name = name_alloc;
    }

    /*
     * Determine if a drop is part of the configuration. It usually isn't, so delay processing more
     * until we know if we need to process this tree.
     */
    if (cfg != NULL) {
        cval.len = 0;
        WT_ERR(__wt_config_gets(session, cfg, "drop", &cval));
        if (cval.len != 0)
            is_drop = true;
    }

    /*
     * This is a complicated test to determine if we can avoid the expensive call of getting the
     * list of checkpoints for this file. We want to avoid that for clean files. But on clean files
     * we want to periodically check if we need to delete old checkpoints that may have been in use
     * by an open cursor.
     */
    if (!btree->modified && !force && is_checkpoint && is_wt_ckpt && !is_drop) {
        /* In the common case of the timer set forever, don't even check the time. */
        skip_ckpt = true;
        if (btree->clean_ckpt_timer != WT_BTREE_CLEAN_CKPT_FOREVER) {
            __wt_seconds(session, &now);
            if (now > btree->clean_ckpt_timer)
                skip_ckpt = false;
        }

        /* Skip the clean btree until the btree has obsolete pages. */
        if (skip_ckpt && !F_ISSET(btree, WT_BTREE_OBSOLETE_PAGES)) {
            F_SET(btree, WT_BTREE_SKIP_CKPT);
            goto skip;
        }
    }

    /*
     * Discard the saved list of checkpoints, and slow path if this is not a WiredTiger checkpoint
     * or if checkpoint drops are involved. Also, if we do not have checkpoint array size, the
     * regular checkpoint process did not create the array. It is safer to discard the array in such
     * a case.
     */
    if (!is_wt_ckpt || is_drop || btree->ckpt_bytes_allocated == 0)
        __wt_meta_saved_ckptlist_free(session);

    /* If we have to process this btree for any reason, reset the timer and obsolete pages flag. */
    WT_BTREE_CLEAN_CKPT(session, btree, 0);
    F_CLR(btree, WT_BTREE_OBSOLETE_PAGES);

    WT_ERR(__wt_meta_ckptlist_get(session, dhandle->name, true, &ckptbase, &ckpt_bytes_allocated));

    /* We may be dropping specific checkpoints, check the configuration. */
    if (cfg != NULL) {
        cval.len = 0;
        WT_ERR(__wt_config_gets(session, cfg, "drop", &cval));
        if (cval.len != 0) {
            /* Gather the list of named checkpoints to drop (if any) from the first tree visited. */
            if (session->ckpt_drop_list == NULL) {
                WT_ERR(__wt_scr_alloc(session, cval.len + 10, &session->ckpt_drop_list));
                WT_ERR(__wt_buf_set(session, session->ckpt_drop_list, "(", 1));
                drop_list = session->ckpt_drop_list;
            }

            __wt_config_subinit(session, &dropconf, &cval);
            while ((ret = __wt_config_next(&dropconf, &k, &v)) == 0) {
                /* Disallow unsafe checkpoint names. */
                if (v.len == 0)
                    WT_ERR(__checkpoint_name_ok(session, k.str, k.len, true));
                else
                    WT_ERR(__checkpoint_name_ok(session, v.str, v.len, true));

                if (v.len == 0)
                    WT_ERR(__drop(session, drop_list, ckptbase, k.str, k.len));
                else if (WT_STRING_MATCH("from", k.str, k.len))
                    WT_ERR(__drop_from(session, drop_list, ckptbase, v.str, v.len));
                else if (WT_STRING_MATCH("to", k.str, k.len))
                    WT_ERR(__drop_to(session, drop_list, ckptbase, v.str, v.len));
                else
                    WT_ERR_MSG(session, EINVAL, "unexpected value for checkpoint key: %.*s",
                      (int)k.len, k.str);
            }
            WT_ERR_NOTFOUND_OK(ret, false);

            if (drop_list != NULL)
                WT_ERR(__wt_buf_catfmt(session, drop_list, ")"));
        }
    }

    /*
     * Drop checkpoints with the same name as the one we're taking. We don't need to add this to the
     * drop list for snapshot/timestamp metadata because the metadata will be replaced by the new
     * checkpoint.
     */
    WT_ERR(__drop(session, NULL, ckptbase, name, strlen(name)));

    /* Set the name of the new entry at the end of the list. */
    WT_CKPT_FOREACH (ckptbase, ckpt)
        ;
    WT_ERR(__wt_strdup(session, name, &ckpt->name));

    /*
     * There is some interaction between backups and checkpoints. Perform all backup related
     * operations that the checkpoint needs now, while holding the hot backup read lock.
     */
    WT_WITH_HOTBACKUP_READ_LOCK_UNCOND(session,
      ret = __checkpoint_lock_dirty_tree_int(session, is_checkpoint, force, btree, ckpt, ckptbase));
    WT_ERR(ret);

    /*
     * If we decided to skip checkpointing, we need to remove the new checkpoint entry we might have
     * appended to the list.
     */
    if (F_ISSET(btree, WT_BTREE_SKIP_CKPT)) {
        WT_CKPT_FOREACH_NAME_OR_ORDER (ckptbase, ckpt) {
            /* Checkpoint(s) to be added are always at the end of the list. */
            WT_ASSERT(session, !seen_ckpt_add || F_ISSET(ckpt, WT_CKPT_ADD));
            if (F_ISSET(ckpt, WT_CKPT_ADD)) {
                seen_ckpt_add = true;
                __wt_meta_checkpoint_free(session, ckpt);
            }
        }
    }

    if (ckptbase->name != NULL) {
        btree->ckpt = ckptbase;
        btree->ckpt_bytes_allocated = ckpt_bytes_allocated;
    } else {
        /* It is possible that we do not have any checkpoint in the list. */
err:
        __wt_meta_ckptlist_free(session, &ckptbase);
        btree->ckpt = NULL;
        btree->ckpt_bytes_allocated = 0;
    }
skip:
    __wt_free(session, name_alloc);

    WT_UNUSED(seen_ckpt_add);
    return (ret);
}

/*
 * __checkpoint_apply_obsolete --
 *     Returns true if the checkpoint is obsolete.
 */
static bool
__checkpoint_apply_obsolete(WT_SESSION_IMPL *session, WT_BTREE *btree, WT_CKPT *ckpt)
{
    wt_timestamp_t stop_ts;

    stop_ts = WT_TS_MAX;
    if (ckpt->size != 0) {
        /*
         * If the checkpoint has a valid stop timestamp, mark the btree as having obsolete pages.
         * This flag is used to avoid skipping the btree until the obsolete check is performed on
         * the checkpoints.
         */
        if (ckpt->ta.newest_stop_ts != WT_TS_MAX) {
            F_SET(btree, WT_BTREE_OBSOLETE_PAGES);
            stop_ts = ckpt->ta.newest_stop_durable_ts;
        }
        if (__wt_txn_visible_all(session, ckpt->ta.newest_stop_txn, stop_ts)) {
            WT_STAT_CONN_DATA_INCR(session, txn_checkpoint_obsolete_applied);
            return (true);
        }
    }

    return (false);
}

/*
 * __checkpoint_mark_skip --
 *     Figure out whether the checkpoint can be skipped for a tree.
 */
static int
__checkpoint_mark_skip(WT_SESSION_IMPL *session, WT_CKPT *ckptbase, bool force)
{
    WT_BTREE *btree;
    WT_CKPT *ckpt;
    uint64_t timer;
    int deleted;
    const char *name;

    btree = S2BT(session);

    /*
     * Check for clean objects not requiring a checkpoint.
     *
     * If we're closing a handle, and the object is clean, we can skip the checkpoint, whatever
     * checkpoints we have are sufficient. (We might not have any checkpoints if the object was
     * never modified, and that's OK: the object creation code doesn't mark the tree modified so we
     * can skip newly created trees here.)
     *
     * If the application repeatedly checkpoints an object (imagine hourly checkpoints using the
     * same explicit or internal name), there's no reason to repeat the checkpoint for clean
     * objects. The test is if the only checkpoint we're deleting is the last one in the list and it
     * has the same name as the checkpoint we're about to take, skip the work. (We can't skip
     * checkpoints that delete more than the last checkpoint because deleting those checkpoints
     * might free up space in the file.) This means an application toggling between two (or more)
     * checkpoint names will repeatedly take empty checkpoints, but that's not likely enough to make
     * detection worthwhile.
     *
     * Checkpoint read-only objects otherwise: the application must be able to open the checkpoint
     * in a cursor after taking any checkpoint, which means it must exist.
     */
    F_CLR(btree, WT_BTREE_SKIP_CKPT);
    if (!btree->modified && !force) {
        deleted = 0;
        WT_CKPT_FOREACH (ckptbase, ckpt) {
            /*
             * Don't skip the objects that have obsolete pages to let them to be removed as part of
             * checkpoint cleanup.
             */
            if (__checkpoint_apply_obsolete(session, btree, ckpt))
                return (0);

            if (F_ISSET(ckpt, WT_CKPT_DELETE))
                ++deleted;
        }

        /*
         * Complicated test: if the tree is clean and last two checkpoints have the same name
         * (correcting for internal checkpoint names with their generational suffix numbers), we can
         * skip the checkpoint, there's nothing to do. The exception is if we're deleting two or
         * more checkpoints: then we may save space.
         */
        name = (ckpt - 1)->name;
        if (ckpt > ckptbase + 1 && deleted < 2 &&
          (strcmp(name, (ckpt - 2)->name) == 0 ||
            (WT_PREFIX_MATCH(name, WT_CHECKPOINT) &&
              WT_PREFIX_MATCH((ckpt - 2)->name, WT_CHECKPOINT)))) {
            F_SET(btree, WT_BTREE_SKIP_CKPT);
            /*
             * If there are potentially extra checkpoints to delete, we set the timer to recheck
             * later. If there are at most two checkpoints, the current one and possibly a previous
             * one, then we know there are no additional ones to delete. In that case, set the timer
             * to forever. If the table gets dirtied or a checkpoint is forced that will clear the
             * timer.
             */
            if (ckpt - ckptbase > 2) {
                __wt_seconds(session, &timer);
                timer += WT_MINUTE * WT_BTREE_CLEAN_MINUTES;
                WT_BTREE_CLEAN_CKPT(session, btree, timer);
            } else
                WT_BTREE_CLEAN_CKPT(session, btree, WT_BTREE_CLEAN_CKPT_FOREVER);
            return (0);
        }
    }

    return (0);
}

/*
 * __wt_checkpoint_tree_reconcile_update --
 *     Update a checkpoint based on reconciliation results.
 */
void
__wt_checkpoint_tree_reconcile_update(WT_SESSION_IMPL *session, WT_TIME_AGGREGATE *ta)
{
    WT_BTREE *btree;
    WT_CKPT *ckpt, *ckptbase;

    btree = S2BT(session);

    /*
     * Reconciliation just wrote a checkpoint, everything has been written. Update the checkpoint
     * with reconciliation information. The reason for this function is the reconciliation code just
     * passes through the btree structure's checkpoint array, it doesn't know any more.
     */
    ckptbase = btree->ckpt;
    WT_CKPT_FOREACH (ckptbase, ckpt)
        if (F_ISSET(ckpt, WT_CKPT_ADD)) {
            ckpt->write_gen = btree->write_gen;
            ckpt->run_write_gen = btree->run_write_gen;
            WT_TIME_AGGREGATE_COPY(&ckpt->ta, ta);
        }
}

/*
 * __checkpoint_save_ckptlist --
 *     Post processing of the ckptlist to carry forward a cached list for the next checkpoint.
 */
static int
__checkpoint_save_ckptlist(WT_SESSION_IMPL *session, WT_CKPT *ckptbase)
{
    WT_CKPT *ckpt, *ckpt_itr;
    WT_DECL_ITEM(tmp);
    WT_DECL_RET;

    ckpt_itr = ckptbase;
    WT_ERR(__wt_scr_alloc(session, 0, &tmp));
    WT_CKPT_FOREACH (ckptbase, ckpt) {
        /* Remove any deleted checkpoints, by shifting the array. */
        if (F_ISSET(ckpt, WT_CKPT_DELETE)) {
            __wt_meta_checkpoint_free(session, ckpt);
            continue;
        }

        /* Clean up block manager information. */
        __wt_free(session, ckpt->bpriv);
        ckpt->bpriv = NULL;

        /* Update the internal checkpoints to their full names, with the generation count suffix. */
        if (strcmp(ckpt->name, WT_CHECKPOINT) == 0) {
            WT_ERR(__wt_buf_fmt(session, tmp, "%s.%" PRId64, WT_CHECKPOINT, ckpt->order));
            __wt_free(session, ckpt->name);
            WT_ERR(__wt_strdup(session, tmp->mem, &ckpt->name));
        }

        /* Reset the flags, and mark a checkpoint fake if there is no address. */
        ckpt->flags = 0;
        if (ckpt->addr.size == 0) {
            WT_ASSERT(session, ckpt->addr.data == NULL);
            F_SET(ckpt, WT_CKPT_FAKE);
        }

        /* Shift the valid checkpoints, if there are deleted checkpoints in the list. */
        if (ckpt_itr != ckpt) {
            *ckpt_itr = *ckpt;
            WT_CLEAR(*ckpt);
        }
        ckpt_itr++;
    }

    /*
     * Confirm that the last checkpoint has a metadata entry that we can use to base a new
     * checkpoint on.
     */
    ckpt_itr--;
    WT_ASSERT(session, ckpt_itr->block_metadata != NULL);

err:
    __wt_scr_free(session, &tmp);
    return (ret);
}

/*
 * __checkpoint_tree --
 *     Checkpoint a single tree. Assumes all necessary locks have been acquired by the caller.
 */
static int
__checkpoint_tree(WT_SESSION_IMPL *session, bool is_checkpoint, const char *cfg[])
{
    WT_BM *bm;
    WT_BTREE *btree;
    WT_CONNECTION_IMPL *conn;
    WT_DATA_HANDLE *dhandle;
    WT_DECL_RET;
    WT_LSN ckptlsn;
    WT_TIME_AGGREGATE ta;
    bool fake_ckpt, resolve_bm;

    WT_UNUSED(cfg);

    btree = S2BT(session);
    bm = btree->bm;
    conn = S2C(session);
    dhandle = session->dhandle;
    fake_ckpt = resolve_bm = false;
    WT_TIME_AGGREGATE_INIT(&ta);

    /*
     * Set the checkpoint LSN to the maximum LSN so that if logging is disabled, recovery will never
     * roll old changes forward over the non-logged changes in this checkpoint. If logging is
     * enabled, a real checkpoint LSN will be assigned for this checkpoint and overwrite this.
     */
    WT_MAX_LSN(&ckptlsn);

    /*
     * If an object has never been used (in other words, if it could become a bulk-loaded file),
     * then we must fake the checkpoint. This is good because we don't write physical checkpoint
     * blocks for just-created files, but it's not just a good idea. The reason is because deleting
     * a physical checkpoint requires writing the file, and fake checkpoints can't write the file.
     * If you (1) create a physical checkpoint for an empty file which writes blocks, (2) start
     * bulk-loading records into the file, (3) during the bulk-load perform another checkpoint with
     * the same name; in order to keep from having two checkpoints with the same name you would have
     * to use the bulk-load's fake checkpoint to delete a physical checkpoint, and that will end in
     * tears.
     */
    if (is_checkpoint && btree->original) {
        __wt_checkpoint_tree_reconcile_update(session, &ta);

        fake_ckpt = true;
        goto fake;
    }

    /*
     * Mark the root page dirty to ensure something gets written. (If the tree is modified, we must
     * write the root page anyway, this doesn't add additional writes to the process. If the tree is
     * not modified, we have to dirty the root page to ensure something gets written.) This is
     * really about paranoia: if the tree modification value gets out of sync with the set of dirty
     * pages (modify is set, but there are no dirty pages), we perform a checkpoint without any
     * writes, no checkpoint is created, and then things get bad. While marking the root page as
     * dirty, we do not want to dirty the btree because we are marking the btree as clean just after
     * this call. Also, marking the btree dirty at this stage will unnecessarily mark the connection
     * as dirty causing checkpoint-skip code to fail.
     */
    WT_ERR(__wt_page_modify_init(session, btree->root.page));
    __wt_page_only_modify_set(session, btree->root.page);

    /*
     * Clear the tree's modified flag; any changes before we clear the flag are guaranteed to be
     * part of this checkpoint (unless reconciliation skips updates for transactional reasons), and
     * changes subsequent to the checkpoint start, which might not be included, will re-set the
     * modified flag. The "unless reconciliation skips updates" problem is handled in the
     * reconciliation code: if reconciliation skips updates, it sets the modified flag itself.
     */
    btree->modified = false;
    WT_FULL_BARRIER();

    /* Tell logging that a file checkpoint is starting. */
    if (FLD_ISSET(conn->log_flags, WT_CONN_LOG_ENABLED))
        WT_ERR(__wt_txn_checkpoint_log(session, false, WT_TXN_LOG_CKPT_START, &ckptlsn));

    /* Tell the block manager that a file checkpoint is starting. */
    WT_ERR(bm->checkpoint_start(bm, session));
    resolve_bm = true;

    /* Flush the file from the cache, creating the checkpoint. */
    if (is_checkpoint)
        WT_ERR(__wt_sync_file(session, WT_SYNC_CHECKPOINT));
    else
        WT_ERR(__wt_evict_file(session, WT_SYNC_CLOSE));

fake:
    /*
     * If we're faking a checkpoint and logging is enabled, recovery should roll forward any changes
     * made between now and the next checkpoint, so set the checkpoint LSN to the beginning of time.
     */
    if (fake_ckpt && FLD_ISSET(conn->log_flags, WT_CONN_LOG_ENABLED))
        WT_INIT_LSN(&ckptlsn);

    /*
     * Update the object's metadata.
     *
     * If the object is the metadata, the call to __wt_meta_ckptlist_set will update the turtle file
     * and swap the new one into place. We need to make sure the metadata is on disk before the
     * turtle file is updated.
     *
     * If we are doing a checkpoint in a file without a transaction (e.g., closing a dirty tree
     * before an exclusive operation like verify), the metadata update will be auto-committed. In
     * that case, we need to sync the file here or we could roll forward the metadata in recovery
     * and open a checkpoint that isn't yet durable.
     */
    if (WT_IS_METADATA(dhandle) || !F_ISSET(session->txn, WT_TXN_RUNNING))
        WT_ERR(__wt_checkpoint_sync(session, NULL));

    WT_ERR(__wt_meta_ckptlist_set(session, dhandle, btree->ckpt, &ckptlsn));

    /*
     * If we wrote a checkpoint (rather than faking one), we have to resolve it. Normally, tracking
     * is enabled and resolution deferred until transaction end. The exception is if the handle is
     * being discarded, in which case the handle will be gone by the time we try to apply or unroll
     * the meta tracking event.
     */
    if (!fake_ckpt) {
        resolve_bm = false;
        if (WT_META_TRACKING(session) && is_checkpoint)
            WT_ERR(__wt_meta_track_checkpoint(session));
        else
            WT_ERR(bm->checkpoint_resolve(bm, session, false));
    }

    /* Tell logging that the checkpoint is complete. */
    if (FLD_ISSET(conn->log_flags, WT_CONN_LOG_ENABLED))
        WT_ERR(__wt_txn_checkpoint_log(session, false, WT_TXN_LOG_CKPT_STOP, NULL));

err:
    /* Resolved the checkpoint for the block manager in the error path. */
    if (resolve_bm)
        WT_TRET(bm->checkpoint_resolve(bm, session, ret != 0));

    /*
     * If the checkpoint didn't complete successfully, make sure the tree is marked dirty.
     */
    if (ret != 0) {
        btree->modified = true;
        conn->modified = true;
    }

    /* For a successful checkpoint, post process the ckptlist, to keep a cached copy around. */
    if (ret != 0 || WT_IS_METADATA(session->dhandle) || F_ISSET(conn, WT_CONN_CLOSING))
        __wt_meta_saved_ckptlist_free(session);
    else {
        ret = __checkpoint_save_ckptlist(session, btree->ckpt);
        /* Discard the saved checkpoint list if processing the list did not work. */
        if (ret != 0)
            __wt_meta_saved_ckptlist_free(session);
    }

    return (ret);
}

/*
 * __checkpoint_presync --
 *     Visit all handles after the checkpoint writes are complete and before syncing. At this point,
 *     all trees should be completely open for business.
 */
static int
__checkpoint_presync(WT_SESSION_IMPL *session, const char *cfg[])
{
    WT_BTREE *btree;

    WT_UNUSED(cfg);

    btree = S2BT(session);
    WT_ASSERT(session, btree->checkpoint_gen == __wt_gen(session, WT_GEN_CHECKPOINT));
    btree->evict_walk_period = btree->evict_walk_saved;
    return (0);
}

/*
 * __checkpoint_tree_helper --
 *     Checkpoint a tree (suitable for use in *_apply functions).
 */
static int
__checkpoint_tree_helper(WT_SESSION_IMPL *session, const char *cfg[])
{
    WT_BTREE *btree;
    WT_DECL_RET;
    WT_TXN *txn;
    bool with_timestamp;

    btree = S2BT(session);
    txn = session->txn;

    /* Are we using a read timestamp for this checkpoint transaction? */
    with_timestamp = F_ISSET(txn, WT_TXN_SHARED_TS_READ);

    /* Logged tables ignore any read timestamp configured for the checkpoint. */
    if (F_ISSET(btree, WT_BTREE_LOGGED))
        F_CLR(txn, WT_TXN_SHARED_TS_READ);

    ret = __checkpoint_tree(session, true, cfg);

    /* Restore the use of the timestamp for other tables. */
    if (with_timestamp)
        F_SET(txn, WT_TXN_SHARED_TS_READ);

    /*
     * Whatever happened, we aren't visiting this tree again in this checkpoint. Don't keep updates
     * pinned any longer.
     */
    __checkpoint_update_generation(session);

    /*
     * In case this tree was being skipped by the eviction server during the checkpoint, restore the
     * previous state.
     */
    btree->evict_walk_period = btree->evict_walk_saved;

    /*
     * Wake the eviction server, in case application threads have stalled while the eviction server
     * decided it couldn't make progress. Without this, application threads will be stalled until
     * the eviction server next wakes.
     */
    __wt_evict_server_wake(session);

    return (ret);
}

/*
 * __wt_checkpoint --
 *     Checkpoint a file.
 */
int
__wt_checkpoint(WT_SESSION_IMPL *session, const char *cfg[])
{
    WT_CONFIG_ITEM cval;
    WT_DECL_RET;
    bool force, standalone;

    /* Should not be called with a checkpoint handle. */
    WT_ASSERT(session, !WT_READING_CHECKPOINT(session));

    /* We must hold the metadata lock if checkpointing the metadata. */
    WT_ASSERT(session,
      !WT_IS_METADATA(session->dhandle) ||
        FLD_ISSET(session->lock_flags, WT_SESSION_LOCKED_METADATA));

    /* If we're already in a global checkpoint, don't get a new time. Otherwise, we need one. */
    standalone = session->current_ckpt_sec == 0;
    if (standalone)
        __txn_checkpoint_establish_time(session);

    WT_RET(__wt_config_gets_def(session, cfg, "force", 0, &cval));
    force = cval.val != 0;
    WT_SAVE_DHANDLE(session, ret = __checkpoint_lock_dirty_tree(session, true, force, true, cfg));
    if (ret != 0 || F_ISSET(S2BT(session), WT_BTREE_SKIP_CKPT))
        goto done;
    ret = __checkpoint_tree(session, true, cfg);

done:
    if (standalone)
        __txn_checkpoint_clear_time(session);

    /* Do not store the cached checkpoint list when checkpointing a single file alone. */
    __wt_meta_saved_ckptlist_free(session);
    return (ret);
}

/*
 * __wt_checkpoint_sync --
 *     Sync a file that has been checkpointed, and wait for the result.
 */
int
__wt_checkpoint_sync(WT_SESSION_IMPL *session, const char *cfg[])
{
    WT_BM *bm;

    WT_UNUSED(cfg);

    bm = S2BT(session)->bm;

    /* Should not be called with a checkpoint handle. */
    WT_ASSERT(session, !WT_READING_CHECKPOINT(session));

    /* Unnecessary if checkpoint_sync has been configured "off". */
    if (!F_ISSET(S2C(session), WT_CONN_CKPT_SYNC))
        return (0);

    return (bm->sync(bm, session, true));
}

/*
 * __wt_checkpoint_close --
 *     Checkpoint a single file as part of closing the handle.
 */
int
__wt_checkpoint_close(WT_SESSION_IMPL *session, bool final)
{
    WT_BTREE *btree;
    WT_DECL_RET;
    bool bulk, metadata, need_tracking;

    btree = S2BT(session);
    bulk = F_ISSET(btree, WT_BTREE_BULK);
    metadata = WT_IS_METADATA(session->dhandle);

    /*
     * We've done the final checkpoint before the final close, subsequent writes to normal objects
     * are wasted effort. Discard the objects to validate exit accounting.
     */
    if (final && !metadata)
        return (__wt_evict_file(session, WT_SYNC_DISCARD));

    /* Closing an unmodified file. */
    if (!btree->modified && !bulk)
        return (__wt_evict_file(session, WT_SYNC_DISCARD));

    /*
     * Don't flush data from modified trees independent of system-wide checkpoint. Flushing trees
     * can lead to files that are inconsistent on disk after a crash.
     */
    if (btree->modified && !bulk && !metadata)
        return (__wt_set_return(session, EBUSY));

    /*
     * Make sure there isn't a potential race between backup copying the metadata and a checkpoint
     * changing the metadata. Backup holds both the checkpoint and schema locks. Checkpoint should
     * hold those also except on the final checkpoint during close. Confirm the caller either is the
     * final checkpoint or holds at least one of the locks.
     */
    WT_ASSERT(session,
      final ||
        (FLD_ISSET(session->lock_flags, WT_SESSION_LOCKED_CHECKPOINT) ||
          FLD_ISSET(session->lock_flags, WT_SESSION_LOCKED_SCHEMA)));
    /*
     * Turn on metadata tracking if:
     * - The session is not already doing metadata tracking.
     * - The file was not bulk loaded.
     * - The close is not during connection close.
     */
    need_tracking = !WT_META_TRACKING(session) && !bulk && !final;

    if (need_tracking)
        WT_RET(__wt_meta_track_on(session));

    __txn_checkpoint_establish_time(session);

    WT_SAVE_DHANDLE(
      session, ret = __checkpoint_lock_dirty_tree(session, false, false, need_tracking, NULL));
    WT_ASSERT(session, ret == 0);
    if (ret == 0 && !F_ISSET(btree, WT_BTREE_SKIP_CKPT))
        ret = __checkpoint_tree(session, false, NULL);

    __txn_checkpoint_clear_time(session);

    /* Do not store the cached checkpoint list when closing the handle. */
    __wt_meta_saved_ckptlist_free(session);

    if (need_tracking)
        WT_TRET(__wt_meta_track_off(session, true, ret != 0));

    return (ret);
}

/*
 * __checkpoint_timing_stress --
 *     Optionally add a delay to a checkpoint to simulate a long running checkpoint for debug
 *     purposes. The reason for this option is finding operations that can block while waiting for a
 *     checkpoint to complete.
 */
static void
__checkpoint_timing_stress(WT_SESSION_IMPL *session, uint64_t flag, struct timespec *tsp)
{
    WT_CONNECTION_IMPL *conn;

    conn = S2C(session);

    /*
     * We only want to sleep if the flag is set and the checkpoint comes from the API, so check if
     * the session used is either of the two sessions set aside for internal checkpoints.
     */
    if (conn->ckpt_session != session && conn->meta_ckpt_session != session &&
      FLD_ISSET(conn->timing_stress_flags, flag))
        __wt_sleep((uint64_t)tsp->tv_sec, (uint64_t)tsp->tv_nsec / WT_THOUSAND);
}