/*-
 * Copyright (c) 2014-present MongoDB, Inc.
 * Copyright (c) 2008-2014 WiredTiger, Inc.
 *	All rights reserved.
 *
 * See the file LICENSE for redistribution information.
 */

#include "wt_internal.h"

/* Enable all recovery-related verbose messaging events. */
#define WT_VERB_RECOVERY_ALL        \
    WT_DECL_VERBOSE_MULTI_CATEGORY( \
      ((WT_VERBOSE_CATEGORY[]){WT_VERB_RECOVERY, WT_VERB_RECOVERY_PROGRESS}))

/* State maintained during recovery. */
typedef struct {
    const char *uri; /* File URI. */
    WT_CURSOR *c;    /* Cursor used for recovery. */
    WT_LSN ckpt_lsn; /* File's checkpoint LSN. */
} WT_RECOVERY_FILE;

typedef struct {
    WT_SESSION_IMPL *session;

    /* Files from the metadata, indexed by file ID. */
    WT_RECOVERY_FILE *files;
    size_t file_alloc; /* Allocated size of files array. */
    u_int max_fileid;  /* Maximum file ID seen. */
    u_int nfiles;      /* Number of files in the metadata. */

    WT_LSN ckpt_lsn;     /* Start LSN for main recovery loop. */
    WT_LSN max_ckpt_lsn; /* Maximum checkpoint LSN seen. */
    WT_LSN max_rec_lsn;  /* Maximum recovery LSN seen. */

    bool missing;       /* Were there missing files? */
    bool metadata_only; /*
                         * Set during the first recovery pass,
                         * when only the metadata is recovered.
                         */
} WT_RECOVERY;

/*
 * __recovery_cursor --
 *     Get a cursor for a recovery operation.
 */
static int
__recovery_cursor(
  WT_SESSION_IMPL *session, WT_RECOVERY *r, WT_LSN *lsnp, u_int id, bool duplicate, WT_CURSOR **cp)
{
    WT_CURSOR *c;
    const char *cfg[] = {WT_CONFIG_BASE(session, WT_SESSION_open_cursor), "overwrite", NULL};
    bool metadata_op;

    c = NULL;

    /*
     * File ids with the bit set to ignore this operation are skipped.
     */
    if (WT_LOGOP_IS_IGNORED(id))
        return (0);
    /*
     * Metadata operations have an id of 0. Match operations based on the id and the current pass of
     * recovery for metadata.
     *
     * Only apply operations in the correct metadata phase, and if the LSN is more recent than the
     * last checkpoint. If there is no entry for a file, assume it was dropped or missing after a
     * hot backup.
     */
    metadata_op = id == WT_METAFILE_ID;
    if (r->metadata_only != metadata_op)
        ;
    else if (id >= r->nfiles || r->files[id].uri == NULL) {
        /* If a file is missing, output a verbose message once. */
        if (!r->missing)
            __wt_verbose(
              session, WT_VERB_RECOVERY, "No file found with ID %u (max %u)", id, r->nfiles);
        r->missing = true;
    } else if (__wt_log_cmp(lsnp, &r->files[id].ckpt_lsn) >= 0) {
        /*
         * We're going to apply the operation. Get the cursor, opening one if none is cached.
         */
        if ((c = r->files[id].c) == NULL) {
            WT_RET(__wt_open_cursor(session, r->files[id].uri, NULL, cfg, &c));
            r->files[id].c = c;
        }
#ifndef WT_STANDALONE_BUILD
        /*
         * In the event of a clean shutdown, there shouldn't be any other table log records other
         * than metadata.
         */
        if (!metadata_op)
            S2C(session)->unclean_shutdown = true;
#endif
    }

    if (duplicate && c != NULL)
        WT_RET(__wt_open_cursor(session, r->files[id].uri, NULL, cfg, &c));

    *cp = c;
    return (0);
}

/*
 * Helper to a cursor if this operation is to be applied during recovery.
 */
#define GET_RECOVERY_CURSOR(session, r, lsnp, fileid, cp)                            \
    ret = __recovery_cursor(session, r, lsnp, fileid, false, cp);                    \
    __wt_verbose_debug2(session, WT_VERB_RECOVERY,                                   \
      "%s op %" PRIu32 " to file %" PRIu32 " at LSN %" PRIu32 "/%" PRIu32,           \
      ret != 0 ? "Error" : cursor == NULL ? "Skipping" : "Applying", optype, fileid, \
      (lsnp)->l.file, (lsnp)->l.offset);                                             \
    WT_ERR(ret);                                                                     \
    if (cursor == NULL)                                                              \
    break

/*
 * __txn_op_apply --
 *     Apply a transactional operation during recovery.
 */
static int
__txn_op_apply(WT_RECOVERY *r, WT_LSN *lsnp, const uint8_t **pp, const uint8_t *end)
{
    WT_CURSOR *cursor, *start, *stop;
    WT_DECL_RET;
    WT_ITEM key, start_key, stop_key, value;
    WT_SESSION_IMPL *session;
    wt_timestamp_t commit, durable, first_commit, prepare, read;
    uint64_t recno, start_recno, stop_recno, t_nsec, t_sec;
    uint32_t fileid, mode, optype, opsize;

    session = r->session;
    cursor = NULL;

    /* Peek at the size and the type. */
    WT_ERR(__wt_logop_read(session, pp, end, &optype, &opsize));
    end = *pp + opsize;

    /*
     * If it is an operation type that should be ignored, we're done. Note that file ids within
     * known operations also use the same macros to indicate that operation should be ignored.
     */
    if (WT_LOGOP_IS_IGNORED(optype)) {
        *pp += opsize;
        goto done;
    }

    switch (optype) {
    case WT_LOGOP_COL_MODIFY:
        WT_ERR(__wt_logop_col_modify_unpack(session, pp, end, &fileid, &recno, &value));
        GET_RECOVERY_CURSOR(session, r, lsnp, fileid, &cursor);
        cursor->set_key(cursor, recno);
        if ((ret = cursor->search(cursor)) != 0)
            WT_ERR_NOTFOUND_OK(ret, false);
        else {
            /*
             * Build/insert a complete value during recovery rather than using cursor modify to
             * create a partial update (for no particular reason than simplicity).
             */
            WT_ERR(__wt_modify_apply_item(
              CUR2S(cursor), cursor->value_format, &cursor->value, value.data));
            WT_ERR(cursor->insert(cursor));
        }
        break;

    case WT_LOGOP_COL_PUT:
        WT_ERR(__wt_logop_col_put_unpack(session, pp, end, &fileid, &recno, &value));
        GET_RECOVERY_CURSOR(session, r, lsnp, fileid, &cursor);
        cursor->set_key(cursor, recno);
        __wt_cursor_set_raw_value(cursor, &value);
        WT_ERR(cursor->insert(cursor));
        break;

    case WT_LOGOP_COL_REMOVE:
        WT_ERR(__wt_logop_col_remove_unpack(session, pp, end, &fileid, &recno));
        GET_RECOVERY_CURSOR(session, r, lsnp, fileid, &cursor);
        cursor->set_key(cursor, recno);
        /*
         * WT_NOTFOUND is an expected error because the checkpoint snapshot we're rolling forward
         * may race with a remove, resulting in the key not being in the tree, but recovery still
         * processing the log record of the remove.
         */
        WT_ERR_NOTFOUND_OK(cursor->remove(cursor), false);
        break;

    case WT_LOGOP_COL_TRUNCATE:
        WT_ERR(
          __wt_logop_col_truncate_unpack(session, pp, end, &fileid, &start_recno, &stop_recno));
        GET_RECOVERY_CURSOR(session, r, lsnp, fileid, &cursor);

        /* Set up the cursors. */
        start = stop = NULL;
        if (start_recno != WT_RECNO_OOB)
            start = cursor;

        if (stop_recno != WT_RECNO_OOB) {
            if (start != NULL)
                WT_ERR(__recovery_cursor(session, r, lsnp, fileid, true, &stop));
            else
                stop = cursor;
        }

        /* Set the keys. */
        if (start != NULL)
            start->set_key(start, start_recno);
        if (stop != NULL)
            stop->set_key(stop, stop_recno);

        /*
         * If the truncate log doesn't have a recorded start and stop recno, truncate the whole file
         * using the URI. Otherwise use the positioned start or stop cursors to truncate a range of
         * the file.
         */
        if (start == NULL && stop == NULL)
            WT_TRET(
              session->iface.truncate(&session->iface, r->files[fileid].uri, NULL, NULL, NULL));
        else
            WT_TRET(session->iface.truncate(&session->iface, NULL, start, stop, NULL));

        /* If we opened a duplicate cursor, close it now. */
        if (stop != NULL && stop != cursor)
            WT_TRET(stop->close(stop));
        WT_ERR(ret);
        break;

    case WT_LOGOP_ROW_MODIFY:
        WT_ERR(__wt_logop_row_modify_unpack(session, pp, end, &fileid, &key, &value));
        GET_RECOVERY_CURSOR(session, r, lsnp, fileid, &cursor);
        __wt_cursor_set_raw_key(cursor, &key);
        if ((ret = cursor->search(cursor)) != 0)
            WT_ERR_NOTFOUND_OK(ret, false);
        else {
            /*
             * Build/insert a complete value during recovery rather than using cursor modify to
             * create a partial update (for no particular reason than simplicity).
             */
            WT_ERR(__wt_modify_apply_item(
              CUR2S(cursor), cursor->value_format, &cursor->value, value.data));
            WT_ERR(cursor->insert(cursor));
        }
        break;

    case WT_LOGOP_ROW_PUT:
        WT_ERR(__wt_logop_row_put_unpack(session, pp, end, &fileid, &key, &value));
        GET_RECOVERY_CURSOR(session, r, lsnp, fileid, &cursor);
        __wt_cursor_set_raw_key(cursor, &key);
        __wt_cursor_set_raw_value(cursor, &value);
        WT_ERR(cursor->insert(cursor));
        break;

    case WT_LOGOP_ROW_REMOVE:
        WT_ERR(__wt_logop_row_remove_unpack(session, pp, end, &fileid, &key));
        GET_RECOVERY_CURSOR(session, r, lsnp, fileid, &cursor);
        __wt_cursor_set_raw_key(cursor, &key);
        /*
         * WT_NOTFOUND is an expected error because the checkpoint snapshot we're rolling forward
         * may race with a remove, resulting in the key not being in the tree, but recovery still
         * processing the log record of the remove.
         */
        WT_ERR_NOTFOUND_OK(cursor->remove(cursor), false);
        break;

    case WT_LOGOP_ROW_TRUNCATE:
        WT_ERR(
          __wt_logop_row_truncate_unpack(session, pp, end, &fileid, &start_key, &stop_key, &mode));
        GET_RECOVERY_CURSOR(session, r, lsnp, fileid, &cursor);
        /* Set up the cursors. */
        start = stop = NULL;
        switch (mode) {
        case WT_TXN_TRUNC_ALL:
            /* Both cursors stay NULL. */
            break;
        case WT_TXN_TRUNC_BOTH:
            start = cursor;
            WT_ERR(__recovery_cursor(session, r, lsnp, fileid, true, &stop));
            break;
        case WT_TXN_TRUNC_START:
            start = cursor;
            break;
        case WT_TXN_TRUNC_STOP:
            stop = cursor;
            break;
        default:
            WT_ERR(__wt_illegal_value(session, mode));
        }

        /* Set the keys. */
        if (start != NULL)
            __wt_cursor_set_raw_key(start, &start_key);
        if (stop != NULL)
            __wt_cursor_set_raw_key(stop, &stop_key);

        /*
         * If the truncate log doesn't have a recorded start and stop key, truncate the whole file
         * using the URI. Otherwise use the positioned start or stop cursors to truncate a range of
         * the file.
         */
        if (start == NULL && stop == NULL)
            WT_TRET(
              session->iface.truncate(&session->iface, r->files[fileid].uri, NULL, NULL, NULL));
        else
            WT_TRET(session->iface.truncate(&session->iface, NULL, start, stop, NULL));

        /* If we opened a duplicate cursor, close it now. */
        if (stop != NULL && stop != cursor)
            WT_TRET(stop->close(stop));
        WT_ERR(ret);
        break;
    case WT_LOGOP_TXN_TIMESTAMP:
        /*
         * Timestamp records are informational only. We have to unpack it to properly move forward
         * in the log record to the next operation, but otherwise ignore.
         */
        WT_ERR(__wt_logop_txn_timestamp_unpack(
          session, pp, end, &t_sec, &t_nsec, &commit, &durable, &first_commit, &prepare, &read));
        break;
    default:
        WT_ERR(__wt_illegal_value(session, optype));
    }

done:
    /* Reset the cursor so it doesn't block eviction. */
    if (cursor != NULL)
        WT_ERR(cursor->reset(cursor));
    return (0);

err:
    __wt_err(session, ret,
      "operation apply failed during recovery: operation type %" PRIu32 " at LSN %" PRIu32
      "/%" PRIu32,
      optype, lsnp->l.file, lsnp->l.offset);
    return (ret);
}

/*
 * __txn_commit_apply --
 *     Apply a commit record during recovery.
 */
static int
__txn_commit_apply(WT_RECOVERY *r, WT_LSN *lsnp, const uint8_t **pp, const uint8_t *end)
{
    /* The logging subsystem zero-pads records. */
    while (*pp < end && **pp)
        WT_RET(__txn_op_apply(r, lsnp, pp, end));

    return (0);
}

/*
 * __txn_log_recover --
 *     Roll the log forward to recover committed changes.
 */
static int
__txn_log_recover(WT_SESSION_IMPL *session, WT_ITEM *logrec, WT_LSN *lsnp, WT_LSN *next_lsnp,
  void *cookie, int firstrecord)
{
    WT_DECL_RET;
    WT_RECOVERY *r;
    uint64_t txnid_unused;
    uint32_t rectype;
    const uint8_t *end, *p;

    r = cookie;
    p = WT_LOG_SKIP_HEADER(logrec->data);
    end = (const uint8_t *)logrec->data + logrec->size;
    WT_UNUSED(firstrecord);

    /* First, peek at the log record type. */
    WT_RET(__wt_logrec_read(session, &p, end, &rectype));

    /*
     * Record the highest LSN we process during the metadata phase. If not the metadata phase, then
     * stop at that LSN.
     */
    if (r->metadata_only)
        WT_ASSIGN_LSN(&r->max_rec_lsn, next_lsnp);
    else if (__wt_log_cmp(lsnp, &r->max_rec_lsn) >= 0)
        return (0);

    switch (rectype) {
    case WT_LOGREC_CHECKPOINT:
        if (r->metadata_only)
            WT_RET(__wt_txn_checkpoint_logread(session, &p, end, &r->ckpt_lsn));
        break;

    case WT_LOGREC_COMMIT:
        if ((ret = __wt_vunpack_uint(&p, WT_PTRDIFF(end, p), &txnid_unused)) != 0)
            WT_RET_MSG(session, ret, "txn_log_recover: unpack failure");
        WT_RET(__txn_commit_apply(r, lsnp, &p, end));
        break;
    }

    return (0);
}

/*
 * __recovery_set_checkpoint_timestamp --
 *     Set the checkpoint timestamp as retrieved from the metadata file.
 */
static int
__recovery_set_checkpoint_timestamp(WT_RECOVERY *r)
{
    WT_CONNECTION_IMPL *conn;
    WT_SESSION_IMPL *session;
    wt_timestamp_t ckpt_timestamp;
    char ts_string[WT_TS_INT_STRING_SIZE];

    session = r->session;
    conn = S2C(session);

    /*
     * Read the system checkpoint information from the metadata file and save the stable timestamp
     * of the last checkpoint for later query. This gets saved in the connection.
     */
    WT_RET(__wt_meta_read_checkpoint_timestamp(r->session, NULL, &ckpt_timestamp, NULL));

    /*
     * Set the recovery checkpoint timestamp and the metadata checkpoint timestamp so that the
     * checkpoint after recovery writes the correct value into the metadata.
     */
    conn->txn_global.meta_ckpt_timestamp = conn->txn_global.recovery_timestamp = ckpt_timestamp;

    __wt_verbose_multi(session, WT_VERB_RECOVERY_ALL, "Set global recovery timestamp: %s",
      __wt_timestamp_to_string(conn->txn_global.recovery_timestamp, ts_string));

    return (0);
}

/*
 * __recovery_set_oldest_timestamp --
 *     Set the oldest timestamp as retrieved from the metadata file. Setting the oldest timestamp
 *     doesn't automatically set the pinned timestamp.
 */
static int
__recovery_set_oldest_timestamp(WT_RECOVERY *r)
{
    WT_CONNECTION_IMPL *conn;
    WT_SESSION_IMPL *session;
    wt_timestamp_t oldest_timestamp;
    char ts_string[WT_TS_INT_STRING_SIZE];

    session = r->session;
    conn = S2C(session);
    /*
     * Read the system checkpoint information from the metadata file and save the oldest timestamp
     * of the last checkpoint for later query. This gets saved in the connection.
     */
    WT_RET(__wt_meta_read_checkpoint_oldest(r->session, NULL, &oldest_timestamp, NULL));
    conn->txn_global.oldest_timestamp = oldest_timestamp;
    conn->txn_global.has_oldest_timestamp = oldest_timestamp != WT_TS_NONE;

    __wt_verbose_multi(session, WT_VERB_RECOVERY_ALL, "Set global oldest timestamp: %s",
      __wt_timestamp_to_string(conn->txn_global.oldest_timestamp, ts_string));

    return (0);
}

/*
 * __recovery_set_checkpoint_snapshot --
 *     Set the checkpoint snapshot details as retrieved from the metadata file.
 */
static int
__recovery_set_checkpoint_snapshot(WT_SESSION_IMPL *session)
{
    WT_CONNECTION_IMPL *conn;

    conn = S2C(session);

    /*
     * WiredTiger versions 10.0.1 onward have a valid checkpoint snapshot on-disk. There was a bug
     * in some versions of WiredTiger that are tagged with the 10.0.0 release, which saved the wrong
     * checkpoint snapshot (see WT-8395), so we ignore the snapshot when it was created with one of
     * those versions. Versions of WiredTiger prior to 10.0.0 never saved a checkpoint snapshot.
     * Additionally the turtle file doesn't always exist (for example, backup doesn't include the
     * turtle file), so there isn't always a WiredTiger version available. If there is no version
     * available, assume that the snapshot is valid, otherwise restoring from a backup won't work.
     */
    if (__wt_version_defined(conn->recovery_version) &&
      __wt_version_lte(conn->recovery_version, (WT_VERSION){10, 0, 0})) {
        /* Return an empty snapshot. */
        conn->recovery_ckpt_snap_min = WT_TXN_NONE;
        conn->recovery_ckpt_snap_max = WT_TXN_NONE;
        conn->recovery_ckpt_snapshot = NULL;
        conn->recovery_ckpt_snapshot_count = 0;
        return (0);
    }

    /*
     * Read the system checkpoint information from the metadata file and save the snapshot related
     * details of the last checkpoint in the connection for later query.
     */
    return (__wt_meta_read_checkpoint_snapshot(session, NULL, NULL, &conn->recovery_ckpt_snap_min,
      &conn->recovery_ckpt_snap_max, &conn->recovery_ckpt_snapshot,
      &conn->recovery_ckpt_snapshot_count, NULL));
}

/*
 * __recovery_set_ckpt_base_write_gen --
 *     Set the base write gen as retrieved from the metadata file.
 */
static int
__recovery_set_ckpt_base_write_gen(WT_RECOVERY *r)
{
    WT_CONFIG_ITEM cval;
    WT_DECL_RET;
    WT_SESSION_IMPL *session;
    char *sys_config;

    sys_config = NULL;
    session = r->session;

    /* Search the metadata for checkpoint base write gen information. */
    WT_ERR_NOTFOUND_OK(
      __wt_metadata_search(session, WT_SYSTEM_BASE_WRITE_GEN_URI, &sys_config), false);
    if (sys_config != NULL) {
        WT_CLEAR(cval);
        WT_ERR(__wt_config_getones(session, sys_config, WT_SYSTEM_BASE_WRITE_GEN, &cval));
        if (cval.len != 0)
            S2C(session)->last_ckpt_base_write_gen = (uint64_t)cval.val;
    }

err:
    __wt_free(session, sys_config);
    return (ret);
}

/*
 * __recovery_txn_setup_initial_state --
 *     Setup the transaction initial state required by rollback to stable.
 */
static int
__recovery_txn_setup_initial_state(WT_SESSION_IMPL *session, WT_RECOVERY *r)
{
    WT_CONNECTION_IMPL *conn;

    conn = S2C(session);

    WT_RET(__recovery_set_checkpoint_snapshot(session));

    /*
     * Set the checkpoint timestamp and oldest timestamp retrieved from the checkpoint metadata.
     * These are the stable timestamp and oldest timestamps of the last successful checkpoint.
     */
    WT_RET(__recovery_set_checkpoint_timestamp(r));
    WT_RET(__recovery_set_oldest_timestamp(r));

    /*
     * Now that timestamps extracted from the checkpoint metadata have been configured, configure
     * the pinned timestamp.
     */
    __wt_txn_update_pinned_timestamp(session, true);

    WT_ASSERT(session,
      conn->txn_global.has_stable_timestamp == false &&
        conn->txn_global.stable_timestamp == WT_TS_NONE);

    /* Set the stable timestamp from recovery timestamp. */
    conn->txn_global.stable_timestamp = conn->txn_global.recovery_timestamp;
    if (conn->txn_global.stable_timestamp != WT_TS_NONE)
        conn->txn_global.has_stable_timestamp = true;

    return (0);
}

/*
 * __recovery_setup_file --
 *     Set up the recovery slot for a file, track the largest file ID, and update the base write gen
 *     based on the file's configuration.
 */
static int
__recovery_setup_file(WT_RECOVERY *r, const char *uri, const char *config)
{
    WT_CONFIG_ITEM cval;
    WT_DECL_RET;
    WT_LSN lsn;
    uint32_t fileid, lsnfile, lsnoffset;

    WT_RET(__wt_config_getones(r->session, config, "id", &cval));
    fileid = (uint32_t)cval.val;

    /* Track the largest file ID we have seen. */
    if (fileid > r->max_fileid)
        r->max_fileid = fileid;

    if (r->nfiles <= fileid) {
        WT_RET(__wt_realloc_def(r->session, &r->file_alloc, fileid + 1, &r->files));
        r->nfiles = fileid + 1;
    }

    if (r->files[fileid].uri != NULL)
        WT_RET_PANIC(r->session, WT_PANIC,
          "metadata corruption: files %s and %s have the same file ID %u", uri,
          r->files[fileid].uri, fileid);
    WT_RET(__wt_strdup(r->session, uri, &r->files[fileid].uri));
    if ((ret = __wt_config_getones(r->session, config, "checkpoint_lsn", &cval)) != 0)
        WT_RET_MSG(
          r->session, ret, "Failed recovery setup for %s: cannot parse config '%s'", uri, config);
    /* If there is no checkpoint logged for the file, apply everything. */
    if (cval.type != WT_CONFIG_ITEM_STRUCT)
        WT_INIT_LSN(&lsn);
    /* NOLINTNEXTLINE(cert-err34-c) */
    else if (sscanf(cval.str, "(%" SCNu32 ",%" SCNu32 ")", &lsnfile, &lsnoffset) == 2)
        WT_SET_LSN(&lsn, lsnfile, lsnoffset);
    else
        WT_RET_MSG(r->session, EINVAL,
          "Failed recovery setup for %s: cannot parse checkpoint LSN '%.*s'", uri, (int)cval.len,
          cval.str);
    WT_ASSIGN_LSN(&r->files[fileid].ckpt_lsn, &lsn);

    __wt_verbose(r->session, WT_VERB_RECOVERY,
      "Recovering %s with id %" PRIu32 " @ (%" PRIu32 ", %" PRIu32 ")", uri, fileid, lsn.l.file,
      lsn.l.offset);

    if ((!WT_IS_MAX_LSN(&lsn) && !WT_IS_INIT_LSN(&lsn)) &&
      (WT_IS_MAX_LSN(&r->max_ckpt_lsn) || __wt_log_cmp(&lsn, &r->max_ckpt_lsn) > 0))
        WT_ASSIGN_LSN(&r->max_ckpt_lsn, &lsn);

    /* Update the base write gen and most recent checkpoint based on this file's configuration. */
    if ((ret = __wt_metadata_update_connection(r->session, config)) != 0)
        WT_RET_MSG(r->session, ret, "Failed recovery setup for %s: cannot update write gen", uri);
    return (0);
}

/*
 * __recovery_close_cursors --
 *     Close the logging recovery cursors.
 */
static int
__recovery_close_cursors(WT_RECOVERY *r)
{
    WT_CURSOR *c;
    WT_DECL_RET;
    WT_SESSION_IMPL *session;
    u_int i;

    session = r->session;
    for (i = 0; i < r->nfiles; i++) {
        __wt_free(session, r->files[i].uri);
        if ((c = r->files[i].c) != NULL)
            WT_TRET(c->close(c));
    }

    r->nfiles = 0;
    __wt_free(session, r->files);
    return (ret);
}

/*
 * __recovery_file_scan_prefix --
 *     Scan the files matching the prefix referenced from the metadata and gather information about
 *     them for recovery.
 */
static int
__recovery_file_scan_prefix(WT_RECOVERY *r, const char *prefix, const char *ignore_suffix)
{
    WT_CURSOR *c;
    WT_DECL_RET;
    int cmp;
    const char *uri, *config;

    /* Scan through all entries in the metadata matching the prefix. */
    c = r->files[0].c;
    c->set_key(c, prefix);
    if ((ret = c->search_near(c, &cmp)) != 0) {
        /* Is the metadata empty? */
        WT_RET_NOTFOUND_OK(ret);
        return (0);
    }
    if (cmp < 0 && (ret = c->next(c)) != 0) {
        /* No matching entries? */
        WT_RET_NOTFOUND_OK(ret);
        return (0);
    }
    for (; ret == 0; ret = c->next(c)) {
        WT_RET(c->get_key(c, &uri));
        if (!WT_PREFIX_MATCH(uri, prefix))
            break;
        if (ignore_suffix != NULL && WT_SUFFIX_MATCH(uri, ignore_suffix))
            continue;
        WT_RET(c->get_value(c, &config));
        WT_RET(__recovery_setup_file(r, uri, config));
    }
    WT_RET_NOTFOUND_OK(ret);
    return (0);
}

/*
 * __recovery_file_scan --
 *     Scan the files referenced from the metadata and gather information about them for recovery.
 */
static int
__recovery_file_scan(WT_RECOVERY *r)
{
    /* Scan through all files and tiered entries in the metadata. */
    WT_RET(__recovery_file_scan_prefix(r, "file:", ".wtobj"));
    WT_RET(__recovery_file_scan_prefix(r, "tiered:", NULL));

    /*
     * Set the connection level file id tracker, as such upon creation of a new file we'll begin
     * from the latest file id.
     */
    S2C(r->session)->next_file_id = r->max_fileid;

    return (0);
}

/*
 * __hs_exists --
 *     Check whether the history store exists. This function looks for both the history store URI in
 *     the metadata file and for the history store data file itself. If we're running salvage, we'll
 *     attempt to salvage the history store here.
 */
static int
__hs_exists(WT_SESSION_IMPL *session, WT_CURSOR *metac, const char *cfg[], bool *hs_exists)
{
    WT_CONNECTION_IMPL *conn;
    WT_DECL_RET;
    WT_SESSION *wt_session;

    conn = S2C(session);

    /*
     * We should check whether the history store file exists in the metadata or not. If it does not,
     * then we should skip rollback to stable for each table. This might happen if we're upgrading
     * from an older version. If it does exist in the metadata we should check that it exists on
     * disk to confirm that it wasn't deleted between runs.
     *
     * This needs to happen after we apply the logs as they may contain the metadata changes which
     * include the history store creation. As such the on disk metadata file won't contain the
     * history store but will after log application.
     */
    metac->set_key(metac, WT_HS_URI);
    WT_ERR_NOTFOUND_OK(metac->search(metac), true);
    if (ret == WT_NOTFOUND) {
        *hs_exists = false;
        ret = 0;
    } else {
        /* Given the history store exists in the metadata validate whether it exists on disk. */
        WT_ERR(__wt_fs_exist(session, WT_HS_FILE, hs_exists));
        if (*hs_exists) {
            /*
             * Attempt to configure the history store, this will detect corruption if it fails.
             */
            ret = __wt_hs_config(session, cfg);
            if (ret != 0) {
                if (F_ISSET(conn, WT_CONN_SALVAGE)) {
                    wt_session = &session->iface;
                    WT_ERR(wt_session->salvage(wt_session, WT_HS_URI, NULL));
                } else
                    WT_ERR(ret);
            }
        } else {
            /*
             * We're attempting to salvage the database with a missing history store, remove it from
             * the metadata and pretend it never existed. As such we won't run rollback to stable
             * later.
             */
            if (F_ISSET(conn, WT_CONN_SALVAGE)) {
                *hs_exists = false;
                metac->remove(metac);
            } else
                /* The history store file has likely been deleted, we cannot recover from this. */
                WT_ERR_MSG(session, WT_TRY_SALVAGE, "%s file is corrupted or missing", WT_HS_FILE);
        }
    }
err:
    /* Unpin the page from cache. */
    WT_TRET(metac->reset(metac));
    return (ret);
}

/*
 * __wt_txn_recover --
 *     Run recovery.
 */
int
__wt_txn_recover(WT_SESSION_IMPL *session, const char *cfg[])
{
    WT_CONNECTION_IMPL *conn;
    WT_CURSOR *metac;
    WT_DECL_RET;
    WT_RECOVERY r;
    WT_RECOVERY_FILE *metafile;
    wt_off_t hs_size;
    char *config;
    char ts_string[2][WT_TS_INT_STRING_SIZE];
    bool do_checkpoint, eviction_started, hs_exists, needs_rec, was_backup;
    bool rts_executed;

    conn = S2C(session);
    F_SET(conn, WT_CONN_RECOVERING);

    WT_CLEAR(r);
    WT_INIT_LSN(&r.ckpt_lsn);
    config = NULL;
    do_checkpoint = hs_exists = true;
    rts_executed = false;
    eviction_started = false;
    was_backup = F_ISSET(conn, WT_CONN_WAS_BACKUP);

    /* We need a real session for recovery. */
    WT_RET(__wt_open_internal_session(conn, "txn-recover", false, 0, 0, &session));
    r.session = session;
    WT_MAX_LSN(&r.max_ckpt_lsn);
    WT_MAX_LSN(&r.max_rec_lsn);
    conn->txn_global.recovery_timestamp = conn->txn_global.meta_ckpt_timestamp = WT_TS_NONE;

    WT_ERR(__wt_metadata_search(session, WT_METAFILE_URI, &config));
    WT_ERR(__recovery_setup_file(&r, WT_METAFILE_URI, config));
    WT_ERR(__wt_metadata_cursor_open(session, NULL, &metac));
    metafile = &r.files[WT_METAFILE_ID];
    metafile->c = metac;

    WT_ERR(__recovery_set_ckpt_base_write_gen(&r));

    /*
     * If no log was found (including if logging is disabled), or if the last checkpoint was done
     * with logging disabled, recovery should not run. Scan the metadata to figure out the largest
     * file ID.
     */
    if (!FLD_ISSET(conn->log_flags, WT_CONN_LOG_EXISTED) || WT_IS_MAX_LSN(&metafile->ckpt_lsn)) {
        /*
         * Detect if we're going from logging disabled to enabled. We need to know this to verify
         * LSNs and start at the correct log file later. If someone ran with logging, then disabled
         * it and removed all the log files and then turned logging back on, we have to start
         * logging in the log file number that is larger than any checkpoint LSN we have from the
         * earlier time.
         */
        WT_ERR(__recovery_file_scan(&r));
        /*
         * The array can be re-allocated in recovery_file_scan. Reset our pointer after scanning all
         * the files.
         */
        metafile = &r.files[WT_METAFILE_ID];

        if (FLD_ISSET(conn->log_flags, WT_CONN_LOG_ENABLED) && WT_IS_MAX_LSN(&metafile->ckpt_lsn) &&
          !WT_IS_MAX_LSN(&r.max_ckpt_lsn))
            WT_ERR(__wt_log_reset(session, r.max_ckpt_lsn.l.file));
        else
            do_checkpoint = false;
        WT_ERR(__hs_exists(session, metac, cfg, &hs_exists));
        goto done;
    }

    /*
     * First, do a pass through the log to recover the metadata, and establish the last checkpoint
     * LSN. Skip this when opening a hot backup: we already have the correct metadata in that case.
     *
     * If we're running with salvage and we hit an error, we ignore it and continue. In salvage we
     * want to recover whatever part of the data we can from the last checkpoint up until whatever
     * problem we detect in the log file. In salvage, we ignore errors from scanning the log so
     * recovery can continue. Other errors remain errors.
     */
    if (!was_backup) {
        r.metadata_only = true;
        /*
         * If this is a read-only connection, check if the checkpoint LSN in the metadata file is up
         * to date, indicating a clean shutdown.
         */
        if (F_ISSET(conn, WT_CONN_READONLY)) {
            WT_ERR(__wt_log_needs_recovery(session, &metafile->ckpt_lsn, &needs_rec));
            if (needs_rec)
                WT_ERR_MSG(session, WT_RUN_RECOVERY, "Read-only database needs recovery");
        }
        if (WT_IS_INIT_LSN(&metafile->ckpt_lsn))
            ret = __wt_log_scan(session, NULL, NULL, WT_LOGSCAN_FIRST, __txn_log_recover, &r);
        else {
            /*
             * Start at the last checkpoint LSN referenced in the metadata. If we see the end of a
             * checkpoint while scanning, we will change the full scan to start from there.
             */
            WT_ASSIGN_LSN(&r.ckpt_lsn, &metafile->ckpt_lsn);
            ret = __wt_log_scan(session, &metafile->ckpt_lsn, NULL, WT_LOGSCAN_RECOVER_METADATA,
              __txn_log_recover, &r);
        }
        if (F_ISSET(conn, WT_CONN_SALVAGE))
            ret = 0;
        /*
         * If log scan couldn't find a file we expected to be around, this indicates a corruption of
         * some sort.
         */
        if (ret == ENOENT) {
            F_SET(conn, WT_CONN_DATA_CORRUPTION);
            ret = WT_ERROR;
        }

        WT_ERR(ret);
    }

    /* Scan the metadata to find the live files and their IDs. */
    WT_ERR(__recovery_file_scan(&r));

    /*
     * Check whether the history store exists.
     *
     * This will open a dhandle on the history store and initialize its write gen so we must ensure
     * that the connection-wide base write generation is stable at this point. Performing a recovery
     * file scan will involve updating the connection-wide base write generation so we MUST do this
     * before checking for the existence of a history store file.
     */
    WT_ERR(__hs_exists(session, metac, cfg, &hs_exists));

    /*
     * Clear this out. We no longer need it and it could have been re-allocated when scanning the
     * files.
     */
    WT_NOT_READ(metafile, NULL);

    /*
     * We no longer need the metadata cursor: close it to avoid pinning any resources that could
     * block eviction during recovery.
     */
    r.files[0].c = NULL;
    WT_ERR(metac->close(metac));

    /*
     * Now, recover all the files apart from the metadata. Pass WT_LOGSCAN_RECOVER so that old logs
     * get truncated.
     */
    r.metadata_only = false;
    __wt_verbose_multi(session, WT_VERB_RECOVERY_ALL,
      "Main recovery loop: starting at %" PRIu32 "/%" PRIu32 " to %" PRIu32 "/%" PRIu32,
      r.ckpt_lsn.l.file, r.ckpt_lsn.l.offset, r.max_rec_lsn.l.file, r.max_rec_lsn.l.offset);
    WT_ERR(__wt_log_needs_recovery(session, &r.ckpt_lsn, &needs_rec));
    /*
     * Check if the database was shut down cleanly. If not return an error if the user does not want
     * automatic recovery.
     */
    if (needs_rec &&
      (FLD_ISSET(conn->log_flags, WT_CONN_LOG_RECOVER_ERR) || F_ISSET(conn, WT_CONN_READONLY))) {
        if (F_ISSET(conn, WT_CONN_READONLY))
            WT_ERR_MSG(session, WT_RUN_RECOVERY, "Read-only database needs recovery");
        WT_ERR_MSG(session, WT_RUN_RECOVERY, "Database needs recovery");
    }

    if (F_ISSET(conn, WT_CONN_READONLY)) {
        do_checkpoint = false;
        goto done;
    }

    if (!hs_exists) {
        __wt_verbose_multi(session, WT_VERB_RECOVERY_ALL, "%s",
          "Creating the history store before applying log records. Likely recovering after an"
          "unclean shutdown on an earlier version");
        /*
         * Create the history store as we might need it while applying log records in recovery.
         */
        WT_ERR(__wt_hs_open(session, cfg));
    }

    /*
     * Recovery can touch more data than fits in cache, so it relies on regular eviction to manage
     * paging. Start eviction threads for recovery without history store cursors.
     */
    WT_ERR(__wt_evict_create(session));
    eviction_started = true;

    /*
     * Always run recovery even if it was a clean shutdown only if this is not a read-only
     * connection. We can consider skipping it in the future.
     */
    if (needs_rec)
        FLD_SET(conn->log_flags, WT_CONN_LOG_RECOVER_DIRTY);
    if (WT_IS_INIT_LSN(&r.ckpt_lsn))
        ret = __wt_log_scan(
          session, NULL, NULL, WT_LOGSCAN_FIRST | WT_LOGSCAN_RECOVER, __txn_log_recover, &r);
    else
        ret = __wt_log_scan(session, &r.ckpt_lsn, NULL, WT_LOGSCAN_RECOVER, __txn_log_recover, &r);
    if (F_ISSET(conn, WT_CONN_SALVAGE))
        ret = 0;
    WT_ERR(ret);

done:
    /* Close cached cursors, rollback-to-stable asserts exclusive access. */
    WT_ERR(__recovery_close_cursors(&r));
#ifndef WT_STANDALONE_BUILD
    /*
     * There is a known problem with upgrading from release 10.0.0 specifically. There are now fixes
     * that can properly upgrade from 10.0.0 without hitting the problem but only from a clean
     * shutdown of 10.0.0. Earlier releases are not affected by the upgrade issue.
     */
    if (conn->unclean_shutdown && __wt_version_eq(conn->recovery_version, (WT_VERSION){10, 0, 0}))
        WT_ERR_MSG(session, WT_ERROR,
          "Upgrading from a WiredTiger version 10.0.0 database that was not shutdown cleanly is "
          "not allowed. Perform a clean shutdown on version 10.0.0 and then upgrade.");
#endif
    WT_ERR(__recovery_txn_setup_initial_state(session, &r));

    /*
     * Set the history store file size as it may already exist after a restart.
     */
    if (hs_exists) {
        WT_ERR(__wt_block_manager_named_size(session, WT_HS_FILE, &hs_size));
        WT_STAT_CONN_SET(session, cache_hs_ondisk, hs_size);
    }

    /*
     * Perform rollback to stable only when the following conditions met.
     * 1. The connection is not read-only. A read-only connection expects that there shouldn't be
     *    any changes that need to be done on the database other than reading.
     * 2. The history store file was found in the metadata.
     */
    if (hs_exists && !F_ISSET(conn, WT_CONN_READONLY)) {
        /* Start the eviction threads for rollback to stable if not already started. */
        if (!eviction_started) {
            WT_ERR(__wt_evict_create(session));
            eviction_started = true;
        }

        __wt_verbose_multi(session,
          WT_DECL_VERBOSE_MULTI_CATEGORY(((WT_VERBOSE_CATEGORY[]){WT_VERB_RECOVERY, WT_VERB_RTS})),
          "[RECOVERY_RTS] performing recovery rollback_to_stable with stable_timestamp=%s and "
          "oldest_timestamp=%s",
          __wt_timestamp_to_string(conn->txn_global.stable_timestamp, ts_string[0]),
          __wt_timestamp_to_string(conn->txn_global.oldest_timestamp, ts_string[1]));
        rts_executed = true;
        WT_ERR(conn->rts->rollback_to_stable(session, NULL, true));
    }

    /*
     * Sometimes eviction is triggered after doing a checkpoint. However, we don't want eviction to
     * make the tree dirty after checkpoint as this will interfere with WT_SESSION alter which
     * expects a clean tree.
     */
    if (eviction_started)
        WT_TRET(__wt_evict_destroy(session));

    if (do_checkpoint || rts_executed)
        /*
         * Forcibly log a checkpoint so the next open is fast and keep the metadata up to date with
         * the checkpoint LSN and removal.
         */
        WT_ERR(session->iface.checkpoint(&session->iface, "force=1"));

    /* Remove any backup file now that metadata has been synced. */
    WT_ERR(__wt_backup_file_remove(session));

    /*
     * Update the open dhandles write generations and base write generation with the connection's
     * base write generation because the recovery checkpoint writes the pages to disk with new write
     * generation number which contains transaction ids that are needed to reset later. The
     * connection level base write generation number is updated at the end of the recovery
     * checkpoint.
     */
    WT_ERR(__wt_dhandle_update_write_gens(session));

    /*
     * If we're downgrading and have newer log files, force log removal, no matter what the remove
     * setting is.
     */
    if (FLD_ISSET(conn->log_flags, WT_CONN_LOG_FORCE_DOWNGRADE))
        WT_ERR(__wt_log_truncate_files(session, NULL, true));
    FLD_SET(conn->log_flags, WT_CONN_LOG_RECOVER_DONE);

err:
    WT_TRET(__recovery_close_cursors(&r));
    __wt_free(session, config);
    FLD_CLR(conn->log_flags, WT_CONN_LOG_RECOVER_DIRTY);

    if (ret != 0) {
        FLD_SET(conn->log_flags, WT_CONN_LOG_RECOVER_FAILED);
        __wt_err(session, ret, "Recovery failed");
    }

    /*
     * Destroy the eviction threads that were started in support of recovery. They will be restarted
     * once the history store table is created.
     */
    if (eviction_started)
        WT_TRET(__wt_evict_destroy(session));

    WT_TRET(__wt_session_close_internal(session));
    F_SET(conn, WT_CONN_RECOVERY_COMPLETE);
    F_CLR(conn, WT_CONN_RECOVERING);

    return (ret);
}