/*-
 * Copyright (c) 2014-present MongoDB, Inc.
 * Copyright (c) 2008-2014 WiredTiger, Inc.
 *	All rights reserved.
 *
 * See the file LICENSE for redistribution information.
 */

#include "wt_internal.h"

static int __btree_conf(WT_SESSION_IMPL *, WT_CKPT *ckpt);
static int __btree_get_last_recno(WT_SESSION_IMPL *);
static int __btree_page_sizes(WT_SESSION_IMPL *);
static int __btree_preload(WT_SESSION_IMPL *);
static int __btree_tree_open_empty(WT_SESSION_IMPL *, bool);

/*
 * __btree_clear --
 *     Clear a Btree, either on handle discard or re-open.
 */
static int
__btree_clear(WT_SESSION_IMPL *session)
{
    WT_BTREE *btree;
    WT_DECL_RET;

    btree = S2BT(session);

    /*
     * If the tree hasn't gone through an open/close cycle, there's no cleanup to be done.
     */
    if (!F_ISSET(btree, WT_BTREE_CLOSED))
        return (0);

    /* Close the Huffman tree. */
    __wt_btree_huffman_close(session);

    /* Terminate any associated collator. */
    if (btree->collator_owned && btree->collator->terminate != NULL)
        WT_TRET(btree->collator->terminate(btree->collator, &session->iface));

    /* Destroy locks. */
    __wt_rwlock_destroy(session, &btree->ovfl_lock);
    __wt_spin_destroy(session, &btree->flush_lock);

    /* Free allocated memory. */
    __wt_free(session, btree->key_format);
    __wt_free(session, btree->value_format);

    return (ret);
}

/*
 * __wt_btree_open --
 *     Open a Btree.
 */
int
__wt_btree_open(WT_SESSION_IMPL *session, const char *op_cfg[])
{
    WT_BM *bm;
    WT_BTREE *btree;
    WT_CKPT ckpt;
    WT_CONFIG_ITEM cval;
    WT_DATA_HANDLE *dhandle;
    WT_DECL_ITEM(tmp);
    WT_DECL_RET;
    size_t root_addr_size;
    uint8_t root_addr[WT_BTREE_MAX_ADDR_COOKIE];
    bool creation, forced_salvage;

    btree = S2BT(session);
    dhandle = session->dhandle;

    /*
     * This may be a re-open, clean up the btree structure. Clear the fields that don't persist
     * across a re-open. Clear all flags other than the operation flags (which are set by the
     * connection handle software that called us).
     */
    WT_RET(__btree_clear(session));
    memset(btree, 0, WT_BTREE_CLEAR_SIZE);
    F_CLR(btree, ~WT_BTREE_SPECIAL_FLAGS);

    /* Set the data handle first, our called functions reasonably use it. */
    btree->dhandle = dhandle;

    /* Checkpoint and verify files are readonly. */
    if (dhandle->checkpoint != NULL || F_ISSET(btree, WT_BTREE_VERIFY) ||
      F_ISSET(S2C(session), WT_CONN_READONLY))
        F_SET(btree, WT_BTREE_READONLY);

    /* Get the checkpoint information for this name/checkpoint pair. */
    WT_RET(__wt_meta_checkpoint(session, dhandle->name, dhandle->checkpoint, &ckpt));

    /*
     * Bulk-load is only permitted on newly created files, not any empty file -- see the checkpoint
     * code for a discussion.
     */
    creation = ckpt.raw.size == 0;
    if (!creation && F_ISSET(btree, WT_BTREE_BULK))
        WT_ERR_MSG(session, EINVAL, "bulk-load is only supported on newly created objects");

    /* Handle salvage configuration. */
    forced_salvage = false;
    if (F_ISSET(btree, WT_BTREE_SALVAGE)) {
        WT_ERR(__wt_config_gets(session, op_cfg, "force", &cval));
        forced_salvage = cval.val != 0;
    }

    /* Initialize and configure the WT_BTREE structure. */
    WT_ERR(__btree_conf(session, &ckpt));

    /* Connect to the underlying block manager. */
    WT_ERR(__wt_blkcache_open(
      session, dhandle->name, dhandle->cfg, forced_salvage, false, btree->allocsize, &btree->bm));

    bm = btree->bm;

    /*
     * !!!
     * As part of block-manager configuration, we need to return the maximum
     * sized address cookie that a block manager will ever return.  There's
     * a limit of WT_BTREE_MAX_ADDR_COOKIE, but at 255B, it's too large for
     * a Btree with 512B internal pages.  The default block manager packs
     * a wt_off_t and 2 uint32_t's into its cookie, so there's no problem
     * now, but when we create a block manager extension API, we need some
     * way to consider the block manager's maximum cookie size versus the
     * minimum Btree internal node size.
     */
    btree->block_header = bm->block_header(bm);

    /*
     * Open the specified checkpoint unless it's a special command (special commands are responsible
     * for loading their own checkpoints, if any).
     */
    if (!F_ISSET(btree, WT_BTREE_SALVAGE | WT_BTREE_UPGRADE | WT_BTREE_VERIFY)) {
        /*
         * There are two reasons to load an empty tree rather than a checkpoint: either there is no
         * checkpoint (the file is being created), or the load call returns no root page (the
         * checkpoint is for an empty file).
         */
        WT_ERR(bm->checkpoint_load(bm, session, ckpt.raw.data, ckpt.raw.size, root_addr,
          &root_addr_size, F_ISSET(btree, WT_BTREE_READONLY)));
        if (creation || root_addr_size == 0)
            WT_ERR(__btree_tree_open_empty(session, creation));
        else {
            WT_ERR(__wt_btree_tree_open(session, root_addr, root_addr_size));

            /* Warm the cache, if possible. */
            WT_WITH_PAGE_INDEX(session, ret = __btree_preload(session));
            WT_ERR(ret);

            /* Get the last record number in a column-store file. */
            if (btree->type != BTREE_ROW)
                WT_ERR(__btree_get_last_recno(session));
        }
    }

    /*
     * Eviction ignores trees until the handle's open flag is set, configure eviction before that
     * happens.
     *
     * Files that can still be bulk-loaded cannot be evicted. Permanently cache-resident files can
     * never be evicted. Special operations don't enable eviction. The underlying commands may turn
     * on eviction (for example, verify turns on eviction while working a file to keep from
     * consuming the cache), but it's their decision. If an underlying command reconfigures
     * eviction, it must either clear the evict-disabled-open flag or restore the eviction
     * configuration when finished so that handle close behaves correctly.
     */
    if (btree->original ||
      F_ISSET(btree, WT_BTREE_IN_MEMORY | WT_BTREE_SALVAGE | WT_BTREE_UPGRADE | WT_BTREE_VERIFY)) {
        WT_ERR(__wt_evict_file_exclusive_on(session));
        btree->evict_disabled_open = true;
    }

    if (0) {
err:
        WT_TRET(__wt_btree_close(session));
    }
    __wt_meta_checkpoint_free(session, &ckpt);

    __wt_scr_free(session, &tmp);
    return (ret);
}

/*
 * __wt_btree_close --
 *     Close a Btree.
 */
int
__wt_btree_close(WT_SESSION_IMPL *session)
{
    WT_BM *bm;
    WT_BTREE *btree;
    WT_DECL_RET;

    btree = S2BT(session);

    /*
     * The close process isn't the same as discarding the handle: we might re-open the handle, which
     * isn't a big deal, but the backing blocks for the handle may not yet have been discarded from
     * the cache, and eviction uses WT_BTREE structure elements. Free backing resources but leave
     * the rest alone, and we'll discard the structure when we discard the data handle.
     *
     * Handles can be closed multiple times, ignore all but the first.
     */
    if (F_ISSET(btree, WT_BTREE_CLOSED))
        return (0);
    F_SET(btree, WT_BTREE_CLOSED);

    /*
     * Verify the history store state. If the history store is open and this btree has history store
     * entries, it can't be a metadata file, nor can it be the history store file.
     */
    WT_ASSERT(session,
      !F_ISSET(S2C(session), WT_CONN_HS_OPEN) || !btree->hs_entries ||
        (!WT_IS_METADATA(btree->dhandle) && !WT_IS_HS(btree->dhandle)));

    /* Clear the saved checkpoint information. */
    __wt_meta_saved_ckptlist_free(session);

    /*
     * If we turned eviction off and never turned it back on, do that now, otherwise the counter
     * will be off.
     */
    if (btree->evict_disabled_open) {
        btree->evict_disabled_open = false;
        __wt_evict_file_exclusive_off(session);
    }

    /* Discard any underlying block manager resources. */
    if ((bm = btree->bm) != NULL) {
        btree->bm = NULL;

        /* Unload the checkpoint, unless it's a special command. */
        if (!F_ISSET(btree, WT_BTREE_SALVAGE | WT_BTREE_UPGRADE | WT_BTREE_VERIFY))
            WT_TRET(bm->checkpoint_unload(bm, session));

        /* Close the underlying block manager reference. */
        WT_TRET(bm->close(bm, session));
    }

    return (ret);
}

/*
 * __wt_btree_discard --
 *     Discard a Btree.
 */
int
__wt_btree_discard(WT_SESSION_IMPL *session)
{
    WT_BTREE *btree;
    WT_DECL_RET;

    ret = __btree_clear(session);

    btree = S2BT(session);
    __wt_overwrite_and_free(session, btree);
    session->dhandle->handle = NULL;

    return (ret);
}

/*
 * __wt_btree_config_encryptor --
 *     Return an encryptor handle based on the configuration.
 */
int
__wt_btree_config_encryptor(
  WT_SESSION_IMPL *session, const char **cfg, WT_KEYED_ENCRYPTOR **kencryptorp)
{
    WT_CONFIG_ITEM cval, enc, keyid;
    WT_DECL_RET;
    const char *enc_cfg[] = {NULL, NULL};

    /*
     * We do not use __wt_config_gets_none here because "none" and the empty string have different
     * meanings. The empty string means inherit the system encryption setting and "none" means this
     * table is in the clear even if the database is encrypted.
     */
    WT_RET(__wt_config_gets(session, cfg, "encryption.name", &cval));
    if (cval.len == 0)
        *kencryptorp = S2C(session)->kencryptor;
    else if (WT_STRING_MATCH("none", cval.str, cval.len))
        *kencryptorp = NULL;
    else {
        WT_RET(__wt_config_gets_none(session, cfg, "encryption.keyid", &keyid));
        WT_RET(__wt_config_gets(session, cfg, "encryption", &enc));
        if (enc.len != 0)
            WT_RET(__wt_strndup(session, enc.str, enc.len, &enc_cfg[0]));
        ret = __wt_encryptor_config(session, &cval, &keyid, (WT_CONFIG_ARG *)enc_cfg, kencryptorp);
        __wt_free(session, enc_cfg[0]);
        WT_RET(ret);
    }
    return (0);
}

/*
 * __btree_conf --
 *     Configure a WT_BTREE structure.
 */
static int
__btree_conf(WT_SESSION_IMPL *session, WT_CKPT *ckpt)
{
    WT_BTREE *btree;
    WT_CONFIG_ITEM cval, metadata;
    WT_CONNECTION_IMPL *conn;
    int64_t maj_version, min_version;
    uint32_t bitcnt;
    const char **cfg;
    bool fixed;

    btree = S2BT(session);
    cfg = btree->dhandle->cfg;
    conn = S2C(session);

    /* Dump out format information. */
    if (WT_VERBOSE_ISSET(session, WT_VERB_VERSION)) {
        WT_RET(__wt_config_gets(session, cfg, "version.major", &cval));
        maj_version = cval.val;
        WT_RET(__wt_config_gets(session, cfg, "version.minor", &cval));
        min_version = cval.val;
        __wt_verbose(session, WT_VERB_VERSION, "%" PRId64 ".%" PRId64, maj_version, min_version);
    }

    /* Get the file ID. */
    WT_RET(__wt_config_gets(session, cfg, "id", &cval));
    btree->id = (uint32_t)cval.val;

    /* Validate file types and check the data format plan. */
    WT_RET(__wt_config_gets(session, cfg, "key_format", &cval));
    WT_RET(__wt_struct_confchk(session, &cval));
    if (WT_STRING_MATCH("r", cval.str, cval.len))
        btree->type = BTREE_COL_VAR;
    else
        btree->type = BTREE_ROW;
    WT_RET(__wt_strndup(session, cval.str, cval.len, &btree->key_format));

    WT_RET(__wt_config_gets(session, cfg, "value_format", &cval));
    WT_RET(__wt_struct_confchk(session, &cval));
    WT_RET(__wt_strndup(session, cval.str, cval.len, &btree->value_format));

    /* Row-store key comparison. */
    if (btree->type == BTREE_ROW) {
        WT_RET(__wt_config_gets_none(session, cfg, "collator", &cval));
        if (cval.len != 0) {
            WT_RET(__wt_config_gets(session, cfg, "app_metadata", &metadata));
            WT_RET(__wt_collator_config(session, btree->dhandle->name, &cval, &metadata,
              &btree->collator, &btree->collator_owned));
        }
    }

    /* Column-store: check for fixed-size data. */
    if (btree->type == BTREE_COL_VAR) {
        WT_RET(__wt_struct_check(session, cval.str, cval.len, &fixed, &bitcnt));
        if (fixed) {
            if (bitcnt == 0 || bitcnt > 8)
                WT_RET_MSG(session, EINVAL,
                  "fixed-width field sizes must be greater than 0 and less than or equal to 8");
            btree->bitcnt = (uint8_t)bitcnt;
            btree->type = BTREE_COL_FIX;
        }
    }

    /* Page sizes */
    WT_RET(__btree_page_sizes(session));

    WT_RET(__wt_config_gets(session, cfg, "cache_resident", &cval));
    if (cval.val)
        F_SET(btree, WT_BTREE_IN_MEMORY);
    else
        F_CLR(btree, WT_BTREE_IN_MEMORY);

    WT_RET(__wt_config_gets(session, cfg, "ignore_in_memory_cache_size", &cval));
    if (cval.val) {
        if (!F_ISSET(conn, WT_CONN_IN_MEMORY))
            WT_RET_MSG(session, EINVAL,
              "ignore_in_memory_cache_size setting is only valid with databases configured to run "
              "in-memory");
        F_SET(btree, WT_BTREE_IGNORE_CACHE);
    } else
        F_CLR(btree, WT_BTREE_IGNORE_CACHE);

    /*
     * The metadata isn't blocked by in-memory cache limits because metadata "unroll" is performed
     * by updates that are potentially blocked by the cache-full checks.
     */
    if (WT_IS_METADATA(btree->dhandle))
        F_SET(btree, WT_BTREE_IGNORE_CACHE);

    /*
     * Turn on logging when it's enabled in the database and not disabled for the tree. (Other code
     * only checks the tree flag, so it's important the tree flag match the overall configuration.)
     */
    F_SET(btree, WT_BTREE_NO_LOGGING);
    if (FLD_ISSET(conn->log_flags, WT_CONN_LOG_ENABLED)) {
        WT_ASSERT(session, !F_ISSET(conn, WT_CONN_IN_MEMORY));
        WT_RET(__wt_config_gets(session, cfg, "log.enabled", &cval));
        if (cval.val)
            F_CLR(btree, WT_BTREE_NO_LOGGING);
    }

    WT_RET(__wt_config_gets(session, cfg, "tiered_object", &cval));
    if (cval.val)
        F_SET(btree, WT_BTREE_NO_CHECKPOINT);
    else
        F_CLR(btree, WT_BTREE_NO_CHECKPOINT);

    /* Checksums */
    WT_RET(__wt_config_gets(session, cfg, "checksum", &cval));
    if (WT_STRING_MATCH("on", cval.str, cval.len))
        btree->checksum = CKSUM_ON;
    else if (WT_STRING_MATCH("off", cval.str, cval.len))
        btree->checksum = CKSUM_OFF;
    else if (WT_STRING_MATCH("uncompressed", cval.str, cval.len))
        btree->checksum = CKSUM_UNCOMPRESSED;
    else
        btree->checksum = CKSUM_UNENCRYPTED;

    /* Huffman encoding */
    WT_RET(__wt_btree_huffman_open(session));

    /*
     * Reconciliation configuration:
     *	Block compression (all)
     *	Dictionary compression (variable-length column-store, row-store)
     *	Page-split percentage
     *	Prefix compression (row-store)
     *	Suffix compression (row-store)
     */
    switch (btree->type) {
    case BTREE_COL_FIX:
        break;
    case BTREE_ROW:
        WT_RET(__wt_config_gets(session, cfg, "internal_key_truncate", &cval));
        btree->internal_key_truncate = cval.val != 0;

        WT_RET(__wt_config_gets(session, cfg, "prefix_compression", &cval));
        btree->prefix_compression = cval.val != 0;
        WT_RET(__wt_config_gets(session, cfg, "prefix_compression_min", &cval));
        btree->prefix_compression_min = (u_int)cval.val;
    /* FALLTHROUGH */
    case BTREE_COL_VAR:
        WT_RET(__wt_config_gets(session, cfg, "dictionary", &cval));
        btree->dictionary = (u_int)cval.val;
        break;
    }

    WT_RET(__wt_config_gets_none(session, cfg, "block_compressor", &cval));
    WT_RET(__wt_compressor_config(session, &cval, &btree->compressor));

    /*
     * Configure compression adjustment.
     * When doing compression, assume compression rates that will result in
     * pages larger than the maximum in-memory images allowed. If we're
     * wrong, we adjust downward (but we're almost certainly correct, the
     * maximum in-memory images allowed are only 4x the maximum page size,
     * and compression always gives us more than 4x).
     *	Don't do compression adjustment for fixed-size column store, the
     * leaf page sizes don't change. (We could adjust internal pages but not
     * internal pages, but that seems an unlikely use case.)
     */
    btree->intlpage_compadjust = false;
    btree->maxintlpage_precomp = btree->maxintlpage;
    btree->leafpage_compadjust = false;
    btree->maxleafpage_precomp = btree->maxleafpage;
    if (btree->compressor != NULL && btree->compressor->compress != NULL &&
      btree->type != BTREE_COL_FIX) {
        /*
         * Don't do compression adjustment when on-disk page sizes are less than 16KB. There's not
         * enough compression going on to fine-tune the size, all we end up doing is hammering
         * shared memory.
         *
         * Don't do compression adjustment when on-disk page sizes are equal to the maximum
         * in-memory page image, the bytes taken for compression can't grow past the base value.
         */
        if (btree->maxintlpage >= 16 * 1024 && btree->maxmempage_image > btree->maxintlpage) {
            btree->intlpage_compadjust = true;
            btree->maxintlpage_precomp = btree->maxmempage_image;
        }
        if (btree->maxleafpage >= 16 * 1024 && btree->maxmempage_image > btree->maxleafpage) {
            btree->leafpage_compadjust = true;
            btree->maxleafpage_precomp = btree->maxmempage_image;
        }
    }

    /* Set special flags for the history store table. */
    if (strcmp(session->dhandle->name, WT_HS_URI) == 0) {
        F_SET(btree->dhandle, WT_DHANDLE_HS);
        F_SET(btree, WT_BTREE_NO_LOGGING);
    }

    /* Configure encryption. */
    WT_RET(__wt_btree_config_encryptor(session, cfg, &btree->kencryptor));

    /* Configure read-only. */
    WT_RET(__wt_config_gets(session, cfg, "readonly", &cval));
    if (cval.val)
        F_SET(btree, WT_BTREE_READONLY);

    /* Initialize locks. */
    WT_RET(__wt_rwlock_init(session, &btree->ovfl_lock));
    WT_RET(__wt_spin_init(session, &btree->flush_lock, "btree flush"));

    btree->modified = false; /* Clean */

    btree->syncing = WT_BTREE_SYNC_OFF;                           /* Not syncing */
    btree->checkpoint_gen = __wt_gen(session, WT_GEN_CHECKPOINT); /* Checkpoint generation */

    /*
     * The first time we open a btree, we'll be initializing the write gen to the connection-wide
     * base write generation since this is the largest of all btree write generations from the
     * previous run. This has the nice property of ensuring that the range of write generations used
     * by consecutive runs do not overlap which aids with debugging.
     *
     * If we're reopening a btree or importing a new one to a running system, the btree write
     * generation from the last run may actually be ahead of the connection-wide base write
     * generation. In that case, we should initialize our write gen just ahead of our btree specific
     * write generation.
     *
     * The runtime write generation is important since it's going to determine what we're going to
     * use as the base write generation (and thus what pages to wipe transaction ids from). The idea
     * is that we want to initialize it once the first time we open the btree during a run and then
     * for every subsequent open, we want to reuse it. This so that we're still able to read
     * transaction ids from the previous time a btree was open in the same run.
     */
    btree->write_gen = WT_MAX(ckpt->write_gen + 1, conn->base_write_gen);
    WT_ASSERT(session, ckpt->write_gen >= ckpt->run_write_gen);

    /* If this is the first time opening the tree this run. */
    if (F_ISSET(session, WT_SESSION_IMPORT) || ckpt->run_write_gen < conn->base_write_gen)
        btree->run_write_gen = btree->write_gen;
    else
        btree->run_write_gen = ckpt->run_write_gen;

    /*
     * In recovery use the last checkpointed run write generation number as base write generation
     * number to reset the transaction ids of the pages that were modified before the restart. The
     * transaction ids are retained only on the pages that are written after the restart.
     *
     * Rollback to stable does not operate on logged tables and metadata, so it is skipped.
     *
     * The only scenario where the checkpoint run write generation number is less than the
     * connection last checkpoint base write generation number is when rollback to stable doesn't
     * happen during the recovery due to the unavailability of history store file.
     */
    if (!F_ISSET(conn, WT_CONN_RECOVERING) || WT_IS_METADATA(btree->dhandle) ||
      !F_ISSET(btree, WT_BTREE_NO_LOGGING) || ckpt->run_write_gen < conn->last_ckpt_base_write_gen)
        btree->base_write_gen = btree->run_write_gen;
    else
        btree->base_write_gen = ckpt->run_write_gen;

    /*
     * We've just overwritten the runtime write generation based off the fact that know that we're
     * importing and therefore, the checkpoint data's runtime write generation is meaningless. We
     * need to ensure that the underlying dhandle doesn't get discarded without being included in a
     * subsequent checkpoint including the new overwritten runtime write generation. Otherwise,
     * we'll reopen, won't know that we're in the import case and will incorrectly use the old
     * system's runtime write generation.
     */
    if (F_ISSET(session, WT_SESSION_IMPORT))
        btree->modified = true;

    return (0);
}

/*
 * __wt_root_ref_init --
 *     Initialize a tree root reference, and link in the root page.
 */
void
__wt_root_ref_init(WT_SESSION_IMPL *session, WT_REF *root_ref, WT_PAGE *root, bool is_recno)
{
    WT_UNUSED(session); /* Used in a macro for diagnostic builds */
    memset(root_ref, 0, sizeof(*root_ref));

    root_ref->page = root;
    F_SET(root_ref, WT_REF_FLAG_INTERNAL);
    WT_REF_SET_STATE(root_ref, WT_REF_MEM);

    root_ref->ref_recno = is_recno ? 1 : WT_RECNO_OOB;

    root->pg_intl_parent_ref = root_ref;
}

/*
 * __wt_btree_tree_open --
 *     Read in a tree from disk.
 */
int
__wt_btree_tree_open(WT_SESSION_IMPL *session, const uint8_t *addr, size_t addr_size)
{
    WT_BM *bm;
    WT_BTREE *btree;
    WT_DECL_ITEM(tmp);
    WT_DECL_RET;
    WT_ITEM dsk;
    WT_PAGE *page;

    btree = S2BT(session);
    bm = btree->bm;

    /*
     * A buffer into which we read a root page; don't use a scratch buffer, the buffer's allocated
     * memory becomes the persistent in-memory page.
     */
    WT_CLEAR(dsk);

    /*
     * Read and verify the page (verify to catch encrypted objects we can't decrypt, where we read
     * the object successfully but we can't decrypt it, and we want to fail gracefully).
     *
     * Create a printable version of the address to pass to verify.
     */
    WT_ERR(__wt_scr_alloc(session, 0, &tmp));
    WT_ERR(bm->addr_string(bm, session, tmp, addr, addr_size));

    F_SET(session, WT_SESSION_QUIET_CORRUPT_FILE);
    if ((ret = __wt_blkcache_read(session, &dsk, addr, addr_size)) == 0)
        ret = __wt_verify_dsk(session, tmp->data, &dsk);
    /*
     * Flag any failed read or verification: if we're in startup, it may be fatal.
     */
    if (ret != 0)
        F_SET(S2C(session), WT_CONN_DATA_CORRUPTION);
    F_CLR(session, WT_SESSION_QUIET_CORRUPT_FILE);
    if (ret != 0)
        __wt_err(session, ret, "unable to read root page from %s", session->dhandle->name);
    /*
     * Failure to open metadata means that the database is unavailable. Try to provide a helpful
     * failure message.
     */
    if (ret != 0 && WT_IS_METADATA(session->dhandle)) {
        __wt_err(session, ret, "WiredTiger has failed to open its metadata");
        __wt_err(session, ret,
          "This may be due to the database files being encrypted, being from an older version or "
          "due to corruption on disk");
        __wt_err(session, ret,
          "You should confirm that you have opened the database with the correct options including "
          "all encryption and compression options");
    }
    WT_ERR(ret);

    /*
     * Build the in-memory version of the page. Clear our local reference to the allocated copy of
     * the disk image on return, the in-memory object steals it.
     */
    WT_ERR(__wt_page_inmem(session, NULL, dsk.data,
      WT_DATA_IN_ITEM(&dsk) ? WT_PAGE_DISK_ALLOC : WT_PAGE_DISK_MAPPED, &page, NULL));
    dsk.mem = NULL;

    /* Finish initializing the root, root reference links. */
    __wt_root_ref_init(session, &btree->root, page, btree->type != BTREE_ROW);

err:
    __wt_buf_free(session, &dsk);
    __wt_scr_free(session, &tmp);

    return (ret);
}

/*
 * __btree_tree_open_empty --
 *     Create an empty in-memory tree.
 */
static int
__btree_tree_open_empty(WT_SESSION_IMPL *session, bool creation)
{
    WT_BTREE *btree;
    WT_DECL_RET;
    WT_PAGE *root;
    WT_PAGE_INDEX *pindex;
    WT_REF *ref;

    btree = S2BT(session);
    root = NULL;
    ref = NULL;

    /*
     * Newly created objects can be used for cursor inserts or for bulk loads; set a flag that's
     * cleared when a row is inserted into the tree.
     */
    if (creation)
        btree->original = 1;

    /*
     * A note about empty trees: the initial tree is a single root page. It has a single reference
     * to a leaf page, marked deleted. The leaf page will be created by the first update. If the
     * root is evicted without being modified, that's OK, nothing is ever written.
     *
     * !!!
     * Be cautious about changing the order of updates in this code: to call __wt_page_out on error,
     * we require a correct page setup at each point where we might fail.
     */
    switch (btree->type) {
    case BTREE_COL_FIX:
    case BTREE_COL_VAR:
        WT_ERR(__wt_page_alloc(session, WT_PAGE_COL_INT, 1, true, &root));
        root->pg_intl_parent_ref = &btree->root;

        pindex = WT_INTL_INDEX_GET_SAFE(root);
        ref = pindex->index[0];
        ref->home = root;
        ref->page = NULL;
        ref->addr = NULL;
        F_SET(ref, WT_REF_FLAG_LEAF);
        WT_REF_SET_STATE(ref, WT_REF_DELETED);
        ref->ref_recno = 1;
        break;
    case BTREE_ROW:
        WT_ERR(__wt_page_alloc(session, WT_PAGE_ROW_INT, 1, true, &root));
        root->pg_intl_parent_ref = &btree->root;

        pindex = WT_INTL_INDEX_GET_SAFE(root);
        ref = pindex->index[0];
        ref->home = root;
        ref->page = NULL;
        ref->addr = NULL;
        F_SET(ref, WT_REF_FLAG_LEAF);
        WT_REF_SET_STATE(ref, WT_REF_DELETED);
        WT_ERR(__wt_row_ikey_incr(session, root, 0, "", 1, ref));
        break;
    }

    /* Bulk loads require a leaf page for reconciliation: create it now. */
    if (F_ISSET(btree, WT_BTREE_BULK)) {
        WT_ERR(__wt_btree_new_leaf_page(session, ref));
        F_SET(ref, WT_REF_FLAG_LEAF);
        WT_REF_SET_STATE(ref, WT_REF_MEM);
        WT_ERR(__wt_page_modify_init(session, ref->page));
        __wt_page_only_modify_set(session, ref->page);
    }

    /* Finish initializing the root, root reference links. */
    __wt_root_ref_init(session, &btree->root, root, btree->type != BTREE_ROW);

    return (0);

err:
    if (ref != NULL && ref->page != NULL)
        __wt_page_out(session, &ref->page);
    if (root != NULL)
        __wt_page_out(session, &root);
    return (ret);
}

/*
 * __wt_btree_new_leaf_page --
 *     Create an empty leaf page.
 */
int
__wt_btree_new_leaf_page(WT_SESSION_IMPL *session, WT_REF *ref)
{
    WT_BTREE *btree;

    btree = S2BT(session);

    switch (btree->type) {
    case BTREE_COL_FIX:
        WT_RET(__wt_page_alloc(session, WT_PAGE_COL_FIX, 0, false, &ref->page));
        break;
    case BTREE_COL_VAR:
        WT_RET(__wt_page_alloc(session, WT_PAGE_COL_VAR, 0, false, &ref->page));
        break;
    case BTREE_ROW:
        WT_RET(__wt_page_alloc(session, WT_PAGE_ROW_LEAF, 0, false, &ref->page));
        break;
    }

    /*
     * When deleting a chunk of the name-space, we can delete internal pages. However, if we are
     * ever forced to re-instantiate that piece of the namespace, it comes back as a leaf page.
     * Reset the WT_REF type as it's possible that it has changed.
     */
    F_CLR(ref, WT_REF_FLAG_INTERNAL);
    F_SET(ref, WT_REF_FLAG_LEAF);

    return (0);
}

/*
 * __btree_preload --
 *     Pre-load internal pages.
 */
static int
__btree_preload(WT_SESSION_IMPL *session)
{
    WT_ADDR_COPY addr;
    WT_BTREE *btree;
    WT_DECL_ITEM(tmp);
    WT_DECL_RET;
    WT_REF *ref;
    uint64_t block_preload;

    btree = S2BT(session);
    block_preload = 0;

    WT_RET(__wt_scr_alloc(session, 0, &tmp));

    /* Pre-load the second-level internal pages. */
    WT_INTL_FOREACH_BEGIN (session, btree->root.page, ref)
        if (__wt_ref_addr_copy(session, ref, &addr)) {
            WT_ERR(__wt_blkcache_read(session, tmp, addr.addr, addr.size));
            ++block_preload;
        }
    WT_INTL_FOREACH_END;

err:
    __wt_scr_free(session, &tmp);

    WT_STAT_CONN_INCRV(session, block_preload, block_preload);
    return (ret);
}

/*
 * __btree_get_last_recno --
 *     Set the last record number for a column-store.
 */
static int
__btree_get_last_recno(WT_SESSION_IMPL *session)
{
    WT_BTREE *btree;
    WT_PAGE *page;
    WT_REF *next_walk;
    uint32_t flags;

    btree = S2BT(session);
    flags = WT_READ_PREV;
    if (!F_ISSET(session->txn, WT_TXN_HAS_SNAPSHOT))
        LF_SET(WT_READ_VISIBLE_ALL);

    next_walk = NULL;
    WT_RET(__wt_tree_walk(session, &next_walk, flags));
    if (next_walk == NULL)
        return (WT_NOTFOUND);

    page = next_walk->page;
    btree->last_recno = page->type == WT_PAGE_COL_VAR ? __col_var_last_recno(next_walk) :
                                                        __col_fix_last_recno(next_walk);

    return (__wt_page_release(session, next_walk, 0));
}

/*
 * __btree_page_sizes --
 *     Verify the page sizes. Some of these sizes are automatically checked using limits defined in
 *     the API, don't duplicate the logic here.
 */
static int
__btree_page_sizes(WT_SESSION_IMPL *session)
{
    WT_BTREE *btree;
    WT_CONFIG_ITEM cval;
    WT_CONNECTION_IMPL *conn;
    uint64_t cache_size;
    uint32_t leaf_split_size, max;
    const char **cfg;

    btree = S2BT(session);
    conn = S2C(session);
    cfg = btree->dhandle->cfg;

    /*
     * Get the allocation size. Allocation sizes must be a power-of-two, nothing else makes sense.
     */
    WT_RET(__wt_direct_io_size_check(session, cfg, "allocation_size", &btree->allocsize));
    if (!__wt_ispo2(btree->allocsize))
        WT_RET_MSG(session, EINVAL, "the allocation size must be a power of two");

    /*
     * Get the internal/leaf page sizes. All page sizes must be in units of the allocation size.
     */
    WT_RET(__wt_direct_io_size_check(session, cfg, "internal_page_max", &btree->maxintlpage));
    WT_RET(__wt_direct_io_size_check(session, cfg, "leaf_page_max", &btree->maxleafpage));
    if (btree->maxintlpage < btree->allocsize || btree->maxintlpage % btree->allocsize != 0 ||
      btree->maxleafpage < btree->allocsize || btree->maxleafpage % btree->allocsize != 0)
        WT_RET_MSG(session, EINVAL,
          "page sizes must be a multiple of the page allocation size (%" PRIu32 "B)",
          btree->allocsize);

    /*
     * FLCS leaf pages have a lower size limit than the default, because the size configures the
     * bitmap data size and the timestamp data adds on to that. Each time window can be up to 63
     * bytes and the total page size must not exceed 4G. Thus for an 8t table there can be 64M
     * entries (so 64M of bitmap data and up to 63*64M == 4032M of time windows), less a bit for
     * headers. For a 1t table there can be (64 7/8)M entries because the bitmap takes less space,
     * but that corresponds to a configured page size of a bit over 8M. Consequently the absolute
     * limit on the page size is 8M, but since pages this large make no sense and perform poorly
     * even if they don't get bloated out with timestamp data, we'll cut down by a factor of 16 and
     * set the limit to 128KB.
     */
    if (btree->type == BTREE_COL_FIX && btree->maxleafpage > 128 * WT_KILOBYTE)
        WT_RET_MSG(session, EINVAL, "page size for fixed-length column store is limited to 128KB");

    /*
     * Default in-memory page image size for compression is 4x the maximum internal or leaf page
     * size, and enforce the on-disk page sizes as a lower-limit for the in-memory image size.
     */
    WT_RET(__wt_config_gets(session, cfg, "memory_page_image_max", &cval));
    btree->maxmempage_image = (uint32_t)cval.val;
    max = WT_MAX(btree->maxintlpage, btree->maxleafpage);
    if (btree->maxmempage_image == 0)
        btree->maxmempage_image = 4 * max;
    else if (btree->maxmempage_image < max)
        WT_RET_MSG(session, EINVAL,
          "in-memory page image size must be larger than the maximum page size (%" PRIu32
          "B < %" PRIu32 "B)",
          btree->maxmempage_image, max);

    /*
     * Don't let pages grow large compared to the cache size or we can end
     * up in a situation where nothing can be evicted.  Make sure at least
     * 10 pages fit in cache when it is at the dirty trigger where threads
     * stall.
     *
     * Take care getting the cache size: with a shared cache, it may not
     * have been set.  Don't forget to update the API documentation if you
     * alter the bounds for any of the parameters here.
     */
    WT_RET(__wt_config_gets(session, cfg, "memory_page_max", &cval));
    btree->maxmempage = (uint64_t)cval.val;
    if (!F_ISSET(conn, WT_CONN_CACHE_POOL) && (cache_size = conn->cache_size) > 0)
        btree->maxmempage = (uint64_t)WT_MIN(
          btree->maxmempage, (conn->cache->eviction_dirty_trigger * cache_size) / 1000);

    /* Enforce a lower bound of a single disk leaf page */
    btree->maxmempage = WT_MAX(btree->maxmempage, btree->maxleafpage);

    /*
     * Try in-memory splits once we hit 80% of the maximum in-memory page size. This gives
     * multi-threaded append workloads a better chance of not stalling.
     */
    btree->splitmempage = (8 * btree->maxmempage) / 10;

    /*
     * Get the split percentage (reconciliation splits pages into smaller than the maximum page size
     * chunks so we don't split every time a new entry is added). Determine how large newly split
     * pages will be. Set to the minimum, if the read value is less than that.
     */
    WT_RET(__wt_config_gets(session, cfg, "split_pct", &cval));
    if (cval.val < WT_BTREE_MIN_SPLIT_PCT) {
        btree->split_pct = WT_BTREE_MIN_SPLIT_PCT;
        __wt_verbose_notice(session, WT_VERB_SPLIT,
          "Re-setting split_pct for %s to the minimum allowed of %d%%", session->dhandle->name,
          WT_BTREE_MIN_SPLIT_PCT);
    } else
        btree->split_pct = (int)cval.val;
    leaf_split_size = __wt_split_page_size(btree->split_pct, btree->maxleafpage, btree->allocsize);

    /*
     * In-memory split configuration.
     */
    if (__wt_config_gets(session, cfg, "split_deepen_min_child", &cval) == WT_NOTFOUND ||
      cval.val == 0)
        btree->split_deepen_min_child = WT_SPLIT_DEEPEN_MIN_CHILD_DEF;
    else
        btree->split_deepen_min_child = (u_int)cval.val;
    if (__wt_config_gets(session, cfg, "split_deepen_per_child", &cval) == WT_NOTFOUND ||
      cval.val == 0)
        btree->split_deepen_per_child = WT_SPLIT_DEEPEN_PER_CHILD_DEF;
    else
        btree->split_deepen_per_child = (u_int)cval.val;

    /*
     * Get the maximum internal/leaf page key/value sizes.
     *
     * In-memory configuration overrides any key/value sizes, there's no such thing as an overflow
     * item in an in-memory configuration.
     */
    if (F_ISSET(conn, WT_CONN_IN_MEMORY)) {
        btree->maxleafkey = WT_BTREE_MAX_OBJECT_SIZE;
        btree->maxleafvalue = WT_BTREE_MAX_OBJECT_SIZE;
        return (0);
    }

    WT_RET(__wt_config_gets(session, cfg, "leaf_key_max", &cval));
    btree->maxleafkey = (uint32_t)cval.val;
    WT_RET(__wt_config_gets(session, cfg, "leaf_value_max", &cval));
    btree->maxleafvalue = (uint32_t)cval.val;

    /*
     * Default max for leaf keys: split-page / 10. Default max for leaf values: split-page / 2.
     *
     * It's difficult for applications to configure this in any exact way as they have to duplicate
     * our calculation of how many keys must fit on a page, and given a split-percentage and page
     * header, that isn't easy to do.
     */
    if (btree->maxleafkey == 0)
        btree->maxleafkey = leaf_split_size / 10;
    if (btree->maxleafvalue == 0)
        btree->maxleafvalue = leaf_split_size / 2;

    return (0);
}

/*
 * __wt_btree_switch_object --
 *     Switch to a writeable object for a tiered btree.
 */
int
__wt_btree_switch_object(WT_SESSION_IMPL *session, uint32_t objectid, uint32_t flags)
{
    WT_BM *bm;

    /*
     * When initially opening a tiered Btree, a tier switch is done internally without the btree
     * being fully opened. That's okay, the btree will be told later about the current object
     * number.
     */
    bm = S2BT(session)->bm;
    return (bm == NULL ? 0 : bm->switch_object(bm, session, objectid, flags));
}