/*-
 * Copyright (c) 2014-present MongoDB, Inc.
 * Copyright (c) 2008-2014 WiredTiger, Inc.
 *	All rights reserved.
 *
 * See the file LICENSE for redistribution information.
 */

#include "wt_internal.h"

static int __ckpt_process(WT_SESSION_IMPL *, WT_BLOCK *, WT_CKPT *);
static int __ckpt_update(WT_SESSION_IMPL *, WT_BLOCK *, WT_CKPT *, WT_CKPT *, WT_BLOCK_CKPT *);

/*
 * __block_extlist_setup --
 *     Initialize the extent lists in a checkpoint structure.
 */
static int
__block_extlist_setup(WT_SESSION_IMPL *session, WT_BLOCK_CKPT *ci, const char *name)
{
    WT_RET(__wt_block_extlist_init(session, &ci->alloc, name, "alloc", false));
    WT_RET(__wt_block_extlist_init(session, &ci->avail, name, "avail", true));
    WT_RET(__wt_block_extlist_init(session, &ci->discard, name, "discard", false));
    WT_RET(__wt_block_extlist_init(session, &ci->ckpt_avail, name, "ckpt_avail", true));

    return (0);
}

/*
 * __block_extlist_reset --
 *     Discard and reinitialize the extent lists in a checkpoint structure
 */
static int
__block_extlist_reset(WT_SESSION_IMPL *session, WT_BLOCK_CKPT *ci, const char *name)
{
    __wt_block_ckpt_destroy(session, ci);
    WT_RET(__block_extlist_setup(session, ci, name));

    return (0);
}

/*
 * __wt_block_ckpt_init --
 *     Return the address cookie for the root page of a checkpoint. Also initialize extent lists if
 *     we are loading the live checkpoint in a writable file.
 */
int
__wt_block_ckpt_init(WT_SESSION_IMPL *session, WT_BLOCK_CKPT *ci, const char *name)
{
    WT_CLEAR(*ci);

    ci->version = WT_BM_CHECKPOINT_VERSION;
    ci->root_offset = WT_BLOCK_INVALID_OFFSET;

    return (__block_extlist_setup(session, ci, name));
}

/*
 * __wt_block_checkpoint_load --
 *     Return the address cookie for the root page of a checkpoint. Also initialize its extent lists
 *     if loading the live checkpoint from a writeable file.
 */
int
__wt_block_checkpoint_load(WT_SESSION_IMPL *session, WT_BLOCK *block, const uint8_t *addr,
  size_t addr_size, uint8_t *root_addr, size_t *root_addr_sizep, bool checkpoint)
{
    WT_BLOCK_CKPT *ci, _ci;
    WT_DECL_RET;
    uint8_t *endp;
    bool live_open;

    /*
     * Sometimes we don't find a root page (we weren't given a checkpoint, or the checkpoint was
     * empty). In that case we return an empty root address, set that up now.
     */
    *root_addr_sizep = 0;

    ci = NULL;

    if (WT_VERBOSE_LEVEL_ISSET(session, WT_VERB_CHECKPOINT, WT_VERBOSE_DEBUG_1))
        __wt_ckpt_verbose(session, block, "load", NULL, addr, addr_size);

    /*
     * There's a single checkpoint in the file that can be written, all of the others are read-only.
     * We use the same initialization calls for readonly checkpoints, but the information doesn't
     * persist.
     */
    if (checkpoint) {
        ci = &_ci;
        WT_ERR(__wt_block_ckpt_init(session, ci, "checkpoint"));
    } else {
        /*
         * We depend on the btree level for locking: things will go bad fast if we open the live
         * system in two handles, or salvage, truncate or verify the live/running file.
         */
        __wt_spin_lock(session, &block->live_lock);
        live_open = block->live_open;
        block->live_open = true;
        __wt_spin_unlock(session, &block->live_lock);
        WT_ERR_ASSERT(session, WT_DIAGNOSTIC_CHECKPOINT_VALIDATE, live_open == false, EBUSY,
          "%s: attempt to re-open live file", block->name);

        ci = &block->live;
        WT_ERR(__wt_block_ckpt_init(session, ci, "live"));
    }

    /*
     * If the checkpoint has an on-disk root page, load it. Otherwise, size the file past the
     * description information.
     */
    if (addr == NULL || addr_size == 0)
        ci->file_size = block->allocsize;
    else {
        /* Crack the checkpoint cookie. */
        WT_ERR(__wt_block_ckpt_unpack(session, block, addr, addr_size, ci));

        /* Verify sets up next. */
        if (block->verify) {
            /*
             * FIXME: We may need to change how we setup for verify when it supports tiered tables.
             * Until then, an attempt to verify a tiered table should return before getting here.
             */
            WT_ASSERT(session, block->objectid == 0 && ci->root_objectid == 0);
            WT_ERR(__wt_verify_ckpt_load(session, block, ci));
        }

        /* Read any root page. */
        if (ci->root_offset != WT_BLOCK_INVALID_OFFSET) {
            /* A checkpoint shouldn't point to an object created after this one. */
            WT_ASSERT(session, block->objectid >= ci->root_objectid);

            endp = root_addr;
            WT_ERR(__wt_block_addr_pack(
              block, &endp, ci->root_objectid, ci->root_offset, ci->root_size, ci->root_checksum));
            *root_addr_sizep = WT_PTRDIFF(endp, root_addr);
        }

        if (!checkpoint) {
            /*
             * The checkpoint address may point to an earlier object. If so, the object backing this
             * block handle doesn't have valid data -- i.e., it must have been written after the
             * checkpoint we are opening. So we discard the incorrect extent lists and reinitialize
             * them to be empty.
             */
            if (block->objectid != ci->root_objectid)
                __block_extlist_reset(session, ci, "live");

            /*
             * Rolling a checkpoint forward requires the avail list, the blocks from which we can
             * allocate.
             */
            WT_ERR(__wt_block_extlist_read_avail(session, block, &ci->avail, ci->file_size));
        }
    }

    /*
     * If the object can be written, that means anything written after the checkpoint is no longer
     * interesting, truncate the file. Don't bother checking the avail list for a block at the end
     * of the file, that was done when the checkpoint was first written (re-writing the checkpoint
     * might possibly make it relevant here, but it's unlikely enough I don't bother).
     */
    if (!checkpoint)
        WT_ERR(__wt_block_truncate(session, block, ci->file_size));

    if (0) {
err:
        /*
         * Don't call checkpoint-unload: unload does real work including file truncation. If we fail
         * early enough that the checkpoint information isn't correct, bad things would happen. The
         * only allocated memory was in the service of verify, clean that up.
         */
        if (block->verify)
            WT_TRET(__wt_verify_ckpt_unload(session, block));
    }

    /* Checkpoints don't need the original information, discard it. */
    if (checkpoint)
        __wt_block_ckpt_destroy(session, ci);

    return (ret);
}

/*
 * __wt_block_checkpoint_unload --
 *     Unload a checkpoint.
 */
int
__wt_block_checkpoint_unload(WT_SESSION_IMPL *session, WT_BLOCK *block, bool checkpoint)
{
    WT_DECL_RET;

    /* Verify cleanup. */
    if (block->verify)
        WT_TRET(__wt_verify_ckpt_unload(session, block));

    /*
     * If it's the live system, truncate to discard any extended blocks and discard the active
     * extent lists. Hold the lock even though we're unloading the live checkpoint, there could be
     * readers active in other checkpoints.
     */
    if (!checkpoint) {
        WT_TRET(__wt_block_truncate(session, block, block->size));

        __wt_spin_lock(session, &block->live_lock);
        __wt_block_ckpt_destroy(session, &block->live);
        block->live_open = false;
        __wt_spin_unlock(session, &block->live_lock);
    }

    return (ret);
}

/*
 * __wt_block_ckpt_destroy --
 *     Clear a checkpoint structure. Free the extent lists, but leave the rest of the state intact
 *     in case the caller is re-using it.
 */
void
__wt_block_ckpt_destroy(WT_SESSION_IMPL *session, WT_BLOCK_CKPT *ci)
{
    /* Discard the extent lists. */
    __wt_block_extlist_free(session, &ci->alloc);
    __wt_block_extlist_free(session, &ci->avail);
    __wt_block_extlist_free(session, &ci->discard);
    __wt_block_extlist_free(session, &ci->ckpt_alloc);
    __wt_block_extlist_free(session, &ci->ckpt_avail);
    __wt_block_extlist_free(session, &ci->ckpt_discard);
}

/*
 * __wt_block_checkpoint_start --
 *     Start a checkpoint.
 */
int
__wt_block_checkpoint_start(WT_SESSION_IMPL *session, WT_BLOCK *block)
{
    WT_DECL_RET;

    __wt_spin_lock(session, &block->live_lock);
    switch (block->ckpt_state) {
    case WT_CKPT_INPROGRESS:
    case WT_CKPT_PANIC_ON_FAILURE:
    case WT_CKPT_SALVAGE:
        ret = __wt_panic(session, EINVAL,
          "%s: an unexpected checkpoint start: the checkpoint has already started or was "
          "configured for salvage",
          block->name);
        __wt_blkcache_set_readonly(session);
        break;
    case WT_CKPT_NONE:
        block->ckpt_state = WT_CKPT_INPROGRESS;
        break;
    }
    __wt_spin_unlock(session, &block->live_lock);
    return (ret);
}

/*
 * __wt_block_checkpoint --
 *     Create a new checkpoint.
 */
int
__wt_block_checkpoint(
  WT_SESSION_IMPL *session, WT_BLOCK *block, WT_ITEM *buf, WT_CKPT *ckptbase, bool data_checksum)
{
    WT_BLOCK_CKPT *ci;
    WT_DECL_RET;

    ci = &block->live;

    /* Switch to first-fit allocation. */
    __wt_block_configure_first_fit(block, true);

    /*
     * Write the root page: it's possible for there to be a checkpoint of
     * an empty tree, in which case, we store an illegal root offset.
     *
     * !!!
     * We happen to know that checkpoints are single-threaded above us in
     * the btree engine.  That's probably something we want to guarantee
     * for any WiredTiger block manager.
     */
    if (buf == NULL) {
        ci->root_offset = WT_BLOCK_INVALID_OFFSET;
        ci->root_objectid = ci->root_size = ci->root_checksum = 0;
    } else
        WT_ERR(__wt_block_write_off(session, block, buf, &ci->root_objectid, &ci->root_offset,
          &ci->root_size, &ci->root_checksum, data_checksum, true, false));

    /*
     * Checkpoints are potentially reading/writing/merging lots of blocks, pre-allocate structures
     * for this thread's use.
     */
    WT_ERR(__wt_block_ext_prealloc(session, 250));

    /* Process the checkpoint list, deleting and updating as required. */
    ret = __ckpt_process(session, block, ckptbase);

    /* Discard any excessive memory we've allocated. */
    WT_TRET(__wt_block_ext_discard(session, 250));

/* Restore the original allocation plan. */
err:
    __wt_block_configure_first_fit(block, false);

    return (ret);
}

/*
 * __ckpt_extlist_read --
 *     Read a checkpoint's extent lists.
 */
static int
__ckpt_extlist_read(WT_SESSION_IMPL *session, WT_BLOCK *block, WT_CKPT *ckpt, bool *localp)
{
    WT_BLOCK_CKPT *ci;

    /* Default to a local file. */
    *localp = true;

    /*
     * Allocate a checkpoint structure, crack the cookie and read the checkpoint's extent lists.
     *
     * Ignore the avail list: checkpoint avail lists are only useful if we are rolling forward from
     * the particular checkpoint and they represent our best understanding of what blocks can be
     * allocated. If we are not operating on the live checkpoint, subsequent checkpoints might have
     * allocated those blocks, and the avail list is useless. We don't discard it, because it is
     * useful as part of verification, but we don't re-write it either.
     */
    WT_RET(__wt_calloc(session, 1, sizeof(WT_BLOCK_CKPT), &ckpt->bpriv));

    ci = ckpt->bpriv;
    WT_RET(__wt_block_ckpt_init(session, ci, ckpt->name));
    WT_RET(__wt_block_ckpt_unpack(session, block, ckpt->raw.data, ckpt->raw.size, ci));

    /* Extent lists from non-local objects aren't useful, we're going to skip them. */
    if (ci->root_objectid != block->objectid) {
        *localp = false;
        return (0);
    }

    WT_RET(__wt_block_extlist_read(session, block, &ci->alloc, ci->file_size));
    WT_RET(__wt_block_extlist_read(session, block, &ci->discard, ci->file_size));

    return (0);
}

/*
 * __ckpt_extlist_fblocks --
 *     If a checkpoint's extent list is going away, free its blocks.
 */
static int
__ckpt_extlist_fblocks(WT_SESSION_IMPL *session, WT_BLOCK *block, WT_EXTLIST *el)
{
    if (el->offset == WT_BLOCK_INVALID_OFFSET)
        return (0);

    /*
     * Free blocks used to write checkpoint extents into the live system's checkpoint avail list
     * (they were never on any alloc list). Do not use the live system's avail list because that
     * list is used to decide if the file can be truncated, and we can't truncate any part of the
     * file that contains a previous checkpoint's extents.
     */
    return (__wt_block_insert_ext(session, block, &block->live.ckpt_avail, el->offset, el->size));
}

/*
 * __ckpt_verify --
 *     Diagnostic code, confirm we get what we expect in the checkpoint array.
 */
static int
__ckpt_verify(WT_SESSION_IMPL *session, WT_CKPT *ckptbase)
{
    WT_CKPT *ckpt;

    /*
     * Fast check that we're seeing what we expect to see: some number of checkpoints to add, delete
     * or ignore, terminated by a new checkpoint.
     */
    WT_CKPT_FOREACH (ckptbase, ckpt)
        switch (ckpt->flags) {
        case 0:
        case WT_CKPT_DELETE:
        case WT_CKPT_DELETE | WT_CKPT_FAKE:
        case WT_CKPT_FAKE:
            break;
        case WT_CKPT_ADD | WT_CKPT_BLOCK_MODS:
        case WT_CKPT_ADD:
            if (ckpt[1].name == NULL)
                break;
        /* FALLTHROUGH */
        default:
            return (__wt_illegal_value(session, ckpt->flags));
        }
    return (0);
}

/*
 * __ckpt_add_blkmod_entry --
 *     Add an offset/length entry to the bitstring based on granularity.
 */
static int
__ckpt_add_blkmod_entry(
  WT_SESSION_IMPL *session, WT_BLOCK_MODS *blk_mod, wt_off_t offset, wt_off_t len)
{
    uint64_t end_bit, start_bit;
    uint32_t end_buf_bytes, end_rdup_bits, end_rdup_bytes;

    WT_ASSERT(session, blk_mod->granularity != 0);
    /*
     * Figure out the starting and ending locations in the bitmap based on its granularity and our
     * offset and length. The bit locations are zero-based; be careful translating to sizes.
     */
    start_bit = (uint64_t)offset / blk_mod->granularity;
    end_bit = (uint64_t)(offset + len - 1) / blk_mod->granularity;
    WT_ASSERT(session, end_bit < UINT32_MAX);
    /* We want to grow the bitmap by 64 bits, or 8 bytes at a time. */
    end_rdup_bits = WT_MAX(__wt_rduppo2((uint32_t)end_bit + 1, 64), WT_BLOCK_MODS_LIST_MIN);
    end_rdup_bytes = __bitstr_size(end_rdup_bits);
    end_buf_bytes = __bitstr_size((uint32_t)blk_mod->nbits);
    /*
     * We are doing a lot of shifting. Make sure that the number of bytes we end up with is a
     * multiple of eight. We guarantee that in the rounding up call, but also make sure that the
     * constant stays a multiple of eight.
     */
    WT_ASSERT(session, end_rdup_bytes % 8 == 0);
    if (end_rdup_bytes > end_buf_bytes) {
        /* If we don't have enough, extend the buffer. */
        if (blk_mod->nbits == 0) {
            WT_RET(__wt_buf_initsize(session, &blk_mod->bitstring, end_rdup_bytes));
            memset(blk_mod->bitstring.mem, 0, end_rdup_bytes);
        } else {
            WT_RET(
              __wt_buf_set(session, &blk_mod->bitstring, blk_mod->bitstring.data, end_rdup_bytes));
            memset(
              (uint8_t *)blk_mod->bitstring.mem + end_buf_bytes, 0, end_rdup_bytes - end_buf_bytes);
        }
        blk_mod->nbits = end_rdup_bits;
    }
    /* Make sure we're not going to run past the end of the bitmap */
    WT_ASSERT(session, blk_mod->bitstring.size >= __bitstr_size((uint32_t)blk_mod->nbits));
    WT_ASSERT(session, end_bit < blk_mod->nbits);
    /* Set all the bits needed to record this offset/length pair. */
    __bit_nset(blk_mod->bitstring.mem, start_bit, end_bit);
    return (0);
}

/*
 * __ckpt_add_blk_mods_alloc --
 *     Add the checkpoint's allocated blocks to all valid incremental backup source identifiers.
 */
static int
__ckpt_add_blk_mods_alloc(
  WT_SESSION_IMPL *session, WT_CKPT *ckptbase, WT_BLOCK_CKPT *ci, WT_BLOCK *block)
{
    WT_BLOCK_MODS *blk_mod;
    WT_CKPT *ckpt;
    WT_EXT *ext;
    u_int i;

    WT_CKPT_FOREACH (ckptbase, ckpt) {
        if (F_ISSET(ckpt, WT_CKPT_ADD))
            break;
    }
    /* If this is not the live checkpoint or we don't care about incremental blocks, we're done. */
    if (ckpt == NULL || !F_ISSET(ckpt, WT_CKPT_BLOCK_MODS))
        return (0);
    for (i = 0; i < WT_BLKINCR_MAX; ++i) {
        blk_mod = &ckpt->backup_blocks[i];
        /* If there is no information at this entry, we're done. */
        if (!F_ISSET(blk_mod, WT_BLOCK_MODS_VALID))
            continue;

        if (block->created_during_backup)
            WT_RET(__ckpt_add_blkmod_entry(session, blk_mod, 0, block->allocsize));
        WT_EXT_FOREACH (ext, ci->alloc.off) {
            WT_RET(__ckpt_add_blkmod_entry(session, blk_mod, ext->off, ext->size));
        }
    }
    block->created_during_backup = false;
    return (0);
}

/*
 * __ckpt_add_blk_mods_ext --
 *     Add a set of extent blocks to all valid incremental backup source identifiers.
 */
static int
__ckpt_add_blk_mods_ext(WT_SESSION_IMPL *session, WT_CKPT *ckptbase, WT_BLOCK_CKPT *ci)
{
    WT_BLOCK_MODS *blk_mod;
    WT_CKPT *ckpt;
    u_int i;

    WT_CKPT_FOREACH (ckptbase, ckpt) {
        if (F_ISSET(ckpt, WT_CKPT_ADD))
            break;
    }
    /* If this is not the live checkpoint or we don't care about incremental blocks, we're done. */
    if (ckpt == NULL || !F_ISSET(ckpt, WT_CKPT_BLOCK_MODS))
        return (0);
    for (i = 0; i < WT_BLKINCR_MAX; ++i) {
        blk_mod = &ckpt->backup_blocks[i];
        /* If there is no information at this entry, we're done. */
        if (!F_ISSET(blk_mod, WT_BLOCK_MODS_VALID))
            continue;

        if (ci->alloc.offset != WT_BLOCK_INVALID_OFFSET)
            WT_RET(__ckpt_add_blkmod_entry(session, blk_mod, ci->alloc.offset, ci->alloc.size));
        if (ci->discard.offset != WT_BLOCK_INVALID_OFFSET)
            WT_RET(__ckpt_add_blkmod_entry(session, blk_mod, ci->discard.offset, ci->discard.size));
        if (ci->avail.offset != WT_BLOCK_INVALID_OFFSET)
            WT_RET(__ckpt_add_blkmod_entry(session, blk_mod, ci->avail.offset, ci->avail.size));
    }
    return (0);
}

/*
 * __ckpt_process --
 *     Process the list of checkpoints.
 */
static int
__ckpt_process(WT_SESSION_IMPL *session, WT_BLOCK *block, WT_CKPT *ckptbase)
{
    WT_BLOCK_CKPT *a, *b, *ci;
    WT_CKPT *ckpt, *next_ckpt;
    WT_DECL_RET;
    uint64_t ckpt_size;
    bool deleting, fatal, local, locked;

    ci = &block->live;
    fatal = locked = false;

    if (EXTRA_DIAGNOSTICS_ENABLED(session, WT_DIAGNOSTIC_CHECKPOINT_VALIDATE))
        WT_RET(__ckpt_verify(session, ckptbase));

    /*
     * Checkpoints are a two-step process: first, write a new checkpoint to disk (including all the
     * new extent lists for modified checkpoints and the live system). As part of this, create a
     * list of file blocks newly available for reallocation, based on checkpoints being deleted. We
     * then return the locations of the new checkpoint information to our caller. Our caller has to
     * write that information into some kind of stable storage, and once that's done, we can
     * actually allocate from that list of newly available file blocks. (We can't allocate from that
     * list immediately because the allocation might happen before our caller saves the new
     * checkpoint information, and if we crashed before the new checkpoint location was saved, we'd
     * have overwritten blocks still referenced by checkpoints in the system.) In summary, there is
     * a second step: after our caller saves the checkpoint information, we are called to add the
     * newly available blocks into the live system's available list.
     *
     * This function is the first step, the second step is in the resolve function.
     *
     * If we're called to checkpoint the same file twice (without the second resolution step), or
     * re-entered for any reason, it's an error in our caller, and our choices are all bad: leak
     * blocks or potentially crash with our caller not yet having saved previous checkpoint
     * information to stable storage.
     */
    __wt_spin_lock(session, &block->live_lock);
    switch (block->ckpt_state) {
    case WT_CKPT_INPROGRESS:
        block->ckpt_state = WT_CKPT_PANIC_ON_FAILURE;
        break;
    case WT_CKPT_NONE:
    case WT_CKPT_PANIC_ON_FAILURE:
        ret = __wt_panic(session, EINVAL,
          "%s: an unexpected checkpoint attempt: the checkpoint was never started or has already "
          "completed",
          block->name);
        __wt_blkcache_set_readonly(session);
        break;
    case WT_CKPT_SALVAGE:
        /* Salvage doesn't use the standard checkpoint APIs. */
        break;
    }
    __wt_spin_unlock(session, &block->live_lock);
    WT_RET(ret);

    /*
     * Extents newly available as a result of deleting previous checkpoints are added to a list of
     * extents. The list should be empty, but as described above, there is no "free the checkpoint
     * information" call into the block manager; if there was an error in an upper level that
     * resulted in some previous checkpoint never being resolved, the list may not be empty. We
     * should have caught that with the "checkpoint in progress" test, but it doesn't cost us
     * anything to be cautious.
     *
     * We free the checkpoint's allocation and discard extent lists as part of the resolution step,
     * not because they're needed at that time, but because it's potentially a lot of work, and
     * waiting allows the btree layer to continue eviction sooner. As for the checkpoint-available
     * list, make sure they get cleaned out.
     */
    __wt_block_extlist_free(session, &ci->ckpt_avail);
    WT_RET(__wt_block_extlist_init(session, &ci->ckpt_avail, "live", "ckpt_avail", true));
    __wt_block_extlist_free(session, &ci->ckpt_alloc);
    __wt_block_extlist_free(session, &ci->ckpt_discard);

    /*
     * To delete a checkpoint, we need checkpoint information for it and the subsequent checkpoint
     * into which it gets rolled; read them from disk before we lock things down.
     */
    deleting = false;
    WT_CKPT_FOREACH (ckptbase, ckpt) {
        if (F_ISSET(ckpt, WT_CKPT_FAKE) || !F_ISSET(ckpt, WT_CKPT_DELETE))
            continue;

        /*
         * Read the checkpoint and next checkpoint extent lists if we haven't already read them (we
         * may have already read these extent blocks if there is more than one deleted checkpoint).
         *
         * We can only delete checkpoints in the current file. Checkpoints of tiered storage objects
         * are checkpoints for the logical object, including files that are no longer live. Skip any
         * checkpoints that aren't local to the live object.
         */
        if (ckpt->bpriv == NULL) {
            WT_ERR(__ckpt_extlist_read(session, block, ckpt, &local));
            if (!local)
                continue;
        }

        deleting = true;

        for (next_ckpt = ckpt + 1;; ++next_ckpt)
            if (!F_ISSET(next_ckpt, WT_CKPT_FAKE))
                break;

        /*
         * The "next" checkpoint may be the live tree which has no extent blocks to read.
         */
        if (next_ckpt->bpriv == NULL && !F_ISSET(next_ckpt, WT_CKPT_ADD)) {
            WT_ERR(__ckpt_extlist_read(session, block, next_ckpt, &local));
            WT_ERR_ASSERT(session, WT_DIAGNOSTIC_CHECKPOINT_VALIDATE, local == true, WT_PANIC,
              "tiered storage checkpoint follows local checkpoint");
        }
    }

    /*
     * Failures are now fatal: we can't currently back out the merge of any deleted checkpoint
     * extent lists into the live system's extent lists, so continuing after error would leave the
     * live system's extent lists corrupted for any subsequent checkpoint (and potentially, should a
     * subsequent checkpoint succeed, for recovery).
     */
    fatal = true;

    /*
     * Hold a lock so the live extent lists and the file size can't change underneath us. I suspect
     * we'll tighten this if checkpoints take too much time away from real work: we read the
     * historic checkpoint information without a lock, but we could also merge and re-write the
     * deleted and merged checkpoint information without a lock, except for the final merge of
     * ranges into the live tree.
     */
    __wt_spin_lock(session, &block->live_lock);
    locked = true;

    /*
     * We've allocated our last page, update the checkpoint size. We need to calculate the live
     * system's checkpoint size before merging checkpoint allocation and discard information from
     * the checkpoints we're deleting, those operations change the underlying byte counts.
     */
    ckpt_size = ci->ckpt_size;
    ckpt_size += ci->alloc.bytes;
    ckpt_size -= ci->discard.bytes;

    /*
     * Record the checkpoint's allocated blocks. Do so before skipping any processing and before
     * possibly merging in blocks from any previous checkpoint.
     */
    WT_ERR(__ckpt_add_blk_mods_alloc(session, ckptbase, ci, block));

    /* Skip the additional processing if we aren't deleting checkpoints. */
    if (!deleting)
        goto live_update;

    /*
     * Delete any no-longer-needed checkpoints: we do this first as it frees blocks to the live
     * lists, and the freed blocks will then be included when writing the live extent lists.
     */
    WT_CKPT_FOREACH (ckptbase, ckpt) {
        if (F_ISSET(ckpt, WT_CKPT_FAKE) || !F_ISSET(ckpt, WT_CKPT_DELETE))
            continue;

        if (WT_VERBOSE_LEVEL_ISSET(session, WT_VERB_CHECKPOINT, WT_VERBOSE_DEBUG_2))
            __wt_ckpt_verbose(session, block, "delete", ckpt->name, ckpt->raw.data, ckpt->raw.size);

        /*
         * Find the checkpoint into which we'll roll this checkpoint's blocks: it's the next real
         * checkpoint in the list, and it better have been read in (if it's not the add slot).
         */
        for (next_ckpt = ckpt + 1;; ++next_ckpt)
            if (!F_ISSET(next_ckpt, WT_CKPT_FAKE))
                break;

        /*
         * Set the from/to checkpoint structures, where the "to" value may be the live tree.
         */
        a = ckpt->bpriv;
        if (F_ISSET(next_ckpt, WT_CKPT_ADD))
            b = &block->live;
        else
            b = next_ckpt->bpriv;

        /*
         * Free the root page: there's nothing special about this free, the root page is allocated
         * using normal rules, that is, it may have been taken from the avail list, and was entered
         * on the live system's alloc list at that time. We free it into the checkpoint's discard
         * list, however, not the live system's list because it appears on the checkpoint's alloc
         * list and so must be paired in the checkpoint.
         */
        if (a->root_offset != WT_BLOCK_INVALID_OFFSET)
            WT_ERR(
              __wt_block_insert_ext(session, block, &a->discard, a->root_offset, a->root_size));

        /*
         * Free the blocks used to hold the "from" checkpoint's extent lists, including the avail
         * list.
         */
        WT_ERR(__ckpt_extlist_fblocks(session, block, &a->alloc));
        WT_ERR(__ckpt_extlist_fblocks(session, block, &a->avail));
        WT_ERR(__ckpt_extlist_fblocks(session, block, &a->discard));

        /*
         * Roll the "from" alloc and discard extent lists into the "to" checkpoint's lists.
         */
        if (a->alloc.entries != 0)
            WT_ERR(__wt_block_extlist_merge(session, block, &a->alloc, &b->alloc));
        if (a->discard.entries != 0)
            WT_ERR(__wt_block_extlist_merge(session, block, &a->discard, &b->discard));

        /*
         * If the "to" checkpoint is also being deleted, we're done with it, it's merged into some
         * other checkpoint in the next loop. This means the extent lists may aggregate over a
         * number of checkpoints, but that's OK, they're disjoint sets of ranges.
         */
        if (F_ISSET(next_ckpt, WT_CKPT_DELETE))
            continue;

        /*
         * Find blocks for re-use: wherever the "to" checkpoint's allocate and discard lists
         * overlap, move the range to the live system's checkpoint available list.
         */
        WT_ERR(__wt_block_extlist_overlap(session, block, b));

        /*
         * If we're updating the live system's information, we're done.
         */
        if (F_ISSET(next_ckpt, WT_CKPT_ADD))
            continue;

        /*
         * We have to write the "to" checkpoint's extent lists out in new blocks, and update its
         * cookie.
         *
         * Free the blocks used to hold the "to" checkpoint's extent lists; don't include the avail
         * list, it's not changing.
         */
        WT_ERR(__ckpt_extlist_fblocks(session, block, &b->alloc));
        WT_ERR(__ckpt_extlist_fblocks(session, block, &b->discard));

        F_SET(next_ckpt, WT_CKPT_UPDATE);
    }

    /* Update checkpoints marked for update. */
    WT_CKPT_FOREACH (ckptbase, ckpt)
        if (F_ISSET(ckpt, WT_CKPT_UPDATE))
            WT_ERR(__ckpt_update(session, block, ckptbase, ckpt, ckpt->bpriv));

live_update:
    /* Truncate the file if that's possible. */
    WT_ERR(__wt_block_extlist_truncate(session, block, &ci->avail));

    /* Update the final, added checkpoint based on the live system. */
    WT_CKPT_FOREACH (ckptbase, ckpt)
        if (F_ISSET(ckpt, WT_CKPT_ADD)) {
            /*
             * !!!
             * Our caller wants the final checkpoint size. Setting the size here violates layering,
             * but the alternative is a call for the btree layer to crack the checkpoint cookie into
             * its components, and that's a fair amount of work.
             */
            ckpt->size = ckpt_size;

            /*
             * Set the rolling checkpoint size for the live system. The current size includes the
             * current checkpoint's root page size (root pages are on the checkpoint's block
             * allocation list as root pages are allocated with the usual block allocation
             * functions). That's correct, but we don't want to include it in the size for the next
             * checkpoint.
             */
            ckpt_size -= ci->root_size;

            /*
             * Additionally, we had a bug for awhile where the live checkpoint size grew without
             * bound. We can't sanity check the value, that would require walking the tree as part
             * of the checkpoint. Bound any bug at the size of the file. It isn't practical to
             * assert that the value is within bounds since databases created with older versions of
             * WiredTiger (2.8.0) would likely see an error.
             */
            ci->ckpt_size = WT_MIN(ckpt_size, (uint64_t)block->size);

            WT_ERR(__ckpt_update(session, block, ckptbase, ckpt, ci));
        }

    /*
     * Reset the live system's alloc and discard extent lists, leave the avail list alone. This
     * includes freeing a lot of extents, so do it outside of the system's lock by copying and
     * resetting the original, then doing the work later.
     */
    ci->ckpt_alloc = ci->alloc;
    WT_ERR(__wt_block_extlist_init(session, &ci->alloc, "live", "alloc", false));
    ci->ckpt_discard = ci->discard;
    WT_ERR(__wt_block_extlist_init(session, &ci->discard, "live", "discard", false));

#ifdef HAVE_DIAGNOSTIC
    /*
     * The first checkpoint in the system should always have an empty discard list. If we've read
     * that checkpoint and/or created it, check.
     */
    WT_CKPT_FOREACH (ckptbase, ckpt)
        if (!F_ISSET(ckpt, WT_CKPT_DELETE))
            break;
    if ((a = ckpt->bpriv) == NULL)
        a = &block->live;
    if (a->discard.entries != 0)
        WT_ERR_MSG(
          session, WT_ERROR, "first checkpoint incorrectly has blocks on the discard list");
#endif

err:
    if (ret != 0 && fatal) {
        ret = __wt_panic(session, ret, "%s: fatal checkpoint failure", block->name);
        __wt_blkcache_set_readonly(session);
    }

    if (locked)
        __wt_spin_unlock(session, &block->live_lock);

    /* Discard any checkpoint information we loaded. */
    WT_CKPT_FOREACH (ckptbase, ckpt)
        if ((ci = ckpt->bpriv) != NULL)
            __wt_block_ckpt_destroy(session, ci);

    return (ret);
}

/*
 * __ckpt_update --
 *     Update a checkpoint.
 */
static int
__ckpt_update(
  WT_SESSION_IMPL *session, WT_BLOCK *block, WT_CKPT *ckptbase, WT_CKPT *ckpt, WT_BLOCK_CKPT *ci)
{
    WT_DECL_ITEM(a);
    WT_DECL_RET;
    uint8_t *endp;
    bool is_live;

    is_live = F_ISSET(ckpt, WT_CKPT_ADD);

#ifdef HAVE_DIAGNOSTIC
    /* Check the extent list combinations for overlaps. */
    WT_RET(__wt_block_extlist_check(session, &ci->alloc, &ci->avail));
    WT_RET(__wt_block_extlist_check(session, &ci->discard, &ci->avail));
    WT_RET(__wt_block_extlist_check(session, &ci->alloc, &ci->discard));
#endif
    /*
     * Write the checkpoint's alloc and discard extent lists. Note these blocks never appear on the
     * system's allocation list, checkpoint extent blocks don't appear on any extent lists.
     */
    WT_RET(__wt_block_extlist_write(session, block, &ci->alloc, NULL));
    WT_RET(__wt_block_extlist_write(session, block, &ci->discard, NULL));

    /*
     * If this is the final block, we append an incomplete copy of the checkpoint information to the
     * avail list for standalone retrieval.
     */
    if (is_live) {
        /*
         * Copy the INCOMPLETE checkpoint information into the checkpoint.
         */
        WT_RET(__wt_buf_init(session, &ckpt->raw, WT_BLOCK_CHECKPOINT_BUFFER));
        endp = ckpt->raw.mem;
        WT_RET(__wt_block_ckpt_pack(session, block, &endp, ci, true));
        ckpt->raw.size = WT_PTRDIFF(endp, ckpt->raw.mem);

        /*
         * Convert the INCOMPLETE checkpoint array into its metadata representation. This must match
         * what is eventually written into the metadata file, in other words, everything must be
         * initialized before the block manager does the checkpoint.
         */
        WT_RET(__wt_scr_alloc(session, 8 * 1024, &a));
        ret = __wt_meta_ckptlist_to_meta(session, ckptbase, a);
        if (ret == 0)
            ret = __wt_strndup(session, a->data, a->size, &ckpt->block_checkpoint);
        __wt_scr_free(session, &a);
        WT_RET(ret);
    }

    /*
     * We only write an avail list for the live system, other checkpoint's avail lists are static
     * and never change.
     *
     * Write the avail list last so it reflects changes due to allocating blocks for the alloc and
     * discard lists. Second, when we write the live system's avail list, it's two lists: the
     * current avail list plus the list of blocks to be made available when the new checkpoint
     * completes. We can't merge that second list into the real list yet, it's not truly available
     * until the new checkpoint locations have been saved to the metadata.
     */
    if (is_live) {
        block->final_ckpt = ckpt;
        ret = __wt_block_extlist_write(session, block, &ci->avail, &ci->ckpt_avail);
        block->final_ckpt = NULL;
        WT_RET(ret);
    }

    /*
     * Record the blocks allocated to write the extent lists. We must record blocks in the live
     * system's extent lists, as those blocks are a necessary part of the checkpoint a hot backup
     * might recover. Update blocks in extent lists used to rewrite other checkpoints (for example,
     * an intermediate checkpoint rewritten because a checkpoint was rolled into it), even though
     * it's not necessary: those blocks aren't the last checkpoint in the file and so aren't
     * included in a recoverable checkpoint, they don't matter on a hot backup target until they're
     * allocated and used in the context of a live system. Regardless, they shouldn't materially
     * affect how much data we're writing, and it keeps things more consistent on the target to
     * update them. (Ignore the live system's ckpt_avail list here. The blocks on that list were
     * written into the final avail extent list which will be copied to the hot backup, and that's
     * all that matters.)
     */
    WT_RET(__ckpt_add_blk_mods_ext(session, ckptbase, ci));

    /*
     * Set the file size for the live system.
     *
     * !!!
     * We do NOT set the file size when re-writing checkpoints because we want to test the
     * checkpoint's blocks against a reasonable maximum file size during verification. This is bad:
     * imagine a checkpoint appearing early in the file, re-written, and then the checkpoint
     * requires blocks at the end of the file, blocks after the listed file size. If the application
     * opens that checkpoint for writing (discarding subsequent checkpoints), we would truncate the
     * file to the early chunk, discarding the re-written checkpoint information. The alternative,
     * updating the file size has its own problems, in that case we'd work correctly, but we'd lose
     * all of the blocks between the original checkpoint and the re-written checkpoint. Currently,
     * there's no API to roll-forward intermediate checkpoints, if there ever is, this will need to
     * be fixed.
     */
    if (is_live)
        ci->file_size = block->size;

    /* Copy the COMPLETE checkpoint information into the checkpoint. */
    WT_RET(__wt_buf_init(session, &ckpt->raw, WT_BLOCK_CHECKPOINT_BUFFER));
    endp = ckpt->raw.mem;
    WT_RET(__wt_block_ckpt_pack(session, block, &endp, ci, false));
    ckpt->raw.size = WT_PTRDIFF(endp, ckpt->raw.mem);

    if (WT_VERBOSE_LEVEL_ISSET(session, WT_VERB_CHECKPOINT, WT_VERBOSE_DEBUG_2))
        __wt_ckpt_verbose(session, block, "create", ckpt->name, ckpt->raw.data, ckpt->raw.size);

    return (0);
}

/*
 * __wt_block_checkpoint_resolve --
 *     Resolve a checkpoint.
 */
int
__wt_block_checkpoint_resolve(WT_SESSION_IMPL *session, WT_BLOCK *block, bool failed)
{
    WT_BLOCK_CKPT *ci;
    WT_DECL_RET;

    ci = &block->live;

    /*
     * Resolve the checkpoint after our caller has written the checkpoint information to stable
     * storage.
     */
    __wt_spin_lock(session, &block->live_lock);
    switch (block->ckpt_state) {
    case WT_CKPT_INPROGRESS:
        /* Something went wrong, but it's recoverable at our level. */
        goto done;
    case WT_CKPT_NONE:
    case WT_CKPT_SALVAGE:
        ret = __wt_panic(session, EINVAL,
          "%s: an unexpected checkpoint resolution: the checkpoint was never started or completed, "
          "or configured for salvage",
          block->name);
        __wt_blkcache_set_readonly(session);
        break;
    case WT_CKPT_PANIC_ON_FAILURE:
        if (!failed)
            break;
        ret = __wt_panic(
          session, EINVAL, "%s: the checkpoint failed, the system must restart", block->name);
        __wt_blkcache_set_readonly(session);
        break;
    }
    WT_ERR(ret);

    if ((ret = __wt_block_extlist_merge(session, block, &ci->ckpt_avail, &ci->avail)) != 0) {
        ret = __wt_panic(
          session, ret, "%s: fatal checkpoint failure during extent list merge", block->name);
        __wt_blkcache_set_readonly(session);
    }
    __wt_spin_unlock(session, &block->live_lock);

    /* Discard the lists remaining after the checkpoint call. */
    __wt_block_extlist_free(session, &ci->ckpt_avail);
    __wt_block_extlist_free(session, &ci->ckpt_alloc);
    __wt_block_extlist_free(session, &ci->ckpt_discard);

    __wt_spin_lock(session, &block->live_lock);
done:
    block->ckpt_state = WT_CKPT_NONE;
err:
    __wt_spin_unlock(session, &block->live_lock);

    return (ret);
}

#ifdef HAVE_UNITTEST
int
__ut_ckpt_add_blkmod_entry(
  WT_SESSION_IMPL *session, WT_BLOCK_MODS *blk_mod, wt_off_t offset, wt_off_t len)
{
    return (__ckpt_add_blkmod_entry(session, blk_mod, offset, len));
}
#endif