/*-
 * Copyright (c) 2014-2016 MongoDB, Inc.
 * Copyright (c) 2008-2014 WiredTiger, Inc.
 *	All rights reserved.
 *
 * See the file LICENSE for redistribution information.
 */

#include "wt_internal.h"

static int __ckpt_process(WT_SESSION_IMPL *, WT_BLOCK *, WT_CKPT *);
static int __ckpt_string(
	WT_SESSION_IMPL *, WT_BLOCK *, const uint8_t *, WT_ITEM *);
static int __ckpt_update(
	WT_SESSION_IMPL *, WT_BLOCK *, WT_CKPT *, WT_BLOCK_CKPT *, bool);

/*
 * __wt_block_ckpt_init --
 *	Initialize a checkpoint structure.
 */
int
__wt_block_ckpt_init(
    WT_SESSION_IMPL *session, WT_BLOCK_CKPT *ci, const char *name)
{
	WT_CLEAR(*ci);

	ci->version = WT_BM_CHECKPOINT_VERSION;
	ci->root_offset = WT_BLOCK_INVALID_OFFSET;

	WT_RET(__wt_block_extlist_init(
	    session, &ci->alloc, name, "alloc", false));
	WT_RET(__wt_block_extlist_init(
	    session, &ci->avail, name, "avail", true));
	WT_RET(__wt_block_extlist_init(
	    session, &ci->discard, name, "discard", false));
	WT_RET(__wt_block_extlist_init(
	    session, &ci->ckpt_avail, name, "ckpt_avail", true));

	return (0);
}

/*
 * __wt_block_checkpoint_load --
 *	Load a checkpoint.
 */
int
__wt_block_checkpoint_load(WT_SESSION_IMPL *session, WT_BLOCK *block,
    const uint8_t *addr, size_t addr_size,
    uint8_t *root_addr, size_t *root_addr_sizep, bool checkpoint)
{
	WT_BLOCK_CKPT *ci, _ci;
	WT_DECL_ITEM(tmp);
	WT_DECL_RET;
	uint8_t *endp;

	ci = NULL;

	/*
	 * Sometimes we don't find a root page (we weren't given a checkpoint,
	 * or the checkpoint was empty).  In that case we return an empty root
	 * address, set that up now.
	 */
	*root_addr_sizep = 0;

#ifdef HAVE_VERBOSE
	if (WT_VERBOSE_ISSET(session, WT_VERB_CHECKPOINT)) {
		if (addr != NULL) {
			WT_ERR(__wt_scr_alloc(session, 0, &tmp));
			WT_ERR(__ckpt_string(session, block, addr, tmp));
		}
		__wt_verbose(session, WT_VERB_CHECKPOINT,
		    "%s: load-checkpoint: %s", block->name,
		    addr == NULL ? "[Empty]" : (const char *)tmp->data);
	}
#endif

	/*
	 * There's a single checkpoint in the file that can be written, all of
	 * the others are read-only.  We use the same initialization calls for
	 * readonly checkpoints, but the information doesn't persist.
	 */
	if (checkpoint) {
		ci = &_ci;
		WT_ERR(__wt_block_ckpt_init(session, ci, "checkpoint"));
	} else {
		/*
		 * We depend on the btree level for locking: things will go bad
		 * fast if we open the live system in two handles, or salvage,
		 * truncate or verify the live/running file.
		 */
#ifdef HAVE_DIAGNOSTIC
		__wt_spin_lock(session, &block->live_lock);
		WT_ASSERT(session, block->live_open == false);
		block->live_open = true;
		__wt_spin_unlock(session, &block->live_lock);
#endif
		ci = &block->live;
		WT_ERR(__wt_block_ckpt_init(session, ci, "live"));
	}

	/*
	 * If the checkpoint has an on-disk root page, load it.  Otherwise, size
	 * the file past the description information.
	 */
	if (addr == NULL || addr_size == 0)
		ci->file_size = block->allocsize;
	else {
		/* Crack the checkpoint cookie. */
		WT_ERR(__wt_block_buffer_to_ckpt(session, block, addr, ci));

		/* Verify sets up next. */
		if (block->verify)
			WT_ERR(__wt_verify_ckpt_load(session, block, ci));

		/* Read any root page. */
		if (ci->root_offset != WT_BLOCK_INVALID_OFFSET) {
			endp = root_addr;
			WT_ERR(__wt_block_addr_to_buffer(block, &endp,
			    ci->root_offset, ci->root_size, ci->root_checksum));
			*root_addr_sizep = WT_PTRDIFF(endp, root_addr);
		}

		/*
		 * Rolling a checkpoint forward requires the avail list, the
		 * blocks from which we can allocate.
		 */
		if (!checkpoint)
			WT_ERR(__wt_block_extlist_read_avail(
			    session, block, &ci->avail, ci->file_size));
	}

	/*
	 * If the checkpoint can be written, that means anything written after
	 * the checkpoint is no longer interesting, truncate the file.  Don't
	 * bother checking the avail list for a block at the end of the file,
	 * that was done when the checkpoint was first written (re-writing the
	 * checkpoint might possibly make it relevant here, but it's unlikely
	 * enough I don't bother).
	 */
	if (!checkpoint)
		WT_ERR(__wt_block_truncate(session, block, ci->file_size));

	if (0) {
err:		/*
		 * Don't call checkpoint-unload: unload does real work including
		 * file truncation.  If we fail early enough that the checkpoint
		 * information isn't correct, bad things would happen.  The only
		 * allocated memory was in the service of verify, clean that up.
		 */
		if (block->verify)
			WT_TRET(__wt_verify_ckpt_unload(session, block));
	}

	/* Checkpoints don't need the original information, discard it. */
	if (checkpoint && ci != NULL)
		__wt_block_ckpt_destroy(session, ci);

	__wt_scr_free(session, &tmp);
	return (ret);
}

/*
 * __wt_block_checkpoint_unload --
 *	Unload a checkpoint.
 */
int
__wt_block_checkpoint_unload(
    WT_SESSION_IMPL *session, WT_BLOCK *block, bool checkpoint)
{
	WT_DECL_RET;

	/* Verify cleanup. */
	if (block->verify)
		WT_TRET(__wt_verify_ckpt_unload(session, block));

	/*
	 * If it's the live system, truncate to discard any extended blocks and
	 * discard the active extent lists.  Hold the lock even though we're
	 * unloading the live checkpoint, there could be readers active in other
	 * checkpoints.
	 */
	if (!checkpoint) {
		WT_TRET(__wt_block_truncate(session, block, block->size));

		__wt_spin_lock(session, &block->live_lock);
		__wt_block_ckpt_destroy(session, &block->live);
#ifdef HAVE_DIAGNOSTIC
		block->live_open = false;
#endif
		__wt_spin_unlock(session, &block->live_lock);
	}

	return (ret);
}

/*
 * __wt_block_ckpt_destroy --
 *	Clear a checkpoint structure.
 */
void
__wt_block_ckpt_destroy(WT_SESSION_IMPL *session, WT_BLOCK_CKPT *ci)
{
	/* Discard the extent lists. */
	__wt_block_extlist_free(session, &ci->alloc);
	__wt_block_extlist_free(session, &ci->avail);
	__wt_block_extlist_free(session, &ci->discard);
	__wt_block_extlist_free(session, &ci->ckpt_alloc);
	__wt_block_extlist_free(session, &ci->ckpt_avail);
	__wt_block_extlist_free(session, &ci->ckpt_discard);
}

/*
 * __wt_block_checkpoint --
 *	Create a new checkpoint.
 */
int
__wt_block_checkpoint(WT_SESSION_IMPL *session,
    WT_BLOCK *block, WT_ITEM *buf, WT_CKPT *ckptbase, bool data_checksum)
{
	WT_BLOCK_CKPT *ci;
	WT_DECL_RET;

	ci = &block->live;

	/* Switch to first-fit allocation. */
	__wt_block_configure_first_fit(block, true);

	/*
	 * Write the root page: it's possible for there to be a checkpoint of
	 * an empty tree, in which case, we store an illegal root offset.
	 *
	 * !!!
	 * We happen to know that checkpoints are single-threaded above us in
	 * the btree engine.  That's probably something we want to guarantee
	 * for any WiredTiger block manager.
	 */
	if (buf == NULL) {
		ci->root_offset = WT_BLOCK_INVALID_OFFSET;
		ci->root_size = ci->root_checksum = 0;
	} else
		WT_ERR(__wt_block_write_off(session, block, buf,
		    &ci->root_offset, &ci->root_size, &ci->root_checksum,
		    data_checksum, true, false));

	/*
	 * Checkpoints are potentially reading/writing/merging lots of blocks,
	 * pre-allocate structures for this thread's use.
	 */
	WT_ERR(__wt_block_ext_prealloc(session, 250));

	/* Process the checkpoint list, deleting and updating as required. */
	ret = __ckpt_process(session, block, ckptbase);

	/* Discard any excessive memory we've allocated. */
	WT_TRET(__wt_block_ext_discard(session, 250));

	/* Restore the original allocation plan. */
err:	__wt_block_configure_first_fit(block, false);

	return (ret);
}

/*
 * __ckpt_extlist_read --
 *	Read a checkpoints extent lists and copy
 */
static int
__ckpt_extlist_read(WT_SESSION_IMPL *session, WT_BLOCK *block, WT_CKPT *ckpt)
{
	WT_BLOCK_CKPT *ci;

	/*
	 * Allocate a checkpoint structure, crack the cookie and read the
	 * checkpoint's extent lists.
	 *
	 * Ignore the avail list: checkpoint avail lists are only useful if we
	 * are rolling forward from the particular checkpoint and they represent
	 * our best understanding of what blocks can be allocated.  If we are
	 * not operating on the live checkpoint, subsequent checkpoints might
	 * have allocated those blocks, and the avail list is useless.  We don't
	 * discard it, because it is useful as part of verification, but we
	 * don't re-write it either.
	 */
	WT_RET(__wt_calloc(session, 1, sizeof(WT_BLOCK_CKPT), &ckpt->bpriv));

	ci = ckpt->bpriv;
	WT_RET(__wt_block_ckpt_init(session, ci, ckpt->name));
	WT_RET(__wt_block_buffer_to_ckpt(session, block, ckpt->raw.data, ci));
	WT_RET(__wt_block_extlist_read(
	    session, block, &ci->alloc, ci->file_size));
	WT_RET(__wt_block_extlist_read(
	    session, block, &ci->discard, ci->file_size));

	return (0);
}

/*
 * __ckpt_extlist_fblocks --
 *	If a checkpoint's extent list is going away, free its blocks.
 */
static int
__ckpt_extlist_fblocks(
    WT_SESSION_IMPL *session, WT_BLOCK *block, WT_EXTLIST *el)
{
	if (el->offset == WT_BLOCK_INVALID_OFFSET)
		return (0);

	/*
	 * Free blocks used to write checkpoint extents into the live system's
	 * checkpoint avail list (they were never on any alloc list). Do not
	 * use the live system's avail list because that list is used to decide
	 * if the file can be truncated, and we can't truncate any part of the
	 * file that contains a previous checkpoint's extents.
	 */
	return (__wt_block_insert_ext(
	    session, block, &block->live.ckpt_avail, el->offset, el->size));
}

#ifdef HAVE_DIAGNOSTIC
/*
 * __ckpt_verify --
 *	Diagnostic code, confirm we get what we expect in the checkpoint array.
 */
static int
__ckpt_verify(WT_SESSION_IMPL *session, WT_CKPT *ckptbase)
{
	WT_CKPT *ckpt;

	/*
	 * Fast check that we're seeing what we expect to see: some number of
	 * checkpoints to add, delete or ignore, terminated by a new checkpoint.
	 */
	WT_CKPT_FOREACH(ckptbase, ckpt)
		switch (ckpt->flags) {
		case 0:
		case WT_CKPT_DELETE:
		case WT_CKPT_DELETE | WT_CKPT_FAKE:
		case WT_CKPT_FAKE:
			break;
		case WT_CKPT_ADD:
			if (ckpt[1].name == NULL)
				break;
			/* FALLTHROUGH */
		default:
			return (
			    __wt_illegal_value(session, "checkpoint array"));
		}
	return (0);
}
#endif

/*
 * __ckpt_process --
 *	Process the list of checkpoints.
 */
static int
__ckpt_process(WT_SESSION_IMPL *session, WT_BLOCK *block, WT_CKPT *ckptbase)
{
	WT_BLOCK_CKPT *a, *b, *ci;
	WT_CKPT *ckpt, *next_ckpt;
	WT_DECL_ITEM(tmp);
	WT_DECL_RET;
	uint64_t ckpt_size;
	bool deleting, fatal, locked;

	ci = &block->live;
	fatal = locked = false;

#ifdef HAVE_DIAGNOSTIC
	WT_RET(__ckpt_verify(session, ckptbase));
#endif

	/*
	 * Checkpoints are a two-step process: first, write a new checkpoint to
	 * disk (including all the new extent lists for modified checkpoints
	 * and the live system).  As part of this, create a list of file blocks
	 * newly available for reallocation, based on checkpoints being deleted.
	 * We then return the locations of the new checkpoint information to our
	 * caller.  Our caller has to write that information into some kind of
	 * stable storage, and once that's done, we can actually allocate from
	 * that list of newly available file blocks.  (We can't allocate from
	 * that list immediately because the allocation might happen before our
	 * caller saves the new checkpoint information, and if we crashed before
	 * the new checkpoint location was saved, we'd have overwritten blocks
	 * still referenced by checkpoints in the system.)  In summary, there is
	 * a second step: after our caller saves the checkpoint information, we
	 * are called to add the newly available blocks into the live system's
	 * available list.
	 *
	 * This function is the first step, the second step is in the resolve
	 * function.
	 *
	 * If we're called to checkpoint the same file twice (without the second
	 * resolution step), or re-entered for any reason, it's an error in our
	 * caller, and our choices are all bad: leak blocks or potentially crash
	 * with our caller not yet having saved previous checkpoint information
	 * to stable storage.
	 */
	__wt_spin_lock(session, &block->live_lock);
	if (block->ckpt_inprogress)
		ret = __wt_block_panic(session, EINVAL,
		    "%s: unexpected checkpoint ordering", block->name);
	else
		block->ckpt_inprogress = true;
	__wt_spin_unlock(session, &block->live_lock);
	WT_RET(ret);

	/*
	 * Extents newly available as a result of deleting previous checkpoints
	 * are added to a list of extents.  The list should be empty, but as
	 * described above, there is no "free the checkpoint information" call
	 * into the block manager; if there was an error in an upper level that
	 * resulted in some previous checkpoint never being resolved, the list
	 * may not be empty.  We should have caught that with the "checkpoint
	 * in progress" test, but it doesn't cost us anything to be cautious.
	 *
	 * We free the checkpoint's allocation and discard extent lists as part
	 * of the resolution step, not because they're needed at that time, but
	 * because it's potentially a lot of work, and waiting allows the btree
	 * layer to continue eviction sooner.  As for the checkpoint-available
	 * list, make sure they get cleaned out.
	 */
	__wt_block_extlist_free(session, &ci->ckpt_avail);
	WT_RET(__wt_block_extlist_init(
	    session, &ci->ckpt_avail, "live", "ckpt_avail", true));
	__wt_block_extlist_free(session, &ci->ckpt_alloc);
	__wt_block_extlist_free(session, &ci->ckpt_discard);

	/*
	 * To delete a checkpoint, we'll need checkpoint information for it and
	 * the subsequent checkpoint into which it gets rolled; read them from
	 * disk before we lock things down.
	 */
	deleting = false;
	WT_CKPT_FOREACH(ckptbase, ckpt) {
		if (F_ISSET(ckpt, WT_CKPT_FAKE) ||
		    !F_ISSET(ckpt, WT_CKPT_DELETE))
			continue;
		deleting = true;

		/*
		 * Read the checkpoint and next checkpoint extent lists if we
		 * haven't already read them (we may have already read these
		 * extent blocks if there is more than one deleted checkpoint).
		 */
		if (ckpt->bpriv == NULL)
			WT_ERR(__ckpt_extlist_read(session, block, ckpt));

		for (next_ckpt = ckpt + 1;; ++next_ckpt)
			if (!F_ISSET(next_ckpt, WT_CKPT_FAKE))
				break;

		/*
		 * The "next" checkpoint may be the live tree which has no
		 * extent blocks to read.
		 */
		if (next_ckpt->bpriv == NULL &&
		    !F_ISSET(next_ckpt, WT_CKPT_ADD))
			WT_ERR(__ckpt_extlist_read(session, block, next_ckpt));
	}

	/*
	 * Failures are now fatal: we can't currently back out the merge of any
	 * deleted checkpoint extent lists into the live system's extent lists,
	 * so continuing after error would leave the live system's extent lists
	 * corrupted for any subsequent checkpoint (and potentially, should a
	 * subsequent checkpoint succeed, for recovery).
	 */
	fatal = true;

	/*
	 * Hold a lock so the live extent lists and the file size can't change
	 * underneath us.  I suspect we'll tighten this if checkpoints take too
	 * much time away from real work: we read the historic checkpoint
	 * information without a lock, but we could also merge and re-write the
	 * deleted and merged checkpoint information without a lock, except for
	 * the final merge of ranges into the live tree.
	 */
	__wt_spin_lock(session, &block->live_lock);
	locked = true;

	/*
	 * We've allocated our last page, update the checkpoint size.  We need
	 * to calculate the live system's checkpoint size before merging
	 * checkpoint allocation and discard information from the checkpoints
	 * we're deleting, those operations change the underlying byte counts.
	 */
	ckpt_size = ci->ckpt_size;
	ckpt_size += ci->alloc.bytes;
	ckpt_size -= ci->discard.bytes;

	/* Skip the additional processing if we aren't deleting checkpoints. */
	if (!deleting)
		goto live_update;

	/*
	 * Delete any no-longer-needed checkpoints: we do this first as it frees
	 * blocks to the live lists, and the freed blocks will then be included
	 * when writing the live extent lists.
	 */
	WT_CKPT_FOREACH(ckptbase, ckpt) {
		if (F_ISSET(ckpt, WT_CKPT_FAKE) ||
		    !F_ISSET(ckpt, WT_CKPT_DELETE))
			continue;

#ifdef HAVE_VERBOSE
		if (WT_VERBOSE_ISSET(session, WT_VERB_CHECKPOINT)) {
			if (tmp == NULL)
				WT_ERR(__wt_scr_alloc(session, 0, &tmp));
			WT_ERR(__ckpt_string(
			    session, block, ckpt->raw.data, tmp));
			__wt_verbose(session, WT_VERB_CHECKPOINT,
			    "%s: delete-checkpoint: %s: %s",
			    block->name, ckpt->name, (const char *)tmp->data);
		}
#endif
		/*
		 * Find the checkpoint into which we'll roll this checkpoint's
		 * blocks: it's the next real checkpoint in the list, and it
		 * better have been read in (if it's not the add slot).
		 */
		for (next_ckpt = ckpt + 1;; ++next_ckpt)
			if (!F_ISSET(next_ckpt, WT_CKPT_FAKE))
				break;

		/*
		 * Set the from/to checkpoint structures, where the "to" value
		 * may be the live tree.
		 */
		a = ckpt->bpriv;
		if (F_ISSET(next_ckpt, WT_CKPT_ADD))
			b = &block->live;
		else
			b = next_ckpt->bpriv;

		/*
		 * Free the root page: there's nothing special about this free,
		 * the root page is allocated using normal rules, that is, it
		 * may have been taken from the avail list, and was entered on
		 * the live system's alloc list at that time.  We free it into
		 * the checkpoint's discard list, however, not the live system's
		 * list because it appears on the checkpoint's alloc list and so
		 * must be paired in the checkpoint.
		 */
		if (a->root_offset != WT_BLOCK_INVALID_OFFSET)
			WT_ERR(__wt_block_insert_ext(session, block,
			    &a->discard, a->root_offset, a->root_size));

		/*
		 * Free the blocks used to hold the "from" checkpoint's extent
		 * lists, including the avail list.
		 */
		WT_ERR(__ckpt_extlist_fblocks(session, block, &a->alloc));
		WT_ERR(__ckpt_extlist_fblocks(session, block, &a->avail));
		WT_ERR(__ckpt_extlist_fblocks(session, block, &a->discard));

		/*
		 * Roll the "from" alloc and discard extent lists into the "to"
		 * checkpoint's lists.
		 */
		if (a->alloc.entries != 0)
			WT_ERR(__wt_block_extlist_merge(
			    session, block, &a->alloc, &b->alloc));
		if (a->discard.entries != 0)
			WT_ERR(__wt_block_extlist_merge(
			    session, block, &a->discard, &b->discard));

		/*
		 * If the "to" checkpoint is also being deleted, we're done with
		 * it, it's merged into some other checkpoint in the next loop.
		 * This means the extent lists may aggregate over a number of
		 * checkpoints, but that's OK, they're disjoint sets of ranges.
		 */
		if (F_ISSET(next_ckpt, WT_CKPT_DELETE))
			continue;

		/*
		 * Find blocks for re-use: wherever the "to" checkpoint's
		 * allocate and discard lists overlap, move the range to
		 * the live system's checkpoint available list.
		 */
		WT_ERR(__wt_block_extlist_overlap(session, block, b));

		/*
		 * If we're updating the live system's information, we're done.
		 */
		if (F_ISSET(next_ckpt, WT_CKPT_ADD))
			continue;

		/*
		 * We have to write the "to" checkpoint's extent lists out in
		 * new blocks, and update its cookie.
		 *
		 * Free the blocks used to hold the "to" checkpoint's extent
		 * lists; don't include the avail list, it's not changing.
		 */
		WT_ERR(__ckpt_extlist_fblocks(session, block, &b->alloc));
		WT_ERR(__ckpt_extlist_fblocks(session, block, &b->discard));

		F_SET(next_ckpt, WT_CKPT_UPDATE);
	}

	/* Update checkpoints marked for update. */
	WT_CKPT_FOREACH(ckptbase, ckpt)
		if (F_ISSET(ckpt, WT_CKPT_UPDATE))
			WT_ERR(__ckpt_update(
			    session, block, ckpt, ckpt->bpriv, false));

live_update:
	/* Truncate the file if that's possible. */
	WT_ERR(__wt_block_extlist_truncate(session, block, &ci->avail));

	/* Update the final, added checkpoint based on the live system. */
	WT_CKPT_FOREACH(ckptbase, ckpt)
		if (F_ISSET(ckpt, WT_CKPT_ADD)) {
			/*
			 * !!!
			 * Our caller wants the final checkpoint size.  Setting
			 * the size here violates layering, but the alternative
			 * is a call for the btree layer to crack the checkpoint
			 * cookie into its components, and that's a fair amount
			 * of work.
			 */
			ckpt->ckpt_size = ckpt_size;

			/*
			 * Set the rolling checkpoint size for the live system.
			 * The current size includes the current checkpoint's
			 * root page size (root pages are on the checkpoint's
			 * block allocation list as root pages are allocated
			 * with the usual block allocation functions). That's
			 * correct, but we don't want to include it in the size
			 * for the next checkpoint.
			 */
			ckpt_size -= ci->root_size;

			/*
			 * Additionally, we had a bug for awhile where the live
			 * checkpoint size grew without bound. We can't sanity
			 * check the value, that would require walking the tree
			 * as part of the checkpoint. Bound any bug at the size
			 * of the file.
			 * It isn't practical to assert that the value is within
			 * bounds since databases created with older versions
			 * of WiredTiger (2.8.0) would likely see an error.
			 */
			ci->ckpt_size =
			    WT_MIN(ckpt_size, (uint64_t)block->size);

			WT_ERR(__ckpt_update(session, block, ckpt, ci, true));
		}

	/*
	 * Reset the live system's alloc and discard extent lists, leave the
	 * avail list alone.  This includes freeing a lot of extents, so do it
	 * outside of the system's lock by copying and resetting the original,
	 * then doing the work later.
	 */
	ci->ckpt_alloc = ci->alloc;
	WT_ERR(__wt_block_extlist_init(
	    session, &ci->alloc, "live", "alloc", false));
	ci->ckpt_discard = ci->discard;
	WT_ERR(__wt_block_extlist_init(
	    session, &ci->discard, "live", "discard", false));

#ifdef HAVE_DIAGNOSTIC
	/*
	 * The first checkpoint in the system should always have an empty
	 * discard list.  If we've read that checkpoint and/or created it,
	 * check.
	 */
	WT_CKPT_FOREACH(ckptbase, ckpt)
		if (!F_ISSET(ckpt, WT_CKPT_DELETE))
			break;
	if ((a = ckpt->bpriv) == NULL)
		a = &block->live;
	if (a->discard.entries != 0)
		WT_ERR_MSG(session, WT_ERROR,
		    "first checkpoint incorrectly has blocks on the discard "
		    "list");
#endif

err:	if (ret != 0 && fatal)
		ret = __wt_block_panic(session, ret,
		    "%s: fatal checkpoint failure", block->name);

	if (locked)
		__wt_spin_unlock(session, &block->live_lock);

	/* Discard any checkpoint information we loaded. */
	WT_CKPT_FOREACH(ckptbase, ckpt)
		if ((ci = ckpt->bpriv) != NULL)
			__wt_block_ckpt_destroy(session, ci);

	__wt_scr_free(session, &tmp);
	return (ret);
}

/*
 * __ckpt_update --
 *	Update a checkpoint.
 */
static int
__ckpt_update(WT_SESSION_IMPL *session,
    WT_BLOCK *block, WT_CKPT *ckpt, WT_BLOCK_CKPT *ci, bool is_live)
{
	WT_DECL_ITEM(tmp);
	WT_DECL_RET;
	uint8_t *endp;

#ifdef HAVE_DIAGNOSTIC
	/* Check the extent list combinations for overlaps. */
	WT_RET(__wt_block_extlist_check(session, &ci->alloc, &ci->avail));
	WT_RET(__wt_block_extlist_check(session, &ci->discard, &ci->avail));
	WT_RET(__wt_block_extlist_check(session, &ci->alloc, &ci->discard));
#endif
	/*
	 * Write the checkpoint's alloc and discard extent lists.  After each
	 * write, remove any allocated blocks from the system's allocation
	 * list, checkpoint extent blocks don't appear on any extent lists.
	 */
	WT_RET(__wt_block_extlist_write(session, block, &ci->alloc, NULL));
	WT_RET(__wt_block_extlist_write(session, block, &ci->discard, NULL));

	/*
	 * We only write an avail list for the live system, other checkpoint's
	 * avail lists are static and never change.
	 *
	 * Write the avail list last so it reflects changes due to allocating
	 * blocks for the alloc and discard lists.  Second, when we write the
	 * live system's avail list, it's two lists: the current avail list
	 * plus the list of blocks to be made available when the new checkpoint
	 * completes.  We can't merge that second list into the real list yet,
	 * it's not truly available until the new checkpoint locations have been
	 * saved to the metadata.
	 */
	if (is_live)
		WT_RET(__wt_block_extlist_write(
		    session, block, &ci->avail, &ci->ckpt_avail));

	/*
	 * Set the file size for the live system.
	 *
	 * !!!
	 * We do NOT set the file size when re-writing checkpoints because we
	 * want to test the checkpoint's blocks against a reasonable maximum
	 * file size during verification.  This is bad: imagine a checkpoint
	 * appearing early in the file, re-written, and then the checkpoint
	 * requires blocks at the end of the file, blocks after the listed file
	 * size.  If the application opens that checkpoint for writing
	 * (discarding subsequent checkpoints), we would truncate the file to
	 * the early chunk, discarding the re-written checkpoint information.
	 * The alternative, updating the file size has its own problems, in
	 * that case we'd work correctly, but we'd lose all of the blocks
	 * between the original checkpoint and the re-written checkpoint.
	 * Currently, there's no API to roll-forward intermediate checkpoints,
	 * if there ever is, this will need to be fixed.
	 */
	if (is_live)
		ci->file_size = block->size;

	/*
	 * Copy the checkpoint information into the checkpoint array's address
	 * cookie.
	 */
	WT_RET(__wt_buf_init(session, &ckpt->raw, WT_BTREE_MAX_ADDR_COOKIE));
	endp = ckpt->raw.mem;
	WT_RET(__wt_block_ckpt_to_buffer(session, block, &endp, ci));
	ckpt->raw.size = WT_PTRDIFF(endp, ckpt->raw.mem);

	if (WT_VERBOSE_ISSET(session, WT_VERB_CHECKPOINT)) {
		WT_RET(__wt_scr_alloc(session, 0, &tmp));
		WT_ERR(__ckpt_string(session, block, ckpt->raw.data, tmp));
		__wt_verbose(session, WT_VERB_CHECKPOINT,
		    "%s: create-checkpoint: %s: %s",
		    block->name, ckpt->name, (const char *)tmp->data);
	}

err:	__wt_scr_free(session, &tmp);
	return (ret);
}

/*
 * __wt_block_checkpoint_resolve --
 *	Resolve a checkpoint.
 */
int
__wt_block_checkpoint_resolve(WT_SESSION_IMPL *session, WT_BLOCK *block)
{
	WT_BLOCK_CKPT *ci;
	WT_DECL_RET;

	ci = &block->live;

	/*
	 * Resolve the checkpoint after our caller has written the checkpoint
	 * information to stable storage.
	 */
	__wt_spin_lock(session, &block->live_lock);
	if (!block->ckpt_inprogress)
		WT_ERR(__wt_block_panic(session, WT_ERROR,
		    "%s: checkpoint resolution with no checkpoint in progress",
		    block->name));

	if ((ret = __wt_block_extlist_merge(
	    session, block, &ci->ckpt_avail, &ci->avail)) != 0)
		WT_ERR(__wt_block_panic(session, ret,
		    "%s: fatal checkpoint failure", block->name));
	__wt_spin_unlock(session, &block->live_lock);

	/* Discard the lists remaining after the checkpoint call. */
	__wt_block_extlist_free(session, &ci->ckpt_avail);
	__wt_block_extlist_free(session, &ci->ckpt_alloc);
	__wt_block_extlist_free(session, &ci->ckpt_discard);

	__wt_spin_lock(session, &block->live_lock);
	block->ckpt_inprogress = 0;
err:	__wt_spin_unlock(session, &block->live_lock);

	return (ret);
}

/*
 * __ckpt_string --
 *	Return a printable string representation of a checkpoint address cookie.
 */
static int
__ckpt_string(WT_SESSION_IMPL *session,
    WT_BLOCK *block, const uint8_t *addr, WT_ITEM *buf)
{
	WT_BLOCK_CKPT *ci, _ci;

	/* Initialize the checkpoint, crack the cookie. */
	ci = &_ci;
	WT_RET(__wt_block_ckpt_init(session, ci, "string"));
	WT_RET(__wt_block_buffer_to_ckpt(session, block, addr, ci));

	WT_RET(__wt_buf_fmt(session, buf,
	    "version=%" PRIu8, ci->version));
	if (ci->root_offset == WT_BLOCK_INVALID_OFFSET)
		WT_RET(__wt_buf_catfmt(session, buf, ", root=[Empty]"));
	else
		WT_RET(__wt_buf_catfmt(session, buf,
		    ", root=[%"
		    PRIuMAX "-%" PRIuMAX ", %" PRIu32 ", %" PRIu32 "]",
		    (uintmax_t)ci->root_offset,
		    (uintmax_t)(ci->root_offset + ci->root_size),
		    ci->root_size, ci->root_checksum));
	if (ci->alloc.offset == WT_BLOCK_INVALID_OFFSET)
		WT_RET(__wt_buf_catfmt(session, buf, ", alloc=[Empty]"));
	else
		WT_RET(__wt_buf_catfmt(session, buf,
		    ", alloc=[%"
		    PRIuMAX "-%" PRIuMAX ", %" PRIu32 ", %" PRIu32 "]",
		    (uintmax_t)ci->alloc.offset,
		    (uintmax_t)(ci->alloc.offset + ci->alloc.size),
		    ci->alloc.size, ci->alloc.checksum));
	if (ci->avail.offset == WT_BLOCK_INVALID_OFFSET)
		WT_RET(__wt_buf_catfmt(session, buf, ", avail=[Empty]"));
	else
		WT_RET(__wt_buf_catfmt(session, buf,
		    ", avail=[%"
		    PRIuMAX "-%" PRIuMAX ", %" PRIu32 ", %" PRIu32 "]",
		    (uintmax_t)ci->avail.offset,
		    (uintmax_t)(ci->avail.offset + ci->avail.size),
		    ci->avail.size, ci->avail.checksum));
	if (ci->discard.offset == WT_BLOCK_INVALID_OFFSET)
		WT_RET(__wt_buf_catfmt(session, buf, ", discard=[Empty]"));
	else
		WT_RET(__wt_buf_catfmt(session, buf,
		    ", discard=[%"
		    PRIuMAX "-%" PRIuMAX ", %" PRIu32 ", %" PRIu32 "]",
		    (uintmax_t)ci->discard.offset,
		    (uintmax_t)(ci->discard.offset + ci->discard.size),
		    ci->discard.size, ci->discard.checksum));
	WT_RET(__wt_buf_catfmt(session, buf,
	    ", file size=%" PRIuMAX, (uintmax_t)ci->file_size));

	__wt_block_ckpt_destroy(session, ci);

	return (0);
}