1 files changed, 708 insertions, 0 deletions
diff --git a/src/block/block_ckpt.c b/src/block/block_ckpt.c
new file mode 100644
index 00000000000..08e3856facd
--- /dev/null
+++ b/src/block/block_ckpt.c
@@ -0,0 +1,708 @@
+/*-
+ * Copyright (c) 2008-2012 WiredTiger, Inc.
+ *	All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+static int __ckpt_process(WT_SESSION_IMPL *, WT_BLOCK *, WT_CKPT *);
+static int __ckpt_string(
+	WT_SESSION_IMPL *, WT_BLOCK *, const uint8_t *, WT_ITEM *);
+static int __ckpt_update(WT_SESSION_IMPL *,
+	WT_BLOCK *, WT_CKPT *, WT_BLOCK_CKPT *, uint64_t, int);
+
+/*
+ * __wt_block_ckpt_init --
+ *	Initialize a checkpoint structure.
+ */
+int
+__wt_block_ckpt_init(WT_SESSION_IMPL *session,
+    WT_BLOCK *block, WT_BLOCK_CKPT *ci, const char *name, int is_live)
+{
+	WT_DECL_RET;
+
+	/*
+	 * If we're loading a new live checkpoint, there shouldn't be one
+	 * already loaded.  The btree engine should prevent this from ever
+	 * happening, but paranoia is a healthy thing.
+	 */
+	if (is_live) {
+		__wt_spin_lock(session, &block->live_lock);
+		if (block->live_load)
+			ret = EINVAL;
+		else
+			block->live_load = 1;
+		__wt_spin_unlock(session, &block->live_lock);
+		if (ret)
+			WT_RET_MSG(
+			    session, EINVAL, "checkpoint already loaded");
+	}
+
+	memset(ci, 0, sizeof(*ci));
+
+	ci->root_offset = WT_BLOCK_INVALID_OFFSET;
+
+	WT_RET(__wt_block_extlist_init(session, &ci->alloc, name, "alloc"));
+	WT_RET(__wt_block_extlist_init(session, &ci->avail, name, "avail"));
+	WT_RET(__wt_block_extlist_init(session, &ci->discard, name, "discard"));
+
+	ci->file_size = WT_BLOCK_DESC_SECTOR;
+	WT_RET(__wt_block_extlist_init(
+	    session, &ci->ckpt_avail, name, "ckpt_avail"));
+
+	return (0);
+}
+
+/*
+ * __wt_block_checkpoint_load --
+ *	Load a checkpoint.
+ */
+int
+__wt_block_checkpoint_load(WT_SESSION_IMPL *session,
+    WT_BLOCK *block, WT_ITEM *dsk, const uint8_t *addr, uint32_t addr_size,
+    int readonly)
+{
+	WT_BLOCK_CKPT *ci;
+	WT_DECL_ITEM(tmp);
+	WT_DECL_RET;
+
+	WT_UNUSED(addr_size);
+
+	/*
+	 * Sometimes we don't find a root page (we weren't given a checkpoint,
+	 * or the referenced checkpoint was empty).  In that case we return a
+	 * root page size of 0.  Set that up now.
+	 */
+	dsk->size = 0;
+
+	ci = &block->live;
+	WT_RET(__wt_block_ckpt_init(session, block, ci, "live", 1));
+
+	if (WT_VERBOSE_ISSET(session, ckpt)) {
+		if (addr != NULL) {
+			WT_ERR(__wt_scr_alloc(session, 0, &tmp));
+			WT_ERR(__ckpt_string(session, block, addr, tmp));
+		}
+		WT_VERBOSE_ERR(session, ckpt,
+		    "%s: load-checkpoint: %s", block->name,
+		    addr == NULL ? "[Empty]" : (char *)tmp->data);
+	}
+
+	/* If not loading a checkpoint from disk, we're done. */
+	if (addr == NULL || addr_size == 0)
+		return (0);
+
+	/* Crack the checkpoint cookie. */
+	if (addr != NULL)
+		WT_ERR(__wt_block_buffer_to_ckpt(session, block, addr, ci));
+
+	/* Verify sets up next. */
+	if (block->verify)
+		WT_ERR(__wt_verify_ckpt_load(session, block, ci));
+
+	/* Read, and optionally verify, any root page. */
+	if (ci->root_offset != WT_BLOCK_INVALID_OFFSET) {
+		WT_ERR(__wt_block_read_off(session, block,
+		    dsk, ci->root_offset, ci->root_size, ci->root_cksum));
+		if (block->verify) {
+			if (tmp == NULL) {
+				WT_ERR(__wt_scr_alloc(session, 0, &tmp));
+				WT_ERR(__ckpt_string(
+				    session, block, addr, tmp));
+			}
+			WT_ERR(
+			    __wt_verify_dsk(session, (char *)tmp->data, dsk));
+		}
+	}
+
+	/*
+	 * Rolling a checkpoint forward requires the avail list, the blocks from
+	 * which we can allocate.
+	 */
+	if (!readonly)
+		WT_ERR(__wt_block_extlist_read(session, block, &ci->avail));
+
+	/*
+	 * If the checkpoint can be written, that means anything written after
+	 * the checkpoint is no longer interesting.  Truncate the file.
+	 */
+	if (!readonly) {
+		WT_VERBOSE_ERR(session, ckpt,
+		    "truncate file to %" PRIuMAX, (uintmax_t)ci->file_size);
+		WT_ERR(__wt_ftruncate(session, block->fh, ci->file_size));
+	}
+
+	if (0) {
+err:		(void)__wt_block_checkpoint_unload(session, block);
+	}
+
+	__wt_scr_free(&tmp);
+	return (ret);
+}
+
+/*
+ * __wt_block_checkpoint_unload --
+ *	Unload a checkpoint.
+ */
+int
+__wt_block_checkpoint_unload(WT_SESSION_IMPL *session, WT_BLOCK *block)
+{
+	WT_BLOCK_CKPT *ci;
+	WT_DECL_RET;
+
+	WT_VERBOSE_RETVAL(
+	    session, ckpt, ret, "%s: unload checkpoint", block->name);
+
+	ci = &block->live;
+
+	/* Verify cleanup. */
+	if (block->verify)
+		WT_TRET(__wt_verify_ckpt_unload(session, block, ci));
+
+	__wt_block_ckpt_destroy(session, ci);
+
+	block->live_load = 0;
+
+	return (ret);
+}
+
+/*
+ * __wt_block_ckpt_destroy --
+ *	Clear a checkpoint structure.
+ */
+void
+__wt_block_ckpt_destroy(WT_SESSION_IMPL *session, WT_BLOCK_CKPT *ci)
+{
+	/* Discard the extent lists. */
+	__wt_block_extlist_free(session, &ci->alloc);
+	__wt_block_extlist_free(session, &ci->avail);
+	__wt_block_extlist_free(session, &ci->discard);
+	__wt_block_extlist_free(session, &ci->ckpt_avail);
+}
+
+/*
+ * __wt_block_checkpoint --
+ *	Create a new checkpoint.
+ */
+int
+__wt_block_checkpoint(WT_SESSION_IMPL *session,
+    WT_BLOCK *block, WT_ITEM *buf, WT_CKPT *ckptbase)
+{
+	WT_BLOCK_CKPT *ci;
+
+	ci = &block->live;
+	ci->version = WT_BM_CHECKPOINT_VERSION;
+
+	/*
+	 * Write the root page: it's possible for there to be a checkpoint of
+	 * an empty tree, in which case, we store an illegal root offset.
+	 *
+	 * XXX
+	 * We happen to know that checkpoints are single-threaded above us in
+	 * the btree engine.  That's probably something we want to guarantee
+	 * for any WiredTiger block manager.
+	 */
+	if (buf == NULL) {
+		ci->root_offset = WT_BLOCK_INVALID_OFFSET;
+		ci->root_size = ci->root_cksum = 0;
+	} else
+		WT_RET(__wt_block_write_off(session, block, buf,
+		    &ci->root_offset, &ci->root_size, &ci->root_cksum, 0));
+
+	/* Process the checkpoint list, deleting and updating as required. */
+	WT_RET(__ckpt_process(session, block, ckptbase));
+
+	/*
+	 * Checkpoints have to hit disk (it would be reasonable to configure for
+	 * lazy checkpoints, but we don't support them yet).  Regardless, we're
+	 * not holding any locks, other writers can proceed while we wait.
+	 */
+	if (!F_ISSET(S2C(session), WT_CONN_NOSYNC))
+		WT_RET(__wt_fsync(session, block->fh));
+
+	return (0);
+}
+
+/*
+ * __ckpt_extlist_fblocks --
+ *	If an extent list was read from disk, free its space to the live avail
+ * list.
+ */
+static inline int
+__ckpt_extlist_fblocks(
+    WT_SESSION_IMPL *session, WT_BLOCK *block, WT_EXTLIST *el)
+{
+	if (el->offset == WT_BLOCK_INVALID_OFFSET)
+		return (0);
+	return (__wt_block_insert_ext(
+	    session, &block->live.avail, el->offset, el->size));
+}
+
+/*
+ * __ckpt_process --
+ *	Process the list of checkpoints.
+ */
+static int
+__ckpt_process(
+    WT_SESSION_IMPL *session, WT_BLOCK *block, WT_CKPT *ckptbase)
+{
+	WT_BLOCK_CKPT *a, *b, *ci;
+	WT_CKPT *ckpt;
+	WT_DECL_ITEM(tmp);
+	WT_DECL_RET;
+	uint64_t ckpt_size;
+	int deleting, locked;
+
+	ci = &block->live;
+	locked = 0;
+
+	/*
+	 * We've allocated our last page, update the checkpoint size.  We need
+	 * to calculate the live system's checkpoint size before reading and
+	 * merging checkpoint allocation and discard information from the
+	 * checkpoints we're deleting, those operations change the underlying
+	 * byte counts.
+	 */
+	ckpt_size = ci->ckpt_size;
+	ckpt_size += ci->alloc.bytes;
+	ckpt_size -= ci->discard.bytes;
+
+	/*
+	 * Extents newly available as a result of deleting previous checkpoints
+	 * are added to a list of extents.  The list should be empty, but there
+	 * is no explicit "free the checkpoint information" call into the block
+	 * manager; if there was an error in an upper level resulting in some
+	 * previous checkpoint never being resolved, the list may not be empty.
+	 *
+	 * XXX
+	 * This isn't sufficient, actually: we're going to leak all the blocks
+	 * written as part of the last checkpoint because it was never resolved.
+	 */
+	__wt_block_extlist_free(session, &ci->ckpt_avail);
+	WT_RET(__wt_block_extlist_init(
+	    session, &ci->ckpt_avail, "live", "ckpt_avail"));
+
+	/*
+	 * To delete a checkpoint, we'll need extent list for it, and we have to
+	 * read that from the disk.
+	 */
+	deleting = 0;
+	WT_CKPT_FOREACH(ckptbase, ckpt) {
+		/*
+		 * To delete a checkpoint, we'll need checkpoint information for
+		 * it and the subsequent checkpoint.  The test is tricky, load
+		 * the current checkpoint's information if it's marked for
+		 * deletion, or if it follows a checkpoint marked for deletion,
+		 * where the boundary cases are the first checkpoint in the list
+		 * and the last checkpoint in the list: if we're deleting the
+		 * last checkpoint in the list, there's no next checkpoint, the
+		 * checkpoint will be merged into the live tree.
+		 */
+		if (!F_ISSET(ckpt, WT_CKPT_DELETE) &&
+		    (ckpt == ckptbase ||
+		    F_ISSET(ckpt, WT_CKPT_ADD) ||
+		    !F_ISSET(ckpt - 1, WT_CKPT_DELETE)))
+			continue;
+		deleting = 1;
+
+		/*
+		 * Allocate a checkpoint structure, crack the cookie and read
+		 * the checkpoint's extent lists.
+		 *
+		 * Ignore the avail list: checkpoint avail lists are only useful
+		 * if we are rolling forward from the particular checkpoint and
+		 * they represent our best understanding of what blocks can be
+		 * allocated.  If we are not operating on the live checkpoint,
+		 * subsequent checkpoints might have allocated those blocks, and
+		 * the avail list is useless.  We don't discard it, because it
+		 * is useful as part of verification, but we don't re-write it
+		 * either.
+		 */
+		WT_ERR(__wt_calloc(
+		    session, 1, sizeof(WT_BLOCK_CKPT), &ckpt->bpriv));
+		ci = ckpt->bpriv;
+		WT_ERR(__wt_block_ckpt_init(session, block, ci, ckpt->name, 0));
+		WT_ERR(__wt_block_buffer_to_ckpt(
+		    session, block, ckpt->raw.data, ci));
+		WT_ERR(__wt_block_extlist_read(session, block, &ci->alloc));
+		WT_ERR(__wt_block_extlist_read(session, block, &ci->discard));
+	}
+
+	/*
+	 * Hold a lock so the live extent lists and the file size can't change
+	 * underneath us.  I suspect we'll tighten this if checkpoints take too
+	 * much time away from real work: we read the historic checkpoint
+	 * information without a lock, but we could also merge and re-write the
+	 * delete checkpoint information without a lock, except for ranges
+	 * merged into the live tree.
+	 */
+	__wt_spin_lock(session, &block->live_lock);
+	locked = 1;
+
+	/* Skip the additional processing if we aren't deleting checkpoints. */
+	if (!deleting)
+		goto live_update;
+
+	/*
+	 * Delete any no-longer-needed checkpoints: we do this first as it frees
+	 * blocks to the live lists, and the freed blocks will then be included
+	 * when writing the live extent lists.
+	 */
+	WT_CKPT_FOREACH(ckptbase, ckpt) {
+		if (!F_ISSET(ckpt, WT_CKPT_DELETE))
+			continue;
+
+		if (WT_VERBOSE_ISSET(session, ckpt)) {
+			if (tmp == NULL)
+				WT_ERR(__wt_scr_alloc(session, 0, &tmp));
+			WT_ERR(__ckpt_string(
+			    session, block, ckpt->raw.data, tmp));
+			WT_VERBOSE_ERR(session, ckpt,
+			    "%s: delete-checkpoint: %s: %s",
+			    block->name, ckpt->name, (char *)tmp->data);
+		}
+
+		/*
+		 * Set the from/to checkpoint structures, where the "to" value
+		 * may be the live tree.
+		 */
+		a = ckpt->bpriv;
+		if (F_ISSET(ckpt + 1, WT_CKPT_ADD))
+			b = &block->live;
+		else
+			b = (ckpt + 1)->bpriv;
+
+		/*
+		 * Free the root page: there's nothing special about this free,
+		 * the root page is allocated using normal rules, that is, it
+		 * may have been taken from the avail list, and was entered on
+		 * the live system's alloc list at that time.  We free it into
+		 * the checkpoint's discard list, however, not the live system's
+		 * list because it appears on the checkpoint's alloc list and so
+		 * must be paired in the checkpoint.
+		 */
+		if (a->root_offset != WT_BLOCK_INVALID_OFFSET)
+			WT_ERR(__wt_block_insert_ext(session,
+			    &a->discard, a->root_offset, a->root_size));
+
+		/*
+		 * Free the blocks used to hold the "from" checkpoint's extent
+		 * lists directly to the live system's avail list, they were
+		 * never on any alloc list.   Include the "from" checkpoint's
+		 * avail list, it's going away.
+		 */
+		WT_ERR(__ckpt_extlist_fblocks(session, block, &a->alloc));
+		WT_ERR(__ckpt_extlist_fblocks(session, block, &a->avail));
+		WT_ERR(__ckpt_extlist_fblocks(session, block, &a->discard));
+
+		/*
+		 * Roll the "from" alloc and discard extent lists into the "to"
+		 * checkpoint's lists.
+		 */
+		if (a->alloc.entries != 0)
+			WT_ERR(__wt_block_extlist_merge(
+			    session, &a->alloc, &b->alloc));
+		if (a->discard.entries != 0)
+			WT_ERR(__wt_block_extlist_merge(
+			    session, &a->discard, &b->discard));
+
+		/*
+		 * If the "to" checkpoint is also being deleted, we're done with
+		 * it, it's merged into some other checkpoint in the next loop.
+		 * This means the extent lists may aggregate over a number of
+		 * checkpoints, but that's OK, they're disjoint sets of ranges.
+		 */
+		if (F_ISSET(ckpt + 1, WT_CKPT_DELETE))
+			continue;
+
+		/*
+		 * Find blocks for re-use: wherever the "to" checkpoint's
+		 * allocate and discard lists overlap is fair game, move ranges
+		 * appearing on both lists to the live checkpoint's newly
+		 * available list.
+		 */
+		WT_ERR(__wt_block_extlist_overlap(session, block, b));
+
+		/*
+		 * If we're updating the live system's information, we're done.
+		 */
+		if (F_ISSET(ckpt + 1, WT_CKPT_ADD))
+			continue;
+
+		/*
+		 * We have to write the "to" checkpoint's extent lists out in
+		 * new blocks, and update its cookie.
+		 *
+		 * Free the blocks used to hold the "to" checkpoint's extent
+		 * lists directly to the live system's avail list, they were
+		 * never on any alloc list.  Don't include the "to" checkpoint's
+		 * avail list, it's not changing.
+		 */
+		WT_ERR(__ckpt_extlist_fblocks(session, block, &b->alloc));
+		WT_ERR(__ckpt_extlist_fblocks(session, block, &b->discard));
+
+		F_SET(ckpt + 1, WT_CKPT_UPDATE);
+	}
+
+	/* Update checkpoints marked for update. */
+	WT_CKPT_FOREACH(ckptbase, ckpt)
+		if (F_ISSET(ckpt, WT_CKPT_UPDATE)) {
+			WT_ASSERT(session, !F_ISSET(ckpt, WT_CKPT_ADD));
+			WT_ERR(__ckpt_update(
+			    session, block, ckpt, ckpt->bpriv, 0, 0));
+		}
+
+live_update:
+	ci = &block->live;
+
+	/* Truncate the file if that's possible. */
+	WT_ERR(__wt_block_extlist_truncate(session, block, &ci->avail));
+
+	/* Update the final, added checkpoint based on the live system. */
+	WT_CKPT_FOREACH(ckptbase, ckpt)
+		if (F_ISSET(ckpt, WT_CKPT_ADD)) {
+			WT_ERR(__ckpt_update(
+			    session, block, ckpt, ci, ckpt_size, 1));
+
+			/*
+			 * XXX
+			 * Our caller wants two pieces of information: the time
+			 * the checkpoint was written and the final checkpoint
+			 * size.  This violates layering but the alternative is
+			 * a call for the btree layer to crack the checkpoint
+			 * cookie into its components, and that's a fair amount
+			 * of work.  (We could just read the system time in the
+			 * session layer when updating the metadata file, but
+			 * that won't work for the checkpoint size, and so we
+			 * do both here.)
+			 */
+			ckpt->ckpt_size = ci->ckpt_size;
+			WT_ERR(__wt_epoch(session, &ckpt->sec, NULL));
+		}
+
+	/*
+	 * Reset the live system's alloc and discard extent lists, leave the
+	 * avail list alone.
+	 */
+	__wt_block_extlist_free(session, &ci->alloc);
+	WT_ERR(__wt_block_extlist_init(session, &ci->alloc, "live", "alloc"));
+	__wt_block_extlist_free(session, &ci->discard);
+	WT_ERR(
+	    __wt_block_extlist_init(session, &ci->discard, "live", "discard"));
+
+#ifdef HAVE_DIAGNOSTIC
+	/*
+	 * The first checkpoint in the system should always have an empty
+	 * discard list.  If we've read that checkpoint and/or created it,
+	 * check.
+	 */
+	WT_CKPT_FOREACH(ckptbase, ckpt)
+		if (!F_ISSET(ckpt, WT_CKPT_DELETE))
+			break;
+	if ((a = ckpt->bpriv) == NULL)
+		a = &block->live;
+	if (a->discard.entries != 0) {
+		__wt_errx(session,
+		    "checkpoint incorrectly has blocks on the discard list");
+		WT_ERR(WT_ERROR);
+	}
+#endif
+
+err:	if (locked)
+		__wt_spin_unlock(session, &block->live_lock);
+
+	/* Discard any checkpoint information we loaded. */
+	WT_CKPT_FOREACH(ckptbase, ckpt)
+		if ((ci = ckpt->bpriv) != NULL) {
+			__wt_block_extlist_free(session, &ci->alloc);
+			__wt_block_extlist_free(session, &ci->avail);
+			__wt_block_extlist_free(session, &ci->discard);
+		}
+
+	__wt_scr_free(&tmp);
+	return (ret);
+}
+
+/*
+ * __ckpt_update --
+ *	Update a checkpoint.
+ */
+static int
+__ckpt_update(
+    WT_SESSION_IMPL *session, WT_BLOCK *block, WT_CKPT *ckpt,
+    WT_BLOCK_CKPT *ci, uint64_t ckpt_size, int is_live)
+{
+	WT_DECL_ITEM(tmp);
+	WT_DECL_RET;
+	uint8_t *endp;
+
+#ifdef HAVE_DIAGNOSTIC
+	/* Check the extent list combinations for overlaps. */
+	WT_RET(__wt_block_extlist_check(session, &ci->alloc, &ci->avail));
+	WT_RET(__wt_block_extlist_check(session, &ci->discard, &ci->avail));
+	WT_RET(__wt_block_extlist_check(session, &ci->alloc, &ci->discard));
+#endif
+	/*
+	 * Write the checkpoint's extent lists; we only write an avail list for
+	 * the live system, other checkpoint's avail lists are static and never
+	 * change.  When we do write the avail list for the live system it's
+	 * two lists: the current avail list plus the list of blocks that are
+	 * being made available as of the new checkpoint.  We can't merge that
+	 * second list into the real list yet, it's not truly available until
+	 * the new checkpoint location has been saved to the metadata.
+	 */
+	WT_RET(__wt_block_extlist_write(session, block, &ci->alloc, NULL));
+	if (is_live)
+		WT_RET(__wt_block_extlist_write(
+		    session, block, &ci->avail, &ci->ckpt_avail));
+	WT_RET(__wt_block_extlist_write(session, block, &ci->discard, NULL));
+
+	/*
+	 * Set the file size for the live system.
+	 *
+	 * XXX
+	 * We do NOT set the file size when re-writing checkpoints because we
+	 * want to test the checkpoint's blocks against a reasonable maximum
+	 * file size during verification.  This is bad: imagine a checkpoint
+	 * appearing early in the file, re-written, and then the checkpoint
+	 * requires blocks at the end of the file, blocks after the listed file
+	 * size.  If the application opens that checkpoint for writing
+	 * (discarding subsequent checkpoints), we would truncate the file to
+	 * the early chunk, discarding the re-written checkpoint information.
+	 * The alternative, updating the file size has its own problems, in
+	 * that case we'd work correctly, but we'd lose all of the blocks
+	 * between the original checkpoint and the re-written checkpoint.
+	 * Currently, there's no API to roll-forward intermediate checkpoints,
+	 * if there ever is, this will need to be fixed.
+	 */
+	if (is_live)
+		WT_RET(__wt_filesize(session, block->fh, &ci->file_size));
+
+	/* Set the checkpoint size for the live system. */
+	if (is_live)
+		ci->ckpt_size = ckpt_size;
+
+	/*
+	 * Copy the checkpoint information into the checkpoint array's address
+	 * cookie.
+	 */
+	WT_RET(__wt_buf_init(session, &ckpt->raw, WT_BTREE_MAX_ADDR_COOKIE));
+	endp = ckpt->raw.mem;
+	WT_RET(__wt_block_ckpt_to_buffer(session, block, &endp, ci));
+	ckpt->raw.size = WT_PTRDIFF32(endp, ckpt->raw.mem);
+
+	if (WT_VERBOSE_ISSET(session, ckpt)) {
+		WT_RET(__wt_scr_alloc(session, 0, &tmp));
+		WT_ERR(__ckpt_string(session, block, ckpt->raw.data, tmp));
+		WT_VERBOSE_ERR(session, ckpt,
+		    "%s: create-checkpoint: %s: %s",
+		    block->name, ckpt->name, (char *)tmp->data);
+	}
+
+err:	__wt_scr_free(&tmp);
+	return (ret);
+}
+
+/*
+ * __wt_block_checkpoint_resolve --
+ *	Resolve a checkpoint.
+ */
+int
+__wt_block_checkpoint_resolve(WT_SESSION_IMPL *session, WT_BLOCK *block)
+{
+	WT_BLOCK_CKPT *ci;
+	WT_DECL_RET;
+
+	ci = &block->live;
+
+	/*
+	 * Checkpoints are a two-step process: first, write a new checkpoint to
+	 * disk (including all the new extent lists for modified checkpoints
+	 * and the live system).  As part of this, create a list of file blocks
+	 * newly available for reallocation, based on checkpoints being deleted.
+	 * We then return the locations of the new checkpoint information to our
+	 * caller.  Our caller has to write that information into some kind of
+	 * stable storage, and once that's done, we can actually allocate from
+	 * that list of newly available file blocks.  (We can't allocate from
+	 * that list immediately because the allocation might happen before our
+	 * caller saves the new checkpoint information, and if we crashed before
+	 * the new checkpoint location was saved, we'd have overwritten blocks
+	 * still referenced by checkpoints in the system.)  In summary, there is
+	 * a second step: after our caller saves the checkpoint information, we
+	 * are called to add the newly available blocks into the live system's
+	 * available list.
+	 */
+	__wt_spin_lock(session, &block->live_lock);
+	ret = __wt_block_extlist_merge(session, &ci->ckpt_avail, &ci->avail);
+	__wt_spin_unlock(session, &block->live_lock);
+
+	/* Discard the list. */
+	__wt_block_extlist_free(session, &ci->ckpt_avail);
+
+	return (ret);
+}
+
+/*
+ * __ckpt_string --
+ *	Return a printable string representation of a checkpoint address cookie.
+ */
+static int
+__ckpt_string(WT_SESSION_IMPL *session,
+    WT_BLOCK *block, const uint8_t *addr, WT_ITEM *buf)
+{
+	WT_BLOCK_CKPT *ci, _ci;
+
+	/* Initialize the checkpoint, crack the cookie. */
+	ci = &_ci;
+	WT_RET(__wt_block_ckpt_init(session, block, ci, "string", 0));
+	WT_RET(__wt_block_buffer_to_ckpt(session, block, addr, ci));
+
+	WT_RET(__wt_buf_fmt(session, buf,
+	    "version=%d",
+	    ci->version));
+	if (ci->root_offset == WT_BLOCK_INVALID_OFFSET)
+		WT_RET(__wt_buf_catfmt(session, buf, ", root=[Empty]"));
+	else
+		WT_RET(__wt_buf_catfmt(session, buf,
+		    ", root=[%"
+		    PRIuMAX "-%" PRIuMAX ", %" PRIu32 ", %" PRIu32 "]",
+		    (uintmax_t)ci->root_offset,
+		    (uintmax_t)(ci->root_offset + ci->root_size),
+		    ci->root_size, ci->root_cksum));
+	if (ci->alloc.offset == WT_BLOCK_INVALID_OFFSET)
+		WT_RET(__wt_buf_catfmt(session, buf, ", alloc=[Empty]"));
+	else
+		WT_RET(__wt_buf_catfmt(session, buf,
+		    ", alloc=[%"
+		    PRIuMAX "-%" PRIuMAX ", %" PRIu32 ", %" PRIu32 "]",
+		    (uintmax_t)ci->alloc.offset,
+		    (uintmax_t)(ci->alloc.offset + ci->alloc.size),
+		    ci->alloc.size, ci->alloc.cksum));
+	if (ci->avail.offset == WT_BLOCK_INVALID_OFFSET)
+		WT_RET(__wt_buf_catfmt(session, buf, ", avail=[Empty]"));
+	else
+		WT_RET(__wt_buf_catfmt(session, buf,
+		    ", avail=[%"
+		    PRIuMAX "-%" PRIuMAX ", %" PRIu32 ", %" PRIu32 "]",
+		    (uintmax_t)ci->avail.offset,
+		    (uintmax_t)(ci->avail.offset + ci->avail.size),
+		    ci->avail.size, ci->avail.cksum));
+	if (ci->discard.offset == WT_BLOCK_INVALID_OFFSET)
+		WT_RET(__wt_buf_catfmt(session, buf, ", discard=[Empty]"));
+	else
+		WT_RET(__wt_buf_catfmt(session, buf,
+		    ", discard=[%"
+		    PRIuMAX "-%" PRIuMAX ", %" PRIu32 ", %" PRIu32 "]",
+		    (uintmax_t)ci->discard.offset,
+		    (uintmax_t)(ci->discard.offset + ci->discard.size),
+		    ci->discard.size, ci->discard.cksum));
+	WT_RET(__wt_buf_catfmt(session, buf,
+	    ", file size=%" PRIuMAX
+	    ", write generation=%" PRIu64,
+	    (uintmax_t)ci->file_size,
+	    ci->write_gen));
+
+	return (0);
+}