summaryrefslogtreecommitdiff
path: root/src/block/block_ckpt.c
diff options
context:
space:
mode:
Diffstat (limited to 'src/block/block_ckpt.c')
-rw-r--r--src/block/block_ckpt.c708
1 files changed, 708 insertions, 0 deletions
diff --git a/src/block/block_ckpt.c b/src/block/block_ckpt.c
new file mode 100644
index 00000000000..08e3856facd
--- /dev/null
+++ b/src/block/block_ckpt.c
@@ -0,0 +1,708 @@
+/*-
+ * Copyright (c) 2008-2012 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+static int __ckpt_process(WT_SESSION_IMPL *, WT_BLOCK *, WT_CKPT *);
+static int __ckpt_string(
+ WT_SESSION_IMPL *, WT_BLOCK *, const uint8_t *, WT_ITEM *);
+static int __ckpt_update(WT_SESSION_IMPL *,
+ WT_BLOCK *, WT_CKPT *, WT_BLOCK_CKPT *, uint64_t, int);
+
+/*
+ * __wt_block_ckpt_init --
+ * Initialize a checkpoint structure.
+ */
+int
+__wt_block_ckpt_init(WT_SESSION_IMPL *session,
+ WT_BLOCK *block, WT_BLOCK_CKPT *ci, const char *name, int is_live)
+{
+ WT_DECL_RET;
+
+ /*
+ * If we're loading a new live checkpoint, there shouldn't be one
+ * already loaded. The btree engine should prevent this from ever
+ * happening, but paranoia is a healthy thing.
+ */
+ if (is_live) {
+ __wt_spin_lock(session, &block->live_lock);
+ if (block->live_load)
+ ret = EINVAL;
+ else
+ block->live_load = 1;
+ __wt_spin_unlock(session, &block->live_lock);
+ if (ret)
+ WT_RET_MSG(
+ session, EINVAL, "checkpoint already loaded");
+ }
+
+ memset(ci, 0, sizeof(*ci));
+
+ ci->root_offset = WT_BLOCK_INVALID_OFFSET;
+
+ WT_RET(__wt_block_extlist_init(session, &ci->alloc, name, "alloc"));
+ WT_RET(__wt_block_extlist_init(session, &ci->avail, name, "avail"));
+ WT_RET(__wt_block_extlist_init(session, &ci->discard, name, "discard"));
+
+ ci->file_size = WT_BLOCK_DESC_SECTOR;
+ WT_RET(__wt_block_extlist_init(
+ session, &ci->ckpt_avail, name, "ckpt_avail"));
+
+ return (0);
+}
+
+/*
+ * __wt_block_checkpoint_load --
+ * Load a checkpoint.
+ */
+int
+__wt_block_checkpoint_load(WT_SESSION_IMPL *session,
+ WT_BLOCK *block, WT_ITEM *dsk, const uint8_t *addr, uint32_t addr_size,
+ int readonly)
+{
+ WT_BLOCK_CKPT *ci;
+ WT_DECL_ITEM(tmp);
+ WT_DECL_RET;
+
+ WT_UNUSED(addr_size);
+
+ /*
+ * Sometimes we don't find a root page (we weren't given a checkpoint,
+ * or the referenced checkpoint was empty). In that case we return a
+ * root page size of 0. Set that up now.
+ */
+ dsk->size = 0;
+
+ ci = &block->live;
+ WT_RET(__wt_block_ckpt_init(session, block, ci, "live", 1));
+
+ if (WT_VERBOSE_ISSET(session, ckpt)) {
+ if (addr != NULL) {
+ WT_ERR(__wt_scr_alloc(session, 0, &tmp));
+ WT_ERR(__ckpt_string(session, block, addr, tmp));
+ }
+ WT_VERBOSE_ERR(session, ckpt,
+ "%s: load-checkpoint: %s", block->name,
+ addr == NULL ? "[Empty]" : (char *)tmp->data);
+ }
+
+ /* If not loading a checkpoint from disk, we're done. */
+ if (addr == NULL || addr_size == 0)
+ return (0);
+
+ /* Crack the checkpoint cookie. */
+ if (addr != NULL)
+ WT_ERR(__wt_block_buffer_to_ckpt(session, block, addr, ci));
+
+ /* Verify sets up next. */
+ if (block->verify)
+ WT_ERR(__wt_verify_ckpt_load(session, block, ci));
+
+ /* Read, and optionally verify, any root page. */
+ if (ci->root_offset != WT_BLOCK_INVALID_OFFSET) {
+ WT_ERR(__wt_block_read_off(session, block,
+ dsk, ci->root_offset, ci->root_size, ci->root_cksum));
+ if (block->verify) {
+ if (tmp == NULL) {
+ WT_ERR(__wt_scr_alloc(session, 0, &tmp));
+ WT_ERR(__ckpt_string(
+ session, block, addr, tmp));
+ }
+ WT_ERR(
+ __wt_verify_dsk(session, (char *)tmp->data, dsk));
+ }
+ }
+
+ /*
+ * Rolling a checkpoint forward requires the avail list, the blocks from
+ * which we can allocate.
+ */
+ if (!readonly)
+ WT_ERR(__wt_block_extlist_read(session, block, &ci->avail));
+
+ /*
+ * If the checkpoint can be written, that means anything written after
+ * the checkpoint is no longer interesting. Truncate the file.
+ */
+ if (!readonly) {
+ WT_VERBOSE_ERR(session, ckpt,
+ "truncate file to %" PRIuMAX, (uintmax_t)ci->file_size);
+ WT_ERR(__wt_ftruncate(session, block->fh, ci->file_size));
+ }
+
+ if (0) {
+err: (void)__wt_block_checkpoint_unload(session, block);
+ }
+
+ __wt_scr_free(&tmp);
+ return (ret);
+}
+
+/*
+ * __wt_block_checkpoint_unload --
+ * Unload a checkpoint.
+ */
+int
+__wt_block_checkpoint_unload(WT_SESSION_IMPL *session, WT_BLOCK *block)
+{
+ WT_BLOCK_CKPT *ci;
+ WT_DECL_RET;
+
+ WT_VERBOSE_RETVAL(
+ session, ckpt, ret, "%s: unload checkpoint", block->name);
+
+ ci = &block->live;
+
+ /* Verify cleanup. */
+ if (block->verify)
+ WT_TRET(__wt_verify_ckpt_unload(session, block, ci));
+
+ __wt_block_ckpt_destroy(session, ci);
+
+ block->live_load = 0;
+
+ return (ret);
+}
+
+/*
+ * __wt_block_ckpt_destroy --
+ * Clear a checkpoint structure.
+ */
+void
+__wt_block_ckpt_destroy(WT_SESSION_IMPL *session, WT_BLOCK_CKPT *ci)
+{
+ /* Discard the extent lists. */
+ __wt_block_extlist_free(session, &ci->alloc);
+ __wt_block_extlist_free(session, &ci->avail);
+ __wt_block_extlist_free(session, &ci->discard);
+ __wt_block_extlist_free(session, &ci->ckpt_avail);
+}
+
+/*
+ * __wt_block_checkpoint --
+ * Create a new checkpoint.
+ */
+int
+__wt_block_checkpoint(WT_SESSION_IMPL *session,
+ WT_BLOCK *block, WT_ITEM *buf, WT_CKPT *ckptbase)
+{
+ WT_BLOCK_CKPT *ci;
+
+ ci = &block->live;
+ ci->version = WT_BM_CHECKPOINT_VERSION;
+
+ /*
+ * Write the root page: it's possible for there to be a checkpoint of
+ * an empty tree, in which case, we store an illegal root offset.
+ *
+ * XXX
+ * We happen to know that checkpoints are single-threaded above us in
+ * the btree engine. That's probably something we want to guarantee
+ * for any WiredTiger block manager.
+ */
+ if (buf == NULL) {
+ ci->root_offset = WT_BLOCK_INVALID_OFFSET;
+ ci->root_size = ci->root_cksum = 0;
+ } else
+ WT_RET(__wt_block_write_off(session, block, buf,
+ &ci->root_offset, &ci->root_size, &ci->root_cksum, 0));
+
+ /* Process the checkpoint list, deleting and updating as required. */
+ WT_RET(__ckpt_process(session, block, ckptbase));
+
+ /*
+ * Checkpoints have to hit disk (it would be reasonable to configure for
+ * lazy checkpoints, but we don't support them yet). Regardless, we're
+ * not holding any locks, other writers can proceed while we wait.
+ */
+ if (!F_ISSET(S2C(session), WT_CONN_NOSYNC))
+ WT_RET(__wt_fsync(session, block->fh));
+
+ return (0);
+}
+
+/*
+ * __ckpt_extlist_fblocks --
+ * If an extent list was read from disk, free its space to the live avail
+ * list.
+ */
+static inline int
+__ckpt_extlist_fblocks(
+ WT_SESSION_IMPL *session, WT_BLOCK *block, WT_EXTLIST *el)
+{
+ if (el->offset == WT_BLOCK_INVALID_OFFSET)
+ return (0);
+ return (__wt_block_insert_ext(
+ session, &block->live.avail, el->offset, el->size));
+}
+
+/*
+ * __ckpt_process --
+ * Process the list of checkpoints.
+ */
+static int
+__ckpt_process(
+ WT_SESSION_IMPL *session, WT_BLOCK *block, WT_CKPT *ckptbase)
+{
+ WT_BLOCK_CKPT *a, *b, *ci;
+ WT_CKPT *ckpt;
+ WT_DECL_ITEM(tmp);
+ WT_DECL_RET;
+ uint64_t ckpt_size;
+ int deleting, locked;
+
+ ci = &block->live;
+ locked = 0;
+
+ /*
+ * We've allocated our last page, update the checkpoint size. We need
+ * to calculate the live system's checkpoint size before reading and
+ * merging checkpoint allocation and discard information from the
+ * checkpoints we're deleting, those operations change the underlying
+ * byte counts.
+ */
+ ckpt_size = ci->ckpt_size;
+ ckpt_size += ci->alloc.bytes;
+ ckpt_size -= ci->discard.bytes;
+
+ /*
+ * Extents newly available as a result of deleting previous checkpoints
+ * are added to a list of extents. The list should be empty, but there
+ * is no explicit "free the checkpoint information" call into the block
+ * manager; if there was an error in an upper level resulting in some
+ * previous checkpoint never being resolved, the list may not be empty.
+ *
+ * XXX
+ * This isn't sufficient, actually: we're going to leak all the blocks
+ * written as part of the last checkpoint because it was never resolved.
+ */
+ __wt_block_extlist_free(session, &ci->ckpt_avail);
+ WT_RET(__wt_block_extlist_init(
+ session, &ci->ckpt_avail, "live", "ckpt_avail"));
+
+ /*
+ * To delete a checkpoint, we'll need extent list for it, and we have to
+ * read that from the disk.
+ */
+ deleting = 0;
+ WT_CKPT_FOREACH(ckptbase, ckpt) {
+ /*
+ * To delete a checkpoint, we'll need checkpoint information for
+ * it and the subsequent checkpoint. The test is tricky, load
+ * the current checkpoint's information if it's marked for
+ * deletion, or if it follows a checkpoint marked for deletion,
+ * where the boundary cases are the first checkpoint in the list
+ * and the last checkpoint in the list: if we're deleting the
+ * last checkpoint in the list, there's no next checkpoint, the
+ * checkpoint will be merged into the live tree.
+ */
+ if (!F_ISSET(ckpt, WT_CKPT_DELETE) &&
+ (ckpt == ckptbase ||
+ F_ISSET(ckpt, WT_CKPT_ADD) ||
+ !F_ISSET(ckpt - 1, WT_CKPT_DELETE)))
+ continue;
+ deleting = 1;
+
+ /*
+ * Allocate a checkpoint structure, crack the cookie and read
+ * the checkpoint's extent lists.
+ *
+ * Ignore the avail list: checkpoint avail lists are only useful
+ * if we are rolling forward from the particular checkpoint and
+ * they represent our best understanding of what blocks can be
+ * allocated. If we are not operating on the live checkpoint,
+ * subsequent checkpoints might have allocated those blocks, and
+ * the avail list is useless. We don't discard it, because it
+ * is useful as part of verification, but we don't re-write it
+ * either.
+ */
+ WT_ERR(__wt_calloc(
+ session, 1, sizeof(WT_BLOCK_CKPT), &ckpt->bpriv));
+ ci = ckpt->bpriv;
+ WT_ERR(__wt_block_ckpt_init(session, block, ci, ckpt->name, 0));
+ WT_ERR(__wt_block_buffer_to_ckpt(
+ session, block, ckpt->raw.data, ci));
+ WT_ERR(__wt_block_extlist_read(session, block, &ci->alloc));
+ WT_ERR(__wt_block_extlist_read(session, block, &ci->discard));
+ }
+
+ /*
+ * Hold a lock so the live extent lists and the file size can't change
+ * underneath us. I suspect we'll tighten this if checkpoints take too
+ * much time away from real work: we read the historic checkpoint
+ * information without a lock, but we could also merge and re-write the
+ * delete checkpoint information without a lock, except for ranges
+ * merged into the live tree.
+ */
+ __wt_spin_lock(session, &block->live_lock);
+ locked = 1;
+
+ /* Skip the additional processing if we aren't deleting checkpoints. */
+ if (!deleting)
+ goto live_update;
+
+ /*
+ * Delete any no-longer-needed checkpoints: we do this first as it frees
+ * blocks to the live lists, and the freed blocks will then be included
+ * when writing the live extent lists.
+ */
+ WT_CKPT_FOREACH(ckptbase, ckpt) {
+ if (!F_ISSET(ckpt, WT_CKPT_DELETE))
+ continue;
+
+ if (WT_VERBOSE_ISSET(session, ckpt)) {
+ if (tmp == NULL)
+ WT_ERR(__wt_scr_alloc(session, 0, &tmp));
+ WT_ERR(__ckpt_string(
+ session, block, ckpt->raw.data, tmp));
+ WT_VERBOSE_ERR(session, ckpt,
+ "%s: delete-checkpoint: %s: %s",
+ block->name, ckpt->name, (char *)tmp->data);
+ }
+
+ /*
+ * Set the from/to checkpoint structures, where the "to" value
+ * may be the live tree.
+ */
+ a = ckpt->bpriv;
+ if (F_ISSET(ckpt + 1, WT_CKPT_ADD))
+ b = &block->live;
+ else
+ b = (ckpt + 1)->bpriv;
+
+ /*
+ * Free the root page: there's nothing special about this free,
+ * the root page is allocated using normal rules, that is, it
+ * may have been taken from the avail list, and was entered on
+ * the live system's alloc list at that time. We free it into
+ * the checkpoint's discard list, however, not the live system's
+ * list because it appears on the checkpoint's alloc list and so
+ * must be paired in the checkpoint.
+ */
+ if (a->root_offset != WT_BLOCK_INVALID_OFFSET)
+ WT_ERR(__wt_block_insert_ext(session,
+ &a->discard, a->root_offset, a->root_size));
+
+ /*
+ * Free the blocks used to hold the "from" checkpoint's extent
+ * lists directly to the live system's avail list, they were
+ * never on any alloc list. Include the "from" checkpoint's
+ * avail list, it's going away.
+ */
+ WT_ERR(__ckpt_extlist_fblocks(session, block, &a->alloc));
+ WT_ERR(__ckpt_extlist_fblocks(session, block, &a->avail));
+ WT_ERR(__ckpt_extlist_fblocks(session, block, &a->discard));
+
+ /*
+ * Roll the "from" alloc and discard extent lists into the "to"
+ * checkpoint's lists.
+ */
+ if (a->alloc.entries != 0)
+ WT_ERR(__wt_block_extlist_merge(
+ session, &a->alloc, &b->alloc));
+ if (a->discard.entries != 0)
+ WT_ERR(__wt_block_extlist_merge(
+ session, &a->discard, &b->discard));
+
+ /*
+ * If the "to" checkpoint is also being deleted, we're done with
+ * it, it's merged into some other checkpoint in the next loop.
+ * This means the extent lists may aggregate over a number of
+ * checkpoints, but that's OK, they're disjoint sets of ranges.
+ */
+ if (F_ISSET(ckpt + 1, WT_CKPT_DELETE))
+ continue;
+
+ /*
+ * Find blocks for re-use: wherever the "to" checkpoint's
+ * allocate and discard lists overlap is fair game, move ranges
+ * appearing on both lists to the live checkpoint's newly
+ * available list.
+ */
+ WT_ERR(__wt_block_extlist_overlap(session, block, b));
+
+ /*
+ * If we're updating the live system's information, we're done.
+ */
+ if (F_ISSET(ckpt + 1, WT_CKPT_ADD))
+ continue;
+
+ /*
+ * We have to write the "to" checkpoint's extent lists out in
+ * new blocks, and update its cookie.
+ *
+ * Free the blocks used to hold the "to" checkpoint's extent
+ * lists directly to the live system's avail list, they were
+ * never on any alloc list. Don't include the "to" checkpoint's
+ * avail list, it's not changing.
+ */
+ WT_ERR(__ckpt_extlist_fblocks(session, block, &b->alloc));
+ WT_ERR(__ckpt_extlist_fblocks(session, block, &b->discard));
+
+ F_SET(ckpt + 1, WT_CKPT_UPDATE);
+ }
+
+ /* Update checkpoints marked for update. */
+ WT_CKPT_FOREACH(ckptbase, ckpt)
+ if (F_ISSET(ckpt, WT_CKPT_UPDATE)) {
+ WT_ASSERT(session, !F_ISSET(ckpt, WT_CKPT_ADD));
+ WT_ERR(__ckpt_update(
+ session, block, ckpt, ckpt->bpriv, 0, 0));
+ }
+
+live_update:
+ ci = &block->live;
+
+ /* Truncate the file if that's possible. */
+ WT_ERR(__wt_block_extlist_truncate(session, block, &ci->avail));
+
+ /* Update the final, added checkpoint based on the live system. */
+ WT_CKPT_FOREACH(ckptbase, ckpt)
+ if (F_ISSET(ckpt, WT_CKPT_ADD)) {
+ WT_ERR(__ckpt_update(
+ session, block, ckpt, ci, ckpt_size, 1));
+
+ /*
+ * XXX
+ * Our caller wants two pieces of information: the time
+ * the checkpoint was written and the final checkpoint
+ * size. This violates layering but the alternative is
+ * a call for the btree layer to crack the checkpoint
+ * cookie into its components, and that's a fair amount
+ * of work. (We could just read the system time in the
+ * session layer when updating the metadata file, but
+ * that won't work for the checkpoint size, and so we
+ * do both here.)
+ */
+ ckpt->ckpt_size = ci->ckpt_size;
+ WT_ERR(__wt_epoch(session, &ckpt->sec, NULL));
+ }
+
+ /*
+ * Reset the live system's alloc and discard extent lists, leave the
+ * avail list alone.
+ */
+ __wt_block_extlist_free(session, &ci->alloc);
+ WT_ERR(__wt_block_extlist_init(session, &ci->alloc, "live", "alloc"));
+ __wt_block_extlist_free(session, &ci->discard);
+ WT_ERR(
+ __wt_block_extlist_init(session, &ci->discard, "live", "discard"));
+
+#ifdef HAVE_DIAGNOSTIC
+ /*
+ * The first checkpoint in the system should always have an empty
+ * discard list. If we've read that checkpoint and/or created it,
+ * check.
+ */
+ WT_CKPT_FOREACH(ckptbase, ckpt)
+ if (!F_ISSET(ckpt, WT_CKPT_DELETE))
+ break;
+ if ((a = ckpt->bpriv) == NULL)
+ a = &block->live;
+ if (a->discard.entries != 0) {
+ __wt_errx(session,
+ "checkpoint incorrectly has blocks on the discard list");
+ WT_ERR(WT_ERROR);
+ }
+#endif
+
+err: if (locked)
+ __wt_spin_unlock(session, &block->live_lock);
+
+ /* Discard any checkpoint information we loaded. */
+ WT_CKPT_FOREACH(ckptbase, ckpt)
+ if ((ci = ckpt->bpriv) != NULL) {
+ __wt_block_extlist_free(session, &ci->alloc);
+ __wt_block_extlist_free(session, &ci->avail);
+ __wt_block_extlist_free(session, &ci->discard);
+ }
+
+ __wt_scr_free(&tmp);
+ return (ret);
+}
+
+/*
+ * __ckpt_update --
+ * Update a checkpoint.
+ */
+static int
+__ckpt_update(
+ WT_SESSION_IMPL *session, WT_BLOCK *block, WT_CKPT *ckpt,
+ WT_BLOCK_CKPT *ci, uint64_t ckpt_size, int is_live)
+{
+ WT_DECL_ITEM(tmp);
+ WT_DECL_RET;
+ uint8_t *endp;
+
+#ifdef HAVE_DIAGNOSTIC
+ /* Check the extent list combinations for overlaps. */
+ WT_RET(__wt_block_extlist_check(session, &ci->alloc, &ci->avail));
+ WT_RET(__wt_block_extlist_check(session, &ci->discard, &ci->avail));
+ WT_RET(__wt_block_extlist_check(session, &ci->alloc, &ci->discard));
+#endif
+ /*
+ * Write the checkpoint's extent lists; we only write an avail list for
+ * the live system, other checkpoint's avail lists are static and never
+ * change. When we do write the avail list for the live system it's
+ * two lists: the current avail list plus the list of blocks that are
+ * being made available as of the new checkpoint. We can't merge that
+ * second list into the real list yet, it's not truly available until
+ * the new checkpoint location has been saved to the metadata.
+ */
+ WT_RET(__wt_block_extlist_write(session, block, &ci->alloc, NULL));
+ if (is_live)
+ WT_RET(__wt_block_extlist_write(
+ session, block, &ci->avail, &ci->ckpt_avail));
+ WT_RET(__wt_block_extlist_write(session, block, &ci->discard, NULL));
+
+ /*
+ * Set the file size for the live system.
+ *
+ * XXX
+ * We do NOT set the file size when re-writing checkpoints because we
+ * want to test the checkpoint's blocks against a reasonable maximum
+ * file size during verification. This is bad: imagine a checkpoint
+ * appearing early in the file, re-written, and then the checkpoint
+ * requires blocks at the end of the file, blocks after the listed file
+ * size. If the application opens that checkpoint for writing
+ * (discarding subsequent checkpoints), we would truncate the file to
+ * the early chunk, discarding the re-written checkpoint information.
+ * The alternative, updating the file size has its own problems, in
+ * that case we'd work correctly, but we'd lose all of the blocks
+ * between the original checkpoint and the re-written checkpoint.
+ * Currently, there's no API to roll-forward intermediate checkpoints,
+ * if there ever is, this will need to be fixed.
+ */
+ if (is_live)
+ WT_RET(__wt_filesize(session, block->fh, &ci->file_size));
+
+ /* Set the checkpoint size for the live system. */
+ if (is_live)
+ ci->ckpt_size = ckpt_size;
+
+ /*
+ * Copy the checkpoint information into the checkpoint array's address
+ * cookie.
+ */
+ WT_RET(__wt_buf_init(session, &ckpt->raw, WT_BTREE_MAX_ADDR_COOKIE));
+ endp = ckpt->raw.mem;
+ WT_RET(__wt_block_ckpt_to_buffer(session, block, &endp, ci));
+ ckpt->raw.size = WT_PTRDIFF32(endp, ckpt->raw.mem);
+
+ if (WT_VERBOSE_ISSET(session, ckpt)) {
+ WT_RET(__wt_scr_alloc(session, 0, &tmp));
+ WT_ERR(__ckpt_string(session, block, ckpt->raw.data, tmp));
+ WT_VERBOSE_ERR(session, ckpt,
+ "%s: create-checkpoint: %s: %s",
+ block->name, ckpt->name, (char *)tmp->data);
+ }
+
+err: __wt_scr_free(&tmp);
+ return (ret);
+}
+
+/*
+ * __wt_block_checkpoint_resolve --
+ * Resolve a checkpoint.
+ */
+int
+__wt_block_checkpoint_resolve(WT_SESSION_IMPL *session, WT_BLOCK *block)
+{
+ WT_BLOCK_CKPT *ci;
+ WT_DECL_RET;
+
+ ci = &block->live;
+
+ /*
+ * Checkpoints are a two-step process: first, write a new checkpoint to
+ * disk (including all the new extent lists for modified checkpoints
+ * and the live system). As part of this, create a list of file blocks
+ * newly available for reallocation, based on checkpoints being deleted.
+ * We then return the locations of the new checkpoint information to our
+ * caller. Our caller has to write that information into some kind of
+ * stable storage, and once that's done, we can actually allocate from
+ * that list of newly available file blocks. (We can't allocate from
+ * that list immediately because the allocation might happen before our
+ * caller saves the new checkpoint information, and if we crashed before
+ * the new checkpoint location was saved, we'd have overwritten blocks
+ * still referenced by checkpoints in the system.) In summary, there is
+ * a second step: after our caller saves the checkpoint information, we
+ * are called to add the newly available blocks into the live system's
+ * available list.
+ */
+ __wt_spin_lock(session, &block->live_lock);
+ ret = __wt_block_extlist_merge(session, &ci->ckpt_avail, &ci->avail);
+ __wt_spin_unlock(session, &block->live_lock);
+
+ /* Discard the list. */
+ __wt_block_extlist_free(session, &ci->ckpt_avail);
+
+ return (ret);
+}
+
+/*
+ * __ckpt_string --
+ * Return a printable string representation of a checkpoint address cookie.
+ */
+static int
+__ckpt_string(WT_SESSION_IMPL *session,
+ WT_BLOCK *block, const uint8_t *addr, WT_ITEM *buf)
+{
+ WT_BLOCK_CKPT *ci, _ci;
+
+ /* Initialize the checkpoint, crack the cookie. */
+ ci = &_ci;
+ WT_RET(__wt_block_ckpt_init(session, block, ci, "string", 0));
+ WT_RET(__wt_block_buffer_to_ckpt(session, block, addr, ci));
+
+ WT_RET(__wt_buf_fmt(session, buf,
+ "version=%d",
+ ci->version));
+ if (ci->root_offset == WT_BLOCK_INVALID_OFFSET)
+ WT_RET(__wt_buf_catfmt(session, buf, ", root=[Empty]"));
+ else
+ WT_RET(__wt_buf_catfmt(session, buf,
+ ", root=[%"
+ PRIuMAX "-%" PRIuMAX ", %" PRIu32 ", %" PRIu32 "]",
+ (uintmax_t)ci->root_offset,
+ (uintmax_t)(ci->root_offset + ci->root_size),
+ ci->root_size, ci->root_cksum));
+ if (ci->alloc.offset == WT_BLOCK_INVALID_OFFSET)
+ WT_RET(__wt_buf_catfmt(session, buf, ", alloc=[Empty]"));
+ else
+ WT_RET(__wt_buf_catfmt(session, buf,
+ ", alloc=[%"
+ PRIuMAX "-%" PRIuMAX ", %" PRIu32 ", %" PRIu32 "]",
+ (uintmax_t)ci->alloc.offset,
+ (uintmax_t)(ci->alloc.offset + ci->alloc.size),
+ ci->alloc.size, ci->alloc.cksum));
+ if (ci->avail.offset == WT_BLOCK_INVALID_OFFSET)
+ WT_RET(__wt_buf_catfmt(session, buf, ", avail=[Empty]"));
+ else
+ WT_RET(__wt_buf_catfmt(session, buf,
+ ", avail=[%"
+ PRIuMAX "-%" PRIuMAX ", %" PRIu32 ", %" PRIu32 "]",
+ (uintmax_t)ci->avail.offset,
+ (uintmax_t)(ci->avail.offset + ci->avail.size),
+ ci->avail.size, ci->avail.cksum));
+ if (ci->discard.offset == WT_BLOCK_INVALID_OFFSET)
+ WT_RET(__wt_buf_catfmt(session, buf, ", discard=[Empty]"));
+ else
+ WT_RET(__wt_buf_catfmt(session, buf,
+ ", discard=[%"
+ PRIuMAX "-%" PRIuMAX ", %" PRIu32 ", %" PRIu32 "]",
+ (uintmax_t)ci->discard.offset,
+ (uintmax_t)(ci->discard.offset + ci->discard.size),
+ ci->discard.size, ci->discard.cksum));
+ WT_RET(__wt_buf_catfmt(session, buf,
+ ", file size=%" PRIuMAX
+ ", write generation=%" PRIu64,
+ (uintmax_t)ci->file_size,
+ ci->write_gen));
+
+ return (0);
+}