summaryrefslogtreecommitdiff
path: root/src/third_party/wiredtiger/src/block/block_ckpt.c
diff options
context:
space:
mode:
Diffstat (limited to 'src/third_party/wiredtiger/src/block/block_ckpt.c')
-rw-r--r--src/third_party/wiredtiger/src/block/block_ckpt.c1463
1 files changed, 701 insertions, 762 deletions
diff --git a/src/third_party/wiredtiger/src/block/block_ckpt.c b/src/third_party/wiredtiger/src/block/block_ckpt.c
index 9b7a42b5b9c..5e2f261a424 100644
--- a/src/third_party/wiredtiger/src/block/block_ckpt.c
+++ b/src/third_party/wiredtiger/src/block/block_ckpt.c
@@ -9,882 +9,821 @@
#include "wt_internal.h"
static int __ckpt_process(WT_SESSION_IMPL *, WT_BLOCK *, WT_CKPT *);
-static int __ckpt_update(WT_SESSION_IMPL *,
- WT_BLOCK *, WT_CKPT *, WT_CKPT *, WT_BLOCK_CKPT *, bool);
+static int __ckpt_update(
+ WT_SESSION_IMPL *, WT_BLOCK *, WT_CKPT *, WT_CKPT *, WT_BLOCK_CKPT *, bool);
/*
* __wt_block_ckpt_init --
- * Initialize a checkpoint structure.
+ * Initialize a checkpoint structure.
*/
int
-__wt_block_ckpt_init(
- WT_SESSION_IMPL *session, WT_BLOCK_CKPT *ci, const char *name)
+__wt_block_ckpt_init(WT_SESSION_IMPL *session, WT_BLOCK_CKPT *ci, const char *name)
{
- WT_CLEAR(*ci);
+ WT_CLEAR(*ci);
- ci->version = WT_BM_CHECKPOINT_VERSION;
- ci->root_offset = WT_BLOCK_INVALID_OFFSET;
+ ci->version = WT_BM_CHECKPOINT_VERSION;
+ ci->root_offset = WT_BLOCK_INVALID_OFFSET;
- WT_RET(__wt_block_extlist_init(
- session, &ci->alloc, name, "alloc", false));
- WT_RET(__wt_block_extlist_init(
- session, &ci->avail, name, "avail", true));
- WT_RET(__wt_block_extlist_init(
- session, &ci->discard, name, "discard", false));
- WT_RET(__wt_block_extlist_init(
- session, &ci->ckpt_avail, name, "ckpt_avail", true));
+ WT_RET(__wt_block_extlist_init(session, &ci->alloc, name, "alloc", false));
+ WT_RET(__wt_block_extlist_init(session, &ci->avail, name, "avail", true));
+ WT_RET(__wt_block_extlist_init(session, &ci->discard, name, "discard", false));
+ WT_RET(__wt_block_extlist_init(session, &ci->ckpt_avail, name, "ckpt_avail", true));
- return (0);
+ return (0);
}
/*
* __wt_block_checkpoint_load --
- * Load a checkpoint.
+ * Load a checkpoint.
*/
int
-__wt_block_checkpoint_load(WT_SESSION_IMPL *session, WT_BLOCK *block,
- const uint8_t *addr, size_t addr_size,
- uint8_t *root_addr, size_t *root_addr_sizep, bool checkpoint)
+__wt_block_checkpoint_load(WT_SESSION_IMPL *session, WT_BLOCK *block, const uint8_t *addr,
+ size_t addr_size, uint8_t *root_addr, size_t *root_addr_sizep, bool checkpoint)
{
- WT_BLOCK_CKPT *ci, _ci;
- WT_DECL_RET;
- uint8_t *endp;
-
- /*
- * Sometimes we don't find a root page (we weren't given a checkpoint,
- * or the checkpoint was empty). In that case we return an empty root
- * address, set that up now.
- */
- *root_addr_sizep = 0;
-
- ci = NULL;
-
- if (WT_VERBOSE_ISSET(session, WT_VERB_CHECKPOINT))
- __wt_ckpt_verbose(session, block, "load", NULL, addr);
-
- /*
- * There's a single checkpoint in the file that can be written, all of
- * the others are read-only. We use the same initialization calls for
- * readonly checkpoints, but the information doesn't persist.
- */
- if (checkpoint) {
- ci = &_ci;
- WT_ERR(__wt_block_ckpt_init(session, ci, "checkpoint"));
- } else {
- /*
- * We depend on the btree level for locking: things will go bad
- * fast if we open the live system in two handles, or salvage,
- * truncate or verify the live/running file.
- */
+ WT_BLOCK_CKPT *ci, _ci;
+ WT_DECL_RET;
+ uint8_t *endp;
+
+ /*
+ * Sometimes we don't find a root page (we weren't given a checkpoint, or the checkpoint was
+ * empty). In that case we return an empty root address, set that up now.
+ */
+ *root_addr_sizep = 0;
+
+ ci = NULL;
+
+ if (WT_VERBOSE_ISSET(session, WT_VERB_CHECKPOINT))
+ __wt_ckpt_verbose(session, block, "load", NULL, addr);
+
+ /*
+ * There's a single checkpoint in the file that can be written, all of the others are read-only.
+ * We use the same initialization calls for readonly checkpoints, but the information doesn't
+ * persist.
+ */
+ if (checkpoint) {
+ ci = &_ci;
+ WT_ERR(__wt_block_ckpt_init(session, ci, "checkpoint"));
+ } else {
+/*
+ * We depend on the btree level for locking: things will go bad fast if we open the live system in
+ * two handles, or salvage, truncate or verify the live/running file.
+ */
#ifdef HAVE_DIAGNOSTIC
- __wt_spin_lock(session, &block->live_lock);
- WT_ASSERT(session, block->live_open == false);
- block->live_open = true;
- __wt_spin_unlock(session, &block->live_lock);
+ __wt_spin_lock(session, &block->live_lock);
+ WT_ASSERT(session, block->live_open == false);
+ block->live_open = true;
+ __wt_spin_unlock(session, &block->live_lock);
#endif
- ci = &block->live;
- WT_ERR(__wt_block_ckpt_init(session, ci, "live"));
- }
-
- /*
- * If the checkpoint has an on-disk root page, load it. Otherwise, size
- * the file past the description information.
- */
- if (addr == NULL || addr_size == 0)
- ci->file_size = block->allocsize;
- else {
- /* Crack the checkpoint cookie. */
- WT_ERR(__wt_block_buffer_to_ckpt(session, block, addr, ci));
-
- /* Verify sets up next. */
- if (block->verify)
- WT_ERR(__wt_verify_ckpt_load(session, block, ci));
-
- /* Read any root page. */
- if (ci->root_offset != WT_BLOCK_INVALID_OFFSET) {
- endp = root_addr;
- WT_ERR(__wt_block_addr_to_buffer(block, &endp,
- ci->root_offset, ci->root_size, ci->root_checksum));
- *root_addr_sizep = WT_PTRDIFF(endp, root_addr);
- }
-
- /*
- * Rolling a checkpoint forward requires the avail list, the
- * blocks from which we can allocate.
- */
- if (!checkpoint)
- WT_ERR(__wt_block_extlist_read_avail(
- session, block, &ci->avail, ci->file_size));
- }
-
- /*
- * If the checkpoint can be written, that means anything written after
- * the checkpoint is no longer interesting, truncate the file. Don't
- * bother checking the avail list for a block at the end of the file,
- * that was done when the checkpoint was first written (re-writing the
- * checkpoint might possibly make it relevant here, but it's unlikely
- * enough I don't bother).
- */
- if (!checkpoint)
- WT_ERR(__wt_block_truncate(session, block, ci->file_size));
-
- if (0) {
+ ci = &block->live;
+ WT_ERR(__wt_block_ckpt_init(session, ci, "live"));
+ }
+
+ /*
+ * If the checkpoint has an on-disk root page, load it. Otherwise, size the file past the
+ * description information.
+ */
+ if (addr == NULL || addr_size == 0)
+ ci->file_size = block->allocsize;
+ else {
+ /* Crack the checkpoint cookie. */
+ WT_ERR(__wt_block_buffer_to_ckpt(session, block, addr, ci));
+
+ /* Verify sets up next. */
+ if (block->verify)
+ WT_ERR(__wt_verify_ckpt_load(session, block, ci));
+
+ /* Read any root page. */
+ if (ci->root_offset != WT_BLOCK_INVALID_OFFSET) {
+ endp = root_addr;
+ WT_ERR(__wt_block_addr_to_buffer(
+ block, &endp, ci->root_offset, ci->root_size, ci->root_checksum));
+ *root_addr_sizep = WT_PTRDIFF(endp, root_addr);
+ }
+
+ /*
+ * Rolling a checkpoint forward requires the avail list, the blocks from which we can
+ * allocate.
+ */
+ if (!checkpoint)
+ WT_ERR(__wt_block_extlist_read_avail(session, block, &ci->avail, ci->file_size));
+ }
+
+ /*
+ * If the checkpoint can be written, that means anything written after the checkpoint is no
+ * longer interesting, truncate the file. Don't bother checking the avail list for a block at
+ * the end of the file, that was done when the checkpoint was first written (re-writing the
+ * checkpoint might possibly make it relevant here, but it's unlikely enough I don't bother).
+ */
+ if (!checkpoint)
+ WT_ERR(__wt_block_truncate(session, block, ci->file_size));
+
+ if (0) {
err:
- /*
- * Don't call checkpoint-unload: unload does real work including
- * file truncation. If we fail early enough that the checkpoint
- * information isn't correct, bad things would happen. The only
- * allocated memory was in the service of verify, clean that up.
- */
- if (block->verify)
- WT_TRET(__wt_verify_ckpt_unload(session, block));
- }
-
- /* Checkpoints don't need the original information, discard it. */
- if (checkpoint)
- __wt_block_ckpt_destroy(session, ci);
-
- return (ret);
+ /*
+ * Don't call checkpoint-unload: unload does real work including file truncation. If we fail
+ * early enough that the checkpoint information isn't correct, bad things would happen. The
+ * only allocated memory was in the service of verify, clean that up.
+ */
+ if (block->verify)
+ WT_TRET(__wt_verify_ckpt_unload(session, block));
+ }
+
+ /* Checkpoints don't need the original information, discard it. */
+ if (checkpoint)
+ __wt_block_ckpt_destroy(session, ci);
+
+ return (ret);
}
/*
* __wt_block_checkpoint_unload --
- * Unload a checkpoint.
+ * Unload a checkpoint.
*/
int
-__wt_block_checkpoint_unload(
- WT_SESSION_IMPL *session, WT_BLOCK *block, bool checkpoint)
+__wt_block_checkpoint_unload(WT_SESSION_IMPL *session, WT_BLOCK *block, bool checkpoint)
{
- WT_DECL_RET;
-
- /* Verify cleanup. */
- if (block->verify)
- WT_TRET(__wt_verify_ckpt_unload(session, block));
-
- /*
- * If it's the live system, truncate to discard any extended blocks and
- * discard the active extent lists. Hold the lock even though we're
- * unloading the live checkpoint, there could be readers active in other
- * checkpoints.
- */
- if (!checkpoint) {
- WT_TRET(__wt_block_truncate(session, block, block->size));
-
- __wt_spin_lock(session, &block->live_lock);
- __wt_block_ckpt_destroy(session, &block->live);
+ WT_DECL_RET;
+
+ /* Verify cleanup. */
+ if (block->verify)
+ WT_TRET(__wt_verify_ckpt_unload(session, block));
+
+ /*
+ * If it's the live system, truncate to discard any extended blocks and discard the active
+ * extent lists. Hold the lock even though we're unloading the live checkpoint, there could be
+ * readers active in other checkpoints.
+ */
+ if (!checkpoint) {
+ WT_TRET(__wt_block_truncate(session, block, block->size));
+
+ __wt_spin_lock(session, &block->live_lock);
+ __wt_block_ckpt_destroy(session, &block->live);
#ifdef HAVE_DIAGNOSTIC
- block->live_open = false;
+ block->live_open = false;
#endif
- __wt_spin_unlock(session, &block->live_lock);
- }
+ __wt_spin_unlock(session, &block->live_lock);
+ }
- return (ret);
+ return (ret);
}
/*
* __wt_block_ckpt_destroy --
- * Clear a checkpoint structure.
+ * Clear a checkpoint structure.
*/
void
__wt_block_ckpt_destroy(WT_SESSION_IMPL *session, WT_BLOCK_CKPT *ci)
{
- /* Discard the extent lists. */
- __wt_block_extlist_free(session, &ci->alloc);
- __wt_block_extlist_free(session, &ci->avail);
- __wt_block_extlist_free(session, &ci->discard);
- __wt_block_extlist_free(session, &ci->ckpt_alloc);
- __wt_block_extlist_free(session, &ci->ckpt_avail);
- __wt_block_extlist_free(session, &ci->ckpt_discard);
+ /* Discard the extent lists. */
+ __wt_block_extlist_free(session, &ci->alloc);
+ __wt_block_extlist_free(session, &ci->avail);
+ __wt_block_extlist_free(session, &ci->discard);
+ __wt_block_extlist_free(session, &ci->ckpt_alloc);
+ __wt_block_extlist_free(session, &ci->ckpt_avail);
+ __wt_block_extlist_free(session, &ci->ckpt_discard);
}
/*
* __wt_block_checkpoint_start --
- * Start a checkpoint.
+ * Start a checkpoint.
*/
int
__wt_block_checkpoint_start(WT_SESSION_IMPL *session, WT_BLOCK *block)
{
- WT_DECL_RET;
-
- __wt_spin_lock(session, &block->live_lock);
- switch (block->ckpt_state) {
- case WT_CKPT_INPROGRESS:
- case WT_CKPT_PANIC_ON_FAILURE:
- case WT_CKPT_SALVAGE:
- __wt_err(session, EINVAL,
- "%s: an unexpected checkpoint start: the checkpoint "
- "has already started or was configured for salvage",
- block->name);
- ret = __wt_block_panic(session);
- break;
- case WT_CKPT_NONE:
- block->ckpt_state = WT_CKPT_INPROGRESS;
- break;
- }
- __wt_spin_unlock(session, &block->live_lock);
- return (ret);
+ WT_DECL_RET;
+
+ __wt_spin_lock(session, &block->live_lock);
+ switch (block->ckpt_state) {
+ case WT_CKPT_INPROGRESS:
+ case WT_CKPT_PANIC_ON_FAILURE:
+ case WT_CKPT_SALVAGE:
+ __wt_err(session, EINVAL,
+ "%s: an unexpected checkpoint start: the checkpoint "
+ "has already started or was configured for salvage",
+ block->name);
+ ret = __wt_block_panic(session);
+ break;
+ case WT_CKPT_NONE:
+ block->ckpt_state = WT_CKPT_INPROGRESS;
+ break;
+ }
+ __wt_spin_unlock(session, &block->live_lock);
+ return (ret);
}
/*
* __wt_block_checkpoint --
- * Create a new checkpoint.
+ * Create a new checkpoint.
*/
int
-__wt_block_checkpoint(WT_SESSION_IMPL *session,
- WT_BLOCK *block, WT_ITEM *buf, WT_CKPT *ckptbase, bool data_checksum)
+__wt_block_checkpoint(
+ WT_SESSION_IMPL *session, WT_BLOCK *block, WT_ITEM *buf, WT_CKPT *ckptbase, bool data_checksum)
{
- WT_BLOCK_CKPT *ci;
- WT_DECL_RET;
-
- ci = &block->live;
-
- /* Switch to first-fit allocation. */
- __wt_block_configure_first_fit(block, true);
-
- /*
- * Write the root page: it's possible for there to be a checkpoint of
- * an empty tree, in which case, we store an illegal root offset.
- *
- * !!!
- * We happen to know that checkpoints are single-threaded above us in
- * the btree engine. That's probably something we want to guarantee
- * for any WiredTiger block manager.
- */
- if (buf == NULL) {
- ci->root_offset = WT_BLOCK_INVALID_OFFSET;
- ci->root_size = ci->root_checksum = 0;
- } else
- WT_ERR(__wt_block_write_off(session, block, buf,
- &ci->root_offset, &ci->root_size, &ci->root_checksum,
- data_checksum, true, false));
-
- /*
- * Checkpoints are potentially reading/writing/merging lots of blocks,
- * pre-allocate structures for this thread's use.
- */
- WT_ERR(__wt_block_ext_prealloc(session, 250));
-
- /* Process the checkpoint list, deleting and updating as required. */
- ret = __ckpt_process(session, block, ckptbase);
-
- /* Discard any excessive memory we've allocated. */
- WT_TRET(__wt_block_ext_discard(session, 250));
-
- /* Restore the original allocation plan. */
-err: __wt_block_configure_first_fit(block, false);
-
- return (ret);
+ WT_BLOCK_CKPT *ci;
+ WT_DECL_RET;
+
+ ci = &block->live;
+
+ /* Switch to first-fit allocation. */
+ __wt_block_configure_first_fit(block, true);
+
+ /*
+ * Write the root page: it's possible for there to be a checkpoint of
+ * an empty tree, in which case, we store an illegal root offset.
+ *
+ * !!!
+ * We happen to know that checkpoints are single-threaded above us in
+ * the btree engine. That's probably something we want to guarantee
+ * for any WiredTiger block manager.
+ */
+ if (buf == NULL) {
+ ci->root_offset = WT_BLOCK_INVALID_OFFSET;
+ ci->root_size = ci->root_checksum = 0;
+ } else
+ WT_ERR(__wt_block_write_off(session, block, buf, &ci->root_offset, &ci->root_size,
+ &ci->root_checksum, data_checksum, true, false));
+
+ /*
+ * Checkpoints are potentially reading/writing/merging lots of blocks, pre-allocate structures
+ * for this thread's use.
+ */
+ WT_ERR(__wt_block_ext_prealloc(session, 250));
+
+ /* Process the checkpoint list, deleting and updating as required. */
+ ret = __ckpt_process(session, block, ckptbase);
+
+ /* Discard any excessive memory we've allocated. */
+ WT_TRET(__wt_block_ext_discard(session, 250));
+
+/* Restore the original allocation plan. */
+err:
+ __wt_block_configure_first_fit(block, false);
+
+ return (ret);
}
/*
* __ckpt_extlist_read --
- * Read a checkpoints extent lists and copy
+ * Read a checkpoints extent lists and copy
*/
static int
__ckpt_extlist_read(WT_SESSION_IMPL *session, WT_BLOCK *block, WT_CKPT *ckpt)
{
- WT_BLOCK_CKPT *ci;
-
- /*
- * Allocate a checkpoint structure, crack the cookie and read the
- * checkpoint's extent lists.
- *
- * Ignore the avail list: checkpoint avail lists are only useful if we
- * are rolling forward from the particular checkpoint and they represent
- * our best understanding of what blocks can be allocated. If we are
- * not operating on the live checkpoint, subsequent checkpoints might
- * have allocated those blocks, and the avail list is useless. We don't
- * discard it, because it is useful as part of verification, but we
- * don't re-write it either.
- */
- WT_RET(__wt_calloc(session, 1, sizeof(WT_BLOCK_CKPT), &ckpt->bpriv));
-
- ci = ckpt->bpriv;
- WT_RET(__wt_block_ckpt_init(session, ci, ckpt->name));
- WT_RET(__wt_block_buffer_to_ckpt(session, block, ckpt->raw.data, ci));
- WT_RET(__wt_block_extlist_read(
- session, block, &ci->alloc, ci->file_size));
- WT_RET(__wt_block_extlist_read(
- session, block, &ci->discard, ci->file_size));
-
- return (0);
+ WT_BLOCK_CKPT *ci;
+
+ /*
+ * Allocate a checkpoint structure, crack the cookie and read the
+ * checkpoint's extent lists.
+ *
+ * Ignore the avail list: checkpoint avail lists are only useful if we
+ * are rolling forward from the particular checkpoint and they represent
+ * our best understanding of what blocks can be allocated. If we are
+ * not operating on the live checkpoint, subsequent checkpoints might
+ * have allocated those blocks, and the avail list is useless. We don't
+ * discard it, because it is useful as part of verification, but we
+ * don't re-write it either.
+ */
+ WT_RET(__wt_calloc(session, 1, sizeof(WT_BLOCK_CKPT), &ckpt->bpriv));
+
+ ci = ckpt->bpriv;
+ WT_RET(__wt_block_ckpt_init(session, ci, ckpt->name));
+ WT_RET(__wt_block_buffer_to_ckpt(session, block, ckpt->raw.data, ci));
+ WT_RET(__wt_block_extlist_read(session, block, &ci->alloc, ci->file_size));
+ WT_RET(__wt_block_extlist_read(session, block, &ci->discard, ci->file_size));
+
+ return (0);
}
/*
* __ckpt_extlist_fblocks --
- * If a checkpoint's extent list is going away, free its blocks.
+ * If a checkpoint's extent list is going away, free its blocks.
*/
static int
-__ckpt_extlist_fblocks(
- WT_SESSION_IMPL *session, WT_BLOCK *block, WT_EXTLIST *el)
+__ckpt_extlist_fblocks(WT_SESSION_IMPL *session, WT_BLOCK *block, WT_EXTLIST *el)
{
- if (el->offset == WT_BLOCK_INVALID_OFFSET)
- return (0);
-
- /*
- * Free blocks used to write checkpoint extents into the live system's
- * checkpoint avail list (they were never on any alloc list). Do not
- * use the live system's avail list because that list is used to decide
- * if the file can be truncated, and we can't truncate any part of the
- * file that contains a previous checkpoint's extents.
- */
- return (__wt_block_insert_ext(
- session, block, &block->live.ckpt_avail, el->offset, el->size));
+ if (el->offset == WT_BLOCK_INVALID_OFFSET)
+ return (0);
+
+ /*
+ * Free blocks used to write checkpoint extents into the live system's checkpoint avail list
+ * (they were never on any alloc list). Do not use the live system's avail list because that
+ * list is used to decide if the file can be truncated, and we can't truncate any part of the
+ * file that contains a previous checkpoint's extents.
+ */
+ return (__wt_block_insert_ext(session, block, &block->live.ckpt_avail, el->offset, el->size));
}
#ifdef HAVE_DIAGNOSTIC
/*
* __ckpt_verify --
- * Diagnostic code, confirm we get what we expect in the checkpoint array.
+ * Diagnostic code, confirm we get what we expect in the checkpoint array.
*/
static int
__ckpt_verify(WT_SESSION_IMPL *session, WT_CKPT *ckptbase)
{
- WT_CKPT *ckpt;
-
- /*
- * Fast check that we're seeing what we expect to see: some number of
- * checkpoints to add, delete or ignore, terminated by a new checkpoint.
- */
- WT_CKPT_FOREACH(ckptbase, ckpt)
- switch (ckpt->flags) {
- case 0:
- case WT_CKPT_DELETE:
- case WT_CKPT_DELETE | WT_CKPT_FAKE:
- case WT_CKPT_FAKE:
- break;
- case WT_CKPT_ADD:
- if (ckpt[1].name == NULL)
- break;
- /* FALLTHROUGH */
- default:
- return (__wt_illegal_value(session, ckpt->flags));
- }
- return (0);
+ WT_CKPT *ckpt;
+
+ /*
+ * Fast check that we're seeing what we expect to see: some number of checkpoints to add, delete
+ * or ignore, terminated by a new checkpoint.
+ */
+ WT_CKPT_FOREACH (ckptbase, ckpt)
+ switch (ckpt->flags) {
+ case 0:
+ case WT_CKPT_DELETE:
+ case WT_CKPT_DELETE | WT_CKPT_FAKE:
+ case WT_CKPT_FAKE:
+ break;
+ case WT_CKPT_ADD:
+ if (ckpt[1].name == NULL)
+ break;
+ /* FALLTHROUGH */
+ default:
+ return (__wt_illegal_value(session, ckpt->flags));
+ }
+ return (0);
}
#endif
/*
* __ckpt_process --
- * Process the list of checkpoints.
+ * Process the list of checkpoints.
*/
static int
__ckpt_process(WT_SESSION_IMPL *session, WT_BLOCK *block, WT_CKPT *ckptbase)
{
- WT_BLOCK_CKPT *a, *b, *ci;
- WT_CKPT *ckpt, *next_ckpt;
- WT_DECL_RET;
- uint64_t ckpt_size;
- bool deleting, fatal, locked;
+ WT_BLOCK_CKPT *a, *b, *ci;
+ WT_CKPT *ckpt, *next_ckpt;
+ WT_DECL_RET;
+ uint64_t ckpt_size;
+ bool deleting, fatal, locked;
- ci = &block->live;
- fatal = locked = false;
+ ci = &block->live;
+ fatal = locked = false;
#ifdef HAVE_DIAGNOSTIC
- WT_RET(__ckpt_verify(session, ckptbase));
+ WT_RET(__ckpt_verify(session, ckptbase));
#endif
- /*
- * Checkpoints are a two-step process: first, write a new checkpoint to
- * disk (including all the new extent lists for modified checkpoints
- * and the live system). As part of this, create a list of file blocks
- * newly available for reallocation, based on checkpoints being deleted.
- * We then return the locations of the new checkpoint information to our
- * caller. Our caller has to write that information into some kind of
- * stable storage, and once that's done, we can actually allocate from
- * that list of newly available file blocks. (We can't allocate from
- * that list immediately because the allocation might happen before our
- * caller saves the new checkpoint information, and if we crashed before
- * the new checkpoint location was saved, we'd have overwritten blocks
- * still referenced by checkpoints in the system.) In summary, there is
- * a second step: after our caller saves the checkpoint information, we
- * are called to add the newly available blocks into the live system's
- * available list.
- *
- * This function is the first step, the second step is in the resolve
- * function.
- *
- * If we're called to checkpoint the same file twice (without the second
- * resolution step), or re-entered for any reason, it's an error in our
- * caller, and our choices are all bad: leak blocks or potentially crash
- * with our caller not yet having saved previous checkpoint information
- * to stable storage.
- */
- __wt_spin_lock(session, &block->live_lock);
- switch (block->ckpt_state) {
- case WT_CKPT_INPROGRESS:
- block->ckpt_state = WT_CKPT_PANIC_ON_FAILURE;
- break;
- case WT_CKPT_NONE:
- case WT_CKPT_PANIC_ON_FAILURE:
- __wt_err(session, EINVAL,
- "%s: an unexpected checkpoint attempt: the checkpoint "
- "was never started or has already completed",
- block->name);
- ret = __wt_block_panic(session);
- break;
- case WT_CKPT_SALVAGE:
- /* Salvage doesn't use the standard checkpoint APIs. */
- break;
- }
- __wt_spin_unlock(session, &block->live_lock);
- WT_RET(ret);
-
- /*
- * Extents newly available as a result of deleting previous checkpoints
- * are added to a list of extents. The list should be empty, but as
- * described above, there is no "free the checkpoint information" call
- * into the block manager; if there was an error in an upper level that
- * resulted in some previous checkpoint never being resolved, the list
- * may not be empty. We should have caught that with the "checkpoint
- * in progress" test, but it doesn't cost us anything to be cautious.
- *
- * We free the checkpoint's allocation and discard extent lists as part
- * of the resolution step, not because they're needed at that time, but
- * because it's potentially a lot of work, and waiting allows the btree
- * layer to continue eviction sooner. As for the checkpoint-available
- * list, make sure they get cleaned out.
- */
- __wt_block_extlist_free(session, &ci->ckpt_avail);
- WT_RET(__wt_block_extlist_init(
- session, &ci->ckpt_avail, "live", "ckpt_avail", true));
- __wt_block_extlist_free(session, &ci->ckpt_alloc);
- __wt_block_extlist_free(session, &ci->ckpt_discard);
-
- /*
- * To delete a checkpoint, we'll need checkpoint information for it and
- * the subsequent checkpoint into which it gets rolled; read them from
- * disk before we lock things down.
- */
- deleting = false;
- WT_CKPT_FOREACH(ckptbase, ckpt) {
- if (F_ISSET(ckpt, WT_CKPT_FAKE) ||
- !F_ISSET(ckpt, WT_CKPT_DELETE))
- continue;
- deleting = true;
-
- /*
- * Read the checkpoint and next checkpoint extent lists if we
- * haven't already read them (we may have already read these
- * extent blocks if there is more than one deleted checkpoint).
- */
- if (ckpt->bpriv == NULL)
- WT_ERR(__ckpt_extlist_read(session, block, ckpt));
-
- for (next_ckpt = ckpt + 1;; ++next_ckpt)
- if (!F_ISSET(next_ckpt, WT_CKPT_FAKE))
- break;
-
- /*
- * The "next" checkpoint may be the live tree which has no
- * extent blocks to read.
- */
- if (next_ckpt->bpriv == NULL &&
- !F_ISSET(next_ckpt, WT_CKPT_ADD))
- WT_ERR(__ckpt_extlist_read(session, block, next_ckpt));
- }
-
- /*
- * Failures are now fatal: we can't currently back out the merge of any
- * deleted checkpoint extent lists into the live system's extent lists,
- * so continuing after error would leave the live system's extent lists
- * corrupted for any subsequent checkpoint (and potentially, should a
- * subsequent checkpoint succeed, for recovery).
- */
- fatal = true;
-
- /*
- * Hold a lock so the live extent lists and the file size can't change
- * underneath us. I suspect we'll tighten this if checkpoints take too
- * much time away from real work: we read the historic checkpoint
- * information without a lock, but we could also merge and re-write the
- * deleted and merged checkpoint information without a lock, except for
- * the final merge of ranges into the live tree.
- */
- __wt_spin_lock(session, &block->live_lock);
- locked = true;
-
- /*
- * We've allocated our last page, update the checkpoint size. We need
- * to calculate the live system's checkpoint size before merging
- * checkpoint allocation and discard information from the checkpoints
- * we're deleting, those operations change the underlying byte counts.
- */
- ckpt_size = ci->ckpt_size;
- ckpt_size += ci->alloc.bytes;
- ckpt_size -= ci->discard.bytes;
-
- /* Skip the additional processing if we aren't deleting checkpoints. */
- if (!deleting)
- goto live_update;
-
- /*
- * Delete any no-longer-needed checkpoints: we do this first as it frees
- * blocks to the live lists, and the freed blocks will then be included
- * when writing the live extent lists.
- */
- WT_CKPT_FOREACH(ckptbase, ckpt) {
- if (F_ISSET(ckpt, WT_CKPT_FAKE) ||
- !F_ISSET(ckpt, WT_CKPT_DELETE))
- continue;
-
- if (WT_VERBOSE_ISSET(session, WT_VERB_CHECKPOINT))
- __wt_ckpt_verbose(session,
- block, "delete", ckpt->name, ckpt->raw.data);
-
- /*
- * Find the checkpoint into which we'll roll this checkpoint's
- * blocks: it's the next real checkpoint in the list, and it
- * better have been read in (if it's not the add slot).
- */
- for (next_ckpt = ckpt + 1;; ++next_ckpt)
- if (!F_ISSET(next_ckpt, WT_CKPT_FAKE))
- break;
-
- /*
- * Set the from/to checkpoint structures, where the "to" value
- * may be the live tree.
- */
- a = ckpt->bpriv;
- if (F_ISSET(next_ckpt, WT_CKPT_ADD))
- b = &block->live;
- else
- b = next_ckpt->bpriv;
-
- /*
- * Free the root page: there's nothing special about this free,
- * the root page is allocated using normal rules, that is, it
- * may have been taken from the avail list, and was entered on
- * the live system's alloc list at that time. We free it into
- * the checkpoint's discard list, however, not the live system's
- * list because it appears on the checkpoint's alloc list and so
- * must be paired in the checkpoint.
- */
- if (a->root_offset != WT_BLOCK_INVALID_OFFSET)
- WT_ERR(__wt_block_insert_ext(session, block,
- &a->discard, a->root_offset, a->root_size));
-
- /*
- * Free the blocks used to hold the "from" checkpoint's extent
- * lists, including the avail list.
- */
- WT_ERR(__ckpt_extlist_fblocks(session, block, &a->alloc));
- WT_ERR(__ckpt_extlist_fblocks(session, block, &a->avail));
- WT_ERR(__ckpt_extlist_fblocks(session, block, &a->discard));
-
- /*
- * Roll the "from" alloc and discard extent lists into the "to"
- * checkpoint's lists.
- */
- if (a->alloc.entries != 0)
- WT_ERR(__wt_block_extlist_merge(
- session, block, &a->alloc, &b->alloc));
- if (a->discard.entries != 0)
- WT_ERR(__wt_block_extlist_merge(
- session, block, &a->discard, &b->discard));
-
- /*
- * If the "to" checkpoint is also being deleted, we're done with
- * it, it's merged into some other checkpoint in the next loop.
- * This means the extent lists may aggregate over a number of
- * checkpoints, but that's OK, they're disjoint sets of ranges.
- */
- if (F_ISSET(next_ckpt, WT_CKPT_DELETE))
- continue;
-
- /*
- * Find blocks for re-use: wherever the "to" checkpoint's
- * allocate and discard lists overlap, move the range to
- * the live system's checkpoint available list.
- */
- WT_ERR(__wt_block_extlist_overlap(session, block, b));
-
- /*
- * If we're updating the live system's information, we're done.
- */
- if (F_ISSET(next_ckpt, WT_CKPT_ADD))
- continue;
-
- /*
- * We have to write the "to" checkpoint's extent lists out in
- * new blocks, and update its cookie.
- *
- * Free the blocks used to hold the "to" checkpoint's extent
- * lists; don't include the avail list, it's not changing.
- */
- WT_ERR(__ckpt_extlist_fblocks(session, block, &b->alloc));
- WT_ERR(__ckpt_extlist_fblocks(session, block, &b->discard));
-
- F_SET(next_ckpt, WT_CKPT_UPDATE);
- }
-
- /* Update checkpoints marked for update. */
- WT_CKPT_FOREACH(ckptbase, ckpt)
- if (F_ISSET(ckpt, WT_CKPT_UPDATE))
- WT_ERR(__ckpt_update(session,
- block, ckptbase, ckpt, ckpt->bpriv, false));
+ /*
+ * Checkpoints are a two-step process: first, write a new checkpoint to
+ * disk (including all the new extent lists for modified checkpoints
+ * and the live system). As part of this, create a list of file blocks
+ * newly available for reallocation, based on checkpoints being deleted.
+ * We then return the locations of the new checkpoint information to our
+ * caller. Our caller has to write that information into some kind of
+ * stable storage, and once that's done, we can actually allocate from
+ * that list of newly available file blocks. (We can't allocate from
+ * that list immediately because the allocation might happen before our
+ * caller saves the new checkpoint information, and if we crashed before
+ * the new checkpoint location was saved, we'd have overwritten blocks
+ * still referenced by checkpoints in the system.) In summary, there is
+ * a second step: after our caller saves the checkpoint information, we
+ * are called to add the newly available blocks into the live system's
+ * available list.
+ *
+ * This function is the first step, the second step is in the resolve
+ * function.
+ *
+ * If we're called to checkpoint the same file twice (without the second
+ * resolution step), or re-entered for any reason, it's an error in our
+ * caller, and our choices are all bad: leak blocks or potentially crash
+ * with our caller not yet having saved previous checkpoint information
+ * to stable storage.
+ */
+ __wt_spin_lock(session, &block->live_lock);
+ switch (block->ckpt_state) {
+ case WT_CKPT_INPROGRESS:
+ block->ckpt_state = WT_CKPT_PANIC_ON_FAILURE;
+ break;
+ case WT_CKPT_NONE:
+ case WT_CKPT_PANIC_ON_FAILURE:
+ __wt_err(session, EINVAL,
+ "%s: an unexpected checkpoint attempt: the checkpoint "
+ "was never started or has already completed",
+ block->name);
+ ret = __wt_block_panic(session);
+ break;
+ case WT_CKPT_SALVAGE:
+ /* Salvage doesn't use the standard checkpoint APIs. */
+ break;
+ }
+ __wt_spin_unlock(session, &block->live_lock);
+ WT_RET(ret);
+
+ /*
+ * Extents newly available as a result of deleting previous checkpoints
+ * are added to a list of extents. The list should be empty, but as
+ * described above, there is no "free the checkpoint information" call
+ * into the block manager; if there was an error in an upper level that
+ * resulted in some previous checkpoint never being resolved, the list
+ * may not be empty. We should have caught that with the "checkpoint
+ * in progress" test, but it doesn't cost us anything to be cautious.
+ *
+ * We free the checkpoint's allocation and discard extent lists as part
+ * of the resolution step, not because they're needed at that time, but
+ * because it's potentially a lot of work, and waiting allows the btree
+ * layer to continue eviction sooner. As for the checkpoint-available
+ * list, make sure they get cleaned out.
+ */
+ __wt_block_extlist_free(session, &ci->ckpt_avail);
+ WT_RET(__wt_block_extlist_init(session, &ci->ckpt_avail, "live", "ckpt_avail", true));
+ __wt_block_extlist_free(session, &ci->ckpt_alloc);
+ __wt_block_extlist_free(session, &ci->ckpt_discard);
+
+ /*
+ * To delete a checkpoint, we'll need checkpoint information for it and the subsequent
+ * checkpoint into which it gets rolled; read them from disk before we lock things down.
+ */
+ deleting = false;
+ WT_CKPT_FOREACH (ckptbase, ckpt) {
+ if (F_ISSET(ckpt, WT_CKPT_FAKE) || !F_ISSET(ckpt, WT_CKPT_DELETE))
+ continue;
+ deleting = true;
+
+ /*
+ * Read the checkpoint and next checkpoint extent lists if we haven't already read them (we
+ * may have already read these extent blocks if there is more than one deleted checkpoint).
+ */
+ if (ckpt->bpriv == NULL)
+ WT_ERR(__ckpt_extlist_read(session, block, ckpt));
+
+ for (next_ckpt = ckpt + 1;; ++next_ckpt)
+ if (!F_ISSET(next_ckpt, WT_CKPT_FAKE))
+ break;
+
+ /*
+ * The "next" checkpoint may be the live tree which has no extent blocks to read.
+ */
+ if (next_ckpt->bpriv == NULL && !F_ISSET(next_ckpt, WT_CKPT_ADD))
+ WT_ERR(__ckpt_extlist_read(session, block, next_ckpt));
+ }
+
+ /*
+ * Failures are now fatal: we can't currently back out the merge of any deleted checkpoint
+ * extent lists into the live system's extent lists, so continuing after error would leave the
+ * live system's extent lists corrupted for any subsequent checkpoint (and potentially, should a
+ * subsequent checkpoint succeed, for recovery).
+ */
+ fatal = true;
+
+ /*
+ * Hold a lock so the live extent lists and the file size can't change underneath us. I suspect
+ * we'll tighten this if checkpoints take too much time away from real work: we read the
+ * historic checkpoint information without a lock, but we could also merge and re-write the
+ * deleted and merged checkpoint information without a lock, except for the final merge of
+ * ranges into the live tree.
+ */
+ __wt_spin_lock(session, &block->live_lock);
+ locked = true;
+
+ /*
+ * We've allocated our last page, update the checkpoint size. We need to calculate the live
+ * system's checkpoint size before merging checkpoint allocation and discard information from
+ * the checkpoints we're deleting, those operations change the underlying byte counts.
+ */
+ ckpt_size = ci->ckpt_size;
+ ckpt_size += ci->alloc.bytes;
+ ckpt_size -= ci->discard.bytes;
+
+ /* Skip the additional processing if we aren't deleting checkpoints. */
+ if (!deleting)
+ goto live_update;
+
+ /*
+ * Delete any no-longer-needed checkpoints: we do this first as it frees blocks to the live
+ * lists, and the freed blocks will then be included when writing the live extent lists.
+ */
+ WT_CKPT_FOREACH (ckptbase, ckpt) {
+ if (F_ISSET(ckpt, WT_CKPT_FAKE) || !F_ISSET(ckpt, WT_CKPT_DELETE))
+ continue;
+
+ if (WT_VERBOSE_ISSET(session, WT_VERB_CHECKPOINT))
+ __wt_ckpt_verbose(session, block, "delete", ckpt->name, ckpt->raw.data);
+
+ /*
+ * Find the checkpoint into which we'll roll this checkpoint's blocks: it's the next real
+ * checkpoint in the list, and it better have been read in (if it's not the add slot).
+ */
+ for (next_ckpt = ckpt + 1;; ++next_ckpt)
+ if (!F_ISSET(next_ckpt, WT_CKPT_FAKE))
+ break;
+
+ /*
+ * Set the from/to checkpoint structures, where the "to" value may be the live tree.
+ */
+ a = ckpt->bpriv;
+ if (F_ISSET(next_ckpt, WT_CKPT_ADD))
+ b = &block->live;
+ else
+ b = next_ckpt->bpriv;
+
+ /*
+ * Free the root page: there's nothing special about this free, the root page is allocated
+ * using normal rules, that is, it may have been taken from the avail list, and was entered
+ * on the live system's alloc list at that time. We free it into the checkpoint's discard
+ * list, however, not the live system's list because it appears on the checkpoint's alloc
+ * list and so must be paired in the checkpoint.
+ */
+ if (a->root_offset != WT_BLOCK_INVALID_OFFSET)
+ WT_ERR(
+ __wt_block_insert_ext(session, block, &a->discard, a->root_offset, a->root_size));
+
+ /*
+ * Free the blocks used to hold the "from" checkpoint's extent lists, including the avail
+ * list.
+ */
+ WT_ERR(__ckpt_extlist_fblocks(session, block, &a->alloc));
+ WT_ERR(__ckpt_extlist_fblocks(session, block, &a->avail));
+ WT_ERR(__ckpt_extlist_fblocks(session, block, &a->discard));
+
+ /*
+ * Roll the "from" alloc and discard extent lists into the "to" checkpoint's lists.
+ */
+ if (a->alloc.entries != 0)
+ WT_ERR(__wt_block_extlist_merge(session, block, &a->alloc, &b->alloc));
+ if (a->discard.entries != 0)
+ WT_ERR(__wt_block_extlist_merge(session, block, &a->discard, &b->discard));
+
+ /*
+ * If the "to" checkpoint is also being deleted, we're done with it, it's merged into some
+ * other checkpoint in the next loop. This means the extent lists may aggregate over a
+ * number of checkpoints, but that's OK, they're disjoint sets of ranges.
+ */
+ if (F_ISSET(next_ckpt, WT_CKPT_DELETE))
+ continue;
+
+ /*
+ * Find blocks for re-use: wherever the "to" checkpoint's allocate and discard lists
+ * overlap, move the range to the live system's checkpoint available list.
+ */
+ WT_ERR(__wt_block_extlist_overlap(session, block, b));
+
+ /*
+ * If we're updating the live system's information, we're done.
+ */
+ if (F_ISSET(next_ckpt, WT_CKPT_ADD))
+ continue;
+
+ /*
+ * We have to write the "to" checkpoint's extent lists out in
+ * new blocks, and update its cookie.
+ *
+ * Free the blocks used to hold the "to" checkpoint's extent
+ * lists; don't include the avail list, it's not changing.
+ */
+ WT_ERR(__ckpt_extlist_fblocks(session, block, &b->alloc));
+ WT_ERR(__ckpt_extlist_fblocks(session, block, &b->discard));
+
+ F_SET(next_ckpt, WT_CKPT_UPDATE);
+ }
+
+ /* Update checkpoints marked for update. */
+ WT_CKPT_FOREACH (ckptbase, ckpt)
+ if (F_ISSET(ckpt, WT_CKPT_UPDATE))
+ WT_ERR(__ckpt_update(session, block, ckptbase, ckpt, ckpt->bpriv, false));
live_update:
- /* Truncate the file if that's possible. */
- WT_ERR(__wt_block_extlist_truncate(session, block, &ci->avail));
-
- /* Update the final, added checkpoint based on the live system. */
- WT_CKPT_FOREACH(ckptbase, ckpt)
- if (F_ISSET(ckpt, WT_CKPT_ADD)) {
- /*
- * !!!
- * Our caller wants the final checkpoint size. Setting
- * the size here violates layering, but the alternative
- * is a call for the btree layer to crack the checkpoint
- * cookie into its components, and that's a fair amount
- * of work.
- */
- ckpt->size = ckpt_size;
-
- /*
- * Set the rolling checkpoint size for the live system.
- * The current size includes the current checkpoint's
- * root page size (root pages are on the checkpoint's
- * block allocation list as root pages are allocated
- * with the usual block allocation functions). That's
- * correct, but we don't want to include it in the size
- * for the next checkpoint.
- */
- ckpt_size -= ci->root_size;
-
- /*
- * Additionally, we had a bug for awhile where the live
- * checkpoint size grew without bound. We can't sanity
- * check the value, that would require walking the tree
- * as part of the checkpoint. Bound any bug at the size
- * of the file.
- * It isn't practical to assert that the value is within
- * bounds since databases created with older versions
- * of WiredTiger (2.8.0) would likely see an error.
- */
- ci->ckpt_size =
- WT_MIN(ckpt_size, (uint64_t)block->size);
-
- WT_ERR(__ckpt_update(
- session, block, ckptbase, ckpt, ci, true));
- }
-
- /*
- * Reset the live system's alloc and discard extent lists, leave the
- * avail list alone. This includes freeing a lot of extents, so do it
- * outside of the system's lock by copying and resetting the original,
- * then doing the work later.
- */
- ci->ckpt_alloc = ci->alloc;
- WT_ERR(__wt_block_extlist_init(
- session, &ci->alloc, "live", "alloc", false));
- ci->ckpt_discard = ci->discard;
- WT_ERR(__wt_block_extlist_init(
- session, &ci->discard, "live", "discard", false));
+ /* Truncate the file if that's possible. */
+ WT_ERR(__wt_block_extlist_truncate(session, block, &ci->avail));
+
+ /* Update the final, added checkpoint based on the live system. */
+ WT_CKPT_FOREACH (ckptbase, ckpt)
+ if (F_ISSET(ckpt, WT_CKPT_ADD)) {
+ /*
+ * !!!
+ * Our caller wants the final checkpoint size. Setting
+ * the size here violates layering, but the alternative
+ * is a call for the btree layer to crack the checkpoint
+ * cookie into its components, and that's a fair amount
+ * of work.
+ */
+ ckpt->size = ckpt_size;
+
+ /*
+ * Set the rolling checkpoint size for the live system. The current size includes the
+ * current checkpoint's root page size (root pages are on the checkpoint's block
+ * allocation list as root pages are allocated with the usual block allocation
+ * functions). That's correct, but we don't want to include it in the size for the next
+ * checkpoint.
+ */
+ ckpt_size -= ci->root_size;
+
+ /*
+ * Additionally, we had a bug for awhile where the live checkpoint size grew without
+ * bound. We can't sanity check the value, that would require walking the tree as part
+ * of the checkpoint. Bound any bug at the size of the file. It isn't practical to
+ * assert that the value is within bounds since databases created with older versions of
+ * WiredTiger (2.8.0) would likely see an error.
+ */
+ ci->ckpt_size = WT_MIN(ckpt_size, (uint64_t)block->size);
+
+ WT_ERR(__ckpt_update(session, block, ckptbase, ckpt, ci, true));
+ }
+
+ /*
+ * Reset the live system's alloc and discard extent lists, leave the avail list alone. This
+ * includes freeing a lot of extents, so do it outside of the system's lock by copying and
+ * resetting the original, then doing the work later.
+ */
+ ci->ckpt_alloc = ci->alloc;
+ WT_ERR(__wt_block_extlist_init(session, &ci->alloc, "live", "alloc", false));
+ ci->ckpt_discard = ci->discard;
+ WT_ERR(__wt_block_extlist_init(session, &ci->discard, "live", "discard", false));
#ifdef HAVE_DIAGNOSTIC
- /*
- * The first checkpoint in the system should always have an empty
- * discard list. If we've read that checkpoint and/or created it,
- * check.
- */
- WT_CKPT_FOREACH(ckptbase, ckpt)
- if (!F_ISSET(ckpt, WT_CKPT_DELETE))
- break;
- if ((a = ckpt->bpriv) == NULL)
- a = &block->live;
- if (a->discard.entries != 0)
- WT_ERR_MSG(session, WT_ERROR,
- "first checkpoint incorrectly has blocks on the discard "
- "list");
+ /*
+ * The first checkpoint in the system should always have an empty discard list. If we've read
+ * that checkpoint and/or created it, check.
+ */
+ WT_CKPT_FOREACH (ckptbase, ckpt)
+ if (!F_ISSET(ckpt, WT_CKPT_DELETE))
+ break;
+ if ((a = ckpt->bpriv) == NULL)
+ a = &block->live;
+ if (a->discard.entries != 0)
+ WT_ERR_MSG(session, WT_ERROR,
+ "first checkpoint incorrectly has blocks on the discard "
+ "list");
#endif
-err: if (ret != 0 && fatal) {
- __wt_err(session, ret,
- "%s: fatal checkpoint failure", block->name);
- ret = __wt_block_panic(session);
- }
+err:
+ if (ret != 0 && fatal) {
+ __wt_err(session, ret, "%s: fatal checkpoint failure", block->name);
+ ret = __wt_block_panic(session);
+ }
- if (locked)
- __wt_spin_unlock(session, &block->live_lock);
+ if (locked)
+ __wt_spin_unlock(session, &block->live_lock);
- /* Discard any checkpoint information we loaded. */
- WT_CKPT_FOREACH(ckptbase, ckpt)
- if ((ci = ckpt->bpriv) != NULL)
- __wt_block_ckpt_destroy(session, ci);
+ /* Discard any checkpoint information we loaded. */
+ WT_CKPT_FOREACH (ckptbase, ckpt)
+ if ((ci = ckpt->bpriv) != NULL)
+ __wt_block_ckpt_destroy(session, ci);
- return (ret);
+ return (ret);
}
/*
* __ckpt_update --
- * Update a checkpoint.
+ * Update a checkpoint.
*/
static int
-__ckpt_update(WT_SESSION_IMPL *session, WT_BLOCK *block,
- WT_CKPT *ckptbase, WT_CKPT *ckpt, WT_BLOCK_CKPT *ci, bool is_live)
+__ckpt_update(WT_SESSION_IMPL *session, WT_BLOCK *block, WT_CKPT *ckptbase, WT_CKPT *ckpt,
+ WT_BLOCK_CKPT *ci, bool is_live)
{
- WT_DECL_ITEM(a);
- WT_DECL_RET;
- uint8_t *endp;
+ WT_DECL_ITEM(a);
+ WT_DECL_RET;
+ uint8_t *endp;
#ifdef HAVE_DIAGNOSTIC
- /* Check the extent list combinations for overlaps. */
- WT_RET(__wt_block_extlist_check(session, &ci->alloc, &ci->avail));
- WT_RET(__wt_block_extlist_check(session, &ci->discard, &ci->avail));
- WT_RET(__wt_block_extlist_check(session, &ci->alloc, &ci->discard));
+ /* Check the extent list combinations for overlaps. */
+ WT_RET(__wt_block_extlist_check(session, &ci->alloc, &ci->avail));
+ WT_RET(__wt_block_extlist_check(session, &ci->discard, &ci->avail));
+ WT_RET(__wt_block_extlist_check(session, &ci->alloc, &ci->discard));
#endif
- /*
- * Write the checkpoint's alloc and discard extent lists. After each
- * write, remove any allocated blocks from the system's allocation
- * list, checkpoint extent blocks don't appear on any extent lists.
- */
- WT_RET(__wt_block_extlist_write(session, block, &ci->alloc, NULL));
- WT_RET(__wt_block_extlist_write(session, block, &ci->discard, NULL));
-
- /*
- * If this is the final block, we append an incomplete copy of the
- * checkpoint information to the avail list for standalone retrieval.
- */
- if (is_live) {
- /*
- * Copy the INCOMPLETE checkpoint information into the
- * checkpoint.
- */
- WT_RET(__wt_buf_init(
- session, &ckpt->raw, WT_BLOCK_CHECKPOINT_BUFFER));
- endp = ckpt->raw.mem;
- WT_RET(__wt_block_ckpt_to_buffer(
- session, block, &endp, ci, true));
- ckpt->raw.size = WT_PTRDIFF(endp, ckpt->raw.mem);
-
- /*
- * Convert the INCOMPLETE checkpoint array into its metadata
- * representation. This must match what is eventually written
- * into the metadata file, in other words, everything must be
- * initialized before the block manager does the checkpoint.
- */
- WT_RET(__wt_scr_alloc(session, 8 * 1024, &a));
- ret = __wt_meta_ckptlist_to_meta(session, ckptbase, a);
- if (ret == 0)
- ret = __wt_strndup(
- session, a->data, a->size, &ckpt->block_checkpoint);
- __wt_scr_free(session, &a);
- WT_RET(ret);
- }
-
- /*
- * We only write an avail list for the live system, other checkpoint's
- * avail lists are static and never change.
- *
- * Write the avail list last so it reflects changes due to allocating
- * blocks for the alloc and discard lists. Second, when we write the
- * live system's avail list, it's two lists: the current avail list
- * plus the list of blocks to be made available when the new checkpoint
- * completes. We can't merge that second list into the real list yet,
- * it's not truly available until the new checkpoint locations have been
- * saved to the metadata.
- */
- if (is_live) {
- block->final_ckpt = ckpt;
- ret = __wt_block_extlist_write(
- session, block, &ci->avail, &ci->ckpt_avail);
- block->final_ckpt = NULL;
- WT_RET(ret);
- }
-
- /*
- * Set the file size for the live system.
- *
- * !!!
- * We do NOT set the file size when re-writing checkpoints because we
- * want to test the checkpoint's blocks against a reasonable maximum
- * file size during verification. This is bad: imagine a checkpoint
- * appearing early in the file, re-written, and then the checkpoint
- * requires blocks at the end of the file, blocks after the listed file
- * size. If the application opens that checkpoint for writing
- * (discarding subsequent checkpoints), we would truncate the file to
- * the early chunk, discarding the re-written checkpoint information.
- * The alternative, updating the file size has its own problems, in
- * that case we'd work correctly, but we'd lose all of the blocks
- * between the original checkpoint and the re-written checkpoint.
- * Currently, there's no API to roll-forward intermediate checkpoints,
- * if there ever is, this will need to be fixed.
- */
- if (is_live)
- ci->file_size = block->size;
-
- /* Copy the COMPLETE checkpoint information into the checkpoint. */
- WT_RET(__wt_buf_init(session, &ckpt->raw, WT_BLOCK_CHECKPOINT_BUFFER));
- endp = ckpt->raw.mem;
- WT_RET(__wt_block_ckpt_to_buffer(session, block, &endp, ci, false));
- ckpt->raw.size = WT_PTRDIFF(endp, ckpt->raw.mem);
-
- if (WT_VERBOSE_ISSET(session, WT_VERB_CHECKPOINT))
- __wt_ckpt_verbose(
- session, block, "create", ckpt->name, ckpt->raw.data);
-
- return (0);
+ /*
+ * Write the checkpoint's alloc and discard extent lists. After each write, remove any allocated
+ * blocks from the system's allocation list, checkpoint extent blocks don't appear on any extent
+ * lists.
+ */
+ WT_RET(__wt_block_extlist_write(session, block, &ci->alloc, NULL));
+ WT_RET(__wt_block_extlist_write(session, block, &ci->discard, NULL));
+
+ /*
+ * If this is the final block, we append an incomplete copy of the checkpoint information to the
+ * avail list for standalone retrieval.
+ */
+ if (is_live) {
+ /*
+ * Copy the INCOMPLETE checkpoint information into the checkpoint.
+ */
+ WT_RET(__wt_buf_init(session, &ckpt->raw, WT_BLOCK_CHECKPOINT_BUFFER));
+ endp = ckpt->raw.mem;
+ WT_RET(__wt_block_ckpt_to_buffer(session, block, &endp, ci, true));
+ ckpt->raw.size = WT_PTRDIFF(endp, ckpt->raw.mem);
+
+ /*
+ * Convert the INCOMPLETE checkpoint array into its metadata representation. This must match
+ * what is eventually written into the metadata file, in other words, everything must be
+ * initialized before the block manager does the checkpoint.
+ */
+ WT_RET(__wt_scr_alloc(session, 8 * 1024, &a));
+ ret = __wt_meta_ckptlist_to_meta(session, ckptbase, a);
+ if (ret == 0)
+ ret = __wt_strndup(session, a->data, a->size, &ckpt->block_checkpoint);
+ __wt_scr_free(session, &a);
+ WT_RET(ret);
+ }
+
+ /*
+ * We only write an avail list for the live system, other checkpoint's
+ * avail lists are static and never change.
+ *
+ * Write the avail list last so it reflects changes due to allocating
+ * blocks for the alloc and discard lists. Second, when we write the
+ * live system's avail list, it's two lists: the current avail list
+ * plus the list of blocks to be made available when the new checkpoint
+ * completes. We can't merge that second list into the real list yet,
+ * it's not truly available until the new checkpoint locations have been
+ * saved to the metadata.
+ */
+ if (is_live) {
+ block->final_ckpt = ckpt;
+ ret = __wt_block_extlist_write(session, block, &ci->avail, &ci->ckpt_avail);
+ block->final_ckpt = NULL;
+ WT_RET(ret);
+ }
+
+ /*
+ * Set the file size for the live system.
+ *
+ * !!!
+ * We do NOT set the file size when re-writing checkpoints because we
+ * want to test the checkpoint's blocks against a reasonable maximum
+ * file size during verification. This is bad: imagine a checkpoint
+ * appearing early in the file, re-written, and then the checkpoint
+ * requires blocks at the end of the file, blocks after the listed file
+ * size. If the application opens that checkpoint for writing
+ * (discarding subsequent checkpoints), we would truncate the file to
+ * the early chunk, discarding the re-written checkpoint information.
+ * The alternative, updating the file size has its own problems, in
+ * that case we'd work correctly, but we'd lose all of the blocks
+ * between the original checkpoint and the re-written checkpoint.
+ * Currently, there's no API to roll-forward intermediate checkpoints,
+ * if there ever is, this will need to be fixed.
+ */
+ if (is_live)
+ ci->file_size = block->size;
+
+ /* Copy the COMPLETE checkpoint information into the checkpoint. */
+ WT_RET(__wt_buf_init(session, &ckpt->raw, WT_BLOCK_CHECKPOINT_BUFFER));
+ endp = ckpt->raw.mem;
+ WT_RET(__wt_block_ckpt_to_buffer(session, block, &endp, ci, false));
+ ckpt->raw.size = WT_PTRDIFF(endp, ckpt->raw.mem);
+
+ if (WT_VERBOSE_ISSET(session, WT_VERB_CHECKPOINT))
+ __wt_ckpt_verbose(session, block, "create", ckpt->name, ckpt->raw.data);
+
+ return (0);
}
/*
* __wt_block_checkpoint_resolve --
- * Resolve a checkpoint.
+ * Resolve a checkpoint.
*/
int
-__wt_block_checkpoint_resolve(
- WT_SESSION_IMPL *session, WT_BLOCK *block, bool failed)
+__wt_block_checkpoint_resolve(WT_SESSION_IMPL *session, WT_BLOCK *block, bool failed)
{
- WT_BLOCK_CKPT *ci;
- WT_DECL_RET;
-
- ci = &block->live;
-
- /*
- * Resolve the checkpoint after our caller has written the checkpoint
- * information to stable storage.
- */
- __wt_spin_lock(session, &block->live_lock);
- switch (block->ckpt_state) {
- case WT_CKPT_INPROGRESS:
- /* Something went wrong, but it's recoverable at our level. */
- goto done;
- case WT_CKPT_NONE:
- case WT_CKPT_SALVAGE:
- __wt_err(session, EINVAL,
- "%s: an unexpected checkpoint resolution: the checkpoint "
- "was never started or completed, or configured for salvage",
- block->name);
- ret = __wt_block_panic(session);
- break;
- case WT_CKPT_PANIC_ON_FAILURE:
- if (!failed)
- break;
- __wt_err(session, EINVAL,
- "%s: the checkpoint failed, the system must restart",
- block->name);
- ret = __wt_block_panic(session);
- break;
- }
- WT_ERR(ret);
-
- if ((ret = __wt_block_extlist_merge(
- session, block, &ci->ckpt_avail, &ci->avail)) != 0) {
- __wt_err(session, ret,
- "%s: fatal checkpoint failure during extent list merge",
- block->name);
- ret = __wt_block_panic(session);
- }
- __wt_spin_unlock(session, &block->live_lock);
-
- /* Discard the lists remaining after the checkpoint call. */
- __wt_block_extlist_free(session, &ci->ckpt_avail);
- __wt_block_extlist_free(session, &ci->ckpt_alloc);
- __wt_block_extlist_free(session, &ci->ckpt_discard);
-
- __wt_spin_lock(session, &block->live_lock);
-done: block->ckpt_state = WT_CKPT_NONE;
-err: __wt_spin_unlock(session, &block->live_lock);
-
- return (ret);
+ WT_BLOCK_CKPT *ci;
+ WT_DECL_RET;
+
+ ci = &block->live;
+
+ /*
+ * Resolve the checkpoint after our caller has written the checkpoint information to stable
+ * storage.
+ */
+ __wt_spin_lock(session, &block->live_lock);
+ switch (block->ckpt_state) {
+ case WT_CKPT_INPROGRESS:
+ /* Something went wrong, but it's recoverable at our level. */
+ goto done;
+ case WT_CKPT_NONE:
+ case WT_CKPT_SALVAGE:
+ __wt_err(session, EINVAL,
+ "%s: an unexpected checkpoint resolution: the checkpoint "
+ "was never started or completed, or configured for salvage",
+ block->name);
+ ret = __wt_block_panic(session);
+ break;
+ case WT_CKPT_PANIC_ON_FAILURE:
+ if (!failed)
+ break;
+ __wt_err(
+ session, EINVAL, "%s: the checkpoint failed, the system must restart", block->name);
+ ret = __wt_block_panic(session);
+ break;
+ }
+ WT_ERR(ret);
+
+ if ((ret = __wt_block_extlist_merge(session, block, &ci->ckpt_avail, &ci->avail)) != 0) {
+ __wt_err(
+ session, ret, "%s: fatal checkpoint failure during extent list merge", block->name);
+ ret = __wt_block_panic(session);
+ }
+ __wt_spin_unlock(session, &block->live_lock);
+
+ /* Discard the lists remaining after the checkpoint call. */
+ __wt_block_extlist_free(session, &ci->ckpt_avail);
+ __wt_block_extlist_free(session, &ci->ckpt_alloc);
+ __wt_block_extlist_free(session, &ci->ckpt_discard);
+
+ __wt_spin_lock(session, &block->live_lock);
+done:
+ block->ckpt_state = WT_CKPT_NONE;
+err:
+ __wt_spin_unlock(session, &block->live_lock);
+
+ return (ret);
}