summaryrefslogtreecommitdiff
path: root/src/third_party/wiredtiger/src/block/block_ckpt_scan.c
diff options
context:
space:
mode:
authorLuke Chen <luke.chen@mongodb.com>2019-08-21 05:23:37 +0000
committerevergreen <evergreen@mongodb.com>2019-08-21 05:23:37 +0000
commitac41c65f6355f83aac70136324c98561ac79daa1 (patch)
treea7c3f7ef090b59c6a06838a02c96bd1d49e1c729 /src/third_party/wiredtiger/src/block/block_ckpt_scan.c
parentf54709196711c63a429b71f47c584661286d675f (diff)
downloadmongo-ac41c65f6355f83aac70136324c98561ac79daa1.tar.gz
Import wiredtiger: 7dfd9391862bc9a6d84868c4dc51689c45a3aacf from branch mongodb-4.4
ref: c809757d8b..7dfd939186 for: 4.3.1 WT-4658 Apply Clang Format WT-4810 Adding WT_ERR_ASSERT and WT_RET_ASSERT macros WT-5046 Prepared transactions aren't properly cleared from global table with WT_CONN_LOG_DEBUG_MODE enabled
Diffstat (limited to 'src/third_party/wiredtiger/src/block/block_ckpt_scan.c')
-rw-r--r--src/third_party/wiredtiger/src/block/block_ckpt_scan.c664
1 files changed, 322 insertions, 342 deletions
diff --git a/src/third_party/wiredtiger/src/block/block_ckpt_scan.c b/src/third_party/wiredtiger/src/block/block_ckpt_scan.c
index 91c82d122f9..b7fda0d73b2 100644
--- a/src/third_party/wiredtiger/src/block/block_ckpt_scan.c
+++ b/src/third_party/wiredtiger/src/block/block_ckpt_scan.c
@@ -47,373 +47,353 @@
/*
* __wt_block_checkpoint_final --
- * Append metadata and checkpoint information to a buffer.
+ * Append metadata and checkpoint information to a buffer.
*/
int
-__wt_block_checkpoint_final(WT_SESSION_IMPL *session,
- WT_BLOCK *block, WT_ITEM *buf, uint8_t **file_sizep)
+__wt_block_checkpoint_final(
+ WT_SESSION_IMPL *session, WT_BLOCK *block, WT_ITEM *buf, uint8_t **file_sizep)
{
- WT_CKPT *ckpt;
- size_t align_size, file_size_offset, len, size;
- uint8_t *p;
-
- *file_sizep = 0;
-
- ckpt = block->final_ckpt;
- p = (uint8_t *)buf->mem + buf->size;
-
- /*
- * First, add in a counter to uniquely order checkpoints at our level.
- * There's order and time information in the checkpoint itself, but the
- * order isn't written and the time is only at second granularity.
- * I'm using the Btree write generation for this purpose. That's
- * safe and guaranteed correct because everything is locked down for the
- * checkpoint, we're the only writer. Plus, because we use the write
- * generation as a database connection generation, it's guaranteed to
- * move forward and never repeat.
- * It's a layering violation though, this is the only place the
- * block manager uses the write generation. The alternative would be to
- * add our own write-generation scheme in the block manager, storing a
- * value and recovering it when we open the file. We could do that, as
- * reading the final avail list when a file is opened is unavoidable,
- * so we can retrieve the value written here when we open the file, but
- * this approach is simpler.
- */
- size = buf->size + WT_INTPACK64_MAXSIZE;
- WT_RET(__wt_buf_extend(session, buf, size));
- p = (uint8_t *)buf->mem + buf->size;
- WT_RET(__wt_vpack_uint(&p, 0, ++S2BT(session)->write_gen));
- buf->size = WT_PTRDIFF(p, buf->mem);
-
- /*
- * Second, add space for the final file size as a packed value. We don't
- * know how large it will be so skip the maximum required space.
- */
- size = buf->size + WT_INTPACK64_MAXSIZE;
- WT_RET(__wt_buf_extend(session, buf, size));
- p = (uint8_t *)buf->mem + buf->size;
- memset(p, 0, WT_INTPACK64_MAXSIZE);
- file_size_offset = buf->size;
- buf->size = size;
-
- /* 3a, copy the metadata length into the buffer. */
- len = strlen(ckpt->block_metadata);
- size = buf->size + WT_INTPACK64_MAXSIZE;
- WT_RET(__wt_buf_extend(session, buf, size));
- p = (uint8_t *)buf->mem + buf->size;
- WT_RET(__wt_vpack_uint(&p, 0, (uint64_t)len));
- buf->size = WT_PTRDIFF(p, buf->mem);
-
- /* 3b, copy the metadata into the buffer. */
- size = buf->size + len;
- WT_RET(__wt_buf_extend(session, buf, size));
- p = (uint8_t *)buf->mem + buf->size;
- memcpy(p, ckpt->block_metadata, len);
- buf->size = size;
-
- /* 4a, copy the checkpoint list length into the buffer. */
- len = strlen(ckpt->block_checkpoint);
- size = buf->size + WT_INTPACK64_MAXSIZE;
- WT_RET(__wt_buf_extend(session, buf, size));
- p = (uint8_t *)buf->mem + buf->size;
- WT_RET(__wt_vpack_uint(&p, 0, (uint64_t)len));
- buf->size = WT_PTRDIFF(p, buf->mem);
-
- /* 4b, copy the checkpoint list into the buffer. */
- size = buf->size + len;
- WT_RET(__wt_buf_extend(session, buf, size));
- p = (uint8_t *)buf->mem + buf->size;
- memcpy(p, ckpt->block_checkpoint, len);
- buf->size = size;
-
- /*
- * 5a, copy the not-quite-right checkpoint information length into the
- * buffer.
- */
- len = ckpt->raw.size;
- size = buf->size + WT_INTPACK64_MAXSIZE;
- WT_RET(__wt_buf_extend(session, buf, size));
- p = (uint8_t *)buf->mem + buf->size;
- WT_RET(__wt_vpack_uint(&p, 0, (uint64_t)len));
- buf->size = WT_PTRDIFF(p, buf->mem);
-
- /*
- * 5b, copy the not-quite-right checkpoint information into the buffer.
- */
- size = buf->size + len;
- WT_RET(__wt_buf_extend(session, buf, size));
- p = (uint8_t *)buf->mem + buf->size;
- memcpy(p, ckpt->raw.data, len);
- buf->size = size;
-
- /*
- * We might have grown the buffer beyond the original allocation size,
- * make sure that we're still in compliance.
- */
- align_size = WT_ALIGN(buf->size, block->allocsize);
- if (align_size > buf->memsize)
- WT_RET(__wt_buf_extend(session, buf, align_size));
-
- *file_sizep = (uint8_t *)buf->mem + file_size_offset;
-
- return (0);
+ WT_CKPT *ckpt;
+ size_t align_size, file_size_offset, len, size;
+ uint8_t *p;
+
+ *file_sizep = 0;
+
+ ckpt = block->final_ckpt;
+ p = (uint8_t *)buf->mem + buf->size;
+
+ /*
+ * First, add in a counter to uniquely order checkpoints at our level.
+ * There's order and time information in the checkpoint itself, but the
+ * order isn't written and the time is only at second granularity.
+ * I'm using the Btree write generation for this purpose. That's
+ * safe and guaranteed correct because everything is locked down for the
+ * checkpoint, we're the only writer. Plus, because we use the write
+ * generation as a database connection generation, it's guaranteed to
+ * move forward and never repeat.
+ * It's a layering violation though, this is the only place the
+ * block manager uses the write generation. The alternative would be to
+ * add our own write-generation scheme in the block manager, storing a
+ * value and recovering it when we open the file. We could do that, as
+ * reading the final avail list when a file is opened is unavoidable,
+ * so we can retrieve the value written here when we open the file, but
+ * this approach is simpler.
+ */
+ size = buf->size + WT_INTPACK64_MAXSIZE;
+ WT_RET(__wt_buf_extend(session, buf, size));
+ p = (uint8_t *)buf->mem + buf->size;
+ WT_RET(__wt_vpack_uint(&p, 0, ++S2BT(session)->write_gen));
+ buf->size = WT_PTRDIFF(p, buf->mem);
+
+ /*
+ * Second, add space for the final file size as a packed value. We don't know how large it will
+ * be so skip the maximum required space.
+ */
+ size = buf->size + WT_INTPACK64_MAXSIZE;
+ WT_RET(__wt_buf_extend(session, buf, size));
+ p = (uint8_t *)buf->mem + buf->size;
+ memset(p, 0, WT_INTPACK64_MAXSIZE);
+ file_size_offset = buf->size;
+ buf->size = size;
+
+ /* 3a, copy the metadata length into the buffer. */
+ len = strlen(ckpt->block_metadata);
+ size = buf->size + WT_INTPACK64_MAXSIZE;
+ WT_RET(__wt_buf_extend(session, buf, size));
+ p = (uint8_t *)buf->mem + buf->size;
+ WT_RET(__wt_vpack_uint(&p, 0, (uint64_t)len));
+ buf->size = WT_PTRDIFF(p, buf->mem);
+
+ /* 3b, copy the metadata into the buffer. */
+ size = buf->size + len;
+ WT_RET(__wt_buf_extend(session, buf, size));
+ p = (uint8_t *)buf->mem + buf->size;
+ memcpy(p, ckpt->block_metadata, len);
+ buf->size = size;
+
+ /* 4a, copy the checkpoint list length into the buffer. */
+ len = strlen(ckpt->block_checkpoint);
+ size = buf->size + WT_INTPACK64_MAXSIZE;
+ WT_RET(__wt_buf_extend(session, buf, size));
+ p = (uint8_t *)buf->mem + buf->size;
+ WT_RET(__wt_vpack_uint(&p, 0, (uint64_t)len));
+ buf->size = WT_PTRDIFF(p, buf->mem);
+
+ /* 4b, copy the checkpoint list into the buffer. */
+ size = buf->size + len;
+ WT_RET(__wt_buf_extend(session, buf, size));
+ p = (uint8_t *)buf->mem + buf->size;
+ memcpy(p, ckpt->block_checkpoint, len);
+ buf->size = size;
+
+ /*
+ * 5a, copy the not-quite-right checkpoint information length into the
+ * buffer.
+ */
+ len = ckpt->raw.size;
+ size = buf->size + WT_INTPACK64_MAXSIZE;
+ WT_RET(__wt_buf_extend(session, buf, size));
+ p = (uint8_t *)buf->mem + buf->size;
+ WT_RET(__wt_vpack_uint(&p, 0, (uint64_t)len));
+ buf->size = WT_PTRDIFF(p, buf->mem);
+
+ /*
+ * 5b, copy the not-quite-right checkpoint information into the buffer.
+ */
+ size = buf->size + len;
+ WT_RET(__wt_buf_extend(session, buf, size));
+ p = (uint8_t *)buf->mem + buf->size;
+ memcpy(p, ckpt->raw.data, len);
+ buf->size = size;
+
+ /*
+ * We might have grown the buffer beyond the original allocation size, make sure that we're
+ * still in compliance.
+ */
+ align_size = WT_ALIGN(buf->size, block->allocsize);
+ if (align_size > buf->memsize)
+ WT_RET(__wt_buf_extend(session, buf, align_size));
+
+ *file_sizep = (uint8_t *)buf->mem + file_size_offset;
+
+ return (0);
}
struct saved_block_info {
- uint64_t write_gen;
- wt_off_t offset;
- uint32_t size;
- uint32_t checksum;
- uint64_t file_size;
+ uint64_t write_gen;
+ wt_off_t offset;
+ uint32_t size;
+ uint32_t checksum;
+ uint64_t file_size;
- char *metadata;
- char *checkpoint_list;
+ char *metadata;
+ char *checkpoint_list;
- WT_ITEM *checkpoint;
+ WT_ITEM *checkpoint;
};
/*
* __block_checkpoint_update --
- * Update the checkpoint information for the file.
+ * Update the checkpoint information for the file.
*/
static int
-__block_checkpoint_update(
- WT_SESSION_IMPL *session, WT_BLOCK *block, struct saved_block_info *info)
+__block_checkpoint_update(WT_SESSION_IMPL *session, WT_BLOCK *block, struct saved_block_info *info)
{
- WT_BLOCK_CKPT ci;
- WT_ITEM *checkpoint;
- uint8_t *endp;
-
- memset(&ci, 0, sizeof(ci));
- checkpoint = info->checkpoint;
-
- if (WT_VERBOSE_ISSET(session, WT_VERB_CHECKPOINT))
- __wt_ckpt_verbose(
- session, block, "import original", NULL, checkpoint->mem);
-
- /*
- * Convert the final checkpoint data blob to a WT_BLOCK_CKPT structure,
- * update it with the avail list information, and convert it back to a
- * data blob.
- */
- WT_RET(__wt_block_buffer_to_ckpt(
- session, block, checkpoint->data, &ci));
- ci.avail.offset = info->offset;
- ci.avail.size = info->size;
- ci.avail.checksum = info->checksum;
- ci.file_size = (wt_off_t)info->file_size;
- WT_RET(__wt_buf_extend(
- session, checkpoint, WT_BLOCK_CHECKPOINT_BUFFER));
- endp = checkpoint->mem;
- WT_RET(__wt_block_ckpt_to_buffer(session, block, &endp, &ci, false));
- checkpoint->size = WT_PTRDIFF(endp, checkpoint->mem);
-
- if (WT_VERBOSE_ISSET(session, WT_VERB_CHECKPOINT))
- __wt_ckpt_verbose(
- session, block, "import replace", NULL, checkpoint->mem);
-
- return (0);
+ WT_BLOCK_CKPT ci;
+ WT_ITEM *checkpoint;
+ uint8_t *endp;
+
+ memset(&ci, 0, sizeof(ci));
+ checkpoint = info->checkpoint;
+
+ if (WT_VERBOSE_ISSET(session, WT_VERB_CHECKPOINT))
+ __wt_ckpt_verbose(session, block, "import original", NULL, checkpoint->mem);
+
+ /*
+ * Convert the final checkpoint data blob to a WT_BLOCK_CKPT structure, update it with the avail
+ * list information, and convert it back to a data blob.
+ */
+ WT_RET(__wt_block_buffer_to_ckpt(session, block, checkpoint->data, &ci));
+ ci.avail.offset = info->offset;
+ ci.avail.size = info->size;
+ ci.avail.checksum = info->checksum;
+ ci.file_size = (wt_off_t)info->file_size;
+ WT_RET(__wt_buf_extend(session, checkpoint, WT_BLOCK_CHECKPOINT_BUFFER));
+ endp = checkpoint->mem;
+ WT_RET(__wt_block_ckpt_to_buffer(session, block, &endp, &ci, false));
+ checkpoint->size = WT_PTRDIFF(endp, checkpoint->mem);
+
+ if (WT_VERBOSE_ISSET(session, WT_VERB_CHECKPOINT))
+ __wt_ckpt_verbose(session, block, "import replace", NULL, checkpoint->mem);
+
+ return (0);
}
-#define WT_BLOCK_SKIP(a) do { \
- if ((a) != 0) \
- continue; \
-} while (0)
+#define WT_BLOCK_SKIP(a) \
+ do { \
+ if ((a) != 0) \
+ continue; \
+ } while (0)
/*
* __wt_block_checkpoint_last --
- * Scan a file for checkpoints, returning the last one we find.
+ * Scan a file for checkpoints, returning the last one we find.
*/
int
-__wt_block_checkpoint_last(WT_SESSION_IMPL *session, WT_BLOCK *block,
- char **metadatap, char **checkpoint_listp, WT_ITEM *checkpoint)
+__wt_block_checkpoint_last(WT_SESSION_IMPL *session, WT_BLOCK *block, char **metadatap,
+ char **checkpoint_listp, WT_ITEM *checkpoint)
{
- struct saved_block_info *best, _best, *current, _current, *saved_tmp;
- WT_BLOCK_HEADER *blk;
- WT_DECL_ITEM(tmp);
- WT_DECL_RET;
- WT_FH *fh;
- const WT_PAGE_HEADER *dsk;
- wt_off_t ext_off, ext_size, offset;
- uint64_t len, nblocks, write_gen;
- uint32_t checksum, size;
- const uint8_t *p, *t;
- bool found;
-
- *metadatap = *checkpoint_listp = NULL;
- WT_RET(__wt_buf_init(session, checkpoint, WT_BLOCK_CHECKPOINT_BUFFER));
-
- /*
- * Initialize a pair of structures that track the best and current
- * checkpoints found so far. This is a little trickier than normal
- * because we don't want to start saving a checkpoint only to find
- * out it's not one we can use. I doubt that can happen and it
- * suggests corruption, but half-a-checkpoint isn't a good place to
- * be. Only swap to a new "best" checkpoint if we read the whole
- * thing successfully.
- *
- * Don't re-order these lines: it's done this way so the WT_ITEMs
- * are always initialized and error handling works.
- */
- memset((best = &_best), 0, sizeof(_best));
- memset((current = &_current), 0, sizeof(_current));
- WT_ERR(__wt_scr_alloc(session, 0, &best->checkpoint));
- WT_ERR(__wt_scr_alloc(session, 0, &current->checkpoint));
-
- found = false;
- ext_off = 0; /* [-Werror=maybe-uninitialized] */
- ext_size = 0;
- len = write_gen = 0;
-
- WT_ERR(__wt_scr_alloc(session, 64 * 1024, &tmp));
-
- F_SET(session, WT_SESSION_QUIET_CORRUPT_FILE);
-
- /*
- * Scan the file for pages, using the minimum possible WiredTiger
- * allocation size.
- */
- fh = block->fh;
- for (nblocks = 0, offset = 0; offset < block->size; offset += size) {
- /* Report progress occasionally. */
-#define WT_CHECKPOINT_LIST_PROGRESS_INTERVAL 100
- if (++nblocks % WT_CHECKPOINT_LIST_PROGRESS_INTERVAL == 0)
- WT_ERR(__wt_progress(session, NULL, nblocks));
-
- /*
- * Read the start of a possible page and get a block length from
- * it. Move to the next allocation sized boundary, we'll never
- * consider this one again.
- */
- if ((ret = __wt_read(session, fh,
- offset, (size_t)WT_BTREE_MIN_ALLOC_SIZE, tmp->mem)) != 0)
- break;
- blk = WT_BLOCK_HEADER_REF(tmp->mem);
- __wt_block_header_byteswap(blk);
- size = blk->disk_size;
- checksum = blk->checksum;
-
- /*
- * Check the block size: if it's not insane, read the block.
- * Reading the block validates any checksum. The file might
- * reasonably have garbage at the end, and we're not here to
- * detect that. Ignore problems, subsequent file verification
- * can deal with any corruption. If the block isn't valid,
- * skip to the next possible block.
- */
- if (__wt_block_offset_invalid(block, offset, size) ||
- __wt_block_read_off(
- session, block, tmp, offset, size, checksum) != 0) {
- size = WT_BTREE_MIN_ALLOC_SIZE;
- continue;
- }
-
- dsk = tmp->mem;
- if (dsk->type != WT_PAGE_BLOCK_MANAGER)
- continue;
-
- p = WT_BLOCK_HEADER_BYTE(tmp->mem);
- WT_BLOCK_SKIP(__wt_extlist_read_pair(&p, &ext_off, &ext_size));
- if (ext_off != WT_BLOCK_EXTLIST_MAGIC || ext_size != 0)
- continue;
- for (;;) {
- if ((ret = __wt_extlist_read_pair(
- &p, &ext_off, &ext_size)) != 0)
- break;
- if (ext_off == WT_BLOCK_INVALID_OFFSET)
- break;
- }
- if (ret != 0) {
- WT_NOT_READ(ret, 0);
- continue;
- }
- /*
- * Note the less-than check of WT_BLOCK_EXTLIST_VERSION_CKPT,
- * that way we can extend this with additional values in the
- * future.
- */
- if (ext_size < WT_BLOCK_EXTLIST_VERSION_CKPT)
- continue;
-
- /*
- * Skip any entries that aren't the most recent we've seen so
- * far.
- */
- WT_BLOCK_SKIP(__wt_vunpack_uint(&p, 0, &write_gen));
- if (write_gen < best->write_gen)
- continue;
-
- __wt_verbose(session, WT_VERB_CHECKPOINT,
- "scan: checkpoint block at offset %" PRIuMAX
- ", generation #%" PRIu64,
- (uintmax_t)offset, write_gen);
-
- current->write_gen = write_gen;
- current->offset = offset;
- current->size = size;
- current->checksum = checksum;
-
- /*
- * The file size is in a fixed-size chunk of data, although it's
- * packed (for portability).
- */
- t = p;
- WT_BLOCK_SKIP(__wt_vunpack_uint(&t, 0, &current->file_size));
- p += WT_INTPACK64_MAXSIZE;
-
- /* Save a copy of the metadata. */
- __wt_free(session, current->metadata);
- WT_BLOCK_SKIP(__wt_vunpack_uint(&p, 0, &len));
- WT_ERR(__wt_strndup(session, p, len, &current->metadata));
- p += len;
-
- /* Save a copy of the checkpoint list. */
- __wt_free(session, current->checkpoint_list);
- WT_BLOCK_SKIP(__wt_vunpack_uint(&p, 0, &len));
- WT_ERR(__wt_strndup(
- session, p, len, &current->checkpoint_list));
- p += len;
-
- /* Save a copy of the checkpoint information. */
- WT_BLOCK_SKIP(__wt_vunpack_uint(&p, 0, &len));
- WT_ERR(__wt_buf_set(session, current->checkpoint, p, len));
-
- /* A new winner, swap the "best" and "current" information. */
- saved_tmp = best;
- best = current;
- current = saved_tmp;
- found = true;
- }
-
- if (!found)
- WT_ERR_MSG(session, WT_NOTFOUND,
- "%s: no final checkpoint found in file scan",
- block->name);
-
- /* Correct the checkpoint. */
- WT_ERR(__block_checkpoint_update(session, block, best));
-
- /*
- * Copy the information out to our caller. Do the WT_ITEM first, it's
- * the only thing left that can fail and simplifies error handling.
- */
- WT_ERR(__wt_buf_set(session,
- checkpoint, best->checkpoint->data, best->checkpoint->size));
- *metadatap = best->metadata;
- best->metadata = NULL;
- *checkpoint_listp = best->checkpoint_list;
- best->checkpoint_list = NULL;
+ struct saved_block_info *best, _best, *current, _current, *saved_tmp;
+ WT_BLOCK_HEADER *blk;
+ WT_DECL_ITEM(tmp);
+ WT_DECL_RET;
+ WT_FH *fh;
+ const WT_PAGE_HEADER *dsk;
+ wt_off_t ext_off, ext_size, offset;
+ uint64_t len, nblocks, write_gen;
+ uint32_t checksum, size;
+ const uint8_t *p, *t;
+ bool found;
+
+ *metadatap = *checkpoint_listp = NULL;
+ WT_RET(__wt_buf_init(session, checkpoint, WT_BLOCK_CHECKPOINT_BUFFER));
+
+ /*
+ * Initialize a pair of structures that track the best and current
+ * checkpoints found so far. This is a little trickier than normal
+ * because we don't want to start saving a checkpoint only to find
+ * out it's not one we can use. I doubt that can happen and it
+ * suggests corruption, but half-a-checkpoint isn't a good place to
+ * be. Only swap to a new "best" checkpoint if we read the whole
+ * thing successfully.
+ *
+ * Don't re-order these lines: it's done this way so the WT_ITEMs
+ * are always initialized and error handling works.
+ */
+ memset((best = &_best), 0, sizeof(_best));
+ memset((current = &_current), 0, sizeof(_current));
+ WT_ERR(__wt_scr_alloc(session, 0, &best->checkpoint));
+ WT_ERR(__wt_scr_alloc(session, 0, &current->checkpoint));
+
+ found = false;
+ ext_off = 0; /* [-Werror=maybe-uninitialized] */
+ ext_size = 0;
+ len = write_gen = 0;
+
+ WT_ERR(__wt_scr_alloc(session, 64 * 1024, &tmp));
+
+ F_SET(session, WT_SESSION_QUIET_CORRUPT_FILE);
+
+ /*
+ * Scan the file for pages, using the minimum possible WiredTiger allocation size.
+ */
+ fh = block->fh;
+ for (nblocks = 0, offset = 0; offset < block->size; offset += size) {
+/* Report progress occasionally. */
+#define WT_CHECKPOINT_LIST_PROGRESS_INTERVAL 100
+ if (++nblocks % WT_CHECKPOINT_LIST_PROGRESS_INTERVAL == 0)
+ WT_ERR(__wt_progress(session, NULL, nblocks));
+
+ /*
+ * Read the start of a possible page and get a block length from it. Move to the next
+ * allocation sized boundary, we'll never consider this one again.
+ */
+ if ((ret = __wt_read(session, fh, offset, (size_t)WT_BTREE_MIN_ALLOC_SIZE, tmp->mem)) != 0)
+ break;
+ blk = WT_BLOCK_HEADER_REF(tmp->mem);
+ __wt_block_header_byteswap(blk);
+ size = blk->disk_size;
+ checksum = blk->checksum;
+
+ /*
+ * Check the block size: if it's not insane, read the block. Reading the block validates any
+ * checksum. The file might reasonably have garbage at the end, and we're not here to detect
+ * that. Ignore problems, subsequent file verification can deal with any corruption. If the
+ * block isn't valid, skip to the next possible block.
+ */
+ if (__wt_block_offset_invalid(block, offset, size) ||
+ __wt_block_read_off(session, block, tmp, offset, size, checksum) != 0) {
+ size = WT_BTREE_MIN_ALLOC_SIZE;
+ continue;
+ }
+
+ dsk = tmp->mem;
+ if (dsk->type != WT_PAGE_BLOCK_MANAGER)
+ continue;
+
+ p = WT_BLOCK_HEADER_BYTE(tmp->mem);
+ WT_BLOCK_SKIP(__wt_extlist_read_pair(&p, &ext_off, &ext_size));
+ if (ext_off != WT_BLOCK_EXTLIST_MAGIC || ext_size != 0)
+ continue;
+ for (;;) {
+ if ((ret = __wt_extlist_read_pair(&p, &ext_off, &ext_size)) != 0)
+ break;
+ if (ext_off == WT_BLOCK_INVALID_OFFSET)
+ break;
+ }
+ if (ret != 0) {
+ WT_NOT_READ(ret, 0);
+ continue;
+ }
+ /*
+ * Note the less-than check of WT_BLOCK_EXTLIST_VERSION_CKPT, that way we can extend this
+ * with additional values in the future.
+ */
+ if (ext_size < WT_BLOCK_EXTLIST_VERSION_CKPT)
+ continue;
+
+ /*
+ * Skip any entries that aren't the most recent we've seen so far.
+ */
+ WT_BLOCK_SKIP(__wt_vunpack_uint(&p, 0, &write_gen));
+ if (write_gen < best->write_gen)
+ continue;
+
+ __wt_verbose(session, WT_VERB_CHECKPOINT,
+ "scan: checkpoint block at offset %" PRIuMAX ", generation #%" PRIu64, (uintmax_t)offset,
+ write_gen);
+
+ current->write_gen = write_gen;
+ current->offset = offset;
+ current->size = size;
+ current->checksum = checksum;
+
+ /*
+ * The file size is in a fixed-size chunk of data, although it's packed (for portability).
+ */
+ t = p;
+ WT_BLOCK_SKIP(__wt_vunpack_uint(&t, 0, &current->file_size));
+ p += WT_INTPACK64_MAXSIZE;
+
+ /* Save a copy of the metadata. */
+ __wt_free(session, current->metadata);
+ WT_BLOCK_SKIP(__wt_vunpack_uint(&p, 0, &len));
+ WT_ERR(__wt_strndup(session, p, len, &current->metadata));
+ p += len;
+
+ /* Save a copy of the checkpoint list. */
+ __wt_free(session, current->checkpoint_list);
+ WT_BLOCK_SKIP(__wt_vunpack_uint(&p, 0, &len));
+ WT_ERR(__wt_strndup(session, p, len, &current->checkpoint_list));
+ p += len;
+
+ /* Save a copy of the checkpoint information. */
+ WT_BLOCK_SKIP(__wt_vunpack_uint(&p, 0, &len));
+ WT_ERR(__wt_buf_set(session, current->checkpoint, p, len));
+
+ /* A new winner, swap the "best" and "current" information. */
+ saved_tmp = best;
+ best = current;
+ current = saved_tmp;
+ found = true;
+ }
+
+ if (!found)
+ WT_ERR_MSG(session, WT_NOTFOUND, "%s: no final checkpoint found in file scan", block->name);
+
+ /* Correct the checkpoint. */
+ WT_ERR(__block_checkpoint_update(session, block, best));
+
+ /*
+ * Copy the information out to our caller. Do the WT_ITEM first, it's the only thing left that
+ * can fail and simplifies error handling.
+ */
+ WT_ERR(__wt_buf_set(session, checkpoint, best->checkpoint->data, best->checkpoint->size));
+ *metadatap = best->metadata;
+ best->metadata = NULL;
+ *checkpoint_listp = best->checkpoint_list;
+ best->checkpoint_list = NULL;
err:
- __wt_free(session, best->metadata);
- __wt_free(session, best->checkpoint_list);
- __wt_scr_free(session, &best->checkpoint);
- __wt_free(session, current->metadata);
- __wt_free(session, current->checkpoint_list);
- __wt_scr_free(session, &current->checkpoint);
+ __wt_free(session, best->metadata);
+ __wt_free(session, best->checkpoint_list);
+ __wt_scr_free(session, &best->checkpoint);
+ __wt_free(session, current->metadata);
+ __wt_free(session, current->checkpoint_list);
+ __wt_scr_free(session, &current->checkpoint);
- __wt_scr_free(session, &tmp);
+ __wt_scr_free(session, &tmp);
- F_CLR(session, WT_SESSION_QUIET_CORRUPT_FILE);
- return (ret);
+ F_CLR(session, WT_SESSION_QUIET_CORRUPT_FILE);
+ return (ret);
}