summaryrefslogtreecommitdiff
path: root/src/third_party/wiredtiger/src/block
diff options
context:
space:
mode:
authorLuke Chen <luke.chen@mongodb.com>2021-01-27 13:28:13 +1100
committerEvergreen Agent <no-reply@evergreen.mongodb.com>2021-01-27 03:07:52 +0000
commit77d134c7a2bab846aaaa056b0883888a8219da2a (patch)
tree8caac27a3573139d221fdf93ba9c99b3e042bc78 /src/third_party/wiredtiger/src/block
parent5fad1f69662696c5b789392622aa34d370fb4825 (diff)
downloadmongo-77d134c7a2bab846aaaa056b0883888a8219da2a.tar.gz
Import wiredtiger: a52cd5a47a7e9af9e2c341e66f0ffdd9bc977930 from branch mongodb-4.4
ref: ef1f2937c3..a52cd5a47a for: 4.4.4 WT-6309 Add support for start/stop arguments to wt printlog command WT-6866 Refactor python backup tests initial base class WT-6924 Queue history store pages for urgent eviction when cache pressure is high WT-6946 Adding test tags to an initial set of test programs WT-7068 Add column store support to test_hs03 WT-7084 Fix assert in test code and a comment error WT-7109 Retain no longer supported configuration options for backward compatibility WT-7113 Integrate prototype tiered storage code into WT WT-7114 Revert Makefile code to always run the prototype script
Diffstat (limited to 'src/third_party/wiredtiger/src/block')
-rw-r--r--src/third_party/wiredtiger/src/block/block_addr.c81
-rw-r--r--src/third_party/wiredtiger/src/block/block_ckpt.c51
-rw-r--r--src/third_party/wiredtiger/src/block/block_ckpt_scan.c7
-rw-r--r--src/third_party/wiredtiger/src/block/block_compact.c4
-rw-r--r--src/third_party/wiredtiger/src/block/block_ext.c36
-rw-r--r--src/third_party/wiredtiger/src/block/block_open.c15
-rw-r--r--src/third_party/wiredtiger/src/block/block_read.c91
-rw-r--r--src/third_party/wiredtiger/src/block/block_slvg.c17
-rw-r--r--src/third_party/wiredtiger/src/block/block_vrfy.c4
-rw-r--r--src/third_party/wiredtiger/src/block/block_write.c26
10 files changed, 232 insertions, 100 deletions
diff --git a/src/third_party/wiredtiger/src/block/block_addr.c b/src/third_party/wiredtiger/src/block/block_addr.c
index f9bd4248642..312db74a24c 100644
--- a/src/third_party/wiredtiger/src/block/block_addr.c
+++ b/src/third_party/wiredtiger/src/block/block_addr.c
@@ -14,11 +14,15 @@
* reference so it can be called repeatedly to load a buffer.
*/
static int
-__block_buffer_to_addr(
- uint32_t allocsize, const uint8_t **pp, wt_off_t *offsetp, uint32_t *sizep, uint32_t *checksump)
+__block_buffer_to_addr(WT_BLOCK *block, const uint8_t **pp, uint32_t *logidp, wt_off_t *offsetp,
+ uint32_t *sizep, uint32_t *checksump)
{
- uint64_t o, s, c;
+ uint64_t l, o, s, c;
+ if (block->log_structured)
+ WT_RET(__wt_vunpack_uint(pp, 0, &l));
+ else
+ l = 0;
WT_RET(__wt_vunpack_uint(pp, 0, &o));
WT_RET(__wt_vunpack_uint(pp, 0, &s));
WT_RET(__wt_vunpack_uint(pp, 0, &c));
@@ -37,10 +41,11 @@ __block_buffer_to_addr(
*/
if (s == 0) {
*offsetp = 0;
- *sizep = *checksump = 0;
+ *logidp = *sizep = *checksump = 0;
} else {
- *offsetp = (wt_off_t)(o + 1) * allocsize;
- *sizep = (uint32_t)s * allocsize;
+ *logidp = (uint32_t)l;
+ *offsetp = (wt_off_t)(o + 1) * block->allocsize;
+ *sizep = (uint32_t)s * block->allocsize;
*checksump = (uint32_t)c;
}
return (0);
@@ -52,19 +57,22 @@ __block_buffer_to_addr(
*/
int
__wt_block_addr_to_buffer(
- WT_BLOCK *block, uint8_t **pp, wt_off_t offset, uint32_t size, uint32_t checksum)
+ WT_BLOCK *block, uint8_t **pp, uint32_t logid, wt_off_t offset, uint32_t size, uint32_t checksum)
{
- uint64_t o, s, c;
+ uint64_t l, o, s, c;
/* See the comment above: this is the reverse operation. */
if (size == 0) {
o = WT_BLOCK_INVALID_OFFSET;
- s = c = 0;
+ l = s = c = 0;
} else {
+ l = logid;
o = (uint64_t)offset / block->allocsize - 1;
s = size / block->allocsize;
c = checksum;
}
+ if (block->log_structured)
+ WT_RET(__wt_vpack_uint(pp, 0, l));
WT_RET(__wt_vpack_uint(pp, 0, o));
WT_RET(__wt_vpack_uint(pp, 0, s));
WT_RET(__wt_vpack_uint(pp, 0, c));
@@ -77,10 +85,10 @@ __wt_block_addr_to_buffer(
* reference.
*/
int
-__wt_block_buffer_to_addr(
- WT_BLOCK *block, const uint8_t *p, wt_off_t *offsetp, uint32_t *sizep, uint32_t *checksump)
+__wt_block_buffer_to_addr(WT_BLOCK *block, const uint8_t *p, uint32_t *logidp, wt_off_t *offsetp,
+ uint32_t *sizep, uint32_t *checksump)
{
- return (__block_buffer_to_addr(block->allocsize, &p, offsetp, sizep, checksump));
+ return (__block_buffer_to_addr(block, &p, logidp, offsetp, sizep, checksump));
}
/*
@@ -92,14 +100,14 @@ __wt_block_addr_invalid(
WT_SESSION_IMPL *session, WT_BLOCK *block, const uint8_t *addr, size_t addr_size, bool live)
{
wt_off_t offset;
- uint32_t checksum, size;
+ uint32_t checksum, logid, size;
WT_UNUSED(session);
WT_UNUSED(addr_size);
WT_UNUSED(live);
/* Crack the cookie. */
- WT_RET(__wt_block_buffer_to_addr(block, addr, &offset, &size, &checksum));
+ WT_RET(__wt_block_buffer_to_addr(block, addr, &logid, &offset, &size, &checksum));
#ifdef HAVE_DIAGNOSTIC
/*
@@ -111,7 +119,7 @@ __wt_block_addr_invalid(
#endif
/* Check if the address is past the end of the file. */
- return (offset + size > block->size ? EINVAL : 0);
+ return (logid == block->logid && offset + size > block->size ? EINVAL : 0);
}
/*
@@ -123,15 +131,16 @@ __wt_block_addr_string(
WT_SESSION_IMPL *session, WT_BLOCK *block, WT_ITEM *buf, const uint8_t *addr, size_t addr_size)
{
wt_off_t offset;
- uint32_t checksum, size;
+ uint32_t checksum, logid, size;
WT_UNUSED(addr_size);
/* Crack the cookie. */
- WT_RET(__wt_block_buffer_to_addr(block, addr, &offset, &size, &checksum));
+ WT_RET(__wt_block_buffer_to_addr(block, addr, &logid, &offset, &size, &checksum));
/* Printable representation. */
- WT_RET(__wt_buf_fmt(session, buf, "[%" PRIuMAX "-%" PRIuMAX ", %" PRIu32 ", %" PRIu32 "]",
+ WT_RET(__wt_buf_fmt(session, buf,
+ "[%" PRIu32 ": %" PRIuMAX "-%" PRIuMAX ", %" PRIu32 ", %" PRIu32 "]", logid,
(uintmax_t)offset, (uintmax_t)offset + size, size, checksum));
return (0);
@@ -143,7 +152,7 @@ __wt_block_addr_string(
*/
static int
__block_buffer_to_ckpt(
- WT_SESSION_IMPL *session, uint32_t allocsize, const uint8_t *p, WT_BLOCK_CKPT *ci)
+ WT_SESSION_IMPL *session, WT_BLOCK *block, const uint8_t *p, WT_BLOCK_CKPT *ci)
{
uint64_t a;
const uint8_t **pp;
@@ -153,14 +162,14 @@ __block_buffer_to_ckpt(
WT_RET_MSG(session, WT_ERROR, "unsupported checkpoint version");
pp = &p;
- WT_RET(
- __block_buffer_to_addr(allocsize, pp, &ci->root_offset, &ci->root_size, &ci->root_checksum));
WT_RET(__block_buffer_to_addr(
- allocsize, pp, &ci->alloc.offset, &ci->alloc.size, &ci->alloc.checksum));
+ block, pp, &ci->root_logid, &ci->root_offset, &ci->root_size, &ci->root_checksum));
WT_RET(__block_buffer_to_addr(
- allocsize, pp, &ci->avail.offset, &ci->avail.size, &ci->avail.checksum));
+ block, pp, &ci->alloc.logid, &ci->alloc.offset, &ci->alloc.size, &ci->alloc.checksum));
WT_RET(__block_buffer_to_addr(
- allocsize, pp, &ci->discard.offset, &ci->discard.size, &ci->discard.checksum));
+ block, pp, &ci->avail.logid, &ci->avail.offset, &ci->avail.size, &ci->avail.checksum));
+ WT_RET(__block_buffer_to_addr(block, pp, &ci->discard.logid, &ci->discard.offset,
+ &ci->discard.size, &ci->discard.checksum));
WT_RET(__wt_vunpack_uint(pp, 0, &a));
ci->file_size = (wt_off_t)a;
WT_RET(__wt_vunpack_uint(pp, 0, &a));
@@ -177,7 +186,7 @@ int
__wt_block_buffer_to_ckpt(
WT_SESSION_IMPL *session, WT_BLOCK *block, const uint8_t *p, WT_BLOCK_CKPT *ci)
{
- return (__block_buffer_to_ckpt(session, block->allocsize, p, ci));
+ return (__block_buffer_to_ckpt(session, block, p, ci));
}
/*
@@ -185,13 +194,13 @@ __wt_block_buffer_to_ckpt(
* Convert a checkpoint cookie into its components, external utility version.
*/
int
-__wt_block_ckpt_decode(WT_SESSION *wt_session, size_t allocsize, const uint8_t *p,
- WT_BLOCK_CKPT *ci) WT_GCC_FUNC_ATTRIBUTE((visibility("default")))
+__wt_block_ckpt_decode(WT_SESSION *wt_session, WT_BLOCK *block, const uint8_t *p, WT_BLOCK_CKPT *ci)
+ WT_GCC_FUNC_ATTRIBUTE((visibility("default")))
{
WT_SESSION_IMPL *session;
session = (WT_SESSION_IMPL *)wt_session;
- return (__block_buffer_to_ckpt(session, (uint32_t)allocsize, p, ci));
+ return (__block_buffer_to_ckpt(session, block, p, ci));
}
/*
@@ -203,6 +212,9 @@ __wt_block_ckpt_to_buffer(
WT_SESSION_IMPL *session, WT_BLOCK *block, uint8_t **pp, WT_BLOCK_CKPT *ci, bool skip_avail)
{
uint64_t a;
+ uint32_t logid;
+
+ logid = block->logid;
if (ci->version != WT_BM_CHECKPOINT_VERSION)
WT_RET_MSG(session, WT_ERROR, "unsupported checkpoint version");
@@ -210,16 +222,17 @@ __wt_block_ckpt_to_buffer(
(*pp)[0] = ci->version;
(*pp)++;
- WT_RET(__wt_block_addr_to_buffer(block, pp, ci->root_offset, ci->root_size, ci->root_checksum));
- WT_RET(
- __wt_block_addr_to_buffer(block, pp, ci->alloc.offset, ci->alloc.size, ci->alloc.checksum));
+ WT_RET(__wt_block_addr_to_buffer(
+ block, pp, logid, ci->root_offset, ci->root_size, ci->root_checksum));
+ WT_RET(__wt_block_addr_to_buffer(
+ block, pp, logid, ci->alloc.offset, ci->alloc.size, ci->alloc.checksum));
if (skip_avail)
- WT_RET(__wt_block_addr_to_buffer(block, pp, 0, 0, 0));
+ WT_RET(__wt_block_addr_to_buffer(block, pp, 0, 0, 0, 0));
else
WT_RET(__wt_block_addr_to_buffer(
- block, pp, ci->avail.offset, ci->avail.size, ci->avail.checksum));
+ block, pp, logid, ci->avail.offset, ci->avail.size, ci->avail.checksum));
WT_RET(__wt_block_addr_to_buffer(
- block, pp, ci->discard.offset, ci->discard.size, ci->discard.checksum));
+ block, pp, logid, ci->discard.offset, ci->discard.size, ci->discard.checksum));
a = (uint64_t)ci->file_size;
WT_RET(__wt_vpack_uint(pp, 0, a));
a = ci->ckpt_size;
diff --git a/src/third_party/wiredtiger/src/block/block_ckpt.c b/src/third_party/wiredtiger/src/block/block_ckpt.c
index cdabd131e40..ceb9cf39262 100644
--- a/src/third_party/wiredtiger/src/block/block_ckpt.c
+++ b/src/third_party/wiredtiger/src/block/block_ckpt.c
@@ -95,8 +95,13 @@ __wt_block_checkpoint_load(WT_SESSION_IMPL *session, WT_BLOCK *block, const uint
if (ci->root_offset != WT_BLOCK_INVALID_OFFSET) {
endp = root_addr;
WT_ERR(__wt_block_addr_to_buffer(
- block, &endp, ci->root_offset, ci->root_size, ci->root_checksum));
+ block, &endp, ci->root_logid, ci->root_offset, ci->root_size, ci->root_checksum));
*root_addr_sizep = WT_PTRDIFF(endp, root_addr);
+
+ if (block->log_structured) {
+ block->logid = ci->root_logid;
+ WT_ERR(__wt_block_newfile(session, block));
+ }
}
/*
@@ -113,7 +118,7 @@ __wt_block_checkpoint_load(WT_SESSION_IMPL *session, WT_BLOCK *block, const uint
* the end of the file, that was done when the checkpoint was first written (re-writing the
* checkpoint might possibly make it relevant here, but it's unlikely enough I don't bother).
*/
- if (!checkpoint)
+ if (!checkpoint && !block->log_structured)
WT_ERR(__wt_block_truncate(session, block, ci->file_size));
if (0) {
@@ -237,10 +242,10 @@ __wt_block_checkpoint(
*/
if (buf == NULL) {
ci->root_offset = WT_BLOCK_INVALID_OFFSET;
- ci->root_size = ci->root_checksum = 0;
+ ci->root_logid = ci->root_size = ci->root_checksum = 0;
} else
- WT_ERR(__wt_block_write_off(session, block, buf, &ci->root_offset, &ci->root_size,
- &ci->root_checksum, data_checksum, true, false));
+ WT_ERR(__wt_block_write_off(session, block, buf, &ci->root_logid, &ci->root_offset,
+ &ci->root_size, &ci->root_checksum, data_checksum, true, false));
/*
* Checkpoints are potentially reading/writing/merging lots of blocks, pre-allocate structures
@@ -463,6 +468,37 @@ __ckpt_add_blk_mods_ext(WT_SESSION_IMPL *session, WT_CKPT *ckptbase, WT_BLOCK_CK
}
/*
+ * __wt_block_newfile --
+ * Switch a log-structured block object to a new file.
+ */
+int
+__wt_block_newfile(WT_SESSION_IMPL *session, WT_BLOCK *block)
+{
+ WT_DECL_ITEM(tmp);
+ WT_DECL_RET;
+ const char *filename;
+
+ /* Bump to a new file ID. */
+ ++block->logid;
+
+ WT_ERR(__wt_scr_alloc(session, 0, &tmp));
+ WT_ERR(__wt_buf_fmt(session, tmp, "%s.%08" PRIu32, block->name, block->logid));
+ filename = tmp->data;
+ WT_ERR(__wt_close(session, &block->fh));
+ WT_ERR(__wt_open(session, filename, WT_FS_OPEN_FILE_TYPE_DATA,
+ WT_FS_OPEN_CREATE | block->file_flags, &block->fh));
+ WT_ERR(__wt_desc_write(session, block->fh, block->allocsize));
+
+ block->size = block->allocsize;
+ __wt_block_ckpt_destroy(session, &block->live);
+ WT_ERR(__wt_block_ckpt_init(session, &block->live, "live"));
+
+err:
+ __wt_scr_free(session, &tmp);
+ return (ret);
+}
+
+/*
* __ckpt_process --
* Process the list of checkpoints.
*/
@@ -610,7 +646,7 @@ __ckpt_process(WT_SESSION_IMPL *session, WT_BLOCK *block, WT_CKPT *ckptbase)
* lists, and the freed blocks will then be included when writing the live extent lists.
*/
WT_CKPT_FOREACH (ckptbase, ckpt) {
- if (F_ISSET(ckpt, WT_CKPT_FAKE) || !F_ISSET(ckpt, WT_CKPT_DELETE))
+ if (F_ISSET(ckpt, WT_CKPT_FAKE) || !F_ISSET(ckpt, WT_CKPT_DELETE) || block->log_structured)
continue;
if (WT_VERBOSE_ISSET(session, WT_VERB_CHECKPOINT))
@@ -744,6 +780,9 @@ live_update:
ci->ckpt_discard = ci->discard;
WT_ERR(__wt_block_extlist_init(session, &ci->discard, "live", "discard", false));
+ if (block->log_structured)
+ WT_ERR(__wt_block_newfile(session, block));
+
#ifdef HAVE_DIAGNOSTIC
/*
* The first checkpoint in the system should always have an empty discard list. If we've read
diff --git a/src/third_party/wiredtiger/src/block/block_ckpt_scan.c b/src/third_party/wiredtiger/src/block/block_ckpt_scan.c
index 4d47c4301b2..2320076752a 100644
--- a/src/third_party/wiredtiger/src/block/block_ckpt_scan.c
+++ b/src/third_party/wiredtiger/src/block/block_ckpt_scan.c
@@ -220,13 +220,16 @@ __wt_block_checkpoint_last(WT_SESSION_IMPL *session, WT_BLOCK *block, char **met
const WT_PAGE_HEADER *dsk;
wt_off_t ext_off, ext_size, offset;
uint64_t len, nblocks, write_gen;
- uint32_t checksum, size;
+ uint32_t checksum, logid, size;
const uint8_t *p, *t;
bool found;
*metadatap = *checkpoint_listp = NULL;
WT_RET(__wt_buf_init(session, checkpoint, WT_BLOCK_CHECKPOINT_BUFFER));
+ /* TODO: scan all log IDs. */
+ logid = 0;
+
/*
* Initialize a pair of structures that track the best and current checkpoints found so far.
* This is a little trickier than normal because we don't want to start saving a checkpoint only
@@ -279,7 +282,7 @@ __wt_block_checkpoint_last(WT_SESSION_IMPL *session, WT_BLOCK *block, char **met
* block isn't valid, skip to the next possible block.
*/
if (__wt_block_offset_invalid(block, offset, size) ||
- __wt_block_read_off(session, block, tmp, offset, size, checksum) != 0) {
+ __wt_block_read_off(session, block, tmp, logid, offset, size, checksum) != 0) {
size = WT_BTREE_MIN_ALLOC_SIZE;
continue;
}
diff --git a/src/third_party/wiredtiger/src/block/block_compact.c b/src/third_party/wiredtiger/src/block/block_compact.c
index 19ee0b664f6..6615d4e192d 100644
--- a/src/third_party/wiredtiger/src/block/block_compact.c
+++ b/src/third_party/wiredtiger/src/block/block_compact.c
@@ -136,13 +136,13 @@ __wt_block_compact_page_skip(
WT_EXT *ext;
WT_EXTLIST *el;
wt_off_t limit, offset;
- uint32_t size, checksum;
+ uint32_t checksum, logid, size;
WT_UNUSED(addr_size);
*skipp = true; /* Return a default skip. */
/* Crack the cookie. */
- WT_RET(__wt_block_buffer_to_addr(block, addr, &offset, &size, &checksum));
+ WT_RET(__wt_block_buffer_to_addr(block, addr, &logid, &offset, &size, &checksum));
/*
* If this block is in the chosen percentage of the file and there's a block on the available
diff --git a/src/third_party/wiredtiger/src/block/block_ext.c b/src/third_party/wiredtiger/src/block/block_ext.c
index 06c44680f1a..d0dbfa97646 100644
--- a/src/third_party/wiredtiger/src/block/block_ext.c
+++ b/src/third_party/wiredtiger/src/block/block_ext.c
@@ -380,7 +380,7 @@ corrupt:
/*
* __wt_block_off_remove_overlap --
- * Remove a range from an extent list, where the range may be part of a overlapping entry.
+ * Remove a range from an extent list, where the range may be part of an overlapping entry.
*/
int
__wt_block_off_remove_overlap(
@@ -565,24 +565,28 @@ __wt_block_free(WT_SESSION_IMPL *session, WT_BLOCK *block, const uint8_t *addr,
{
WT_DECL_RET;
wt_off_t offset;
- uint32_t checksum, size;
+ uint32_t checksum, logid, size;
WT_UNUSED(addr_size);
WT_STAT_DATA_INCR(session, block_free);
/* Crack the cookie. */
- WT_RET(__wt_block_buffer_to_addr(block, addr, &offset, &size, &checksum));
+ WT_RET(__wt_block_buffer_to_addr(block, addr, &logid, &offset, &size, &checksum));
- __wt_verbose(
- session, WT_VERB_BLOCK, "free %" PRIdMAX "/%" PRIdMAX, (intmax_t)offset, (intmax_t)size);
+ __wt_verbose(session, WT_VERB_BLOCK, "free %" PRIu32 ": %" PRIdMAX "/%" PRIdMAX, logid,
+ (intmax_t)offset, (intmax_t)size);
#ifdef HAVE_DIAGNOSTIC
WT_RET(__wt_block_misplaced(session, block, "free", offset, size, true, __func__, __LINE__));
#endif
- WT_RET(__wt_block_ext_prealloc(session, 5));
- __wt_spin_lock(session, &block->live_lock);
- ret = __wt_block_off_free(session, block, offset, (wt_off_t)size);
- __wt_spin_unlock(session, &block->live_lock);
+ if (logid == block->logid) {
+ WT_RET(__wt_block_ext_prealloc(session, 5));
+ __wt_spin_lock(session, &block->live_lock);
+ ret = __wt_block_off_free(session, block, logid, offset, (wt_off_t)size);
+ __wt_spin_unlock(session, &block->live_lock);
+ } else {
+ /* TODO: update stats about older files to drive garbage collection. */
+ }
return (ret);
}
@@ -592,13 +596,18 @@ __wt_block_free(WT_SESSION_IMPL *session, WT_BLOCK *block, const uint8_t *addr,
* Free a file range to the underlying file.
*/
int
-__wt_block_off_free(WT_SESSION_IMPL *session, WT_BLOCK *block, wt_off_t offset, wt_off_t size)
+__wt_block_off_free(
+ WT_SESSION_IMPL *session, WT_BLOCK *block, uint32_t logid, wt_off_t offset, wt_off_t size)
{
WT_DECL_RET;
/* If a sync is running, no other sessions can free blocks. */
WT_ASSERT(session, WT_SESSION_BTREE_SYNC_SAFE(session, S2BT(session)));
+ /* TODO: track stats for old files to drive garbage collection. */
+ if (logid != block->logid)
+ return (0);
+
/*
* Callers of this function are expected to have already acquired any locks required to
* manipulate the extent lists.
@@ -1096,7 +1105,7 @@ __wt_block_extlist_read(
return (0);
WT_RET(__wt_scr_alloc(session, el->size, &tmp));
- WT_ERR(__wt_block_read_off(session, block, tmp, el->offset, el->size, el->checksum));
+ WT_ERR(__wt_block_read_off(session, block, tmp, el->logid, el->offset, el->size, el->checksum));
p = WT_BLOCK_HEADER_BYTE(tmp->mem);
WT_ERR(__wt_extlist_read_pair(&p, &off, &size));
@@ -1156,7 +1165,7 @@ __wt_block_extlist_write(
WT_EXT *ext;
WT_PAGE_HEADER *dsk;
size_t size;
- uint32_t entries;
+ uint32_t logid, entries;
uint8_t *p;
WT_RET(__block_extlist_dump(session, block, el, "write"));
@@ -1214,7 +1223,8 @@ __wt_block_extlist_write(
/* Write the extent list to disk. */
WT_ERR(__wt_block_write_off(
- session, block, tmp, &el->offset, &el->size, &el->checksum, true, true, true));
+ session, block, tmp, &logid, &el->offset, &el->size, &el->checksum, true, true, true));
+ WT_UNUSED(logid); /* TODO check */
/*
* Remove the allocated blocks from the system's allocation list, extent blocks never appear on
diff --git a/src/third_party/wiredtiger/src/block/block_open.c b/src/third_party/wiredtiger/src/block/block_open.c
index d97f3a86f7d..5eba665ea8d 100644
--- a/src/third_party/wiredtiger/src/block/block_open.c
+++ b/src/third_party/wiredtiger/src/block/block_open.c
@@ -33,6 +33,8 @@ __wt_block_manager_create(WT_SESSION_IMPL *session, const char *filename, uint32
int suffix;
bool exists;
+ WT_ERR(__wt_scr_alloc(session, 0, &tmp));
+
/*
* Create the underlying file and open a handle.
*
@@ -46,8 +48,6 @@ __wt_block_manager_create(WT_SESSION_IMPL *session, const char *filename, uint32
break;
WT_ERR_TEST(ret != EEXIST, ret, false);
- if (tmp == NULL)
- WT_ERR(__wt_scr_alloc(session, 0, &tmp));
for (suffix = 1;; ++suffix) {
WT_ERR(__wt_buf_fmt(session, tmp, "%s.%d", filename, suffix));
WT_ERR(__wt_fs_exist(session, tmp->data, &exists));
@@ -91,6 +91,7 @@ __block_destroy(WT_SESSION_IMPL *session, WT_BLOCK *block)
WT_CONNECTION_IMPL *conn;
WT_DECL_RET;
uint64_t bucket;
+ u_int i;
conn = S2C(session);
bucket = block->name_hash & (conn->hash_size - 1);
@@ -98,6 +99,12 @@ __block_destroy(WT_SESSION_IMPL *session, WT_BLOCK *block)
__wt_free(session, block->name);
+ if (block->log_structured && block->lfh != NULL) {
+ for (i = 0; i < block->max_logid; i++)
+ WT_TRET(__wt_close(session, &block->lfh[i]));
+ __wt_free(session, block->lfh);
+ }
+
if (block->fh != NULL)
WT_TRET(__wt_close(session, &block->fh));
@@ -175,6 +182,7 @@ __wt_block_open(WT_SESSION_IMPL *session, const char *filename, const char *cfg[
WT_ERR(__wt_config_gets(session, cfg, "block_allocation", &cval));
block->allocfirst = WT_STRING_MATCH("first", cval.str, cval.len);
+ block->log_structured = WT_STRING_MATCH("log-structured", cval.str, cval.len);
/* Configuration: optional OS buffer cache maximum size. */
WT_ERR(__wt_config_gets(session, cfg, "os_cache_max", &cval));
@@ -203,7 +211,8 @@ __wt_block_open(WT_SESSION_IMPL *session, const char *filename, const char *cfg[
LF_SET(WT_FS_OPEN_DIRECTIO);
if (!readonly && FLD_ISSET(conn->direct_io, WT_DIRECT_IO_DATA))
LF_SET(WT_FS_OPEN_DIRECTIO);
- WT_ERR(__wt_open(session, filename, WT_FS_OPEN_FILE_TYPE_DATA, flags, &block->fh));
+ block->file_flags = flags;
+ WT_ERR(__wt_open(session, filename, WT_FS_OPEN_FILE_TYPE_DATA, block->file_flags, &block->fh));
/* Set the file's size. */
WT_ERR(__wt_filesize(session, block->fh, &block->size));
diff --git a/src/third_party/wiredtiger/src/block/block_read.c b/src/third_party/wiredtiger/src/block/block_read.c
index 8680f1f90f8..c5e3a1c193c 100644
--- a/src/third_party/wiredtiger/src/block/block_read.c
+++ b/src/third_party/wiredtiger/src/block/block_read.c
@@ -18,9 +18,10 @@ __wt_bm_preload(WT_BM *bm, WT_SESSION_IMPL *session, const uint8_t *addr, size_t
WT_BLOCK *block;
WT_DECL_ITEM(tmp);
WT_DECL_RET;
+ WT_FH *fh;
WT_FILE_HANDLE *handle;
wt_off_t offset;
- uint32_t checksum, size;
+ uint32_t checksum, logid, size;
bool mapped;
block = bm->block;
@@ -28,9 +29,10 @@ __wt_bm_preload(WT_BM *bm, WT_SESSION_IMPL *session, const uint8_t *addr, size_t
WT_STAT_CONN_INCR(session, block_preload);
/* Crack the cookie. */
- WT_RET(__wt_block_buffer_to_addr(block, addr, &offset, &size, &checksum));
+ WT_RET(__wt_block_buffer_to_addr(block, addr, &logid, &offset, &size, &checksum));
- handle = block->fh->handle;
+ WT_RET(__wt_block_fh(session, block, logid, &fh));
+ handle = fh->handle;
mapped = bm->map != NULL && offset + size <= (wt_off_t)bm->maplen;
if (mapped && handle->fh_map_preload != NULL)
ret = handle->fh_map_preload(
@@ -59,21 +61,23 @@ __wt_bm_read(
{
WT_BLOCK *block;
WT_DECL_RET;
+ WT_FH *fh;
WT_FILE_HANDLE *handle;
wt_off_t offset;
- uint32_t checksum, size;
+ uint32_t checksum, logid, size;
bool mapped;
WT_UNUSED(addr_size);
block = bm->block;
/* Crack the cookie. */
- WT_RET(__wt_block_buffer_to_addr(block, addr, &offset, &size, &checksum));
+ WT_RET(__wt_block_buffer_to_addr(block, addr, &logid, &offset, &size, &checksum));
/*
* Map the block if it's possible.
*/
- handle = block->fh->handle;
+ WT_RET(__wt_block_fh(session, block, logid, &fh));
+ handle = fh->handle;
mapped = bm->map != NULL && offset + size <= (wt_off_t)bm->maplen;
if (mapped && handle->fh_map_preload != NULL) {
buf->data = (uint8_t *)bm->map + offset;
@@ -96,7 +100,7 @@ __wt_bm_read(
#endif
/* Read the block. */
__wt_capacity_throttle(session, size, WT_THROTTLE_READ);
- WT_RET(__wt_block_read_off(session, block, buf, offset, size, checksum));
+ WT_RET(__wt_block_read_off(session, block, buf, logid, offset, size, checksum));
/* Optionally discard blocks from the system's buffer cache. */
WT_RET(__wt_block_discard(session, block, (size_t)size));
@@ -109,17 +113,17 @@ __wt_bm_read(
* Dump a block into the log in 1KB chunks.
*/
static int
-__wt_bm_corrupt_dump(WT_SESSION_IMPL *session, WT_ITEM *buf, wt_off_t offset, uint32_t size,
- uint32_t checksum) WT_GCC_FUNC_ATTRIBUTE((cold))
+__wt_bm_corrupt_dump(WT_SESSION_IMPL *session, WT_ITEM *buf, uint32_t logid, wt_off_t offset,
+ uint32_t size, uint32_t checksum) WT_GCC_FUNC_ATTRIBUTE((cold))
{
WT_DECL_ITEM(tmp);
WT_DECL_RET;
size_t chunk, i, nchunks;
-#define WT_CORRUPT_FMT "{%" PRIuMAX ", %" PRIu32 ", %#" PRIx32 "}"
+#define WT_CORRUPT_FMT "{%" PRIu32 ": %" PRIuMAX ", %" PRIu32 ", %#" PRIx32 "}"
if (buf->size == 0) {
- __wt_errx(session, WT_CORRUPT_FMT ": empty buffer, no dump available", (uintmax_t)offset,
- size, checksum);
+ __wt_errx(session, WT_CORRUPT_FMT ": empty buffer, no dump available", logid,
+ (uintmax_t)offset, size, checksum);
return (0);
}
@@ -130,7 +134,7 @@ __wt_bm_corrupt_dump(WT_SESSION_IMPL *session, WT_ITEM *buf, wt_off_t offset, ui
WT_ERR(__wt_buf_catfmt(session, tmp, "%02x ", ((uint8_t *)buf->data)[i]));
if (++i == buf->size || i % 1024 == 0) {
__wt_errx(session,
- WT_CORRUPT_FMT ": (chunk %" WT_SIZET_FMT " of %" WT_SIZET_FMT "): %.*s",
+ WT_CORRUPT_FMT ": (chunk %" WT_SIZET_FMT " of %" WT_SIZET_FMT "): %.*s", logid,
(uintmax_t)offset, size, checksum, ++chunk, nchunks, (int)tmp->size,
(char *)tmp->data);
if (i == buf->size)
@@ -154,15 +158,15 @@ __wt_bm_corrupt(WT_BM *bm, WT_SESSION_IMPL *session, const uint8_t *addr, size_t
WT_DECL_ITEM(tmp);
WT_DECL_RET;
wt_off_t offset;
- uint32_t checksum, size;
+ uint32_t checksum, logid, size;
/* Read the block. */
WT_RET(__wt_scr_alloc(session, 0, &tmp));
WT_ERR(__wt_bm_read(bm, session, tmp, addr, addr_size));
/* Crack the cookie, dump the block. */
- WT_ERR(__wt_block_buffer_to_addr(bm->block, addr, &offset, &size, &checksum));
- WT_ERR(__wt_bm_corrupt_dump(session, tmp, offset, size, checksum));
+ WT_ERR(__wt_block_buffer_to_addr(bm->block, addr, &logid, &offset, &size, &checksum));
+ WT_ERR(__wt_bm_corrupt_dump(session, tmp, logid, offset, size, checksum));
err:
__wt_scr_free(session, &tmp);
@@ -203,14 +207,60 @@ err:
#endif
/*
+ * __wt_block_fh --
+ * Get a block file handle.
+ */
+int
+__wt_block_fh(WT_SESSION_IMPL *session, WT_BLOCK *block, uint32_t logid, WT_FH **fhp)
+{
+ WT_DECL_ITEM(tmp);
+ WT_DECL_RET;
+ const char *filename;
+
+ if (!block->log_structured || logid == block->logid) {
+ *fhp = block->fh;
+ return (0);
+ }
+
+ /* TODO: fh readlock */
+ if (logid * sizeof(WT_FILE_HANDLE *) < block->lfh_alloc && (*fhp = block->lfh[logid]) != NULL)
+ return (0);
+
+ /* TODO: fh writelock */
+ /* Ensure the array goes far enough. */
+ WT_RET(__wt_realloc_def(session, &block->lfh_alloc, logid + 1, &block->lfh));
+ if (logid >= block->max_logid)
+ block->max_logid = logid + 1;
+ if ((*fhp = block->lfh[logid]) != NULL)
+ return (0);
+
+ WT_RET(__wt_scr_alloc(session, 0, &tmp));
+ if (logid == 0)
+ filename = block->name;
+ else {
+ WT_ERR(__wt_buf_fmt(session, tmp, "%s.%08" PRIu32, block->name, logid));
+ filename = tmp->data;
+ }
+ WT_ERR(__wt_open(session, filename, WT_FS_OPEN_FILE_TYPE_DATA,
+ WT_FS_OPEN_READONLY | block->file_flags, &block->lfh[logid]));
+ *fhp = block->lfh[logid];
+ WT_ASSERT(session, *fhp != NULL);
+
+err:
+ __wt_scr_free(session, &tmp);
+ return (ret);
+}
+
+/*
* __wt_block_read_off --
* Read an addr/size pair referenced block into a buffer.
*/
int
-__wt_block_read_off(WT_SESSION_IMPL *session, WT_BLOCK *block, WT_ITEM *buf, wt_off_t offset,
- uint32_t size, uint32_t checksum)
+__wt_block_read_off(WT_SESSION_IMPL *session, WT_BLOCK *block, WT_ITEM *buf, uint32_t logid,
+ wt_off_t offset, uint32_t size, uint32_t checksum)
{
WT_BLOCK_HEADER *blk, swap;
+ WT_FH *fh;
size_t bufsize;
__wt_verbose(session, WT_VERB_READ, "off %" PRIuMAX ", size %" PRIu32 ", checksum %#" PRIx32,
@@ -243,7 +293,8 @@ __wt_block_read_off(WT_SESSION_IMPL *session, WT_BLOCK *block, WT_ITEM *buf, wt_
block->name, size, block->allocsize);
WT_RET(__wt_buf_init(session, buf, bufsize));
- WT_RET(__wt_read(session, block->fh, offset, size, buf->mem));
+ WT_RET(__wt_block_fh(session, block, logid, &fh));
+ WT_RET(__wt_read(session, fh, offset, size, buf->mem));
buf->size = size;
/*
@@ -276,7 +327,7 @@ __wt_block_read_off(WT_SESSION_IMPL *session, WT_BLOCK *block, WT_ITEM *buf, wt_
block->name, size, (uintmax_t)offset, swap.checksum, checksum);
if (!F_ISSET(session, WT_SESSION_QUIET_CORRUPT_FILE))
- WT_IGNORE_RET(__wt_bm_corrupt_dump(session, buf, offset, size, checksum));
+ WT_IGNORE_RET(__wt_bm_corrupt_dump(session, buf, logid, offset, size, checksum));
/* Panic if a checksum fails during an ordinary read. */
F_SET(S2C(session), WT_CONN_DATA_CORRUPTION);
diff --git a/src/third_party/wiredtiger/src/block/block_slvg.c b/src/third_party/wiredtiger/src/block/block_slvg.c
index 8a632e892b4..857f15d7848 100644
--- a/src/third_party/wiredtiger/src/block/block_slvg.c
+++ b/src/third_party/wiredtiger/src/block/block_slvg.c
@@ -104,11 +104,14 @@ __wt_block_salvage_next(
WT_DECL_RET;
WT_FH *fh;
wt_off_t max, offset;
- uint32_t allocsize, checksum, size;
+ uint32_t allocsize, checksum, logid, size;
uint8_t *endp;
*eofp = 0;
+ /* FIXME: salvage across all chunks in a log-structured tree. */
+ logid = 0;
+
fh = block->fh;
allocsize = block->allocsize;
WT_ERR(__wt_scr_alloc(session, allocsize, &tmp));
@@ -137,19 +140,19 @@ __wt_block_salvage_next(
* otherwise, move past it.
*/
if (!__wt_block_offset_invalid(block, offset, size) &&
- __wt_block_read_off(session, block, tmp, offset, size, checksum) == 0)
+ __wt_block_read_off(session, block, tmp, logid, offset, size, checksum) == 0)
break;
/* Free the allocation-size block. */
__wt_verbose(session, WT_VERB_SALVAGE, "skipping %" PRIu32 "B at file offset %" PRIuMAX,
allocsize, (uintmax_t)offset);
- WT_ERR(__wt_block_off_free(session, block, offset, (wt_off_t)allocsize));
+ WT_ERR(__wt_block_off_free(session, block, logid, offset, (wt_off_t)allocsize));
block->slvg_off += allocsize;
}
/* Re-create the address cookie that should reference this block. */
endp = addr;
- WT_ERR(__wt_block_addr_to_buffer(block, &endp, offset, size, checksum));
+ WT_ERR(__wt_block_addr_to_buffer(block, &endp, logid, offset, size, checksum));
*addr_sizep = WT_PTRDIFF(endp, addr);
done:
@@ -167,7 +170,7 @@ __wt_block_salvage_valid(
WT_SESSION_IMPL *session, WT_BLOCK *block, uint8_t *addr, size_t addr_size, bool valid)
{
wt_off_t offset;
- uint32_t size, checksum;
+ uint32_t size, logid, checksum;
WT_UNUSED(addr_size);
@@ -175,11 +178,11 @@ __wt_block_salvage_valid(
* Crack the cookie. If the upper layer took the block, move past it; if the upper layer
* rejected the block, move past an allocation size chunk and free it.
*/
- WT_RET(__wt_block_buffer_to_addr(block, addr, &offset, &size, &checksum));
+ WT_RET(__wt_block_buffer_to_addr(block, addr, &logid, &offset, &size, &checksum));
if (valid)
block->slvg_off = offset + size;
else {
- WT_RET(__wt_block_off_free(session, block, offset, (wt_off_t)block->allocsize));
+ WT_RET(__wt_block_off_free(session, block, logid, offset, (wt_off_t)block->allocsize));
block->slvg_off = offset + block->allocsize;
}
diff --git a/src/third_party/wiredtiger/src/block/block_vrfy.c b/src/third_party/wiredtiger/src/block/block_vrfy.c
index 57bba04e5da..0a434aebfce 100644
--- a/src/third_party/wiredtiger/src/block/block_vrfy.c
+++ b/src/third_party/wiredtiger/src/block/block_vrfy.c
@@ -316,12 +316,12 @@ __wt_block_verify_addr(
WT_SESSION_IMPL *session, WT_BLOCK *block, const uint8_t *addr, size_t addr_size)
{
wt_off_t offset;
- uint32_t checksum, size;
+ uint32_t checksum, logid, size;
WT_UNUSED(addr_size);
/* Crack the cookie. */
- WT_RET(__wt_block_buffer_to_addr(block, addr, &offset, &size, &checksum));
+ WT_RET(__wt_block_buffer_to_addr(block, addr, &logid, &offset, &size, &checksum));
/* Add to the per-file list. */
WT_RET(__verify_filefrag_add(session, block, NULL, offset, size, false));
diff --git a/src/third_party/wiredtiger/src/block/block_write.c b/src/third_party/wiredtiger/src/block/block_write.c
index c5890efd038..bafee786c5b 100644
--- a/src/third_party/wiredtiger/src/block/block_write.c
+++ b/src/third_party/wiredtiger/src/block/block_write.c
@@ -189,14 +189,14 @@ __wt_block_write(WT_SESSION_IMPL *session, WT_BLOCK *block, WT_ITEM *buf, uint8_
size_t *addr_sizep, bool data_checksum, bool checkpoint_io)
{
wt_off_t offset;
- uint32_t checksum, size;
+ uint32_t checksum, logid, size;
uint8_t *endp;
WT_RET(__wt_block_write_off(
- session, block, buf, &offset, &size, &checksum, data_checksum, checkpoint_io, false));
+ session, block, buf, &logid, &offset, &size, &checksum, data_checksum, checkpoint_io, false));
endp = addr;
- WT_RET(__wt_block_addr_to_buffer(block, &endp, offset, size, checksum));
+ WT_RET(__wt_block_addr_to_buffer(block, &endp, logid, offset, size, checksum));
*addr_sizep = WT_PTRDIFF(endp, addr);
return (0);
@@ -207,15 +207,16 @@ __wt_block_write(WT_SESSION_IMPL *session, WT_BLOCK *block, WT_ITEM *buf, uint8_
* Write a buffer into a block, returning the block's offset, size and checksum.
*/
static int
-__block_write_off(WT_SESSION_IMPL *session, WT_BLOCK *block, WT_ITEM *buf, wt_off_t *offsetp,
- uint32_t *sizep, uint32_t *checksump, bool data_checksum, bool checkpoint_io, bool caller_locked)
+__block_write_off(WT_SESSION_IMPL *session, WT_BLOCK *block, WT_ITEM *buf, uint32_t *logidp,
+ wt_off_t *offsetp, uint32_t *sizep, uint32_t *checksump, bool data_checksum, bool checkpoint_io,
+ bool caller_locked)
{
WT_BLOCK_HEADER *blk;
WT_DECL_RET;
WT_FH *fh;
wt_off_t offset;
size_t align_size;
- uint32_t checksum;
+ uint32_t checksum, logid;
uint8_t *file_sizep;
bool local_locked;
@@ -224,6 +225,7 @@ __block_write_off(WT_SESSION_IMPL *session, WT_BLOCK *block, WT_ITEM *buf, wt_of
*checksump = 0; /* -Werror=maybe-uninitialized */
fh = block->fh;
+ logid = block->logid;
/* Buffers should be aligned for writing. */
if (!F_ISSET(buf, WT_ITEM_ALIGNED)) {
@@ -325,7 +327,7 @@ __block_write_off(WT_SESSION_IMPL *session, WT_BLOCK *block, WT_ITEM *buf, wt_of
if ((ret = __wt_write(session, fh, offset, align_size, buf->mem)) != 0) {
if (!caller_locked)
__wt_spin_lock(session, &block->live_lock);
- WT_TRET(__wt_block_off_free(session, block, offset, (wt_off_t)align_size));
+ WT_TRET(__wt_block_off_free(session, block, logid, offset, (wt_off_t)align_size));
if (!caller_locked)
__wt_spin_unlock(session, &block->live_lock);
WT_RET(ret);
@@ -359,6 +361,7 @@ __block_write_off(WT_SESSION_IMPL *session, WT_BLOCK *block, WT_ITEM *buf, wt_of
__wt_verbose(session, WT_VERB_WRITE, "off %" PRIuMAX ", size %" PRIuMAX ", checksum %#" PRIx32,
(uintmax_t)offset, (uintmax_t)align_size, checksum);
+ *logidp = logid;
*offsetp = offset;
*sizep = WT_STORE_SIZE(align_size);
*checksump = checksum;
@@ -371,8 +374,9 @@ __block_write_off(WT_SESSION_IMPL *session, WT_BLOCK *block, WT_ITEM *buf, wt_of
* Write a buffer into a block, returning the block's offset, size and checksum.
*/
int
-__wt_block_write_off(WT_SESSION_IMPL *session, WT_BLOCK *block, WT_ITEM *buf, wt_off_t *offsetp,
- uint32_t *sizep, uint32_t *checksump, bool data_checksum, bool checkpoint_io, bool caller_locked)
+__wt_block_write_off(WT_SESSION_IMPL *session, WT_BLOCK *block, WT_ITEM *buf, uint32_t *logidp,
+ wt_off_t *offsetp, uint32_t *sizep, uint32_t *checksump, bool data_checksum, bool checkpoint_io,
+ bool caller_locked)
{
WT_DECL_RET;
@@ -382,8 +386,8 @@ __wt_block_write_off(WT_SESSION_IMPL *session, WT_BLOCK *block, WT_ITEM *buf, wt
* never see anything other than their original content.
*/
__wt_page_header_byteswap(buf->mem);
- ret = __block_write_off(
- session, block, buf, offsetp, sizep, checksump, data_checksum, checkpoint_io, caller_locked);
+ ret = __block_write_off(session, block, buf, logidp, offsetp, sizep, checksump, data_checksum,
+ checkpoint_io, caller_locked);
__wt_page_header_byteswap(buf->mem);
return (ret);
}