diff options
author | Luke Chen <luke.chen@mongodb.com> | 2021-01-27 13:28:13 +1100 |
---|---|---|
committer | Evergreen Agent <no-reply@evergreen.mongodb.com> | 2021-01-27 03:07:52 +0000 |
commit | 77d134c7a2bab846aaaa056b0883888a8219da2a (patch) | |
tree | 8caac27a3573139d221fdf93ba9c99b3e042bc78 /src/third_party/wiredtiger/src/block | |
parent | 5fad1f69662696c5b789392622aa34d370fb4825 (diff) | |
download | mongo-77d134c7a2bab846aaaa056b0883888a8219da2a.tar.gz |
Import wiredtiger: a52cd5a47a7e9af9e2c341e66f0ffdd9bc977930 from branch mongodb-4.4
ref: ef1f2937c3..a52cd5a47a
for: 4.4.4
WT-6309 Add support for start/stop arguments to wt printlog command
WT-6866 Refactor python backup tests initial base class
WT-6924 Queue history store pages for urgent eviction when cache pressure is high
WT-6946 Adding test tags to an initial set of test programs
WT-7068 Add column store support to test_hs03
WT-7084 Fix assert in test code and a comment error
WT-7109 Retain no longer supported configuration options for backward compatibility
WT-7113 Integrate prototype tiered storage code into WT
WT-7114 Revert Makefile code to always run the prototype script
Diffstat (limited to 'src/third_party/wiredtiger/src/block')
-rw-r--r-- | src/third_party/wiredtiger/src/block/block_addr.c | 81 | ||||
-rw-r--r-- | src/third_party/wiredtiger/src/block/block_ckpt.c | 51 | ||||
-rw-r--r-- | src/third_party/wiredtiger/src/block/block_ckpt_scan.c | 7 | ||||
-rw-r--r-- | src/third_party/wiredtiger/src/block/block_compact.c | 4 | ||||
-rw-r--r-- | src/third_party/wiredtiger/src/block/block_ext.c | 36 | ||||
-rw-r--r-- | src/third_party/wiredtiger/src/block/block_open.c | 15 | ||||
-rw-r--r-- | src/third_party/wiredtiger/src/block/block_read.c | 91 | ||||
-rw-r--r-- | src/third_party/wiredtiger/src/block/block_slvg.c | 17 | ||||
-rw-r--r-- | src/third_party/wiredtiger/src/block/block_vrfy.c | 4 | ||||
-rw-r--r-- | src/third_party/wiredtiger/src/block/block_write.c | 26 |
10 files changed, 232 insertions, 100 deletions
diff --git a/src/third_party/wiredtiger/src/block/block_addr.c b/src/third_party/wiredtiger/src/block/block_addr.c index f9bd4248642..312db74a24c 100644 --- a/src/third_party/wiredtiger/src/block/block_addr.c +++ b/src/third_party/wiredtiger/src/block/block_addr.c @@ -14,11 +14,15 @@ * reference so it can be called repeatedly to load a buffer. */ static int -__block_buffer_to_addr( - uint32_t allocsize, const uint8_t **pp, wt_off_t *offsetp, uint32_t *sizep, uint32_t *checksump) +__block_buffer_to_addr(WT_BLOCK *block, const uint8_t **pp, uint32_t *logidp, wt_off_t *offsetp, + uint32_t *sizep, uint32_t *checksump) { - uint64_t o, s, c; + uint64_t l, o, s, c; + if (block->log_structured) + WT_RET(__wt_vunpack_uint(pp, 0, &l)); + else + l = 0; WT_RET(__wt_vunpack_uint(pp, 0, &o)); WT_RET(__wt_vunpack_uint(pp, 0, &s)); WT_RET(__wt_vunpack_uint(pp, 0, &c)); @@ -37,10 +41,11 @@ __block_buffer_to_addr( */ if (s == 0) { *offsetp = 0; - *sizep = *checksump = 0; + *logidp = *sizep = *checksump = 0; } else { - *offsetp = (wt_off_t)(o + 1) * allocsize; - *sizep = (uint32_t)s * allocsize; + *logidp = (uint32_t)l; + *offsetp = (wt_off_t)(o + 1) * block->allocsize; + *sizep = (uint32_t)s * block->allocsize; *checksump = (uint32_t)c; } return (0); @@ -52,19 +57,22 @@ __block_buffer_to_addr( */ int __wt_block_addr_to_buffer( - WT_BLOCK *block, uint8_t **pp, wt_off_t offset, uint32_t size, uint32_t checksum) + WT_BLOCK *block, uint8_t **pp, uint32_t logid, wt_off_t offset, uint32_t size, uint32_t checksum) { - uint64_t o, s, c; + uint64_t l, o, s, c; /* See the comment above: this is the reverse operation. */ if (size == 0) { o = WT_BLOCK_INVALID_OFFSET; - s = c = 0; + l = s = c = 0; } else { + l = logid; o = (uint64_t)offset / block->allocsize - 1; s = size / block->allocsize; c = checksum; } + if (block->log_structured) + WT_RET(__wt_vpack_uint(pp, 0, l)); WT_RET(__wt_vpack_uint(pp, 0, o)); WT_RET(__wt_vpack_uint(pp, 0, s)); WT_RET(__wt_vpack_uint(pp, 0, c)); @@ -77,10 +85,10 @@ __wt_block_addr_to_buffer( * reference. */ int -__wt_block_buffer_to_addr( - WT_BLOCK *block, const uint8_t *p, wt_off_t *offsetp, uint32_t *sizep, uint32_t *checksump) +__wt_block_buffer_to_addr(WT_BLOCK *block, const uint8_t *p, uint32_t *logidp, wt_off_t *offsetp, + uint32_t *sizep, uint32_t *checksump) { - return (__block_buffer_to_addr(block->allocsize, &p, offsetp, sizep, checksump)); + return (__block_buffer_to_addr(block, &p, logidp, offsetp, sizep, checksump)); } /* @@ -92,14 +100,14 @@ __wt_block_addr_invalid( WT_SESSION_IMPL *session, WT_BLOCK *block, const uint8_t *addr, size_t addr_size, bool live) { wt_off_t offset; - uint32_t checksum, size; + uint32_t checksum, logid, size; WT_UNUSED(session); WT_UNUSED(addr_size); WT_UNUSED(live); /* Crack the cookie. */ - WT_RET(__wt_block_buffer_to_addr(block, addr, &offset, &size, &checksum)); + WT_RET(__wt_block_buffer_to_addr(block, addr, &logid, &offset, &size, &checksum)); #ifdef HAVE_DIAGNOSTIC /* @@ -111,7 +119,7 @@ __wt_block_addr_invalid( #endif /* Check if the address is past the end of the file. */ - return (offset + size > block->size ? EINVAL : 0); + return (logid == block->logid && offset + size > block->size ? EINVAL : 0); } /* @@ -123,15 +131,16 @@ __wt_block_addr_string( WT_SESSION_IMPL *session, WT_BLOCK *block, WT_ITEM *buf, const uint8_t *addr, size_t addr_size) { wt_off_t offset; - uint32_t checksum, size; + uint32_t checksum, logid, size; WT_UNUSED(addr_size); /* Crack the cookie. */ - WT_RET(__wt_block_buffer_to_addr(block, addr, &offset, &size, &checksum)); + WT_RET(__wt_block_buffer_to_addr(block, addr, &logid, &offset, &size, &checksum)); /* Printable representation. */ - WT_RET(__wt_buf_fmt(session, buf, "[%" PRIuMAX "-%" PRIuMAX ", %" PRIu32 ", %" PRIu32 "]", + WT_RET(__wt_buf_fmt(session, buf, + "[%" PRIu32 ": %" PRIuMAX "-%" PRIuMAX ", %" PRIu32 ", %" PRIu32 "]", logid, (uintmax_t)offset, (uintmax_t)offset + size, size, checksum)); return (0); @@ -143,7 +152,7 @@ __wt_block_addr_string( */ static int __block_buffer_to_ckpt( - WT_SESSION_IMPL *session, uint32_t allocsize, const uint8_t *p, WT_BLOCK_CKPT *ci) + WT_SESSION_IMPL *session, WT_BLOCK *block, const uint8_t *p, WT_BLOCK_CKPT *ci) { uint64_t a; const uint8_t **pp; @@ -153,14 +162,14 @@ __block_buffer_to_ckpt( WT_RET_MSG(session, WT_ERROR, "unsupported checkpoint version"); pp = &p; - WT_RET( - __block_buffer_to_addr(allocsize, pp, &ci->root_offset, &ci->root_size, &ci->root_checksum)); WT_RET(__block_buffer_to_addr( - allocsize, pp, &ci->alloc.offset, &ci->alloc.size, &ci->alloc.checksum)); + block, pp, &ci->root_logid, &ci->root_offset, &ci->root_size, &ci->root_checksum)); WT_RET(__block_buffer_to_addr( - allocsize, pp, &ci->avail.offset, &ci->avail.size, &ci->avail.checksum)); + block, pp, &ci->alloc.logid, &ci->alloc.offset, &ci->alloc.size, &ci->alloc.checksum)); WT_RET(__block_buffer_to_addr( - allocsize, pp, &ci->discard.offset, &ci->discard.size, &ci->discard.checksum)); + block, pp, &ci->avail.logid, &ci->avail.offset, &ci->avail.size, &ci->avail.checksum)); + WT_RET(__block_buffer_to_addr(block, pp, &ci->discard.logid, &ci->discard.offset, + &ci->discard.size, &ci->discard.checksum)); WT_RET(__wt_vunpack_uint(pp, 0, &a)); ci->file_size = (wt_off_t)a; WT_RET(__wt_vunpack_uint(pp, 0, &a)); @@ -177,7 +186,7 @@ int __wt_block_buffer_to_ckpt( WT_SESSION_IMPL *session, WT_BLOCK *block, const uint8_t *p, WT_BLOCK_CKPT *ci) { - return (__block_buffer_to_ckpt(session, block->allocsize, p, ci)); + return (__block_buffer_to_ckpt(session, block, p, ci)); } /* @@ -185,13 +194,13 @@ __wt_block_buffer_to_ckpt( * Convert a checkpoint cookie into its components, external utility version. */ int -__wt_block_ckpt_decode(WT_SESSION *wt_session, size_t allocsize, const uint8_t *p, - WT_BLOCK_CKPT *ci) WT_GCC_FUNC_ATTRIBUTE((visibility("default"))) +__wt_block_ckpt_decode(WT_SESSION *wt_session, WT_BLOCK *block, const uint8_t *p, WT_BLOCK_CKPT *ci) + WT_GCC_FUNC_ATTRIBUTE((visibility("default"))) { WT_SESSION_IMPL *session; session = (WT_SESSION_IMPL *)wt_session; - return (__block_buffer_to_ckpt(session, (uint32_t)allocsize, p, ci)); + return (__block_buffer_to_ckpt(session, block, p, ci)); } /* @@ -203,6 +212,9 @@ __wt_block_ckpt_to_buffer( WT_SESSION_IMPL *session, WT_BLOCK *block, uint8_t **pp, WT_BLOCK_CKPT *ci, bool skip_avail) { uint64_t a; + uint32_t logid; + + logid = block->logid; if (ci->version != WT_BM_CHECKPOINT_VERSION) WT_RET_MSG(session, WT_ERROR, "unsupported checkpoint version"); @@ -210,16 +222,17 @@ __wt_block_ckpt_to_buffer( (*pp)[0] = ci->version; (*pp)++; - WT_RET(__wt_block_addr_to_buffer(block, pp, ci->root_offset, ci->root_size, ci->root_checksum)); - WT_RET( - __wt_block_addr_to_buffer(block, pp, ci->alloc.offset, ci->alloc.size, ci->alloc.checksum)); + WT_RET(__wt_block_addr_to_buffer( + block, pp, logid, ci->root_offset, ci->root_size, ci->root_checksum)); + WT_RET(__wt_block_addr_to_buffer( + block, pp, logid, ci->alloc.offset, ci->alloc.size, ci->alloc.checksum)); if (skip_avail) - WT_RET(__wt_block_addr_to_buffer(block, pp, 0, 0, 0)); + WT_RET(__wt_block_addr_to_buffer(block, pp, 0, 0, 0, 0)); else WT_RET(__wt_block_addr_to_buffer( - block, pp, ci->avail.offset, ci->avail.size, ci->avail.checksum)); + block, pp, logid, ci->avail.offset, ci->avail.size, ci->avail.checksum)); WT_RET(__wt_block_addr_to_buffer( - block, pp, ci->discard.offset, ci->discard.size, ci->discard.checksum)); + block, pp, logid, ci->discard.offset, ci->discard.size, ci->discard.checksum)); a = (uint64_t)ci->file_size; WT_RET(__wt_vpack_uint(pp, 0, a)); a = ci->ckpt_size; diff --git a/src/third_party/wiredtiger/src/block/block_ckpt.c b/src/third_party/wiredtiger/src/block/block_ckpt.c index cdabd131e40..ceb9cf39262 100644 --- a/src/third_party/wiredtiger/src/block/block_ckpt.c +++ b/src/third_party/wiredtiger/src/block/block_ckpt.c @@ -95,8 +95,13 @@ __wt_block_checkpoint_load(WT_SESSION_IMPL *session, WT_BLOCK *block, const uint if (ci->root_offset != WT_BLOCK_INVALID_OFFSET) { endp = root_addr; WT_ERR(__wt_block_addr_to_buffer( - block, &endp, ci->root_offset, ci->root_size, ci->root_checksum)); + block, &endp, ci->root_logid, ci->root_offset, ci->root_size, ci->root_checksum)); *root_addr_sizep = WT_PTRDIFF(endp, root_addr); + + if (block->log_structured) { + block->logid = ci->root_logid; + WT_ERR(__wt_block_newfile(session, block)); + } } /* @@ -113,7 +118,7 @@ __wt_block_checkpoint_load(WT_SESSION_IMPL *session, WT_BLOCK *block, const uint * the end of the file, that was done when the checkpoint was first written (re-writing the * checkpoint might possibly make it relevant here, but it's unlikely enough I don't bother). */ - if (!checkpoint) + if (!checkpoint && !block->log_structured) WT_ERR(__wt_block_truncate(session, block, ci->file_size)); if (0) { @@ -237,10 +242,10 @@ __wt_block_checkpoint( */ if (buf == NULL) { ci->root_offset = WT_BLOCK_INVALID_OFFSET; - ci->root_size = ci->root_checksum = 0; + ci->root_logid = ci->root_size = ci->root_checksum = 0; } else - WT_ERR(__wt_block_write_off(session, block, buf, &ci->root_offset, &ci->root_size, - &ci->root_checksum, data_checksum, true, false)); + WT_ERR(__wt_block_write_off(session, block, buf, &ci->root_logid, &ci->root_offset, + &ci->root_size, &ci->root_checksum, data_checksum, true, false)); /* * Checkpoints are potentially reading/writing/merging lots of blocks, pre-allocate structures @@ -463,6 +468,37 @@ __ckpt_add_blk_mods_ext(WT_SESSION_IMPL *session, WT_CKPT *ckptbase, WT_BLOCK_CK } /* + * __wt_block_newfile -- + * Switch a log-structured block object to a new file. + */ +int +__wt_block_newfile(WT_SESSION_IMPL *session, WT_BLOCK *block) +{ + WT_DECL_ITEM(tmp); + WT_DECL_RET; + const char *filename; + + /* Bump to a new file ID. */ + ++block->logid; + + WT_ERR(__wt_scr_alloc(session, 0, &tmp)); + WT_ERR(__wt_buf_fmt(session, tmp, "%s.%08" PRIu32, block->name, block->logid)); + filename = tmp->data; + WT_ERR(__wt_close(session, &block->fh)); + WT_ERR(__wt_open(session, filename, WT_FS_OPEN_FILE_TYPE_DATA, + WT_FS_OPEN_CREATE | block->file_flags, &block->fh)); + WT_ERR(__wt_desc_write(session, block->fh, block->allocsize)); + + block->size = block->allocsize; + __wt_block_ckpt_destroy(session, &block->live); + WT_ERR(__wt_block_ckpt_init(session, &block->live, "live")); + +err: + __wt_scr_free(session, &tmp); + return (ret); +} + +/* * __ckpt_process -- * Process the list of checkpoints. */ @@ -610,7 +646,7 @@ __ckpt_process(WT_SESSION_IMPL *session, WT_BLOCK *block, WT_CKPT *ckptbase) * lists, and the freed blocks will then be included when writing the live extent lists. */ WT_CKPT_FOREACH (ckptbase, ckpt) { - if (F_ISSET(ckpt, WT_CKPT_FAKE) || !F_ISSET(ckpt, WT_CKPT_DELETE)) + if (F_ISSET(ckpt, WT_CKPT_FAKE) || !F_ISSET(ckpt, WT_CKPT_DELETE) || block->log_structured) continue; if (WT_VERBOSE_ISSET(session, WT_VERB_CHECKPOINT)) @@ -744,6 +780,9 @@ live_update: ci->ckpt_discard = ci->discard; WT_ERR(__wt_block_extlist_init(session, &ci->discard, "live", "discard", false)); + if (block->log_structured) + WT_ERR(__wt_block_newfile(session, block)); + #ifdef HAVE_DIAGNOSTIC /* * The first checkpoint in the system should always have an empty discard list. If we've read diff --git a/src/third_party/wiredtiger/src/block/block_ckpt_scan.c b/src/third_party/wiredtiger/src/block/block_ckpt_scan.c index 4d47c4301b2..2320076752a 100644 --- a/src/third_party/wiredtiger/src/block/block_ckpt_scan.c +++ b/src/third_party/wiredtiger/src/block/block_ckpt_scan.c @@ -220,13 +220,16 @@ __wt_block_checkpoint_last(WT_SESSION_IMPL *session, WT_BLOCK *block, char **met const WT_PAGE_HEADER *dsk; wt_off_t ext_off, ext_size, offset; uint64_t len, nblocks, write_gen; - uint32_t checksum, size; + uint32_t checksum, logid, size; const uint8_t *p, *t; bool found; *metadatap = *checkpoint_listp = NULL; WT_RET(__wt_buf_init(session, checkpoint, WT_BLOCK_CHECKPOINT_BUFFER)); + /* TODO: scan all log IDs. */ + logid = 0; + /* * Initialize a pair of structures that track the best and current checkpoints found so far. * This is a little trickier than normal because we don't want to start saving a checkpoint only @@ -279,7 +282,7 @@ __wt_block_checkpoint_last(WT_SESSION_IMPL *session, WT_BLOCK *block, char **met * block isn't valid, skip to the next possible block. */ if (__wt_block_offset_invalid(block, offset, size) || - __wt_block_read_off(session, block, tmp, offset, size, checksum) != 0) { + __wt_block_read_off(session, block, tmp, logid, offset, size, checksum) != 0) { size = WT_BTREE_MIN_ALLOC_SIZE; continue; } diff --git a/src/third_party/wiredtiger/src/block/block_compact.c b/src/third_party/wiredtiger/src/block/block_compact.c index 19ee0b664f6..6615d4e192d 100644 --- a/src/third_party/wiredtiger/src/block/block_compact.c +++ b/src/third_party/wiredtiger/src/block/block_compact.c @@ -136,13 +136,13 @@ __wt_block_compact_page_skip( WT_EXT *ext; WT_EXTLIST *el; wt_off_t limit, offset; - uint32_t size, checksum; + uint32_t checksum, logid, size; WT_UNUSED(addr_size); *skipp = true; /* Return a default skip. */ /* Crack the cookie. */ - WT_RET(__wt_block_buffer_to_addr(block, addr, &offset, &size, &checksum)); + WT_RET(__wt_block_buffer_to_addr(block, addr, &logid, &offset, &size, &checksum)); /* * If this block is in the chosen percentage of the file and there's a block on the available diff --git a/src/third_party/wiredtiger/src/block/block_ext.c b/src/third_party/wiredtiger/src/block/block_ext.c index 06c44680f1a..d0dbfa97646 100644 --- a/src/third_party/wiredtiger/src/block/block_ext.c +++ b/src/third_party/wiredtiger/src/block/block_ext.c @@ -380,7 +380,7 @@ corrupt: /* * __wt_block_off_remove_overlap -- - * Remove a range from an extent list, where the range may be part of a overlapping entry. + * Remove a range from an extent list, where the range may be part of an overlapping entry. */ int __wt_block_off_remove_overlap( @@ -565,24 +565,28 @@ __wt_block_free(WT_SESSION_IMPL *session, WT_BLOCK *block, const uint8_t *addr, { WT_DECL_RET; wt_off_t offset; - uint32_t checksum, size; + uint32_t checksum, logid, size; WT_UNUSED(addr_size); WT_STAT_DATA_INCR(session, block_free); /* Crack the cookie. */ - WT_RET(__wt_block_buffer_to_addr(block, addr, &offset, &size, &checksum)); + WT_RET(__wt_block_buffer_to_addr(block, addr, &logid, &offset, &size, &checksum)); - __wt_verbose( - session, WT_VERB_BLOCK, "free %" PRIdMAX "/%" PRIdMAX, (intmax_t)offset, (intmax_t)size); + __wt_verbose(session, WT_VERB_BLOCK, "free %" PRIu32 ": %" PRIdMAX "/%" PRIdMAX, logid, + (intmax_t)offset, (intmax_t)size); #ifdef HAVE_DIAGNOSTIC WT_RET(__wt_block_misplaced(session, block, "free", offset, size, true, __func__, __LINE__)); #endif - WT_RET(__wt_block_ext_prealloc(session, 5)); - __wt_spin_lock(session, &block->live_lock); - ret = __wt_block_off_free(session, block, offset, (wt_off_t)size); - __wt_spin_unlock(session, &block->live_lock); + if (logid == block->logid) { + WT_RET(__wt_block_ext_prealloc(session, 5)); + __wt_spin_lock(session, &block->live_lock); + ret = __wt_block_off_free(session, block, logid, offset, (wt_off_t)size); + __wt_spin_unlock(session, &block->live_lock); + } else { + /* TODO: update stats about older files to drive garbage collection. */ + } return (ret); } @@ -592,13 +596,18 @@ __wt_block_free(WT_SESSION_IMPL *session, WT_BLOCK *block, const uint8_t *addr, * Free a file range to the underlying file. */ int -__wt_block_off_free(WT_SESSION_IMPL *session, WT_BLOCK *block, wt_off_t offset, wt_off_t size) +__wt_block_off_free( + WT_SESSION_IMPL *session, WT_BLOCK *block, uint32_t logid, wt_off_t offset, wt_off_t size) { WT_DECL_RET; /* If a sync is running, no other sessions can free blocks. */ WT_ASSERT(session, WT_SESSION_BTREE_SYNC_SAFE(session, S2BT(session))); + /* TODO: track stats for old files to drive garbage collection. */ + if (logid != block->logid) + return (0); + /* * Callers of this function are expected to have already acquired any locks required to * manipulate the extent lists. @@ -1096,7 +1105,7 @@ __wt_block_extlist_read( return (0); WT_RET(__wt_scr_alloc(session, el->size, &tmp)); - WT_ERR(__wt_block_read_off(session, block, tmp, el->offset, el->size, el->checksum)); + WT_ERR(__wt_block_read_off(session, block, tmp, el->logid, el->offset, el->size, el->checksum)); p = WT_BLOCK_HEADER_BYTE(tmp->mem); WT_ERR(__wt_extlist_read_pair(&p, &off, &size)); @@ -1156,7 +1165,7 @@ __wt_block_extlist_write( WT_EXT *ext; WT_PAGE_HEADER *dsk; size_t size; - uint32_t entries; + uint32_t logid, entries; uint8_t *p; WT_RET(__block_extlist_dump(session, block, el, "write")); @@ -1214,7 +1223,8 @@ __wt_block_extlist_write( /* Write the extent list to disk. */ WT_ERR(__wt_block_write_off( - session, block, tmp, &el->offset, &el->size, &el->checksum, true, true, true)); + session, block, tmp, &logid, &el->offset, &el->size, &el->checksum, true, true, true)); + WT_UNUSED(logid); /* TODO check */ /* * Remove the allocated blocks from the system's allocation list, extent blocks never appear on diff --git a/src/third_party/wiredtiger/src/block/block_open.c b/src/third_party/wiredtiger/src/block/block_open.c index d97f3a86f7d..5eba665ea8d 100644 --- a/src/third_party/wiredtiger/src/block/block_open.c +++ b/src/third_party/wiredtiger/src/block/block_open.c @@ -33,6 +33,8 @@ __wt_block_manager_create(WT_SESSION_IMPL *session, const char *filename, uint32 int suffix; bool exists; + WT_ERR(__wt_scr_alloc(session, 0, &tmp)); + /* * Create the underlying file and open a handle. * @@ -46,8 +48,6 @@ __wt_block_manager_create(WT_SESSION_IMPL *session, const char *filename, uint32 break; WT_ERR_TEST(ret != EEXIST, ret, false); - if (tmp == NULL) - WT_ERR(__wt_scr_alloc(session, 0, &tmp)); for (suffix = 1;; ++suffix) { WT_ERR(__wt_buf_fmt(session, tmp, "%s.%d", filename, suffix)); WT_ERR(__wt_fs_exist(session, tmp->data, &exists)); @@ -91,6 +91,7 @@ __block_destroy(WT_SESSION_IMPL *session, WT_BLOCK *block) WT_CONNECTION_IMPL *conn; WT_DECL_RET; uint64_t bucket; + u_int i; conn = S2C(session); bucket = block->name_hash & (conn->hash_size - 1); @@ -98,6 +99,12 @@ __block_destroy(WT_SESSION_IMPL *session, WT_BLOCK *block) __wt_free(session, block->name); + if (block->log_structured && block->lfh != NULL) { + for (i = 0; i < block->max_logid; i++) + WT_TRET(__wt_close(session, &block->lfh[i])); + __wt_free(session, block->lfh); + } + if (block->fh != NULL) WT_TRET(__wt_close(session, &block->fh)); @@ -175,6 +182,7 @@ __wt_block_open(WT_SESSION_IMPL *session, const char *filename, const char *cfg[ WT_ERR(__wt_config_gets(session, cfg, "block_allocation", &cval)); block->allocfirst = WT_STRING_MATCH("first", cval.str, cval.len); + block->log_structured = WT_STRING_MATCH("log-structured", cval.str, cval.len); /* Configuration: optional OS buffer cache maximum size. */ WT_ERR(__wt_config_gets(session, cfg, "os_cache_max", &cval)); @@ -203,7 +211,8 @@ __wt_block_open(WT_SESSION_IMPL *session, const char *filename, const char *cfg[ LF_SET(WT_FS_OPEN_DIRECTIO); if (!readonly && FLD_ISSET(conn->direct_io, WT_DIRECT_IO_DATA)) LF_SET(WT_FS_OPEN_DIRECTIO); - WT_ERR(__wt_open(session, filename, WT_FS_OPEN_FILE_TYPE_DATA, flags, &block->fh)); + block->file_flags = flags; + WT_ERR(__wt_open(session, filename, WT_FS_OPEN_FILE_TYPE_DATA, block->file_flags, &block->fh)); /* Set the file's size. */ WT_ERR(__wt_filesize(session, block->fh, &block->size)); diff --git a/src/third_party/wiredtiger/src/block/block_read.c b/src/third_party/wiredtiger/src/block/block_read.c index 8680f1f90f8..c5e3a1c193c 100644 --- a/src/third_party/wiredtiger/src/block/block_read.c +++ b/src/third_party/wiredtiger/src/block/block_read.c @@ -18,9 +18,10 @@ __wt_bm_preload(WT_BM *bm, WT_SESSION_IMPL *session, const uint8_t *addr, size_t WT_BLOCK *block; WT_DECL_ITEM(tmp); WT_DECL_RET; + WT_FH *fh; WT_FILE_HANDLE *handle; wt_off_t offset; - uint32_t checksum, size; + uint32_t checksum, logid, size; bool mapped; block = bm->block; @@ -28,9 +29,10 @@ __wt_bm_preload(WT_BM *bm, WT_SESSION_IMPL *session, const uint8_t *addr, size_t WT_STAT_CONN_INCR(session, block_preload); /* Crack the cookie. */ - WT_RET(__wt_block_buffer_to_addr(block, addr, &offset, &size, &checksum)); + WT_RET(__wt_block_buffer_to_addr(block, addr, &logid, &offset, &size, &checksum)); - handle = block->fh->handle; + WT_RET(__wt_block_fh(session, block, logid, &fh)); + handle = fh->handle; mapped = bm->map != NULL && offset + size <= (wt_off_t)bm->maplen; if (mapped && handle->fh_map_preload != NULL) ret = handle->fh_map_preload( @@ -59,21 +61,23 @@ __wt_bm_read( { WT_BLOCK *block; WT_DECL_RET; + WT_FH *fh; WT_FILE_HANDLE *handle; wt_off_t offset; - uint32_t checksum, size; + uint32_t checksum, logid, size; bool mapped; WT_UNUSED(addr_size); block = bm->block; /* Crack the cookie. */ - WT_RET(__wt_block_buffer_to_addr(block, addr, &offset, &size, &checksum)); + WT_RET(__wt_block_buffer_to_addr(block, addr, &logid, &offset, &size, &checksum)); /* * Map the block if it's possible. */ - handle = block->fh->handle; + WT_RET(__wt_block_fh(session, block, logid, &fh)); + handle = fh->handle; mapped = bm->map != NULL && offset + size <= (wt_off_t)bm->maplen; if (mapped && handle->fh_map_preload != NULL) { buf->data = (uint8_t *)bm->map + offset; @@ -96,7 +100,7 @@ __wt_bm_read( #endif /* Read the block. */ __wt_capacity_throttle(session, size, WT_THROTTLE_READ); - WT_RET(__wt_block_read_off(session, block, buf, offset, size, checksum)); + WT_RET(__wt_block_read_off(session, block, buf, logid, offset, size, checksum)); /* Optionally discard blocks from the system's buffer cache. */ WT_RET(__wt_block_discard(session, block, (size_t)size)); @@ -109,17 +113,17 @@ __wt_bm_read( * Dump a block into the log in 1KB chunks. */ static int -__wt_bm_corrupt_dump(WT_SESSION_IMPL *session, WT_ITEM *buf, wt_off_t offset, uint32_t size, - uint32_t checksum) WT_GCC_FUNC_ATTRIBUTE((cold)) +__wt_bm_corrupt_dump(WT_SESSION_IMPL *session, WT_ITEM *buf, uint32_t logid, wt_off_t offset, + uint32_t size, uint32_t checksum) WT_GCC_FUNC_ATTRIBUTE((cold)) { WT_DECL_ITEM(tmp); WT_DECL_RET; size_t chunk, i, nchunks; -#define WT_CORRUPT_FMT "{%" PRIuMAX ", %" PRIu32 ", %#" PRIx32 "}" +#define WT_CORRUPT_FMT "{%" PRIu32 ": %" PRIuMAX ", %" PRIu32 ", %#" PRIx32 "}" if (buf->size == 0) { - __wt_errx(session, WT_CORRUPT_FMT ": empty buffer, no dump available", (uintmax_t)offset, - size, checksum); + __wt_errx(session, WT_CORRUPT_FMT ": empty buffer, no dump available", logid, + (uintmax_t)offset, size, checksum); return (0); } @@ -130,7 +134,7 @@ __wt_bm_corrupt_dump(WT_SESSION_IMPL *session, WT_ITEM *buf, wt_off_t offset, ui WT_ERR(__wt_buf_catfmt(session, tmp, "%02x ", ((uint8_t *)buf->data)[i])); if (++i == buf->size || i % 1024 == 0) { __wt_errx(session, - WT_CORRUPT_FMT ": (chunk %" WT_SIZET_FMT " of %" WT_SIZET_FMT "): %.*s", + WT_CORRUPT_FMT ": (chunk %" WT_SIZET_FMT " of %" WT_SIZET_FMT "): %.*s", logid, (uintmax_t)offset, size, checksum, ++chunk, nchunks, (int)tmp->size, (char *)tmp->data); if (i == buf->size) @@ -154,15 +158,15 @@ __wt_bm_corrupt(WT_BM *bm, WT_SESSION_IMPL *session, const uint8_t *addr, size_t WT_DECL_ITEM(tmp); WT_DECL_RET; wt_off_t offset; - uint32_t checksum, size; + uint32_t checksum, logid, size; /* Read the block. */ WT_RET(__wt_scr_alloc(session, 0, &tmp)); WT_ERR(__wt_bm_read(bm, session, tmp, addr, addr_size)); /* Crack the cookie, dump the block. */ - WT_ERR(__wt_block_buffer_to_addr(bm->block, addr, &offset, &size, &checksum)); - WT_ERR(__wt_bm_corrupt_dump(session, tmp, offset, size, checksum)); + WT_ERR(__wt_block_buffer_to_addr(bm->block, addr, &logid, &offset, &size, &checksum)); + WT_ERR(__wt_bm_corrupt_dump(session, tmp, logid, offset, size, checksum)); err: __wt_scr_free(session, &tmp); @@ -203,14 +207,60 @@ err: #endif /* + * __wt_block_fh -- + * Get a block file handle. + */ +int +__wt_block_fh(WT_SESSION_IMPL *session, WT_BLOCK *block, uint32_t logid, WT_FH **fhp) +{ + WT_DECL_ITEM(tmp); + WT_DECL_RET; + const char *filename; + + if (!block->log_structured || logid == block->logid) { + *fhp = block->fh; + return (0); + } + + /* TODO: fh readlock */ + if (logid * sizeof(WT_FILE_HANDLE *) < block->lfh_alloc && (*fhp = block->lfh[logid]) != NULL) + return (0); + + /* TODO: fh writelock */ + /* Ensure the array goes far enough. */ + WT_RET(__wt_realloc_def(session, &block->lfh_alloc, logid + 1, &block->lfh)); + if (logid >= block->max_logid) + block->max_logid = logid + 1; + if ((*fhp = block->lfh[logid]) != NULL) + return (0); + + WT_RET(__wt_scr_alloc(session, 0, &tmp)); + if (logid == 0) + filename = block->name; + else { + WT_ERR(__wt_buf_fmt(session, tmp, "%s.%08" PRIu32, block->name, logid)); + filename = tmp->data; + } + WT_ERR(__wt_open(session, filename, WT_FS_OPEN_FILE_TYPE_DATA, + WT_FS_OPEN_READONLY | block->file_flags, &block->lfh[logid])); + *fhp = block->lfh[logid]; + WT_ASSERT(session, *fhp != NULL); + +err: + __wt_scr_free(session, &tmp); + return (ret); +} + +/* * __wt_block_read_off -- * Read an addr/size pair referenced block into a buffer. */ int -__wt_block_read_off(WT_SESSION_IMPL *session, WT_BLOCK *block, WT_ITEM *buf, wt_off_t offset, - uint32_t size, uint32_t checksum) +__wt_block_read_off(WT_SESSION_IMPL *session, WT_BLOCK *block, WT_ITEM *buf, uint32_t logid, + wt_off_t offset, uint32_t size, uint32_t checksum) { WT_BLOCK_HEADER *blk, swap; + WT_FH *fh; size_t bufsize; __wt_verbose(session, WT_VERB_READ, "off %" PRIuMAX ", size %" PRIu32 ", checksum %#" PRIx32, @@ -243,7 +293,8 @@ __wt_block_read_off(WT_SESSION_IMPL *session, WT_BLOCK *block, WT_ITEM *buf, wt_ block->name, size, block->allocsize); WT_RET(__wt_buf_init(session, buf, bufsize)); - WT_RET(__wt_read(session, block->fh, offset, size, buf->mem)); + WT_RET(__wt_block_fh(session, block, logid, &fh)); + WT_RET(__wt_read(session, fh, offset, size, buf->mem)); buf->size = size; /* @@ -276,7 +327,7 @@ __wt_block_read_off(WT_SESSION_IMPL *session, WT_BLOCK *block, WT_ITEM *buf, wt_ block->name, size, (uintmax_t)offset, swap.checksum, checksum); if (!F_ISSET(session, WT_SESSION_QUIET_CORRUPT_FILE)) - WT_IGNORE_RET(__wt_bm_corrupt_dump(session, buf, offset, size, checksum)); + WT_IGNORE_RET(__wt_bm_corrupt_dump(session, buf, logid, offset, size, checksum)); /* Panic if a checksum fails during an ordinary read. */ F_SET(S2C(session), WT_CONN_DATA_CORRUPTION); diff --git a/src/third_party/wiredtiger/src/block/block_slvg.c b/src/third_party/wiredtiger/src/block/block_slvg.c index 8a632e892b4..857f15d7848 100644 --- a/src/third_party/wiredtiger/src/block/block_slvg.c +++ b/src/third_party/wiredtiger/src/block/block_slvg.c @@ -104,11 +104,14 @@ __wt_block_salvage_next( WT_DECL_RET; WT_FH *fh; wt_off_t max, offset; - uint32_t allocsize, checksum, size; + uint32_t allocsize, checksum, logid, size; uint8_t *endp; *eofp = 0; + /* FIXME: salvage across all chunks in a log-structured tree. */ + logid = 0; + fh = block->fh; allocsize = block->allocsize; WT_ERR(__wt_scr_alloc(session, allocsize, &tmp)); @@ -137,19 +140,19 @@ __wt_block_salvage_next( * otherwise, move past it. */ if (!__wt_block_offset_invalid(block, offset, size) && - __wt_block_read_off(session, block, tmp, offset, size, checksum) == 0) + __wt_block_read_off(session, block, tmp, logid, offset, size, checksum) == 0) break; /* Free the allocation-size block. */ __wt_verbose(session, WT_VERB_SALVAGE, "skipping %" PRIu32 "B at file offset %" PRIuMAX, allocsize, (uintmax_t)offset); - WT_ERR(__wt_block_off_free(session, block, offset, (wt_off_t)allocsize)); + WT_ERR(__wt_block_off_free(session, block, logid, offset, (wt_off_t)allocsize)); block->slvg_off += allocsize; } /* Re-create the address cookie that should reference this block. */ endp = addr; - WT_ERR(__wt_block_addr_to_buffer(block, &endp, offset, size, checksum)); + WT_ERR(__wt_block_addr_to_buffer(block, &endp, logid, offset, size, checksum)); *addr_sizep = WT_PTRDIFF(endp, addr); done: @@ -167,7 +170,7 @@ __wt_block_salvage_valid( WT_SESSION_IMPL *session, WT_BLOCK *block, uint8_t *addr, size_t addr_size, bool valid) { wt_off_t offset; - uint32_t size, checksum; + uint32_t size, logid, checksum; WT_UNUSED(addr_size); @@ -175,11 +178,11 @@ __wt_block_salvage_valid( * Crack the cookie. If the upper layer took the block, move past it; if the upper layer * rejected the block, move past an allocation size chunk and free it. */ - WT_RET(__wt_block_buffer_to_addr(block, addr, &offset, &size, &checksum)); + WT_RET(__wt_block_buffer_to_addr(block, addr, &logid, &offset, &size, &checksum)); if (valid) block->slvg_off = offset + size; else { - WT_RET(__wt_block_off_free(session, block, offset, (wt_off_t)block->allocsize)); + WT_RET(__wt_block_off_free(session, block, logid, offset, (wt_off_t)block->allocsize)); block->slvg_off = offset + block->allocsize; } diff --git a/src/third_party/wiredtiger/src/block/block_vrfy.c b/src/third_party/wiredtiger/src/block/block_vrfy.c index 57bba04e5da..0a434aebfce 100644 --- a/src/third_party/wiredtiger/src/block/block_vrfy.c +++ b/src/third_party/wiredtiger/src/block/block_vrfy.c @@ -316,12 +316,12 @@ __wt_block_verify_addr( WT_SESSION_IMPL *session, WT_BLOCK *block, const uint8_t *addr, size_t addr_size) { wt_off_t offset; - uint32_t checksum, size; + uint32_t checksum, logid, size; WT_UNUSED(addr_size); /* Crack the cookie. */ - WT_RET(__wt_block_buffer_to_addr(block, addr, &offset, &size, &checksum)); + WT_RET(__wt_block_buffer_to_addr(block, addr, &logid, &offset, &size, &checksum)); /* Add to the per-file list. */ WT_RET(__verify_filefrag_add(session, block, NULL, offset, size, false)); diff --git a/src/third_party/wiredtiger/src/block/block_write.c b/src/third_party/wiredtiger/src/block/block_write.c index c5890efd038..bafee786c5b 100644 --- a/src/third_party/wiredtiger/src/block/block_write.c +++ b/src/third_party/wiredtiger/src/block/block_write.c @@ -189,14 +189,14 @@ __wt_block_write(WT_SESSION_IMPL *session, WT_BLOCK *block, WT_ITEM *buf, uint8_ size_t *addr_sizep, bool data_checksum, bool checkpoint_io) { wt_off_t offset; - uint32_t checksum, size; + uint32_t checksum, logid, size; uint8_t *endp; WT_RET(__wt_block_write_off( - session, block, buf, &offset, &size, &checksum, data_checksum, checkpoint_io, false)); + session, block, buf, &logid, &offset, &size, &checksum, data_checksum, checkpoint_io, false)); endp = addr; - WT_RET(__wt_block_addr_to_buffer(block, &endp, offset, size, checksum)); + WT_RET(__wt_block_addr_to_buffer(block, &endp, logid, offset, size, checksum)); *addr_sizep = WT_PTRDIFF(endp, addr); return (0); @@ -207,15 +207,16 @@ __wt_block_write(WT_SESSION_IMPL *session, WT_BLOCK *block, WT_ITEM *buf, uint8_ * Write a buffer into a block, returning the block's offset, size and checksum. */ static int -__block_write_off(WT_SESSION_IMPL *session, WT_BLOCK *block, WT_ITEM *buf, wt_off_t *offsetp, - uint32_t *sizep, uint32_t *checksump, bool data_checksum, bool checkpoint_io, bool caller_locked) +__block_write_off(WT_SESSION_IMPL *session, WT_BLOCK *block, WT_ITEM *buf, uint32_t *logidp, + wt_off_t *offsetp, uint32_t *sizep, uint32_t *checksump, bool data_checksum, bool checkpoint_io, + bool caller_locked) { WT_BLOCK_HEADER *blk; WT_DECL_RET; WT_FH *fh; wt_off_t offset; size_t align_size; - uint32_t checksum; + uint32_t checksum, logid; uint8_t *file_sizep; bool local_locked; @@ -224,6 +225,7 @@ __block_write_off(WT_SESSION_IMPL *session, WT_BLOCK *block, WT_ITEM *buf, wt_of *checksump = 0; /* -Werror=maybe-uninitialized */ fh = block->fh; + logid = block->logid; /* Buffers should be aligned for writing. */ if (!F_ISSET(buf, WT_ITEM_ALIGNED)) { @@ -325,7 +327,7 @@ __block_write_off(WT_SESSION_IMPL *session, WT_BLOCK *block, WT_ITEM *buf, wt_of if ((ret = __wt_write(session, fh, offset, align_size, buf->mem)) != 0) { if (!caller_locked) __wt_spin_lock(session, &block->live_lock); - WT_TRET(__wt_block_off_free(session, block, offset, (wt_off_t)align_size)); + WT_TRET(__wt_block_off_free(session, block, logid, offset, (wt_off_t)align_size)); if (!caller_locked) __wt_spin_unlock(session, &block->live_lock); WT_RET(ret); @@ -359,6 +361,7 @@ __block_write_off(WT_SESSION_IMPL *session, WT_BLOCK *block, WT_ITEM *buf, wt_of __wt_verbose(session, WT_VERB_WRITE, "off %" PRIuMAX ", size %" PRIuMAX ", checksum %#" PRIx32, (uintmax_t)offset, (uintmax_t)align_size, checksum); + *logidp = logid; *offsetp = offset; *sizep = WT_STORE_SIZE(align_size); *checksump = checksum; @@ -371,8 +374,9 @@ __block_write_off(WT_SESSION_IMPL *session, WT_BLOCK *block, WT_ITEM *buf, wt_of * Write a buffer into a block, returning the block's offset, size and checksum. */ int -__wt_block_write_off(WT_SESSION_IMPL *session, WT_BLOCK *block, WT_ITEM *buf, wt_off_t *offsetp, - uint32_t *sizep, uint32_t *checksump, bool data_checksum, bool checkpoint_io, bool caller_locked) +__wt_block_write_off(WT_SESSION_IMPL *session, WT_BLOCK *block, WT_ITEM *buf, uint32_t *logidp, + wt_off_t *offsetp, uint32_t *sizep, uint32_t *checksump, bool data_checksum, bool checkpoint_io, + bool caller_locked) { WT_DECL_RET; @@ -382,8 +386,8 @@ __wt_block_write_off(WT_SESSION_IMPL *session, WT_BLOCK *block, WT_ITEM *buf, wt * never see anything other than their original content. */ __wt_page_header_byteswap(buf->mem); - ret = __block_write_off( - session, block, buf, offsetp, sizep, checksump, data_checksum, checkpoint_io, caller_locked); + ret = __block_write_off(session, block, buf, logidp, offsetp, sizep, checksump, data_checksum, + checkpoint_io, caller_locked); __wt_page_header_byteswap(buf->mem); return (ret); } |