diff options
author | Luke Chen <luke.chen@mongodb.com> | 2022-01-28 14:46:51 +1100 |
---|---|---|
committer | Evergreen Agent <no-reply@evergreen.mongodb.com> | 2022-01-28 04:15:17 +0000 |
commit | cf62ec3e57c8be293de4ed04898f31b38fb2dd38 (patch) | |
tree | f5d191f5ced7f73a31c3555099ed72630eb8ddb0 | |
parent | 72b2405abfb1fd74f7f0fa715dc401d5caefdd0d (diff) | |
download | mongo-cf62ec3e57c8be293de4ed04898f31b38fb2dd38.tar.gz |
Import wiredtiger: b9e7765c161ebeaf277842933a5e4909a9f84ac1 from branch mongodb-master
ref: 4f8ac804b8..b9e7765c16
for: 5.3.0
WT-7872 Replace tiered storage use of WT_FH handles with WT_BLOCK handles
27 files changed, 377 insertions, 465 deletions
diff --git a/src/third_party/wiredtiger/dist/filelist b/src/third_party/wiredtiger/dist/filelist index 91ca9a079e9..730ddca0893 100644 --- a/src/third_party/wiredtiger/dist/filelist +++ b/src/third_party/wiredtiger/dist/filelist @@ -10,13 +10,13 @@ src/block/block_open.c src/block/block_read.c src/block/block_session.c src/block/block_slvg.c -src/block/block_tiered.c src/block/block_vrfy.c src/block/block_write.c src/block_cache/block_cache.c src/block_cache/block_io.c src/block_cache/block_map.c src/block_cache/block_mgr.c +src/block_cache/block_tier.c src/bloom/bloom.c src/btree/bt_compact.c src/btree/bt_curnext.c @@ -215,7 +215,6 @@ src/support/timestamp.c src/support/update_vector.c src/tiered/tiered_config.c src/tiered/tiered_handle.c -src/tiered/tiered_opener.c src/tiered/tiered_work.c src/txn/txn.c src/txn/txn_ckpt.c diff --git a/src/third_party/wiredtiger/import.data b/src/third_party/wiredtiger/import.data index a5b7fe63777..97a94770b68 100644 --- a/src/third_party/wiredtiger/import.data +++ b/src/third_party/wiredtiger/import.data @@ -2,5 +2,5 @@ "vendor": "wiredtiger", "github": "wiredtiger/wiredtiger.git", "branch": "mongodb-master", - "commit": "4f8ac804b8ad14c223331c399810959f6c922ec1" + "commit": "b9e7765c161ebeaf277842933a5e4909a9f84ac1" } diff --git a/src/third_party/wiredtiger/src/block/block_addr.c b/src/third_party/wiredtiger/src/block/block_addr.c index 54e2eb71d4e..8fef633de2a 100644 --- a/src/third_party/wiredtiger/src/block/block_addr.c +++ b/src/third_party/wiredtiger/src/block/block_addr.c @@ -113,7 +113,7 @@ __wt_block_addr_pack(WT_BLOCK *block, uint8_t **pp, uint32_t objectid, wt_off_t * starting with a single object with no object IDs, where all future objects in the stack know * a missing object ID is a reference to the base object. */ - if (i != 0 && block->has_objects) { + if (i != WT_TIERED_OBJECTID_NONE) { **pp = WT_BLOCK_COOKIE_FILEID; ++(*pp); WT_RET(__wt_vpack_uint(pp, 0, i)); diff --git a/src/third_party/wiredtiger/src/block/block_ckpt.c b/src/third_party/wiredtiger/src/block/block_ckpt.c index addcd9b447f..c6c4f391dec 100644 --- a/src/third_party/wiredtiger/src/block/block_ckpt.c +++ b/src/third_party/wiredtiger/src/block/block_ckpt.c @@ -63,11 +63,11 @@ __wt_block_checkpoint_load(WT_SESSION_IMPL *session, WT_BLOCK *block, const uint ci = &_ci; WT_ERR(__wt_block_ckpt_init(session, ci, "checkpoint")); } else { -/* - * We depend on the btree level for locking: things will go bad fast if we open the live system in - * two handles, or salvage, truncate or verify the live/running file. - */ #ifdef HAVE_DIAGNOSTIC + /* + * We depend on the btree level for locking: things will go bad fast if we open the live + * system in two handles, or salvage, truncate or verify the live/running file. + */ __wt_spin_lock(session, &block->live_lock); WT_ASSERT(session, block->live_open == false); block->live_open = true; @@ -113,7 +113,7 @@ __wt_block_checkpoint_load(WT_SESSION_IMPL *session, WT_BLOCK *block, const uint * the end of the file, that was done when the checkpoint was first written (re-writing the * checkpoint might possibly make it relevant here, but it's unlikely enough I don't bother). */ - if (!checkpoint && !block->has_objects) + if (!checkpoint && WT_BLOCK_ISLOCAL(block)) WT_ERR(__wt_block_truncate(session, block, ci->file_size)); if (0) { @@ -200,7 +200,7 @@ __wt_block_checkpoint_start(WT_SESSION_IMPL *session, WT_BLOCK *block) "%s: an unexpected checkpoint start: the checkpoint has already started or was " "configured for salvage", block->name); - __wt_block_set_readonly(session); + __wt_blkcache_set_readonly(session); break; case WT_CKPT_NONE: block->ckpt_state = WT_CKPT_INPROGRESS; @@ -513,7 +513,7 @@ __ckpt_process(WT_SESSION_IMPL *session, WT_BLOCK *block, WT_CKPT *ckptbase) "%s: an unexpected checkpoint attempt: the checkpoint was never started or has already " "completed", block->name); - __wt_block_set_readonly(session); + __wt_blkcache_set_readonly(session); break; case WT_CKPT_SALVAGE: /* Salvage doesn't use the standard checkpoint APIs. */ @@ -610,7 +610,8 @@ __ckpt_process(WT_SESSION_IMPL *session, WT_BLOCK *block, WT_CKPT *ckptbase) * lists, and the freed blocks will then be included when writing the live extent lists. */ WT_CKPT_FOREACH (ckptbase, ckpt) { - if (F_ISSET(ckpt, WT_CKPT_FAKE) || !F_ISSET(ckpt, WT_CKPT_DELETE) || block->has_objects) + if (F_ISSET(ckpt, WT_CKPT_FAKE) || !F_ISSET(ckpt, WT_CKPT_DELETE) || + !WT_BLOCK_ISLOCAL(block)) continue; if (WT_VERBOSE_ISSET(session, WT_VERB_CHECKPOINT)) @@ -762,7 +763,7 @@ live_update: err: if (ret != 0 && fatal) { ret = __wt_panic(session, ret, "%s: fatal checkpoint failure", block->name); - __wt_block_set_readonly(session); + __wt_blkcache_set_readonly(session); } if (locked) @@ -919,14 +920,14 @@ __wt_block_checkpoint_resolve(WT_SESSION_IMPL *session, WT_BLOCK *block, bool fa "%s: an unexpected checkpoint resolution: the checkpoint was never started or completed, " "or configured for salvage", block->name); - __wt_block_set_readonly(session); + __wt_blkcache_set_readonly(session); break; case WT_CKPT_PANIC_ON_FAILURE: if (!failed) break; ret = __wt_panic( session, EINVAL, "%s: the checkpoint failed, the system must restart", block->name); - __wt_block_set_readonly(session); + __wt_blkcache_set_readonly(session); break; } WT_ERR(ret); @@ -934,7 +935,7 @@ __wt_block_checkpoint_resolve(WT_SESSION_IMPL *session, WT_BLOCK *block, bool fa if ((ret = __wt_block_extlist_merge(session, block, &ci->ckpt_avail, &ci->avail)) != 0) { ret = __wt_panic( session, ret, "%s: fatal checkpoint failure during extent list merge", block->name); - __wt_block_set_readonly(session); + __wt_blkcache_set_readonly(session); } __wt_spin_unlock(session, &block->live_lock); diff --git a/src/third_party/wiredtiger/src/block/block_open.c b/src/third_party/wiredtiger/src/block/block_open.c index 8ba9d559695..2ea6f806d84 100644 --- a/src/third_party/wiredtiger/src/block/block_open.c +++ b/src/third_party/wiredtiger/src/block/block_open.c @@ -82,33 +82,31 @@ err: } /* - * __block_destroy -- - * Destroy a block handle. + * __wt_block_close -- + * Close a block handle. */ -static int -__block_destroy(WT_SESSION_IMPL *session, WT_BLOCK *block) +int +__wt_block_close(WT_SESSION_IMPL *session, WT_BLOCK *block) { WT_CONNECTION_IMPL *conn; WT_DECL_RET; - uint64_t bucket; - u_int i; + uint64_t bucket, hash; conn = S2C(session); - bucket = block->name_hash & (conn->hash_size - 1); + + __wt_verbose(session, WT_VERB_BLOCK, "close: %s", block->name == NULL ? "" : block->name); + + hash = __wt_hash_city64(block->name, strlen(block->name)); + bucket = hash & (conn->hash_size - 1); WT_CONN_BLOCK_REMOVE(conn, block, bucket); __wt_free(session, block->name); + __wt_free(session, block->related); - if (block->has_objects && block->ofh != NULL) { - for (i = 0; i < block->max_objectid; i++) - WT_TRET(__wt_close(session, &block->ofh[i])); - __wt_free(session, block->ofh); - } - - if (block->fh != NULL) - WT_TRET(__wt_close(session, &block->fh)); + WT_TRET(__wt_close(session, &block->fh)); __wt_spin_destroy(session, &block->live_lock); + __wt_block_ckpt_destroy(session, &block->live); __wt_overwrite_and_free(session, block); @@ -138,8 +136,9 @@ __wt_block_configure_first_fit(WT_BLOCK *block, bool on) * Open a block handle. */ int -__wt_block_open(WT_SESSION_IMPL *session, const char *filename, WT_BLOCK_FILE_OPENER *opener, - const char *cfg[], bool forced_salvage, bool readonly, uint32_t allocsize, WT_BLOCK **blockp) +__wt_block_open(WT_SESSION_IMPL *session, const char *filename, uint32_t objectid, + const char *cfg[], bool forced_salvage, bool readonly, bool fixed, uint32_t allocsize, + WT_BLOCK **blockp) { WT_BLOCK *block; WT_CONFIG_ITEM cval; @@ -148,22 +147,23 @@ __wt_block_open(WT_SESSION_IMPL *session, const char *filename, WT_BLOCK_FILE_OP uint64_t bucket, hash; uint32_t flags; - *blockp = block = NULL; + *blockp = NULL; __wt_verbose(session, WT_VERB_BLOCK, "open: %s", filename); conn = S2C(session); + + /* Block objects can be shared (although there can be only one writer). */ hash = __wt_hash_city64(filename, strlen(filename)); bucket = hash & (conn->hash_size - 1); __wt_spin_lock(session, &conn->block_lock); - TAILQ_FOREACH (block, &conn->blockhash[bucket], hashq) { - if (strcmp(filename, block->name) == 0) { + TAILQ_FOREACH (block, &conn->blockhash[bucket], hashq) + if (block->objectid == objectid && strcmp(filename, block->name) == 0) { ++block->ref; *blockp = block; __wt_spin_unlock(session, &conn->block_lock); return (0); } - } /* * Basic structure allocation, initialization. @@ -172,20 +172,21 @@ __wt_block_open(WT_SESSION_IMPL *session, const char *filename, WT_BLOCK_FILE_OP * block destroy code which uses that hash value to remove the block from the underlying linked * lists. */ - WT_ERR(__wt_calloc_one(session, &block)); - block->ref = 1; - block->name_hash = hash; - block->allocsize = allocsize; - block->opener = opener; + WT_RET(__wt_calloc_one(session, &block)); WT_CONN_BLOCK_INSERT(conn, block, bucket); - WT_ERR(__wt_strdup(session, filename, &block->name)); + block->objectid = objectid; + block->ref = 1; + + /* If not passed an allocation size, get one from the configuration. */ + if (allocsize == 0) { + WT_ERR(__wt_config_gets(session, cfg, "allocation_size", &cval)); + allocsize = (uint32_t)cval.val; + } + block->allocsize = allocsize; WT_ERR(__wt_config_gets(session, cfg, "block_allocation", &cval)); block->allocfirst = WT_STRING_MATCH("first", cval.str, cval.len); - block->has_objects = (opener != NULL); - if (block->has_objects) - block->objectid = opener->current_object_id(opener); /* Configuration: optional OS buffer cache maximum size. */ WT_ERR(__wt_config_gets(session, cfg, "os_cache_max", &cval)); @@ -210,17 +211,19 @@ __wt_block_open(WT_SESSION_IMPL *session, const char *filename, WT_BLOCK_FILE_OP else if (WT_STRING_MATCH("sequential", cval.str, cval.len)) LF_SET(WT_FS_OPEN_ACCESS_SEQ); + if (fixed) + LF_SET(WT_FS_OPEN_FIXED); if (readonly && FLD_ISSET(conn->direct_io, WT_DIRECT_IO_CHECKPOINT)) LF_SET(WT_FS_OPEN_DIRECTIO); if (!readonly && FLD_ISSET(conn->direct_io, WT_DIRECT_IO_DATA)) LF_SET(WT_FS_OPEN_DIRECTIO); - block->file_flags = flags; - if (block->has_objects) - WT_ERR(opener->open(opener, session, WT_TIERED_CURRENT_ID, WT_FS_OPEN_FILE_TYPE_DATA, - block->file_flags, &block->fh)); - else - WT_ERR( - __wt_open(session, filename, WT_FS_OPEN_FILE_TYPE_DATA, block->file_flags, &block->fh)); + /* + * Tiered storage sets file permissions to readonly, but nobody else does. This flag means the + * underlying file is read-only, and NOT that the handle access pattern is read-only. + */ + if (readonly) + LF_SET(WT_FS_OPEN_READONLY); + WT_ERR(__wt_open(session, filename, WT_FS_OPEN_FILE_TYPE_DATA, flags, &block->fh)); /* Set the file's size. */ WT_ERR(__wt_filesize(session, block->fh, &block->size)); @@ -244,41 +247,14 @@ __wt_block_open(WT_SESSION_IMPL *session, const char *filename, WT_BLOCK_FILE_OP if (!forced_salvage) WT_ERR(__desc_read(session, allocsize, block)); - *blockp = block; __wt_spin_unlock(session, &conn->block_lock); + + *blockp = block; return (0); err: - if (block != NULL) - WT_TRET(__block_destroy(session, block)); - __wt_spin_unlock(session, &conn->block_lock); - return (ret); -} - -/* - * __wt_block_close -- - * Close a block handle. - */ -int -__wt_block_close(WT_SESSION_IMPL *session, WT_BLOCK *block) -{ - WT_CONNECTION_IMPL *conn; - WT_DECL_RET; - - if (block == NULL) /* Safety check */ - return (0); - - conn = S2C(session); - - __wt_verbose(session, WT_VERB_BLOCK, "close: %s", block->name == NULL ? "" : block->name); - - __wt_spin_lock(session, &conn->block_lock); - - /* Reference count is initialized to 1. */ - if (block->ref == 0 || --block->ref == 0) - ret = __block_destroy(session, block); - __wt_spin_unlock(session, &conn->block_lock); + WT_TRET(__wt_block_close(session, block)); return (ret); } diff --git a/src/third_party/wiredtiger/src/block/block_read.c b/src/third_party/wiredtiger/src/block/block_read.c index 4030c19708b..d6db171e338 100644 --- a/src/third_party/wiredtiger/src/block/block_read.c +++ b/src/third_party/wiredtiger/src/block/block_read.c @@ -28,19 +28,13 @@ __wt_bm_read( #ifdef HAVE_DIAGNOSTIC /* - * In diagnostic mode, verify the user isn't trying to import a tiered object as a regular table - * file. - */ - if (objectid != 0 && !WT_PREFIX_MATCH(block->name, "tiered:") && - !WT_SUFFIX_MATCH(block->name, ".wtobj")) - WT_RET_MSG(session, ENOTSUP, "%s: is a tiered object", block->name); - /* * In diagnostic mode, verify the block we're about to read isn't on the available list, or for * live systems, the discard list. */ WT_RET(__wt_block_misplaced( session, block, "read", offset, size, bm->is_live, __PRETTY_FUNCTION__, __LINE__)); #endif + /* Read the block. */ __wt_capacity_throttle(session, size, WT_THROTTLE_READ); WT_RET(__wt_block_read_off(session, block, buf, objectid, offset, size, checksum)); @@ -159,7 +153,6 @@ __wt_block_read_off(WT_SESSION_IMPL *session, WT_BLOCK *block, WT_ITEM *buf, uin wt_off_t offset, uint32_t size, uint32_t checksum) { WT_BLOCK_HEADER *blk, swap; - WT_FH *fh; size_t bufsize; __wt_verbose(session, WT_VERB_READ, "off %" PRIuMAX ", size %" PRIu32 ", checksum %#" PRIx32, @@ -168,6 +161,10 @@ __wt_block_read_off(WT_SESSION_IMPL *session, WT_BLOCK *block, WT_ITEM *buf, uin WT_STAT_CONN_INCR(session, block_read); WT_STAT_CONN_INCRV(session, block_byte_read, size); + /* Swap file handles if reading from a different object. */ + if (block->objectid != objectid) + WT_RET(__wt_blkcache_get_handle(session, block, objectid, &block)); + /* * Grow the buffer as necessary and read the block. Buffers should be aligned for reading, but * there are lots of buffers (for example, file cursors have two buffers each, key and value), @@ -192,10 +189,10 @@ __wt_block_read_off(WT_SESSION_IMPL *session, WT_BLOCK *block, WT_ITEM *buf, uin block->name, size, block->allocsize); WT_RET(__wt_buf_init(session, buf, bufsize)); - WT_RET(__wt_block_fh(session, block, objectid, &fh)); - WT_RET(__wt_read(session, fh, offset, size, buf->mem)); buf->size = size; + WT_RET(__wt_read(session, block->fh, offset, size, buf->mem)); + /* * We incrementally read through the structure before doing a checksum, do little- to big-endian * handling early on, and then select from the original or swapped structure as needed. diff --git a/src/third_party/wiredtiger/src/block/block_tiered.c b/src/third_party/wiredtiger/src/block/block_tiered.c deleted file mode 100644 index d912b058d30..00000000000 --- a/src/third_party/wiredtiger/src/block/block_tiered.c +++ /dev/null @@ -1,106 +0,0 @@ -/*- - * Copyright (c) 2014-present MongoDB, Inc. - * Copyright (c) 2008-2014 WiredTiger, Inc. - * All rights reserved. - * - * See the file LICENSE for redistribution information. - */ - -#include "wt_internal.h" - -/* - * __block_switch_writeable -- - * Switch a new writeable object. - */ -static int -__block_switch_writeable(WT_SESSION_IMPL *session, WT_BLOCK *block, uint32_t object_id) -{ - WT_FH *new_fh, *old_fh; - - /* - * FIXME-WT-7470: write lock while opening a new write handle. - * - * The block manager must always have valid file handle since other threads may have concurrent - * requests in flight. - */ - old_fh = block->fh; - WT_RET(block->opener->open( - block->opener, session, object_id, WT_FS_OPEN_FILE_TYPE_DATA, block->file_flags, &new_fh)); - block->fh = new_fh; - block->objectid = object_id; - - return (__wt_close(session, &old_fh)); -} - -/* - * __wt_block_fh -- - * Get a block file handle. - */ -int -__wt_block_fh(WT_SESSION_IMPL *session, WT_BLOCK *block, uint32_t object_id, WT_FH **fhp) -{ - WT_DECL_RET; - - /* It's the local object if there's no object ID or the object ID matches our own. */ - if (object_id == 0 || object_id == block->objectid) { - *fhp = block->fh; - return (0); - } - - /* - * FIXME-WT-7470: take a read lock to get a handle, and a write lock to open a handle or extend - * the array. - * - * If the object id isn't larger than the array of file handles, see if it's already opened. - */ - if (object_id * sizeof(WT_FILE_HANDLE *) < block->ofh_alloc && - (*fhp = block->ofh[object_id]) != NULL) - return (0); - - /* Ensure the array is big enough. */ - WT_RET(__wt_realloc_def(session, &block->ofh_alloc, object_id + 1, &block->ofh)); - if (object_id >= block->max_objectid) - block->max_objectid = object_id + 1; - if ((*fhp = block->ofh[object_id]) != NULL) - return (0); - - /* - * Fail gracefully if we don't have an opener, or if the opener fails: a release that can't read - * tiered storage blocks might have been pointed at a file that it can read, but that references - * files it doesn't know about, or there may have been some other mismatch. Regardless, we want - * to log a specific error message, we're missing a file. - */ - ret = block->opener->open == NULL ? - WT_NOTFOUND : - block->opener->open(block->opener, session, object_id, WT_FS_OPEN_FILE_TYPE_DATA, - WT_FS_OPEN_READONLY | block->file_flags, &block->ofh[object_id]); - if (ret == 0) { - *fhp = block->ofh[object_id]; - return (0); - } - - WT_RET_MSG(session, ret, - "object %s with ID %" PRIu32 " referenced unknown object with ID %" PRIu32, block->name, - block->objectid, object_id); -} - -/* - * __wt_block_switch_object -- - * Modify an object. - */ -int -__wt_block_switch_object( - WT_SESSION_IMPL *session, WT_BLOCK *block, uint32_t object_id, uint32_t flags) -{ - WT_UNUSED(flags); - - /* - * FIXME-WT-7596 the flags argument will be used in the future to perform various tasks, - * to efficiently mark objects in transition (that is during a switch): - * - mark this file as the writeable file (what currently happens) - * - disallow writes to this object (reads still allowed, we're about to switch) - * - close this object (about to move it, don't allow reopens yet) - * - allow opens on this object again - */ - return (__block_switch_writeable(session, block, object_id)); -} diff --git a/src/third_party/wiredtiger/src/block_cache/block_cache.c b/src/third_party/wiredtiger/src/block_cache/block_cache.c index 9228eaebbe1..800c65a7ce9 100644 --- a/src/third_party/wiredtiger/src/block_cache/block_cache.c +++ b/src/third_party/wiredtiger/src/block_cache/block_cache.c @@ -595,11 +595,11 @@ __blkcache_init(WT_SESSION_IMPL *session, size_t cache_size, u_int hash_size, u_ } /* - * __wt_block_cache_destroy -- + * __wt_blkcache_destroy -- * Destroy the block cache and free all memory. */ void -__wt_block_cache_destroy(WT_SESSION_IMPL *session) +__wt_blkcache_destroy(WT_SESSION_IMPL *session) { WT_BLKCACHE *blkcache; WT_BLKCACHE_ITEM *blkcache_item; @@ -701,11 +701,11 @@ __blkcache_reconfig(WT_SESSION_IMPL *session, bool reconfig, size_t cache_size, } /* - * __wt_block_cache_setup -- + * __wt_blkcache_setup -- * Set up the block cache. */ int -__wt_block_cache_setup(WT_SESSION_IMPL *session, const char *cfg[], bool reconfig) +__wt_blkcache_setup(WT_SESSION_IMPL *session, const char *cfg[], bool reconfig) { WT_BLKCACHE *blkcache; WT_CONFIG_ITEM cval; diff --git a/src/third_party/wiredtiger/src/block_cache/block_map.c b/src/third_party/wiredtiger/src/block_cache/block_map.c index 68275d10328..974c1e42565 100644 --- a/src/third_party/wiredtiger/src/block_cache/block_map.c +++ b/src/third_party/wiredtiger/src/block_cache/block_map.c @@ -86,7 +86,6 @@ __wt_blkcache_map_read( { WT_BLOCK *block; WT_BM *bm; - WT_FH *fh; WT_FILE_HANDLE *handle; wt_off_t offset; uint32_t checksum, objectid, size; @@ -95,13 +94,7 @@ __wt_blkcache_map_read( bm = S2BT(session)->bm; - /* - * FIXME WT-7872: The WT_BLOCK.map test is wrong; tiered storage assumes object IDs translate to - * WT_FH structures, not WT_BLOCK structures. When we check if the WT_BLOCK handle references a - * mapped object, that's not going to work as we might be about to switch to a different WT_FH - * handle which may or may not reference a mapped object. - */ - if (!bm->map) + if (!bm->map) /* FIXME WT-8728. */ return (0); block = bm->block; @@ -110,9 +103,12 @@ __wt_blkcache_map_read( WT_RET(__wt_block_addr_unpack( session, block, addr, addr_size, &objectid, &offset, &size, &checksum)); + /* Swap file handles if reading from a different object. */ + if (block->objectid != objectid) + WT_RET(__wt_blkcache_get_handle(session, block, objectid, &block)); + /* Map the block if it's possible. */ - WT_RET(__wt_block_fh(session, block, objectid, &fh)); - handle = fh->handle; + handle = block->fh->handle; if (handle->fh_map_preload != NULL && offset + size <= (wt_off_t)bm->maplen && handle->fh_map_preload( handle, (WT_SESSION *)session, (uint8_t *)bm->map + offset, size, bm->mapped_cookie) == 0) { diff --git a/src/third_party/wiredtiger/src/block_cache/block_mgr.c b/src/third_party/wiredtiger/src/block_cache/block_mgr.c index b41c6cc8f3f..ef480c18942 100644 --- a/src/third_party/wiredtiger/src/block_cache/block_mgr.c +++ b/src/third_party/wiredtiger/src/block_cache/block_mgr.c @@ -183,6 +183,69 @@ __bm_checkpoint_unload(WT_BM *bm, WT_SESSION_IMPL *session) } /* + * __bm_close_block_remove -- + * Remove a single block handle. + */ +static int +__bm_close_block_remove(WT_SESSION_IMPL *session, WT_BLOCK *block) +{ + u_int i; + + /* Discard any references we're holding. */ + for (i = 0; i < block->related_next; ++i) { + --block->related[i]->ref; + block->related[i] = NULL; + } + + /* Discard the block structure. */ + return (__wt_block_close(session, block)); +} + +/* + * __bm_close_block -- + * Close a single block handle, removing the handle if it's no longer useful. + */ +static int +__bm_close_block(WT_SESSION_IMPL *session, WT_BLOCK *block) +{ + WT_CONNECTION_IMPL *conn; + WT_DECL_RET; + bool found; + + __wt_verbose(session, WT_VERB_BLKCACHE, "close: %s", block->name); + + conn = S2C(session); + + /* You can't close files during a checkpoint. */ + WT_ASSERT( + session, block->ckpt_state == WT_CKPT_NONE || block->ckpt_state == WT_CKPT_PANIC_ON_FAILURE); + + __wt_spin_lock(session, &conn->block_lock); + if (block->ref > 0 && --block->ref > 0) { + __wt_spin_unlock(session, &conn->block_lock); + return (0); + } + + /* + * Every time we remove a block, we may have sufficiently decremented other references to allow + * other blocks to be removed. It's unlikely for blocks to reference each other but it's not out + * of the question, either. Loop until we don't find anything to close. + */ + do { + found = false; + TAILQ_FOREACH (block, &conn->blockqh, q) + if (block->ref == 0) { + found = true; + WT_TRET(__bm_close_block_remove(session, block)); + break; + } + } while (found); + __wt_spin_unlock(session, &conn->block_lock); + + return (ret); +} + +/* * __bm_close -- * Close a file. */ @@ -194,7 +257,7 @@ __bm_close(WT_BM *bm, WT_SESSION_IMPL *session) if (bm == NULL) /* Safety check */ return (0); - ret = __wt_block_close(session, bm->block); + ret = __bm_close_block(session, bm->block); __wt_overwrite_and_free(session, bm); return (ret); @@ -494,22 +557,55 @@ __bm_stat(WT_BM *bm, WT_SESSION_IMPL *session, WT_DSRC_STATS *stats) /* * __bm_switch_object -- - * Modify the tiered object. + * Switch the tiered object. */ static int -__bm_switch_object(WT_BM *bm, WT_SESSION_IMPL *session, uint32_t object_id, uint32_t flags) +__bm_switch_object(WT_BM *bm, WT_SESSION_IMPL *session, uint32_t objectid, uint32_t flags) { - return (__wt_block_switch_object(session, bm->block, object_id, flags)); + WT_BLOCK *block; + + block = bm->block; + + /* Close out our current handle. */ + WT_RET(__bm_close_block(session, block)); + bm->block = NULL; + + /* + * FIXME-WT-7596 the flags argument will be used in the future to perform various tasks, + * to efficiently mark objects in transition (that is during a switch): + * - mark this file as the writeable file (what currently happens) + * - disallow writes to this object (reads still allowed, we're about to switch) + * - close this object (about to move it, don't allow reopens yet) + * - allow opens on this object again + */ + WT_UNUSED(flags); + WT_RET(__wt_blkcache_get_handle(session, NULL, objectid, &block)); + + /* + * KEITH XXX: We need to distinguish between tiered switch and loading a checkpoint. This is + * also discarding the extent list which isn't correct, because we can't know when to discard + * previous files if we don't have the extent list. This fixes the problem where we randomly + * write a new position in the new tiered object, but it's not OK. + */ + WT_RET(__wt_block_ckpt_init(session, &block->live, "live")); + + /* + * This isn't right: the new block handle will reasonably have different methods for objects in + * different backing sources. That's not the case today, but the current architecture lacks the + * ability to support multiple sources cleanly. + */ + bm->block = block; + return (0); } /* * __bm_switch_object_readonly -- - * Modify the tiered object; readonly version. + * Switch the tiered object; readonly version. */ static int -__bm_switch_object_readonly(WT_BM *bm, WT_SESSION_IMPL *session, uint32_t object_id, uint32_t flags) +__bm_switch_object_readonly(WT_BM *bm, WT_SESSION_IMPL *session, uint32_t objectid, uint32_t flags) { - WT_UNUSED(object_id); + WT_UNUSED(objectid); WT_UNUSED(flags); return (__bm_readonly(bm, session)); @@ -685,39 +781,44 @@ __bm_method_set(WT_BM *bm, bool readonly) } /* - * __wt_block_manager_open -- + * __wt_blkcache_open -- * Open a file. */ int -__wt_block_manager_open(WT_SESSION_IMPL *session, const char *filename, - WT_BLOCK_FILE_OPENER *opener, const char *cfg[], bool forced_salvage, bool readonly, - uint32_t allocsize, WT_BM **bmp) +__wt_blkcache_open(WT_SESSION_IMPL *session, const char *uri, const char *cfg[], + bool forced_salvage, bool readonly, uint32_t allocsize, WT_BM **bmp) { WT_BM *bm; WT_DECL_RET; *bmp = NULL; + __wt_verbose(session, WT_VERB_BLOCK, "open: %s", uri); + WT_RET(__wt_calloc_one(session, &bm)); __bm_method_set(bm, false); - WT_ERR(__wt_block_open( - session, filename, opener, cfg, forced_salvage, readonly, allocsize, &bm->block)); + if (WT_PREFIX_MATCH(uri, "file:")) { + uri += strlen("file:"); + WT_ERR(__wt_block_open(session, uri, WT_TIERED_OBJECTID_NONE, cfg, forced_salvage, readonly, + false, allocsize, &bm->block)); + } else + WT_ERR(__wt_blkcache_tiered_open(session, uri, 0, &bm->block)); *bmp = bm; return (0); err: - WT_TRET(bm->close(bm, session)); + __wt_free(session, bm); return (ret); } /* - * __wt_block_set_readonly -- + * __wt_blkcache_set_readonly -- * Set the block API to read-only. */ void -__wt_block_set_readonly(WT_SESSION_IMPL *session) WT_GCC_FUNC_ATTRIBUTE((cold)) +__wt_blkcache_set_readonly(WT_SESSION_IMPL *session) WT_GCC_FUNC_ATTRIBUTE((cold)) { /* Switch the handle into read-only mode. */ __bm_method_set(S2BT(session)->bm, true); diff --git a/src/third_party/wiredtiger/src/block_cache/block_tier.c b/src/third_party/wiredtiger/src/block_cache/block_tier.c new file mode 100644 index 00000000000..e96ac9b5787 --- /dev/null +++ b/src/third_party/wiredtiger/src/block_cache/block_tier.c @@ -0,0 +1,137 @@ +/*- + * Copyright (c) 2014-present MongoDB, Inc. + * Copyright (c) 2008-2014 WiredTiger, Inc. + * All rights reserved. + * + * See the file LICENSE for redistribution information. + */ + +#include "wt_internal.h" + +/* + * __wt_blkcache_tiered_open -- + * Open a tiered object. + */ +int +__wt_blkcache_tiered_open( + WT_SESSION_IMPL *session, const char *uri, uint32_t objectid, WT_BLOCK **blockp) +{ + WT_BLOCK *block; + WT_BUCKET_STORAGE *bstorage; + WT_CONFIG_ITEM pfx; + WT_DECL_ITEM(tmp); + WT_DECL_RET; + WT_TIERED *tiered; + const char *cfg[2], *object_name, *object_uri, *object_val; + bool exist, local_only, readonly; + + *blockp = NULL; + + tiered = (WT_TIERED *)session->dhandle; + object_uri = object_val = NULL; + + WT_ASSERT(session, objectid <= tiered->current_id); + WT_ASSERT(session, uri == NULL || WT_PREFIX_MATCH(uri, "tiered:")); + WT_ASSERT(session, (uri == NULL && objectid != 0) || (uri != NULL && objectid == 0)); + + /* + * First look for the local file. This will be the fastest access and we retain recent objects + * in the local database for awhile. If we're passed a name to open, then by definition it's a + * local file. + * + * FIXME-WT-7590 we will need some kind of locking while we're looking at the tiered structure. + * This can be called at any time, because we are opening the objects lazily. + */ + if (uri != NULL) + objectid = tiered->current_id; + if (objectid == tiered->current_id) { + local_only = true; + object_uri = tiered->tiers[WT_TIERED_INDEX_LOCAL].name; + object_name = object_uri; + WT_PREFIX_SKIP_REQUIRED(session, object_name, "file:"); + readonly = false; + } else { + local_only = false; + WT_ERR( + __wt_tiered_name(session, &tiered->iface, objectid, WT_TIERED_NAME_OBJECT, &object_uri)); + object_name = object_uri; + WT_PREFIX_SKIP_REQUIRED(session, object_name, "object:"); + readonly = true; + } + + /* Get the object's configuration. */ + WT_ERR(__wt_metadata_search(session, object_uri, (char **)&object_val)); + cfg[0] = object_val; + cfg[1] = NULL; + + /* Check if the object exists. */ + exist = true; + if (!local_only) + WT_ERR(__wt_fs_exist(session, object_name, &exist)); + if (exist) + WT_ERR( + __wt_block_open(session, object_name, objectid, cfg, false, readonly, false, 0, &block)); + else { + /* We expect a prefix. */ + WT_ERR(__wt_config_gets(session, cfg, "tiered_storage.bucket_prefix", &pfx)); + WT_ASSERT(session, pfx.len != 0); + + WT_ERR(__wt_scr_alloc(session, 0, &tmp)); + WT_ERR(__wt_buf_fmt(session, tmp, "%.*s%s", (int)pfx.len, pfx.str, object_name)); + + bstorage = tiered->bstorage; + WT_WITH_BUCKET_STORAGE(bstorage, session, + ret = __wt_block_open(session, tmp->mem, objectid, cfg, false, true, true, 0, &block)); + WT_ERR(ret); + } + + *blockp = block; + +err: + if (!local_only) + __wt_free(session, object_uri); + __wt_free(session, object_val); + __wt_scr_free(session, &tmp); + return (ret); +} + +/* + * __wt_blkcache_get_handle -- + * Get a block handle for an object, creating it if it doesn't exist, optionally cache a + * reference. + */ +int +__wt_blkcache_get_handle( + WT_SESSION_IMPL *session, WT_BLOCK *orig, uint32_t objectid, WT_BLOCK **blockp) +{ + u_int i; + + *blockp = NULL; + + /* We should never be looking for our own object. */ + WT_ASSERT(session, orig == NULL || orig->objectid != objectid); + + /* + * Check the local cache for the object. We don't have to check the name because we can only + * reference objects in our name space. + */ + if (orig != NULL) { + for (i = 0; i < orig->related_next; ++i) + if (orig->related[i]->objectid == objectid) { + *blockp = orig->related[i]; + return (0); + } + + /* Allocate space to store a reference (do first for less complicated cleanup). */ + WT_RET(__wt_realloc_def( + session, &orig->related_allocated, orig->related_next + 1, &orig->related)); + } + + /* Get a reference to the object, opening it as necessary. */ + WT_RET(__wt_blkcache_tiered_open(session, NULL, objectid, blockp)); + + /* Save a reference in the block in which we started for fast subsequent access. */ + if (orig != NULL) + orig->related[orig->related_next++] = *blockp; + return (0); +} diff --git a/src/third_party/wiredtiger/src/btree/bt_handle.c b/src/third_party/wiredtiger/src/btree/bt_handle.c index 93d2bba4701..8487e2514f1 100644 --- a/src/third_party/wiredtiger/src/btree/bt_handle.c +++ b/src/third_party/wiredtiger/src/btree/bt_handle.c @@ -57,7 +57,6 @@ __btree_clear(WT_SESSION_IMPL *session) int __wt_btree_open(WT_SESSION_IMPL *session, const char *op_cfg[]) { - WT_BLOCK_FILE_OPENER *opener; WT_BM *bm; WT_BTREE *btree; WT_CKPT ckpt; @@ -67,7 +66,6 @@ __wt_btree_open(WT_SESSION_IMPL *session, const char *op_cfg[]) WT_DECL_RET; size_t root_addr_size; uint8_t root_addr[WT_BTREE_MAX_ADDR_COOKIE]; - const char *filename; bool creation, forced_salvage; btree = S2BT(session); @@ -111,17 +109,9 @@ __wt_btree_open(WT_SESSION_IMPL *session, const char *op_cfg[]) /* Initialize and configure the WT_BTREE structure. */ WT_ERR(__btree_conf(session, &ckpt)); - /* - * Get an opener abstraction that the block manager can use to open any of the files that - * represent a btree. In the case of a tiered Btree, that would allow opening different files - * according to an object id in a reference. For a non-tiered Btree, the opener will know to - * always open a single file (given by the filename). - */ - WT_ERR(__wt_tiered_opener(session, dhandle, &opener, &filename)); - /* Connect to the underlying block manager. */ - WT_ERR(__wt_block_manager_open(session, filename, opener, dhandle->cfg, forced_salvage, - F_ISSET(btree, WT_BTREE_READONLY), btree->allocsize, &btree->bm)); + WT_ERR(__wt_blkcache_open( + session, dhandle->name, dhandle->cfg, forced_salvage, false, btree->allocsize, &btree->bm)); bm = btree->bm; @@ -1024,20 +1014,15 @@ __wt_btree_immediately_durable(WT_SESSION_IMPL *session) * Switch to a writeable object for a tiered btree. */ int -__wt_btree_switch_object(WT_SESSION_IMPL *session, uint32_t object_id, uint32_t flags) +__wt_btree_switch_object(WT_SESSION_IMPL *session, uint32_t objectid, uint32_t flags) { WT_BM *bm; - WT_DECL_RET; - - bm = S2BT(session)->bm; /* * When initially opening a tiered Btree, a tier switch is done internally without the btree * being fully opened. That's okay, the btree will be told later about the current object * number. */ - if (bm != NULL) - ret = bm->switch_object(bm, session, object_id, flags); - - return (ret); + bm = S2BT(session)->bm; + return (bm == NULL ? 0 : bm->switch_object(bm, session, objectid, flags)); } diff --git a/src/third_party/wiredtiger/src/btree/bt_import.c b/src/third_party/wiredtiger/src/btree/bt_import.c index cb60e9aa14a..4a551e63dd0 100644 --- a/src/third_party/wiredtiger/src/btree/bt_import.c +++ b/src/third_party/wiredtiger/src/btree/bt_import.c @@ -24,10 +24,8 @@ __wt_import_repair(WT_SESSION_IMPL *session, const char *uri, char **configp) WT_DECL_ITEM(checkpoint); WT_DECL_RET; WT_KEYED_ENCRYPTOR *kencryptor; - uint32_t allocsize; char *checkpoint_list, *config, *config_tmp, *metadata, fileid[64]; const char *cfg[] = {WT_CONFIG_BASE(session, file_meta), NULL, NULL, NULL, NULL, NULL, NULL}; - const char *filename; ckptbase = NULL; checkpoint_list = config = config_tmp = metadata = NULL; @@ -37,16 +35,12 @@ __wt_import_repair(WT_SESSION_IMPL *session, const char *uri, char **configp) WT_ERR(__wt_scr_alloc(session, 1024, &buf)); WT_ERR(__wt_scr_alloc(session, 0, &checkpoint)); - WT_ASSERT(session, WT_PREFIX_MATCH(uri, "file:")); - filename = uri; - WT_PREFIX_SKIP(filename, "file:"); - /* * Open the file, request block manager checkpoint information. We don't know the allocation * size, but 512B allows us to read the descriptor block and that's all we care about. */ F_SET(session, WT_SESSION_IMPORT_REPAIR); - WT_ERR(__wt_block_manager_open(session, filename, NULL, cfg, false, true, 512, &bm)); + WT_ERR(__wt_blkcache_open(session, uri, cfg, false, true, 512, &bm)); ret = bm->checkpoint_last(bm, session, &metadata, &checkpoint_list, checkpoint); WT_TRET(bm->close(bm, session)); F_CLR(session, WT_SESSION_IMPORT_REPAIR); @@ -63,7 +57,7 @@ __wt_import_repair(WT_SESSION_IMPL *session, const char *uri, char **configp) WT_ERR_MSG(session, EINVAL, "%s: loaded object's encryption configuration doesn't match the database's encryption " "configuration", - filename); + uri); /* * The metadata was quoted to avoid configuration string characters acting as separators. * Discard any quote characters. @@ -109,16 +103,12 @@ __wt_import_repair(WT_SESSION_IMPL *session, const char *uri, char **configp) cfg[5] = fileid; WT_ERR(__wt_config_collapse(session, cfg, &config_tmp)); - /* Now that we've retrieved the configuration, let's get the real allocation size. */ - WT_ERR(__wt_config_getones(session, config_tmp, "allocation_size", &v)); - allocsize = (uint32_t)v.val; - /* * Now we need to retrieve the last checkpoint again but this time, with the correct allocation * size. When we did this earlier, we were able to read the descriptor block properly but the * checkpoint's byte representation was wrong because it was using the wrong allocation size. */ - WT_ERR(__wt_block_manager_open(session, filename, NULL, cfg, false, true, allocsize, &bm)); + WT_ERR(__wt_blkcache_open(session, uri, cfg, false, true, 0, &bm)); __wt_free(session, checkpoint_list); __wt_free(session, metadata); ret = bm->checkpoint_last(bm, session, &metadata, &checkpoint_list, checkpoint); diff --git a/src/third_party/wiredtiger/src/conn/conn_api.c b/src/third_party/wiredtiger/src/conn/conn_api.c index ced91d8b59f..65e9cd3b35e 100644 --- a/src/third_party/wiredtiger/src/conn/conn_api.c +++ b/src/third_party/wiredtiger/src/conn/conn_api.c @@ -2788,7 +2788,7 @@ wiredtiger_open(const char *home, WT_EVENT_HANDLER *event_handler, const char *c WT_ERR(__wt_json_config(session, cfg, false)); WT_ERR(__wt_verbose_config(session, cfg, false)); WT_ERR(__wt_timing_stress_config(session, cfg)); - WT_ERR(__wt_block_cache_setup(session, cfg, false)); + WT_ERR(__wt_blkcache_setup(session, cfg, false)); WT_ERR(__wt_conn_optrack_setup(session, cfg, false)); WT_ERR(__conn_session_size(session, cfg, &conn->session_size)); WT_ERR(__wt_config_gets(session, cfg, "session_scratch_max", &cval)); diff --git a/src/third_party/wiredtiger/src/conn/conn_open.c b/src/third_party/wiredtiger/src/conn/conn_open.c index 75ef3c4a1d4..5b89817c9b6 100644 --- a/src/third_party/wiredtiger/src/conn/conn_open.c +++ b/src/third_party/wiredtiger/src/conn/conn_open.c @@ -113,7 +113,7 @@ __wt_connection_close(WT_CONNECTION_IMPL *conn) WT_TRET(__wt_meta_track_destroy(session)); /* Shut down the block cache */ - __wt_block_cache_destroy(session); + __wt_blkcache_destroy(session); /* * Now that all data handles are closed, tell logging that a checkpoint has completed then shut diff --git a/src/third_party/wiredtiger/src/conn/conn_reconfig.c b/src/third_party/wiredtiger/src/conn/conn_reconfig.c index 57a85d7d515..14378f1d78e 100644 --- a/src/third_party/wiredtiger/src/conn/conn_reconfig.c +++ b/src/third_party/wiredtiger/src/conn/conn_reconfig.c @@ -405,7 +405,7 @@ __wt_conn_reconfig(WT_SESSION_IMPL *session, const char **cfg) */ WT_WITH_CHECKPOINT_LOCK(session, ret = __wt_conn_compat_config(session, cfg, true)); WT_ERR(ret); - WT_ERR(__wt_block_cache_setup(session, cfg, true)); + WT_ERR(__wt_blkcache_setup(session, cfg, true)); WT_ERR(__wt_conn_optrack_setup(session, cfg, true)); WT_ERR(__wt_conn_statistics_config(session, cfg)); WT_ERR(__wt_cache_config(session, cfg, true)); diff --git a/src/third_party/wiredtiger/src/include/block.h b/src/third_party/wiredtiger/src/include/block.h index 9dac14a7a05..f823c3b3021 100644 --- a/src/third_party/wiredtiger/src/include/block.h +++ b/src/third_party/wiredtiger/src/include/block.h @@ -16,6 +16,8 @@ */ #define WT_BLOCK_INVALID_OFFSET 0 +#define WT_BLOCK_ISLOCAL(block) ((block)->objectid == WT_TIERED_OBJECTID_NONE) + /* * The block manager maintains three per-checkpoint extent lists: * alloc: the extents allocated in this checkpoint @@ -222,15 +224,17 @@ struct __wt_bm { * Block manager handle, references a single file. */ struct __wt_block { - const char *name; /* Name */ - uint64_t name_hash; /* Hash of name */ - WT_BLOCK_FILE_OPENER *opener; /* how to open files/objects */ + const char *name; /* Name */ + uint32_t objectid; /* Object id */ + uint32_t ref; /* References */ - /* A list of block manager handles, sharing a file descriptor. */ - uint32_t ref; /* References */ TAILQ_ENTRY(__wt_block) q; /* Linked list of handles */ TAILQ_ENTRY(__wt_block) hashq; /* Hashed list of handles */ + WT_BLOCK **related; /* Related objects */ + size_t related_allocated; /* Size of related object array */ + u_int related_next; /* Next open slot */ + WT_FH *fh; /* Backing file handle */ wt_off_t size; /* File size */ wt_off_t extend_size; /* File extended size */ @@ -247,19 +251,9 @@ struct __wt_block { u_int block_header; /* Header length */ - /* Object file tracking. */ - bool has_objects; /* Address cookies contain object id */ - uint32_t file_flags; /* Flags for opening objects */ - uint32_t objectid; /* Current writeable object id */ - uint32_t max_objectid; /* Size of object handle array */ - WT_FH **ofh; /* Object file handles */ - size_t ofh_alloc; - /* - * There is only a single checkpoint in a file that can be written. The information could - * logically live in the WT_BM structure, but then we would be re-creating it every time we - * opened a new checkpoint and I'd rather not do that. So, it's stored here, only accessed by - * one WT_BM handle. + * There is only a single checkpoint in a file that can be written; stored here, only accessed + * by one WT_BM handle. */ WT_SPINLOCK live_lock; /* Live checkpoint lock */ WT_BLOCK_CKPT live; /* Live checkpoint */ @@ -320,20 +314,6 @@ struct __wt_block_desc { #define WT_BLOCK_DESC_SIZE 16 /* - * WT_BLOCK_FILE_OPENER -- - * An open callback for the block manager. This hides details about how to access the - * different objects that make up a tiered file. - */ -struct __wt_block_file_opener { - /* An id to be used with the open call to reference the current object. */ -#define WT_TIERED_CURRENT_ID UINT32_MAX - int (*open)( - WT_BLOCK_FILE_OPENER *, WT_SESSION_IMPL *, uint32_t, WT_FS_OPEN_FILE_TYPE, u_int, WT_FH **); - uint32_t (*current_object_id)(WT_BLOCK_FILE_OPENER *); - void *cookie; /* Used in open call */ -}; - -/* * __wt_block_desc_byteswap -- * Handle big- and little-endian transformation of a description block. */ diff --git a/src/third_party/wiredtiger/src/include/extern.h b/src/third_party/wiredtiger/src/include/extern.h index 9addc8298f2..570c6fb3b64 100644 --- a/src/third_party/wiredtiger/src/include/extern.h +++ b/src/third_party/wiredtiger/src/include/extern.h @@ -88,14 +88,23 @@ extern int __wt_backup_open(WT_SESSION_IMPL *session) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_bad_object_type(WT_SESSION_IMPL *session, const char *uri) WT_GCC_FUNC_DECL_ATTRIBUTE((cold)) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); +extern int __wt_blkcache_get_handle(WT_SESSION_IMPL *session, WT_BLOCK *orig, uint32_t objectid, + WT_BLOCK **blockp) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_blkcache_map(WT_SESSION_IMPL *session, WT_BLOCK *block, void *mapped_regionp, size_t *lengthp, void *mapped_cookiep) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_blkcache_map_read(WT_SESSION_IMPL *session, WT_ITEM *buf, const uint8_t *addr, size_t addr_size, bool *foundp) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); +extern int __wt_blkcache_open(WT_SESSION_IMPL *session, const char *uri, const char *cfg[], + bool forced_salvage, bool readonly, uint32_t allocsize, WT_BM **bmp) + WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_blkcache_put(WT_SESSION_IMPL *session, WT_ITEM *data, const uint8_t *addr, size_t addr_size, bool write) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_blkcache_read(WT_SESSION_IMPL *session, WT_ITEM *buf, const uint8_t *addr, size_t addr_size) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); +extern int __wt_blkcache_setup(WT_SESSION_IMPL *session, const char *cfg[], bool reconfig) + WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); +extern int __wt_blkcache_tiered_open(WT_SESSION_IMPL *session, const char *uri, uint32_t objectid, + WT_BLOCK **blockp) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_blkcache_unmap(WT_SESSION_IMPL *session, WT_BLOCK *block, void *mapped_region, size_t length, void *mapped_cookie) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_blkcache_write(WT_SESSION_IMPL *session, WT_ITEM *buf, uint8_t *addr, @@ -112,8 +121,6 @@ extern int __wt_block_addr_unpack(WT_SESSION_IMPL *session, WT_BLOCK *block, con WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_block_alloc(WT_SESSION_IMPL *session, WT_BLOCK *block, wt_off_t *offp, wt_off_t size) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); -extern int __wt_block_cache_setup(WT_SESSION_IMPL *session, const char *cfg[], bool reconfig) - WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_block_checkpoint(WT_SESSION_IMPL *session, WT_BLOCK *block, WT_ITEM *buf, WT_CKPT *ckptbase, bool data_checksum) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_block_checkpoint_final(WT_SESSION_IMPL *session, WT_BLOCK *block, WT_ITEM *buf, @@ -175,8 +182,6 @@ extern int __wt_block_extlist_truncate(WT_SESSION_IMPL *session, WT_BLOCK *block WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_block_extlist_write(WT_SESSION_IMPL *session, WT_BLOCK *block, WT_EXTLIST *el, WT_EXTLIST *additional) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); -extern int __wt_block_fh(WT_SESSION_IMPL *session, WT_BLOCK *block, uint32_t object_id, WT_FH **fhp) - WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_block_free(WT_SESSION_IMPL *session, WT_BLOCK *block, const uint8_t *addr, size_t addr_size) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_block_insert_ext(WT_SESSION_IMPL *session, WT_BLOCK *block, WT_EXTLIST *el, @@ -187,9 +192,6 @@ extern int __wt_block_manager_drop(WT_SESSION_IMPL *session, const char *filenam WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_block_manager_named_size(WT_SESSION_IMPL *session, const char *name, wt_off_t *sizep) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); -extern int __wt_block_manager_open(WT_SESSION_IMPL *session, const char *filename, - WT_BLOCK_FILE_OPENER *opener, const char *cfg[], bool forced_salvage, bool readonly, - uint32_t allocsize, WT_BM **bmp) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_block_manager_size(WT_BM *bm, WT_SESSION_IMPL *session, wt_off_t *sizep) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_block_misplaced(WT_SESSION_IMPL *session, WT_BLOCK *block, const char *list, @@ -199,9 +201,9 @@ extern int __wt_block_off_free(WT_SESSION_IMPL *session, WT_BLOCK *block, uint32 wt_off_t offset, wt_off_t size) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_block_off_remove_overlap(WT_SESSION_IMPL *session, WT_BLOCK *block, WT_EXTLIST *el, wt_off_t off, wt_off_t size) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); -extern int __wt_block_open(WT_SESSION_IMPL *session, const char *filename, - WT_BLOCK_FILE_OPENER *opener, const char *cfg[], bool forced_salvage, bool readonly, - uint32_t allocsize, WT_BLOCK **blockp) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); +extern int __wt_block_open(WT_SESSION_IMPL *session, const char *filename, uint32_t objectid, + const char *cfg[], bool forced_salvage, bool readonly, bool fixed, uint32_t allocsize, + WT_BLOCK **blockp) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_block_read_off(WT_SESSION_IMPL *session, WT_BLOCK *block, WT_ITEM *buf, uint32_t objectid, wt_off_t offset, uint32_t size, uint32_t checksum) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); @@ -217,8 +219,6 @@ extern int __wt_block_salvage_valid(WT_SESSION_IMPL *session, WT_BLOCK *block, u size_t addr_size, bool valid) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_block_size_alloc(WT_SESSION_IMPL *session, WT_SIZE **szp) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); -extern int __wt_block_switch_object(WT_SESSION_IMPL *session, WT_BLOCK *block, uint32_t object_id, - uint32_t flags) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_block_truncate(WT_SESSION_IMPL *session, WT_BLOCK *block, wt_off_t len) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_block_verify_addr(WT_SESSION_IMPL *session, WT_BLOCK *block, const uint8_t *addr, @@ -306,7 +306,7 @@ extern int __wt_btree_open(WT_SESSION_IMPL *session, const char *op_cfg[]) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_btree_stat_init(WT_SESSION_IMPL *session, WT_CURSOR_STAT *cst) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); -extern int __wt_btree_switch_object(WT_SESSION_IMPL *session, uint32_t object_id, uint32_t flags) +extern int __wt_btree_switch_object(WT_SESSION_IMPL *session, uint32_t objectid, uint32_t flags) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_btree_tree_open(WT_SESSION_IMPL *session, const uint8_t *addr, size_t addr_size) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); @@ -1483,9 +1483,6 @@ extern int __wt_tiered_name(WT_SESSION_IMPL *session, WT_DATA_HANDLE *dhandle, u uint32_t flags, const char **retp) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_tiered_open(WT_SESSION_IMPL *session, const char *cfg[]) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); -extern int __wt_tiered_opener(WT_SESSION_IMPL *session, WT_DATA_HANDLE *dhandle, - WT_BLOCK_FILE_OPENER **openerp, const char **filenamep) - WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_tiered_put_drop_local(WT_SESSION_IMPL *session, WT_TIERED *tiered, uint32_t id) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_tiered_put_drop_shared(WT_SESSION_IMPL *session, WT_TIERED *tiered, uint32_t id) @@ -1676,10 +1673,11 @@ extern void *__wt_ext_scr_alloc(WT_EXTENSION_API *wt_api, WT_SESSION *wt_session extern void __wt_abort(WT_SESSION_IMPL *session) WT_GCC_FUNC_DECL_ATTRIBUTE((noreturn)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("default"))); extern void __wt_backup_destroy(WT_SESSION_IMPL *session); +extern void __wt_blkcache_destroy(WT_SESSION_IMPL *session); extern void __wt_blkcache_get(WT_SESSION_IMPL *session, const uint8_t *addr, size_t addr_size, WT_BLKCACHE_ITEM **blkcache_retp, bool *foundp, bool *skip_cache_putp); extern void __wt_blkcache_remove(WT_SESSION_IMPL *session, const uint8_t *addr, size_t addr_size); -extern void __wt_block_cache_destroy(WT_SESSION_IMPL *session); +extern void __wt_blkcache_set_readonly(WT_SESSION_IMPL *session) WT_GCC_FUNC_DECL_ATTRIBUTE((cold)); extern void __wt_block_ckpt_destroy(WT_SESSION_IMPL *session, WT_BLOCK_CKPT *ci); extern void __wt_block_compact_get_progress_stats(WT_SESSION_IMPL *session, WT_BM *bm, uint64_t *pages_reviewedp, uint64_t *pages_skippedp, uint64_t *pages_rewrittenp); @@ -1688,7 +1686,6 @@ extern void __wt_block_compact_progress( extern void __wt_block_configure_first_fit(WT_BLOCK *block, bool on); extern void __wt_block_ext_free(WT_SESSION_IMPL *session, WT_EXT *ext); extern void __wt_block_extlist_free(WT_SESSION_IMPL *session, WT_EXTLIST *el); -extern void __wt_block_set_readonly(WT_SESSION_IMPL *session) WT_GCC_FUNC_DECL_ATTRIBUTE((cold)); extern void __wt_block_size_free(WT_SESSION_IMPL *session, WT_SIZE *sz); extern void __wt_block_stat(WT_SESSION_IMPL *session, WT_BLOCK *block, WT_DSRC_STATS *stats); extern void __wt_bloom_hash(WT_BLOOM *bloom, WT_ITEM *key, WT_BLOOM_HASH *bhash); diff --git a/src/third_party/wiredtiger/src/include/os_fs_inline.h b/src/third_party/wiredtiger/src/include/os_fs_inline.h index 2010032a5c2..2276f096312 100644 --- a/src/third_party/wiredtiger/src/include/os_fs_inline.h +++ b/src/third_party/wiredtiger/src/include/os_fs_inline.h @@ -141,7 +141,7 @@ __wt_fs_remove(WT_SESSION_IMPL *session, const char *name, bool durable) * It is a layering violation to retrieve a WT_FH here, but it is a useful diagnostic to ensure * WiredTiger doesn't have the handle open. */ - if (__wt_handle_is_open(session, name) && !F_ISSET(session, WT_SESSION_QUIET_TIERED)) + if (__wt_handle_is_open(session, name)) WT_RET_MSG(session, EINVAL, "%s: file-remove: file has open handles", name); #endif diff --git a/src/third_party/wiredtiger/src/include/session.h b/src/third_party/wiredtiger/src/include/session.h index 85690b57fbd..b5a7479f763 100644 --- a/src/third_party/wiredtiger/src/include/session.h +++ b/src/third_party/wiredtiger/src/include/session.h @@ -205,11 +205,10 @@ struct __wt_session_impl { #define WT_SESSION_NO_LOGGING 0x01000u #define WT_SESSION_NO_RECONCILE 0x02000u #define WT_SESSION_QUIET_CORRUPT_FILE 0x04000u -#define WT_SESSION_QUIET_TIERED 0x08000u -#define WT_SESSION_READ_WONT_NEED 0x10000u -#define WT_SESSION_RESOLVING_TXN 0x20000u -#define WT_SESSION_ROLLBACK_TO_STABLE 0x40000u -#define WT_SESSION_SCHEMA_TXN 0x80000u +#define WT_SESSION_READ_WONT_NEED 0x08000u +#define WT_SESSION_RESOLVING_TXN 0x10000u +#define WT_SESSION_ROLLBACK_TO_STABLE 0x20000u +#define WT_SESSION_SCHEMA_TXN 0x40000u /* AUTOMATIC FLAG VALUE GENERATION STOP 32 */ uint32_t flags; diff --git a/src/third_party/wiredtiger/src/include/tiered.h b/src/third_party/wiredtiger/src/include/tiered.h index 3fe6784c80b..5e3e14006dd 100644 --- a/src/third_party/wiredtiger/src/include/tiered.h +++ b/src/third_party/wiredtiger/src/include/tiered.h @@ -103,6 +103,8 @@ struct __wt_tiered_tiers { uint32_t flags; /* Flags including operations */ }; +#define WT_TIERED_OBJECTID_NONE 0 + /* * WT_TIERED -- * Handle for a tiered data source. This data structure is used as the basis for metadata @@ -124,8 +126,6 @@ struct __wt_tiered { WT_TIERED_TIERS tiers[WT_TIERED_MAX_TIERS]; /* Tiers array */ - WT_BLOCK_FILE_OPENER opener; - uint32_t current_id; /* Current object id number */ uint32_t next_id; /* Next object number */ uint32_t oldest_id; /* Oldest object id number */ diff --git a/src/third_party/wiredtiger/src/include/wt_internal.h b/src/third_party/wiredtiger/src/include/wt_internal.h index f26b72ff7f7..f5871dbf788 100644 --- a/src/third_party/wiredtiger/src/include/wt_internal.h +++ b/src/third_party/wiredtiger/src/include/wt_internal.h @@ -81,8 +81,6 @@ struct __wt_block_ckpt; typedef struct __wt_block_ckpt WT_BLOCK_CKPT; struct __wt_block_desc; typedef struct __wt_block_desc WT_BLOCK_DESC; -struct __wt_block_file_opener; -typedef struct __wt_block_file_opener WT_BLOCK_FILE_OPENER; struct __wt_block_header; typedef struct __wt_block_header WT_BLOCK_HEADER; struct __wt_block_mods; diff --git a/src/third_party/wiredtiger/src/os_posix/os_fs.c b/src/third_party/wiredtiger/src/os_posix/os_fs.c index 3898eb74343..1ae6259e5d8 100644 --- a/src/third_party/wiredtiger/src/os_posix/os_fs.c +++ b/src/third_party/wiredtiger/src/os_posix/os_fs.c @@ -797,16 +797,12 @@ __posix_open_file(WT_FILE_SYSTEM *file_system, WT_SESSION *wt_session, const cha /* Create/Open the file. */ WT_SYSCALL_RETRY(((pfh->fd = open(name, f, mode)) == -1 ? -1 : 0), ret); - if (ret != 0) { - /* If we don't want error messages, just return the error value. */ - if (F_ISSET(session, WT_SESSION_QUIET_TIERED) && ret == ENOENT) - goto err; + if (ret != 0) WT_ERR_MSG(session, ret, pfh->direct_io ? "%s: handle-open: open: failed with direct I/O configured, some " "filesystem types do not support direct I/O" : "%s: handle-open: open", name); - } #ifdef __linux__ /* diff --git a/src/third_party/wiredtiger/src/tiered/tiered_config.c b/src/third_party/wiredtiger/src/tiered/tiered_config.c index 611ea8323c1..817ef8cc226 100644 --- a/src/third_party/wiredtiger/src/tiered/tiered_config.c +++ b/src/third_party/wiredtiger/src/tiered/tiered_config.c @@ -118,7 +118,7 @@ __wt_tiered_bucket_config( WT_ERR(__wt_strndup(session, auth.str, auth.len, &new->auth_token)); WT_ERR(__wt_strndup(session, bucket.str, bucket.len, &new->bucket)); WT_ERR(__wt_strndup(session, prefix.str, prefix.len, &new->bucket_prefix)); - WT_ERR(__wt_strndup(session, cachedir.str, auth.len, &new->cache_directory)); + WT_ERR(__wt_strndup(session, cachedir.str, cachedir.len, &new->cache_directory)); storage = nstorage->storage_source; if (cachedir.len != 0) diff --git a/src/third_party/wiredtiger/src/tiered/tiered_handle.c b/src/third_party/wiredtiger/src/tiered/tiered_handle.c index 4fd964d30f0..927ab86eb02 100644 --- a/src/third_party/wiredtiger/src/tiered/tiered_handle.c +++ b/src/third_party/wiredtiger/src/tiered/tiered_handle.c @@ -630,7 +630,7 @@ __tiered_open(WT_SESSION_IMPL *session, const char *cfg[]) WT_ERR_NOTFOUND_OK(ret, true); /* Open tiers if we have them, otherwise initialize. */ - if (tiered->current_id != 0) + if (tiered->current_id != WT_TIERED_OBJECTID_NONE) WT_ERR(__tiered_init_tiers(session, tiered, &tierconf)); else { /* diff --git a/src/third_party/wiredtiger/src/tiered/tiered_opener.c b/src/third_party/wiredtiger/src/tiered/tiered_opener.c deleted file mode 100644 index f786ef30eff..00000000000 --- a/src/third_party/wiredtiger/src/tiered/tiered_opener.c +++ /dev/null @@ -1,131 +0,0 @@ -/*- - * Copyright (c) 2014-present MongoDB, Inc. - * Copyright (c) 2008-2014 WiredTiger, Inc. - * All rights reserved. - * - * See the file LICENSE for redistribution information. - */ - -#include "wt_internal.h" - -/* - * __tiered_opener_open -- - * Open an object by number. - */ -static int -__tiered_opener_open(WT_BLOCK_FILE_OPENER *opener, WT_SESSION_IMPL *session, uint32_t object_id, - WT_FS_OPEN_FILE_TYPE type, u_int flags, WT_FH **fhp) -{ - WT_BUCKET_STORAGE *bstorage; - WT_CONFIG_ITEM pfx; - WT_DECL_RET; - WT_TIERED *tiered; - size_t len; - char *tmp; - const char *cfg[2], *object_name, *object_uri, *object_val; - bool local_only; - - tiered = opener->cookie; - object_uri = NULL; - object_val = NULL; - tmp = NULL; - local_only = false; - - WT_ASSERT(session, - (object_id > 0 && object_id <= tiered->current_id) || object_id == WT_TIERED_CURRENT_ID); - /* - * First look for the local file. This will be the fastest access and we retain recent objects - * in the local database for a while. - */ - if (object_id == tiered->current_id || object_id == WT_TIERED_CURRENT_ID) { - bstorage = NULL; - object_name = tiered->tiers[WT_TIERED_INDEX_LOCAL].name; - WT_PREFIX_SKIP_REQUIRED(session, object_name, "file:"); - local_only = true; - } else { - WT_ERR( - __wt_tiered_name(session, &tiered->iface, object_id, WT_TIERED_NAME_OBJECT, &object_uri)); - object_name = object_uri; - WT_PREFIX_SKIP_REQUIRED(session, object_name, "object:"); - LF_SET(WT_FS_OPEN_READONLY); - WT_ASSERT(session, !FLD_ISSET(flags, WT_FS_OPEN_CREATE)); - F_SET(session, WT_SESSION_QUIET_TIERED); - } - ret = __wt_open(session, object_name, type, flags, fhp); - F_CLR(session, WT_SESSION_QUIET_TIERED); - - /* - * FIXME-WT-7590 we will need some kind of locking while we're looking at the tiered structure. - * This can be called at any time, because we are opening the objects lazily. - */ - if (!local_only && ret != 0) { - /* Get the prefix from the object's metadata, not the connection. */ - WT_ERR(__wt_metadata_search(session, object_uri, (char **)&object_val)); - cfg[0] = object_val; - cfg[1] = NULL; - WT_ERR(__wt_config_gets(session, cfg, "tiered_storage.bucket_prefix", &pfx)); - /* We expect a prefix. */ - WT_ASSERT(session, pfx.len != 0); - len = strlen(object_name) + pfx.len + 1; - WT_ERR(__wt_calloc_def(session, len, &tmp)); - WT_ERR(__wt_snprintf(tmp, len, "%.*s%s", (int)pfx.len, pfx.str, object_name)); - bstorage = tiered->bstorage; - LF_SET(WT_FS_OPEN_FIXED | WT_FS_OPEN_READONLY); - WT_WITH_BUCKET_STORAGE( - bstorage, session, { ret = __wt_open(session, tmp, type, flags, fhp); }); - } -err: - __wt_free(session, object_uri); - __wt_free(session, object_val); - __wt_free(session, tmp); - return (ret); -} - -/* - * __tiered_opener_current_id -- - * Get the current writeable object id. - */ -static uint32_t -__tiered_opener_current_id(WT_BLOCK_FILE_OPENER *opener) -{ - WT_TIERED *tiered; - - tiered = opener->cookie; - - /* - * FIXME-WT-7590 we will need some kind of locking while we're looking at the tiered structure. - * This can be called at any time, because we are opening the objects lazily. - */ - return (tiered->current_id); -} - -/* - * __wt_tiered_opener -- - * Set up an opener for a tiered handle. - */ -int -__wt_tiered_opener(WT_SESSION_IMPL *session, WT_DATA_HANDLE *dhandle, - WT_BLOCK_FILE_OPENER **openerp, const char **filenamep) -{ - WT_TIERED *tiered; - const char *filename; - - filename = dhandle->name; - *openerp = NULL; - - if (dhandle->type == WT_DHANDLE_TYPE_BTREE) { - if (!WT_PREFIX_SKIP(filename, "file:")) - WT_RET_MSG(session, EINVAL, "expected a 'file:' URI"); - *filenamep = filename; - } else if (dhandle->type == WT_DHANDLE_TYPE_TIERED) { - tiered = (WT_TIERED *)dhandle; - tiered->opener.open = __tiered_opener_open; - tiered->opener.current_object_id = __tiered_opener_current_id; - tiered->opener.cookie = tiered; - *openerp = &tiered->opener; - *filenamep = dhandle->name; - } else - WT_RET_MSG(session, EINVAL, "invalid URI: %s", dhandle->name); - - return (0); -} diff --git a/src/third_party/wiredtiger/src/utilities/util_list.c b/src/third_party/wiredtiger/src/utilities/util_list.c index 405b79d0721..09e16a969a6 100644 --- a/src/third_party/wiredtiger/src/utilities/util_list.c +++ b/src/third_party/wiredtiger/src/utilities/util_list.c @@ -109,9 +109,6 @@ list_init_block(WT_SESSION *session, const char *key, WT_BLOCK *block) else if (ret != WT_NOTFOUND) WT_ERR(util_err(session, ret, "WT_CONFIG_PARSER.get")); - if (WT_PREFIX_MATCH(key, "tiered:")) - block->has_objects = true; - err: if (parser != NULL && (tret = parser->close(parser)) != 0) { tret = util_err(session, tret, "WT_CONFIG_PARSER.close"); |