diff options
author | Ravi Giri <ravi.giri@mongodb.com> | 2023-04-11 01:57:06 +0000 |
---|---|---|
committer | Evergreen Agent <no-reply@evergreen.mongodb.com> | 2023-04-11 02:34:15 +0000 |
commit | 4439118eeb3570e2fcb84428e9913bbb37f866b1 (patch) | |
tree | b5c8e0d190fb1026fa30b31b6302057be30ad6e2 | |
parent | e95045c5d5416487eb82e8caf7d66a887c2370dc (diff) | |
download | mongo-4439118eeb3570e2fcb84428e9913bbb37f866b1.tar.gz |
Import wiredtiger: 88ae348f2978141067fafaa014b84eefcbfa65fc from branch mongodb-master
ref: 301819539a..88ae348f29
for: 7.0.0-rc0
WT-10734 block handle table
-rw-r--r-- | src/third_party/wiredtiger/import.data | 2 | ||||
-rw-r--r-- | src/third_party/wiredtiger/src/block/block_open.c | 5 | ||||
-rw-r--r-- | src/third_party/wiredtiger/src/block/block_read.c | 13 | ||||
-rw-r--r-- | src/third_party/wiredtiger/src/block_cache/block_map.c | 4 | ||||
-rw-r--r-- | src/third_party/wiredtiger/src/block_cache/block_mgr.c | 143 | ||||
-rw-r--r-- | src/third_party/wiredtiger/src/block_cache/block_tier.c | 52 | ||||
-rw-r--r-- | src/third_party/wiredtiger/src/include/block.h | 23 | ||||
-rw-r--r-- | src/third_party/wiredtiger/src/include/extern.h | 2 | ||||
-rw-r--r-- | src/third_party/wiredtiger/test/suite/test_tiered04.py | 8 |
9 files changed, 118 insertions, 134 deletions
diff --git a/src/third_party/wiredtiger/import.data b/src/third_party/wiredtiger/import.data index 39b64a13067..07903a22c35 100644 --- a/src/third_party/wiredtiger/import.data +++ b/src/third_party/wiredtiger/import.data @@ -2,5 +2,5 @@ "vendor": "wiredtiger", "github": "wiredtiger/wiredtiger.git", "branch": "mongodb-master", - "commit": "301819539a5de604857534c0a73b0cc8a268ad88" + "commit": "88ae348f2978141067fafaa014b84eefcbfa65fc" } diff --git a/src/third_party/wiredtiger/src/block/block_open.c b/src/third_party/wiredtiger/src/block/block_open.c index bd024cdcf46..64f2fd64179 100644 --- a/src/third_party/wiredtiger/src/block/block_open.c +++ b/src/third_party/wiredtiger/src/block/block_open.c @@ -132,8 +132,6 @@ __wt_block_close(WT_SESSION_IMPL *session, WT_BLOCK *block) } __wt_free(session, block->name); - __wt_spin_destroy(session, &block->cache_lock); - __wt_free(session, block->related); WT_TRET(__wt_close(session, &block->fh)); @@ -206,9 +204,6 @@ __wt_block_open(WT_SESSION_IMPL *session, const char *filename, uint32_t objecti WT_CONN_BLOCK_INSERT(conn, block, bucket); block->linked = true; - /* Initialize the block cache layer lock. */ - WT_ERR(__wt_spin_init(session, &block->cache_lock, "block cache")); - /* If not passed an allocation size, get one from the configuration. */ if (allocsize == 0) { WT_ERR(__wt_config_gets(session, cfg, "allocation_size", &cval)); diff --git a/src/third_party/wiredtiger/src/block/block_read.c b/src/third_party/wiredtiger/src/block/block_read.c index 02ecfe7a57b..849400b2cf2 100644 --- a/src/third_party/wiredtiger/src/block/block_read.c +++ b/src/third_party/wiredtiger/src/block/block_read.c @@ -36,6 +36,10 @@ __wt_bm_read( session, block, "read", offset, size, bm->is_live, __PRETTY_FUNCTION__, __LINE__)); #endif + /* Swap file handles if reading from a different object. */ + if (block->objectid != objectid) + WT_RET(__wt_blkcache_get_handle(session, bm, objectid, &block)); + /* Read the block. */ __wt_capacity_throttle(session, size, WT_THROTTLE_READ); WT_RET(__wt_block_read_off(session, block, buf, objectid, offset, size, checksum)); @@ -162,15 +166,6 @@ __wt_block_read_off(WT_SESSION_IMPL *session, WT_BLOCK *block, WT_ITEM *buf, uin WT_STAT_CONN_INCR(session, block_read); WT_STAT_CONN_INCRV(session, block_byte_read, size); - /* Swap file handles if reading from a different object. */ - - if (block->objectid != objectid) - /* - * Format has a private callback that is called when a search completes. Part of getting a - * data handle may involve metadata searching, and we don't want that to interfere. - */ - WT_RET(__wt_blkcache_get_handle(session, block, objectid, &block)); - /* * Grow the buffer as necessary and read the block. Buffers should be aligned for reading, but * there are lots of buffers (for example, file cursors have two buffers each, key and value), diff --git a/src/third_party/wiredtiger/src/block_cache/block_map.c b/src/third_party/wiredtiger/src/block_cache/block_map.c index 1937b1b08d7..4cb241089ac 100644 --- a/src/third_party/wiredtiger/src/block_cache/block_map.c +++ b/src/third_party/wiredtiger/src/block_cache/block_map.c @@ -103,9 +103,9 @@ __wt_blkcache_map_read( WT_RET(__wt_block_addr_unpack( session, block, addr, addr_size, &objectid, &offset, &size, &checksum)); - /* Swap file handles if reading from a different object. */ + /* Swap block handles if reading from a different object. */ if (block->objectid != objectid) - WT_RET(__wt_blkcache_get_handle(session, block, objectid, &block)); + WT_RET(__wt_blkcache_get_handle(session, bm, objectid, &block)); /* Map the block if it's possible. */ handle = block->fh->handle; diff --git a/src/third_party/wiredtiger/src/block_cache/block_mgr.c b/src/third_party/wiredtiger/src/block_cache/block_mgr.c index e65bb64b4ba..333e6708703 100644 --- a/src/third_party/wiredtiger/src/block_cache/block_mgr.c +++ b/src/third_party/wiredtiger/src/block_cache/block_mgr.c @@ -11,34 +11,13 @@ static void __bm_method_set(WT_BM *, bool); /* - * __bm_close_block_remove -- - * Remove a single block handle. Must be called with the block lock held. - */ -static int -__bm_close_block_remove(WT_SESSION_IMPL *session, WT_BLOCK *block) -{ - u_int i; - - /* Discard any references we're holding. */ - for (i = 0; i < block->related_next; ++i) { - --block->related[i]->ref; - block->related[i] = NULL; - } - - /* Discard the block structure. */ - return (__wt_block_close(session, block)); -} - -/* * __bm_close_block -- - * Close a single block handle, removing the handle if it's no longer useful. + * Close a block handle. */ static int __bm_close_block(WT_SESSION_IMPL *session, WT_BLOCK *block) { WT_CONNECTION_IMPL *conn; - WT_DECL_RET; - bool found; conn = S2C(session); @@ -49,28 +28,16 @@ __bm_close_block(WT_SESSION_IMPL *session, WT_BLOCK *block) __wt_spin_unlock(session, &conn->block_lock); return (0); } + __wt_spin_unlock(session, &conn->block_lock); /* You can't close files during a checkpoint. */ WT_ASSERT( session, block->ckpt_state == WT_CKPT_NONE || block->ckpt_state == WT_CKPT_PANIC_ON_FAILURE); - /* - * Every time we remove a block, we may have sufficiently decremented other references to allow - * other blocks to be removed. It's unlikely for blocks to reference each other but it's not out - * of the question, either. Loop until we don't find anything to close. - */ - do { - found = false; - TAILQ_FOREACH (block, &conn->blockqh, q) - if (block->ref == 0) { - found = true; - WT_TRET(__bm_close_block_remove(session, block)); - break; - } - } while (found); - __wt_spin_unlock(session, &conn->block_lock); + if (block->sync_on_checkpoint) + WT_RET(__wt_fsync(session, block->fh, true)); - return (ret); + return (__wt_block_close(session, block)); } /* @@ -123,38 +90,43 @@ static int __bm_checkpoint( WT_BM *bm, WT_SESSION_IMPL *session, WT_ITEM *buf, WT_CKPT *ckptbase, bool data_checksum) { - WT_BLOCK *block, *tblock; - WT_CONNECTION_IMPL *conn; + WT_BLOCK *block; + u_int i; bool found; - conn = S2C(session); block = bm->block; WT_RET(__wt_block_checkpoint(session, block, buf, ckptbase, data_checksum)); + if (!bm->is_multi_handle) + return (0); /* - * Close previous primary objects that are no longer being written, that is, ones where all - * in-flight writes have drained. We know all writes have drained when a subsequent checkpoint - * completes, and we know the metadata file is the last file to be checkpointed. After - * checkpointing the metadata file, review any previous primary objects, flushing writes and - * discarding the primary reference. + * For tiered tables, we need to fsync any previous active files to ensure the full checkpoint + * is persisted. We wait until now because there may have been in-progress writes to old files. + * But now we know those writes must have completed. Checkpoint ensures that all dirty pages of + * the tree have been written and eviction is disabled at this point, so no new data is getting + * written. + * + * We don't hold the handle array lock across fsync calls since those could be slow and that + * would block a concurrent thread opening a new block handle */ - if (strcmp(WT_METAFILE, block->name) != 0) - return (0); do { found = false; - __wt_spin_lock(session, &conn->block_lock); - TAILQ_FOREACH (tblock, &conn->blockqh, q) - if (tblock->close_on_checkpoint) { - tblock->close_on_checkpoint = false; - __wt_spin_unlock(session, &conn->block_lock); + __wt_readlock(session, &bm->handle_array_lock); + for (i = 0; i < bm->handle_array_next; ++i) { + block = bm->handle_array[i]; + if (block->sync_on_checkpoint) { found = true; - WT_RET(__wt_fsync(session, tblock->fh, true)); - WT_RET(__bm_close_block(session, tblock)); break; } + } + __wt_readunlock(session, &bm->handle_array_lock); + + if (found) { + WT_RET(__wt_fsync(session, block->fh, true)); + block->sync_on_checkpoint = false; + } } while (found); - __wt_spin_unlock(session, &conn->block_lock); return (0); } @@ -286,11 +258,26 @@ static int __bm_close(WT_BM *bm, WT_SESSION_IMPL *session) { WT_DECL_RET; + u_int i; if (bm == NULL) /* Safety check */ return (0); - ret = __bm_close_block(session, bm->block); + if (!bm->is_multi_handle) + ret = __bm_close_block(session, bm->block); + else { + /* + * Higher-level code ensures that we can only have one call to close a block manager. So we + * don't need to lock the block handle array here. + * + * We don't need to explicitly close the active handle; it is also in the handle array. + */ + for (i = 0; i < bm->handle_array_next; ++i) + WT_TRET(__bm_close_block(session, bm->handle_array[i])); + + __wt_rwlock_destroy(session, &bm->handle_array_lock); + __wt_free(session, bm->handle_array); + } __wt_overwrite_and_free(session, bm); return (ret); @@ -600,37 +587,23 @@ __bm_switch_object(WT_BM *bm, WT_SESSION_IMPL *session, uint32_t objectid) current = bm->block; - WT_RET(__wt_blkcache_tiered_open(session, NULL, objectid, &block)); + /* We shouldn't ask to switch objects unless we actually need to switch objects */ + WT_ASSERT(session, current->objectid != objectid); + + WT_RET(__wt_blkcache_get_handle(session, bm, objectid, &block)); __wt_verbose( session, WT_VERB_TIERED, "block manager switching from %s to %s", current->name, block->name); - /* Fast-path switching to the current object, just undo the reference count increment. */ - if (block == current) - return (__bm_close_block(session, block)); - - /* Load a new object. */ + /* This will be the new writable object. Load its checkpoint */ WT_RET(__wt_block_checkpoint_load(session, block, NULL, 0, NULL, &root_addr_size, false)); - /* - * The previous object should be closed once writes have drained. - * - * FIXME: the old object does not participate in the upcoming checkpoint which has a couple of - * implications. First, the extent lists for the old object are discarded and never written, - * which makes it impossible to treat the old object as a standalone object, so, for example, - * you can't verify it. A solution to this is for the upper layers to checkpoint all modified - * objects in the logical object before the checkpoint updates the metadata, flushing all - * underlying writes to stable storage, but that means writing extent lists without a root page. - */ - current->close_on_checkpoint = true; + /* The previous object must by synced to disk as part of the next checkpoint. */ + current->sync_on_checkpoint = true; /* * Swap out the block manager's default handler. * - * FIXME: the new block handle reasonably has different methods for objects in different backing - * sources. That's not the case today, but the current architecture lacks the ability to support - * multiple sources cleanly. - * * FIXME: it should not be possible for a thread of control to copy the WT_BM value in the btree * layer, sleep until after a subsequent switch and a subsequent a checkpoint that would discard * the WT_BM it copied, but it would be worth thinking through those scenarios in detail to be @@ -838,18 +811,30 @@ __wt_blkcache_open(WT_SESSION_IMPL *session, const char *uri, const char *cfg[], WT_RET(__wt_calloc_one(session, &bm)); __bm_method_set(bm, false); + bm->is_multi_handle = false; if (WT_PREFIX_MATCH(uri, "file:")) { uri += strlen("file:"); WT_ERR(__wt_block_open(session, uri, WT_TIERED_OBJECTID_NONE, cfg, forced_salvage, readonly, false, allocsize, &bm->block)); - } else + } else { + bm->is_multi_handle = true; + WT_ERR(__wt_rwlock_init(session, &bm->handle_array_lock)); + + /* Allocate space to store the handle (do first for simpler cleanup). */ + WT_ERR(__wt_realloc_def( + session, &bm->handle_array_allocated, bm->handle_array_next + 1, &bm->handle_array)); + + /* Open the active file, and save in array */ WT_ERR(__wt_blkcache_tiered_open(session, uri, 0, &bm->block)); + bm->handle_array[bm->handle_array_next++] = bm->block; + } *bmp = bm; return (0); err: + __wt_rwlock_destroy(session, &bm->handle_array_lock); __wt_free(session, bm); return (ret); } diff --git a/src/third_party/wiredtiger/src/block_cache/block_tier.c b/src/third_party/wiredtiger/src/block_cache/block_tier.c index deca87d0b0b..548262b628b 100644 --- a/src/third_party/wiredtiger/src/block_cache/block_tier.c +++ b/src/third_party/wiredtiger/src/block_cache/block_tier.c @@ -100,51 +100,51 @@ err: * Get a cached block handle for an object, creating it if it doesn't exist. */ int -__wt_blkcache_get_handle( - WT_SESSION_IMPL *session, WT_BLOCK *current, uint32_t objectid, WT_BLOCK **blockp) +__wt_blkcache_get_handle(WT_SESSION_IMPL *session, WT_BM *bm, uint32_t objectid, WT_BLOCK **blockp) { WT_DECL_RET; u_int i; *blockp = NULL; - /* We should never be looking for our own object. */ - WT_ASSERT(session, current->objectid != objectid); + /* We should never be looking for current active file. */ + WT_ASSERT(session, bm->block->objectid != objectid); /* - * Check the local cache for the object. We don't have to check the name because we can only - * reference objects in our name space. + * Check the block handle array for the object. We don't have to check the name because we can + * only reference objects in our name space. */ - for (i = 0; i < current->related_next; ++i) - if (current->related[i]->objectid == objectid) { - *blockp = current->related[i]; + __wt_readlock(session, &bm->handle_array_lock); + for (i = 0; i < bm->handle_array_next; ++i) + if (bm->handle_array[i]->objectid == objectid) { + *blockp = bm->handle_array[i]; + __wt_readunlock(session, &bm->handle_array_lock); return (0); } - /* Lock the block cache layer. */ - __wt_spin_lock(session, ¤t->cache_lock); + /* We need to add a new handle the block handle array. Upgrade to a write lock. */ + __wt_readunlock(session, &bm->handle_array_lock); + __wt_writelock(session, &bm->handle_array_lock); /* Check to make sure the object wasn't cached while we locked. */ - for (i = 0; i < current->related_next; ++i) - if (current->related[i]->objectid == objectid) { - *blockp = current->related[i]; - break; + for (i = 0; i < bm->handle_array_next; ++i) + if (bm->handle_array[i]->objectid == objectid) { + *blockp = bm->handle_array[i]; + __wt_writeunlock(session, &bm->handle_array_lock); + return (0); } - /* Open the object. */ - if (*blockp == NULL) { - /* Allocate space to store a reference (do first for less complicated cleanup). */ - WT_ERR(__wt_realloc_def( - session, ¤t->related_allocated, current->related_next + 1, ¤t->related)); + /* Allocate space to store a new handle (do first for less complicated cleanup). */ + WT_ERR(__wt_realloc_def( + session, &bm->handle_array_allocated, bm->handle_array_next + 1, &bm->handle_array)); - /* Get a reference to the object, opening it as necessary. */ - WT_ERR(__wt_blkcache_tiered_open(session, NULL, objectid, blockp)); + /* Open the object */ + WT_ERR(__wt_blkcache_tiered_open(session, NULL, objectid, blockp)); - /* Save a reference in the block in which we started for fast subsequent access. */ - current->related[current->related_next++] = *blockp; - } + /* Add object to block handle array. */ + bm->handle_array[bm->handle_array_next++] = *blockp; err: - __wt_spin_unlock(session, ¤t->cache_lock); + __wt_writeunlock(session, &bm->handle_array_lock); return (ret); } diff --git a/src/third_party/wiredtiger/src/include/block.h b/src/third_party/wiredtiger/src/include/block.h index dd6af8235ea..4f6ab05bdff 100644 --- a/src/third_party/wiredtiger/src/include/block.h +++ b/src/third_party/wiredtiger/src/include/block.h @@ -165,7 +165,7 @@ struct __wt_block_ckpt { /* * WT_BM -- - * Block manager handle, references a single checkpoint in a file. + * Block manager handle, references a single checkpoint in a btree. */ struct __wt_bm { /* Methods */ @@ -205,13 +205,25 @@ struct __wt_bm { int (*write)(WT_BM *, WT_SESSION_IMPL *, WT_ITEM *, uint8_t *, size_t *, bool, bool); int (*write_size)(WT_BM *, WT_SESSION_IMPL *, size_t *); - WT_BLOCK *block; /* Underlying file */ + WT_BLOCK *block; /* Underlying file. For a multi-handle tree this will be the writable file. */ void *map; /* Mapped region */ size_t maplen; void *mapped_cookie; /* + * For trees, such as tiered tables, that are allowed to have more than one backing file or + * object, we maintain an array of the block handles used by the tree. We use a reader-writer + * mutex to protect the array. We lock it for reading when looking for a handle in the array and + * lock it for writing when adding or removing handles in the array. + */ + bool is_multi_handle; + WT_BLOCK **handle_array; /* Array of block handles */ + size_t handle_array_allocated; /* Size of handle array */ + WT_RWLOCK handle_array_lock; /* Lock for block handle array */ + u_int handle_array_next; /* Next open slot */ + + /* * There's only a single block manager handle that can be written, all others are checkpoints. */ bool is_live; /* The live system */ @@ -230,18 +242,13 @@ struct __wt_block { TAILQ_ENTRY(__wt_block) hashq; /* Hashed list of handles */ bool linked; - WT_SPINLOCK cache_lock; /* Block cache layer lock */ - WT_BLOCK **related; /* Related objects */ - size_t related_allocated; /* Size of related object array */ - u_int related_next; /* Next open slot */ - WT_FH *fh; /* Backing file handle */ wt_off_t size; /* File size */ wt_off_t extend_size; /* File extended size */ wt_off_t extend_len; /* File extend chunk size */ - bool close_on_checkpoint; /* Close the handle after the next checkpoint */ bool created_during_backup; /* Created during incremental backup */ + bool sync_on_checkpoint; /* fsync the handle after the next checkpoint */ /* Configuration information, set when the file is opened. */ uint32_t allocfirst; /* Allocation is first-fit */ diff --git a/src/third_party/wiredtiger/src/include/extern.h b/src/third_party/wiredtiger/src/include/extern.h index 593887e5c4c..5bc24c1b5da 100644 --- a/src/third_party/wiredtiger/src/include/extern.h +++ b/src/third_party/wiredtiger/src/include/extern.h @@ -88,7 +88,7 @@ extern int __wt_backup_open(WT_SESSION_IMPL *session) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_bad_object_type(WT_SESSION_IMPL *session, const char *uri) WT_GCC_FUNC_DECL_ATTRIBUTE((cold)) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); -extern int __wt_blkcache_get_handle(WT_SESSION_IMPL *session, WT_BLOCK *current, uint32_t objectid, +extern int __wt_blkcache_get_handle(WT_SESSION_IMPL *session, WT_BM *bm, uint32_t objectid, WT_BLOCK **blockp) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_blkcache_map(WT_SESSION_IMPL *session, WT_BLOCK *block, void **mapped_regionp, size_t *lengthp, void **mapped_cookiep) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); diff --git a/src/third_party/wiredtiger/test/suite/test_tiered04.py b/src/third_party/wiredtiger/test/suite/test_tiered04.py index 1c1b0d29335..5ca20dcddf8 100644 --- a/src/third_party/wiredtiger/test/suite/test_tiered04.py +++ b/src/third_party/wiredtiger/test/suite/test_tiered04.py @@ -153,9 +153,11 @@ class test_tiered04(wttest.WiredTigerTestCase, TieredConfigMixin): time.sleep(1) self.pr("Check removal of ") self.pr(self.obj1file) - self.assertFalse(os.path.exists(self.obj1file)) - remove2 = self.get_stat(stat.conn.local_objects_removed, None) - self.assertTrue(remove2 > remove1) + # FIXME-WT-10838: We can't remove files from open tables because we don't know whether + # there are active read requests to those files. + # self.assertFalse(os.path.exists(self.obj1file)) + # remove2 = self.get_stat(stat.conn.local_objects_removed, None) + # self.assertTrue(remove2 > remove1) c = self.session.open_cursor(self.uri) c["1"] = "1" |