summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorRavi Giri <ravi.giri@mongodb.com>2023-04-11 01:57:06 +0000
committerEvergreen Agent <no-reply@evergreen.mongodb.com>2023-04-11 02:34:15 +0000
commit4439118eeb3570e2fcb84428e9913bbb37f866b1 (patch)
treeb5c8e0d190fb1026fa30b31b6302057be30ad6e2
parente95045c5d5416487eb82e8caf7d66a887c2370dc (diff)
downloadmongo-4439118eeb3570e2fcb84428e9913bbb37f866b1.tar.gz
Import wiredtiger: 88ae348f2978141067fafaa014b84eefcbfa65fc from branch mongodb-master
ref: 301819539a..88ae348f29 for: 7.0.0-rc0 WT-10734 block handle table
-rw-r--r--src/third_party/wiredtiger/import.data2
-rw-r--r--src/third_party/wiredtiger/src/block/block_open.c5
-rw-r--r--src/third_party/wiredtiger/src/block/block_read.c13
-rw-r--r--src/third_party/wiredtiger/src/block_cache/block_map.c4
-rw-r--r--src/third_party/wiredtiger/src/block_cache/block_mgr.c143
-rw-r--r--src/third_party/wiredtiger/src/block_cache/block_tier.c52
-rw-r--r--src/third_party/wiredtiger/src/include/block.h23
-rw-r--r--src/third_party/wiredtiger/src/include/extern.h2
-rw-r--r--src/third_party/wiredtiger/test/suite/test_tiered04.py8
9 files changed, 118 insertions, 134 deletions
diff --git a/src/third_party/wiredtiger/import.data b/src/third_party/wiredtiger/import.data
index 39b64a13067..07903a22c35 100644
--- a/src/third_party/wiredtiger/import.data
+++ b/src/third_party/wiredtiger/import.data
@@ -2,5 +2,5 @@
"vendor": "wiredtiger",
"github": "wiredtiger/wiredtiger.git",
"branch": "mongodb-master",
- "commit": "301819539a5de604857534c0a73b0cc8a268ad88"
+ "commit": "88ae348f2978141067fafaa014b84eefcbfa65fc"
}
diff --git a/src/third_party/wiredtiger/src/block/block_open.c b/src/third_party/wiredtiger/src/block/block_open.c
index bd024cdcf46..64f2fd64179 100644
--- a/src/third_party/wiredtiger/src/block/block_open.c
+++ b/src/third_party/wiredtiger/src/block/block_open.c
@@ -132,8 +132,6 @@ __wt_block_close(WT_SESSION_IMPL *session, WT_BLOCK *block)
}
__wt_free(session, block->name);
- __wt_spin_destroy(session, &block->cache_lock);
- __wt_free(session, block->related);
WT_TRET(__wt_close(session, &block->fh));
@@ -206,9 +204,6 @@ __wt_block_open(WT_SESSION_IMPL *session, const char *filename, uint32_t objecti
WT_CONN_BLOCK_INSERT(conn, block, bucket);
block->linked = true;
- /* Initialize the block cache layer lock. */
- WT_ERR(__wt_spin_init(session, &block->cache_lock, "block cache"));
-
/* If not passed an allocation size, get one from the configuration. */
if (allocsize == 0) {
WT_ERR(__wt_config_gets(session, cfg, "allocation_size", &cval));
diff --git a/src/third_party/wiredtiger/src/block/block_read.c b/src/third_party/wiredtiger/src/block/block_read.c
index 02ecfe7a57b..849400b2cf2 100644
--- a/src/third_party/wiredtiger/src/block/block_read.c
+++ b/src/third_party/wiredtiger/src/block/block_read.c
@@ -36,6 +36,10 @@ __wt_bm_read(
session, block, "read", offset, size, bm->is_live, __PRETTY_FUNCTION__, __LINE__));
#endif
+ /* Swap file handles if reading from a different object. */
+ if (block->objectid != objectid)
+ WT_RET(__wt_blkcache_get_handle(session, bm, objectid, &block));
+
/* Read the block. */
__wt_capacity_throttle(session, size, WT_THROTTLE_READ);
WT_RET(__wt_block_read_off(session, block, buf, objectid, offset, size, checksum));
@@ -162,15 +166,6 @@ __wt_block_read_off(WT_SESSION_IMPL *session, WT_BLOCK *block, WT_ITEM *buf, uin
WT_STAT_CONN_INCR(session, block_read);
WT_STAT_CONN_INCRV(session, block_byte_read, size);
- /* Swap file handles if reading from a different object. */
-
- if (block->objectid != objectid)
- /*
- * Format has a private callback that is called when a search completes. Part of getting a
- * data handle may involve metadata searching, and we don't want that to interfere.
- */
- WT_RET(__wt_blkcache_get_handle(session, block, objectid, &block));
-
/*
* Grow the buffer as necessary and read the block. Buffers should be aligned for reading, but
* there are lots of buffers (for example, file cursors have two buffers each, key and value),
diff --git a/src/third_party/wiredtiger/src/block_cache/block_map.c b/src/third_party/wiredtiger/src/block_cache/block_map.c
index 1937b1b08d7..4cb241089ac 100644
--- a/src/third_party/wiredtiger/src/block_cache/block_map.c
+++ b/src/third_party/wiredtiger/src/block_cache/block_map.c
@@ -103,9 +103,9 @@ __wt_blkcache_map_read(
WT_RET(__wt_block_addr_unpack(
session, block, addr, addr_size, &objectid, &offset, &size, &checksum));
- /* Swap file handles if reading from a different object. */
+ /* Swap block handles if reading from a different object. */
if (block->objectid != objectid)
- WT_RET(__wt_blkcache_get_handle(session, block, objectid, &block));
+ WT_RET(__wt_blkcache_get_handle(session, bm, objectid, &block));
/* Map the block if it's possible. */
handle = block->fh->handle;
diff --git a/src/third_party/wiredtiger/src/block_cache/block_mgr.c b/src/third_party/wiredtiger/src/block_cache/block_mgr.c
index e65bb64b4ba..333e6708703 100644
--- a/src/third_party/wiredtiger/src/block_cache/block_mgr.c
+++ b/src/third_party/wiredtiger/src/block_cache/block_mgr.c
@@ -11,34 +11,13 @@
static void __bm_method_set(WT_BM *, bool);
/*
- * __bm_close_block_remove --
- * Remove a single block handle. Must be called with the block lock held.
- */
-static int
-__bm_close_block_remove(WT_SESSION_IMPL *session, WT_BLOCK *block)
-{
- u_int i;
-
- /* Discard any references we're holding. */
- for (i = 0; i < block->related_next; ++i) {
- --block->related[i]->ref;
- block->related[i] = NULL;
- }
-
- /* Discard the block structure. */
- return (__wt_block_close(session, block));
-}
-
-/*
* __bm_close_block --
- * Close a single block handle, removing the handle if it's no longer useful.
+ * Close a block handle.
*/
static int
__bm_close_block(WT_SESSION_IMPL *session, WT_BLOCK *block)
{
WT_CONNECTION_IMPL *conn;
- WT_DECL_RET;
- bool found;
conn = S2C(session);
@@ -49,28 +28,16 @@ __bm_close_block(WT_SESSION_IMPL *session, WT_BLOCK *block)
__wt_spin_unlock(session, &conn->block_lock);
return (0);
}
+ __wt_spin_unlock(session, &conn->block_lock);
/* You can't close files during a checkpoint. */
WT_ASSERT(
session, block->ckpt_state == WT_CKPT_NONE || block->ckpt_state == WT_CKPT_PANIC_ON_FAILURE);
- /*
- * Every time we remove a block, we may have sufficiently decremented other references to allow
- * other blocks to be removed. It's unlikely for blocks to reference each other but it's not out
- * of the question, either. Loop until we don't find anything to close.
- */
- do {
- found = false;
- TAILQ_FOREACH (block, &conn->blockqh, q)
- if (block->ref == 0) {
- found = true;
- WT_TRET(__bm_close_block_remove(session, block));
- break;
- }
- } while (found);
- __wt_spin_unlock(session, &conn->block_lock);
+ if (block->sync_on_checkpoint)
+ WT_RET(__wt_fsync(session, block->fh, true));
- return (ret);
+ return (__wt_block_close(session, block));
}
/*
@@ -123,38 +90,43 @@ static int
__bm_checkpoint(
WT_BM *bm, WT_SESSION_IMPL *session, WT_ITEM *buf, WT_CKPT *ckptbase, bool data_checksum)
{
- WT_BLOCK *block, *tblock;
- WT_CONNECTION_IMPL *conn;
+ WT_BLOCK *block;
+ u_int i;
bool found;
- conn = S2C(session);
block = bm->block;
WT_RET(__wt_block_checkpoint(session, block, buf, ckptbase, data_checksum));
+ if (!bm->is_multi_handle)
+ return (0);
/*
- * Close previous primary objects that are no longer being written, that is, ones where all
- * in-flight writes have drained. We know all writes have drained when a subsequent checkpoint
- * completes, and we know the metadata file is the last file to be checkpointed. After
- * checkpointing the metadata file, review any previous primary objects, flushing writes and
- * discarding the primary reference.
+ * For tiered tables, we need to fsync any previous active files to ensure the full checkpoint
+ * is persisted. We wait until now because there may have been in-progress writes to old files.
+ * But now we know those writes must have completed. Checkpoint ensures that all dirty pages of
+ * the tree have been written and eviction is disabled at this point, so no new data is getting
+ * written.
+ *
+ * We don't hold the handle array lock across fsync calls since those could be slow and that
+ * would block a concurrent thread opening a new block handle
*/
- if (strcmp(WT_METAFILE, block->name) != 0)
- return (0);
do {
found = false;
- __wt_spin_lock(session, &conn->block_lock);
- TAILQ_FOREACH (tblock, &conn->blockqh, q)
- if (tblock->close_on_checkpoint) {
- tblock->close_on_checkpoint = false;
- __wt_spin_unlock(session, &conn->block_lock);
+ __wt_readlock(session, &bm->handle_array_lock);
+ for (i = 0; i < bm->handle_array_next; ++i) {
+ block = bm->handle_array[i];
+ if (block->sync_on_checkpoint) {
found = true;
- WT_RET(__wt_fsync(session, tblock->fh, true));
- WT_RET(__bm_close_block(session, tblock));
break;
}
+ }
+ __wt_readunlock(session, &bm->handle_array_lock);
+
+ if (found) {
+ WT_RET(__wt_fsync(session, block->fh, true));
+ block->sync_on_checkpoint = false;
+ }
} while (found);
- __wt_spin_unlock(session, &conn->block_lock);
return (0);
}
@@ -286,11 +258,26 @@ static int
__bm_close(WT_BM *bm, WT_SESSION_IMPL *session)
{
WT_DECL_RET;
+ u_int i;
if (bm == NULL) /* Safety check */
return (0);
- ret = __bm_close_block(session, bm->block);
+ if (!bm->is_multi_handle)
+ ret = __bm_close_block(session, bm->block);
+ else {
+ /*
+ * Higher-level code ensures that we can only have one call to close a block manager. So we
+ * don't need to lock the block handle array here.
+ *
+ * We don't need to explicitly close the active handle; it is also in the handle array.
+ */
+ for (i = 0; i < bm->handle_array_next; ++i)
+ WT_TRET(__bm_close_block(session, bm->handle_array[i]));
+
+ __wt_rwlock_destroy(session, &bm->handle_array_lock);
+ __wt_free(session, bm->handle_array);
+ }
__wt_overwrite_and_free(session, bm);
return (ret);
@@ -600,37 +587,23 @@ __bm_switch_object(WT_BM *bm, WT_SESSION_IMPL *session, uint32_t objectid)
current = bm->block;
- WT_RET(__wt_blkcache_tiered_open(session, NULL, objectid, &block));
+ /* We shouldn't ask to switch objects unless we actually need to switch objects */
+ WT_ASSERT(session, current->objectid != objectid);
+
+ WT_RET(__wt_blkcache_get_handle(session, bm, objectid, &block));
__wt_verbose(
session, WT_VERB_TIERED, "block manager switching from %s to %s", current->name, block->name);
- /* Fast-path switching to the current object, just undo the reference count increment. */
- if (block == current)
- return (__bm_close_block(session, block));
-
- /* Load a new object. */
+ /* This will be the new writable object. Load its checkpoint */
WT_RET(__wt_block_checkpoint_load(session, block, NULL, 0, NULL, &root_addr_size, false));
- /*
- * The previous object should be closed once writes have drained.
- *
- * FIXME: the old object does not participate in the upcoming checkpoint which has a couple of
- * implications. First, the extent lists for the old object are discarded and never written,
- * which makes it impossible to treat the old object as a standalone object, so, for example,
- * you can't verify it. A solution to this is for the upper layers to checkpoint all modified
- * objects in the logical object before the checkpoint updates the metadata, flushing all
- * underlying writes to stable storage, but that means writing extent lists without a root page.
- */
- current->close_on_checkpoint = true;
+ /* The previous object must by synced to disk as part of the next checkpoint. */
+ current->sync_on_checkpoint = true;
/*
* Swap out the block manager's default handler.
*
- * FIXME: the new block handle reasonably has different methods for objects in different backing
- * sources. That's not the case today, but the current architecture lacks the ability to support
- * multiple sources cleanly.
- *
* FIXME: it should not be possible for a thread of control to copy the WT_BM value in the btree
* layer, sleep until after a subsequent switch and a subsequent a checkpoint that would discard
* the WT_BM it copied, but it would be worth thinking through those scenarios in detail to be
@@ -838,18 +811,30 @@ __wt_blkcache_open(WT_SESSION_IMPL *session, const char *uri, const char *cfg[],
WT_RET(__wt_calloc_one(session, &bm));
__bm_method_set(bm, false);
+ bm->is_multi_handle = false;
if (WT_PREFIX_MATCH(uri, "file:")) {
uri += strlen("file:");
WT_ERR(__wt_block_open(session, uri, WT_TIERED_OBJECTID_NONE, cfg, forced_salvage, readonly,
false, allocsize, &bm->block));
- } else
+ } else {
+ bm->is_multi_handle = true;
+ WT_ERR(__wt_rwlock_init(session, &bm->handle_array_lock));
+
+ /* Allocate space to store the handle (do first for simpler cleanup). */
+ WT_ERR(__wt_realloc_def(
+ session, &bm->handle_array_allocated, bm->handle_array_next + 1, &bm->handle_array));
+
+ /* Open the active file, and save in array */
WT_ERR(__wt_blkcache_tiered_open(session, uri, 0, &bm->block));
+ bm->handle_array[bm->handle_array_next++] = bm->block;
+ }
*bmp = bm;
return (0);
err:
+ __wt_rwlock_destroy(session, &bm->handle_array_lock);
__wt_free(session, bm);
return (ret);
}
diff --git a/src/third_party/wiredtiger/src/block_cache/block_tier.c b/src/third_party/wiredtiger/src/block_cache/block_tier.c
index deca87d0b0b..548262b628b 100644
--- a/src/third_party/wiredtiger/src/block_cache/block_tier.c
+++ b/src/third_party/wiredtiger/src/block_cache/block_tier.c
@@ -100,51 +100,51 @@ err:
* Get a cached block handle for an object, creating it if it doesn't exist.
*/
int
-__wt_blkcache_get_handle(
- WT_SESSION_IMPL *session, WT_BLOCK *current, uint32_t objectid, WT_BLOCK **blockp)
+__wt_blkcache_get_handle(WT_SESSION_IMPL *session, WT_BM *bm, uint32_t objectid, WT_BLOCK **blockp)
{
WT_DECL_RET;
u_int i;
*blockp = NULL;
- /* We should never be looking for our own object. */
- WT_ASSERT(session, current->objectid != objectid);
+ /* We should never be looking for current active file. */
+ WT_ASSERT(session, bm->block->objectid != objectid);
/*
- * Check the local cache for the object. We don't have to check the name because we can only
- * reference objects in our name space.
+ * Check the block handle array for the object. We don't have to check the name because we can
+ * only reference objects in our name space.
*/
- for (i = 0; i < current->related_next; ++i)
- if (current->related[i]->objectid == objectid) {
- *blockp = current->related[i];
+ __wt_readlock(session, &bm->handle_array_lock);
+ for (i = 0; i < bm->handle_array_next; ++i)
+ if (bm->handle_array[i]->objectid == objectid) {
+ *blockp = bm->handle_array[i];
+ __wt_readunlock(session, &bm->handle_array_lock);
return (0);
}
- /* Lock the block cache layer. */
- __wt_spin_lock(session, &current->cache_lock);
+ /* We need to add a new handle the block handle array. Upgrade to a write lock. */
+ __wt_readunlock(session, &bm->handle_array_lock);
+ __wt_writelock(session, &bm->handle_array_lock);
/* Check to make sure the object wasn't cached while we locked. */
- for (i = 0; i < current->related_next; ++i)
- if (current->related[i]->objectid == objectid) {
- *blockp = current->related[i];
- break;
+ for (i = 0; i < bm->handle_array_next; ++i)
+ if (bm->handle_array[i]->objectid == objectid) {
+ *blockp = bm->handle_array[i];
+ __wt_writeunlock(session, &bm->handle_array_lock);
+ return (0);
}
- /* Open the object. */
- if (*blockp == NULL) {
- /* Allocate space to store a reference (do first for less complicated cleanup). */
- WT_ERR(__wt_realloc_def(
- session, &current->related_allocated, current->related_next + 1, &current->related));
+ /* Allocate space to store a new handle (do first for less complicated cleanup). */
+ WT_ERR(__wt_realloc_def(
+ session, &bm->handle_array_allocated, bm->handle_array_next + 1, &bm->handle_array));
- /* Get a reference to the object, opening it as necessary. */
- WT_ERR(__wt_blkcache_tiered_open(session, NULL, objectid, blockp));
+ /* Open the object */
+ WT_ERR(__wt_blkcache_tiered_open(session, NULL, objectid, blockp));
- /* Save a reference in the block in which we started for fast subsequent access. */
- current->related[current->related_next++] = *blockp;
- }
+ /* Add object to block handle array. */
+ bm->handle_array[bm->handle_array_next++] = *blockp;
err:
- __wt_spin_unlock(session, &current->cache_lock);
+ __wt_writeunlock(session, &bm->handle_array_lock);
return (ret);
}
diff --git a/src/third_party/wiredtiger/src/include/block.h b/src/third_party/wiredtiger/src/include/block.h
index dd6af8235ea..4f6ab05bdff 100644
--- a/src/third_party/wiredtiger/src/include/block.h
+++ b/src/third_party/wiredtiger/src/include/block.h
@@ -165,7 +165,7 @@ struct __wt_block_ckpt {
/*
* WT_BM --
- * Block manager handle, references a single checkpoint in a file.
+ * Block manager handle, references a single checkpoint in a btree.
*/
struct __wt_bm {
/* Methods */
@@ -205,13 +205,25 @@ struct __wt_bm {
int (*write)(WT_BM *, WT_SESSION_IMPL *, WT_ITEM *, uint8_t *, size_t *, bool, bool);
int (*write_size)(WT_BM *, WT_SESSION_IMPL *, size_t *);
- WT_BLOCK *block; /* Underlying file */
+ WT_BLOCK *block; /* Underlying file. For a multi-handle tree this will be the writable file. */
void *map; /* Mapped region */
size_t maplen;
void *mapped_cookie;
/*
+ * For trees, such as tiered tables, that are allowed to have more than one backing file or
+ * object, we maintain an array of the block handles used by the tree. We use a reader-writer
+ * mutex to protect the array. We lock it for reading when looking for a handle in the array and
+ * lock it for writing when adding or removing handles in the array.
+ */
+ bool is_multi_handle;
+ WT_BLOCK **handle_array; /* Array of block handles */
+ size_t handle_array_allocated; /* Size of handle array */
+ WT_RWLOCK handle_array_lock; /* Lock for block handle array */
+ u_int handle_array_next; /* Next open slot */
+
+ /*
* There's only a single block manager handle that can be written, all others are checkpoints.
*/
bool is_live; /* The live system */
@@ -230,18 +242,13 @@ struct __wt_block {
TAILQ_ENTRY(__wt_block) hashq; /* Hashed list of handles */
bool linked;
- WT_SPINLOCK cache_lock; /* Block cache layer lock */
- WT_BLOCK **related; /* Related objects */
- size_t related_allocated; /* Size of related object array */
- u_int related_next; /* Next open slot */
-
WT_FH *fh; /* Backing file handle */
wt_off_t size; /* File size */
wt_off_t extend_size; /* File extended size */
wt_off_t extend_len; /* File extend chunk size */
- bool close_on_checkpoint; /* Close the handle after the next checkpoint */
bool created_during_backup; /* Created during incremental backup */
+ bool sync_on_checkpoint; /* fsync the handle after the next checkpoint */
/* Configuration information, set when the file is opened. */
uint32_t allocfirst; /* Allocation is first-fit */
diff --git a/src/third_party/wiredtiger/src/include/extern.h b/src/third_party/wiredtiger/src/include/extern.h
index 593887e5c4c..5bc24c1b5da 100644
--- a/src/third_party/wiredtiger/src/include/extern.h
+++ b/src/third_party/wiredtiger/src/include/extern.h
@@ -88,7 +88,7 @@ extern int __wt_backup_open(WT_SESSION_IMPL *session)
WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
extern int __wt_bad_object_type(WT_SESSION_IMPL *session, const char *uri)
WT_GCC_FUNC_DECL_ATTRIBUTE((cold)) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
-extern int __wt_blkcache_get_handle(WT_SESSION_IMPL *session, WT_BLOCK *current, uint32_t objectid,
+extern int __wt_blkcache_get_handle(WT_SESSION_IMPL *session, WT_BM *bm, uint32_t objectid,
WT_BLOCK **blockp) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
extern int __wt_blkcache_map(WT_SESSION_IMPL *session, WT_BLOCK *block, void **mapped_regionp,
size_t *lengthp, void **mapped_cookiep) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
diff --git a/src/third_party/wiredtiger/test/suite/test_tiered04.py b/src/third_party/wiredtiger/test/suite/test_tiered04.py
index 1c1b0d29335..5ca20dcddf8 100644
--- a/src/third_party/wiredtiger/test/suite/test_tiered04.py
+++ b/src/third_party/wiredtiger/test/suite/test_tiered04.py
@@ -153,9 +153,11 @@ class test_tiered04(wttest.WiredTigerTestCase, TieredConfigMixin):
time.sleep(1)
self.pr("Check removal of ")
self.pr(self.obj1file)
- self.assertFalse(os.path.exists(self.obj1file))
- remove2 = self.get_stat(stat.conn.local_objects_removed, None)
- self.assertTrue(remove2 > remove1)
+ # FIXME-WT-10838: We can't remove files from open tables because we don't know whether
+ # there are active read requests to those files.
+ # self.assertFalse(os.path.exists(self.obj1file))
+ # remove2 = self.get_stat(stat.conn.local_objects_removed, None)
+ # self.assertTrue(remove2 > remove1)
c = self.session.open_cursor(self.uri)
c["1"] = "1"