summaryrefslogtreecommitdiff
path: root/src/third_party/wiredtiger
diff options
context:
space:
mode:
authorClarisse Cheah <clarisse.cheah@mongodb.com>2022-06-17 04:53:32 +0000
committerEvergreen Agent <no-reply@evergreen.mongodb.com>2022-06-17 05:26:18 +0000
commit6dcf2594502b30a7199c3c0ff5e121132b2c0c8f (patch)
tree083286a8ff283ad4659b88149389a06e9e150cae /src/third_party/wiredtiger
parentcee33e401cc9ff6e2d5d1514e13830f82fb4a080 (diff)
downloadmongo-6dcf2594502b30a7199c3c0ff5e121132b2c0c8f.tar.gz
Import wiredtiger: cd83c1ce55d4c46bce5a16b56ed84c66ca340da9 from branch mongodb-master
ref: ba538eb02c..cd83c1ce55 for: 6.1.0-rc0 WT-9463 Fix a race opening checkpoint cursors
Diffstat (limited to 'src/third_party/wiredtiger')
-rw-r--r--src/third_party/wiredtiger/import.data2
-rw-r--r--src/third_party/wiredtiger/src/meta/meta_ckpt.c51
-rw-r--r--src/third_party/wiredtiger/src/session/session_dhandle.c116
3 files changed, 96 insertions, 73 deletions
diff --git a/src/third_party/wiredtiger/import.data b/src/third_party/wiredtiger/import.data
index 8106dd1c724..cee21f07d89 100644
--- a/src/third_party/wiredtiger/import.data
+++ b/src/third_party/wiredtiger/import.data
@@ -2,5 +2,5 @@
"vendor": "wiredtiger",
"github": "wiredtiger/wiredtiger.git",
"branch": "mongodb-master",
- "commit": "ba538eb02c89962857915aa020c26db78ca6441a"
+ "commit": "cd83c1ce55d4c46bce5a16b56ed84c66ca340da9"
}
diff --git a/src/third_party/wiredtiger/src/meta/meta_ckpt.c b/src/third_party/wiredtiger/src/meta/meta_ckpt.c
index f95a38ad858..60a783e86cd 100644
--- a/src/third_party/wiredtiger/src/meta/meta_ckpt.c
+++ b/src/third_party/wiredtiger/src/meta/meta_ckpt.c
@@ -1564,7 +1564,10 @@ err:
* __wt_meta_read_checkpoint_snapshot --
* Fetch the snapshot data for a checkpoint from the metadata file. Reads the selected named
* checkpoint's snapshot, or if the checkpoint name passed is null, the most recent checkpoint's
- * snapshot. The snapshot list returned is allocated and must be freed by the caller.
+ * snapshot. The snapshot list returned is allocated and must be freed by the caller. Can be
+ * called with NULL return parameters to avoid (in particular) bothering to allocate the
+ * snapshot data if it's not needed. Note that if you retrieve the snapshot data you must also
+ * retrieve the snapshot count.
*/
int
__wt_meta_read_checkpoint_snapshot(WT_SESSION_IMPL *session, const char *ckpt_name,
@@ -1598,10 +1601,14 @@ __wt_meta_read_checkpoint_snapshot(WT_SESSION_IMPL *session, const char *ckpt_na
/* Initialize to an empty snapshot. */
if (snap_write_gen != NULL)
*snap_write_gen = 0;
- *snap_min = WT_TXN_NONE;
- *snap_max = WT_TXN_NONE;
- *snapshot = NULL;
- *snapshot_count = 0;
+ if (snap_min != NULL)
+ *snap_min = WT_TXN_NONE;
+ if (snap_max != NULL)
+ *snap_max = WT_TXN_NONE;
+ if (snapshot != NULL)
+ *snapshot = NULL;
+ if (snapshot_count != NULL)
+ *snapshot_count = 0;
if (ckpttime != NULL)
*ckpttime = 0;
@@ -1618,20 +1625,25 @@ __wt_meta_read_checkpoint_snapshot(WT_SESSION_IMPL *session, const char *ckpt_na
/* Extract the components of the metadata string. */
if (sys_config != NULL) {
WT_CLEAR(cval);
- if (__wt_config_getones(session, sys_config, WT_SYSTEM_CKPT_SNAPSHOT_MIN, &cval) == 0 &&
+ if (snap_min != NULL &&
+ __wt_config_getones(session, sys_config, WT_SYSTEM_CKPT_SNAPSHOT_MIN, &cval) == 0 &&
cval.len != 0)
*snap_min = (uint64_t)cval.val;
- if (__wt_config_getones(session, sys_config, WT_SYSTEM_CKPT_SNAPSHOT_MAX, &cval) == 0 &&
+ if (snap_max != NULL &&
+ __wt_config_getones(session, sys_config, WT_SYSTEM_CKPT_SNAPSHOT_MAX, &cval) == 0 &&
cval.len != 0)
*snap_max = (uint64_t)cval.val;
- if (__wt_config_getones(session, sys_config, WT_SYSTEM_CKPT_SNAPSHOT_COUNT, &cval) == 0 &&
+ if (snapshot_count != NULL &&
+ __wt_config_getones(session, sys_config, WT_SYSTEM_CKPT_SNAPSHOT_COUNT, &cval) == 0 &&
cval.len != 0)
*snapshot_count = (uint32_t)cval.val;
- if (__wt_config_getones(session, sys_config, WT_SYSTEM_CKPT_SNAPSHOT, &cval) == 0 &&
+ if (snapshot != NULL &&
+ __wt_config_getones(session, sys_config, WT_SYSTEM_CKPT_SNAPSHOT, &cval) == 0 &&
cval.len != 0) {
+ WT_ASSERT(session, snapshot_count != NULL);
__wt_config_subinit(session, &list, &cval);
WT_ERR(__wt_calloc_def(session, *snapshot_count, snapshot));
while (__wt_config_subget_next(&list, &k) == 0)
@@ -1648,17 +1660,14 @@ __wt_meta_read_checkpoint_snapshot(WT_SESSION_IMPL *session, const char *ckpt_na
if (snap_write_gen != NULL)
*snap_write_gen = write_gen;
- if (ckpttime != NULL) {
- /*
- * If the write generation is current, extract the checkpoint time. Otherwise we use 0.
- */
- if (cval.val != 0 && write_gen >= conn->base_write_gen) {
- WT_ERR_NOTFOUND_OK(
- __wt_config_getones(session, sys_config, WT_SYSTEM_CKPT_SNAPSHOT_TIME, &cval),
- false);
- if (cval.val != 0)
- *ckpttime = (uint64_t)cval.val;
- }
+ /*
+ * If the write generation is current, extract the checkpoint time. Otherwise we use 0.
+ */
+ if (ckpttime != NULL && cval.val != 0 && write_gen >= conn->base_write_gen) {
+ WT_ERR_NOTFOUND_OK(
+ __wt_config_getones(session, sys_config, WT_SYSTEM_CKPT_SNAPSHOT_TIME, &cval), false);
+ if (cval.val != 0)
+ *ckpttime = (uint64_t)cval.val;
}
/*
@@ -1666,7 +1675,7 @@ __wt_meta_read_checkpoint_snapshot(WT_SESSION_IMPL *session, const char *ckpt_na
* transaction IDs between min and max.
*/
WT_ASSERT(session,
- *snapshot == NULL ||
+ snapshot == NULL || snap_min == NULL || snap_max == NULL || *snapshot == NULL ||
(*snapshot_count == counter && (*snapshot)[0] == *snap_min &&
(*snapshot)[counter - 1] < *snap_max));
}
diff --git a/src/third_party/wiredtiger/src/session/session_dhandle.c b/src/third_party/wiredtiger/src/session/session_dhandle.c
index 106d4aebb4a..c8f39c28273 100644
--- a/src/third_party/wiredtiger/src/session/session_dhandle.c
+++ b/src/third_party/wiredtiger/src/session/session_dhandle.c
@@ -292,13 +292,16 @@ __session_fetch_checkpoint_meta(WT_SESSION_IMPL *session, const char *ckpt_name,
WT_CKPT_SNAPSHOT *info_ret, uint64_t *snapshot_time_ret, uint64_t *stable_time_ret,
uint64_t *oldest_time_ret)
{
- WT_DECL_RET;
- uint64_t *snapshot_txns;
+ /* Get the timestamps. */
+ WT_RET(__wt_meta_read_checkpoint_timestamp(
+ session, ckpt_name, &info_ret->stable_ts, stable_time_ret));
+ WT_RET(
+ __wt_meta_read_checkpoint_oldest(session, ckpt_name, &info_ret->oldest_ts, oldest_time_ret));
- /* Get the snapshot first; it's written last as the checkpoint completes. */
+ /* Get the snapshot. */
WT_RET(__wt_meta_read_checkpoint_snapshot(session, ckpt_name, &info_ret->snapshot_write_gen,
- &info_ret->snapshot_min, &info_ret->snapshot_max, &snapshot_txns, &info_ret->snapshot_count,
- snapshot_time_ret));
+ &info_ret->snapshot_min, &info_ret->snapshot_max, &info_ret->snapshot_txns,
+ &info_ret->snapshot_count, snapshot_time_ret));
/*
* If we successfully read a null snapshot, set the min and max to WT_TXN_MAX so everything is
@@ -309,22 +312,22 @@ __session_fetch_checkpoint_meta(WT_SESSION_IMPL *session, const char *ckpt_name,
*/
if (info_ret->snapshot_min == WT_TXN_NONE && info_ret->snapshot_max == WT_TXN_NONE) {
info_ret->snapshot_min = info_ret->snapshot_max = WT_TXN_MAX;
- WT_ASSERT(session, snapshot_txns == NULL && info_ret->snapshot_count == 0);
+ WT_ASSERT(session, info_ret->snapshot_txns == NULL && info_ret->snapshot_count == 0);
}
- /* Get the timestamps. */
- WT_ERR(__wt_meta_read_checkpoint_timestamp(
- session, ckpt_name, &info_ret->stable_ts, stable_time_ret));
- WT_ERR(
- __wt_meta_read_checkpoint_oldest(session, ckpt_name, &info_ret->oldest_ts, oldest_time_ret));
-
- /* Wait until we succeed to assign this, to be sure it can't be cleaned up twice. */
- info_ret->snapshot_txns = snapshot_txns;
return (0);
+}
-err:
- __wt_free(session, snapshot_txns);
- return (ret);
+/*
+ * __session_fetch_checkpoint_snapshot_wall_time --
+ * Like __session_fetch_checkpoint_meta, but retrieves just the wall clock time of the snapshot.
+ */
+static int
+__session_fetch_checkpoint_snapshot_wall_time(
+ WT_SESSION_IMPL *session, const char *ckpt_name, uint64_t *walltime)
+{
+ return (__wt_meta_read_checkpoint_snapshot(
+ session, ckpt_name, NULL, NULL, NULL, NULL, NULL, walltime));
}
/*
@@ -365,12 +368,12 @@ __wt_session_get_btree_ckpt(WT_SESSION_IMPL *session, const char *uri, const cha
{
WT_CONFIG_ITEM cval;
WT_DECL_RET;
- uint64_t ds_time, hs_time, oldest_time, snapshot_time, stable_time;
+ uint64_t ds_time, first_snapshot_time, hs_time, oldest_time, snapshot_time, stable_time;
int64_t ds_order, hs_order;
const char *checkpoint, *hs_checkpoint;
bool is_unnamed_ckpt, must_resolve;
- ds_time = hs_time = oldest_time = snapshot_time = stable_time = 0;
+ ds_time = first_snapshot_time = hs_time = oldest_time = snapshot_time = stable_time = 0;
ds_order = hs_order = 0;
checkpoint = NULL;
hs_checkpoint = NULL;
@@ -434,28 +437,29 @@ __wt_session_get_btree_ckpt(WT_SESSION_IMPL *session, const char *uri, const cha
* because unnamed checkpoints are never replaced, but for named checkpoints it's possible for
* the open to race with regeneration of the checkpoint.)
*
- * Because the snapshot and timestamp information is always written by every checkpoint, and is
- * written last, it always gives the wall clock time of the most recent completed global
- * checkpoint. If either the data store or history store checkpoint has a newer wall clock time,
- * it must be from a currently running checkpoint and does not match the snapshot; therefore we
- * must retry or fail. If both have the same or an older wall clock time, they are from the same
- * or an older checkpoint and can be presumed to match.
+ * Because the snapshot information is always written by every checkpoint, and is written last,
+ * we use its wall clock time as the reference. This is always the wall clock time of the most
+ * recent completed global checkpoint of the same name, or the most recent completed unnamed
+ * checkpoint, as appropriate. We read this time twice, once at the very beginning and again
+ * along with the snapshot information itself at the end after the other items. If these two
+ * times don't match, a global checkpoint completed while we were reading. In this case we
+ * cannot tell for sure if we read one of the trees' metadata before the checkpoint updated it;
+ * if the tree's wall clock time is older than the snapshot's, it might be because that tree was
+ * skipped, but it might also be because there was an update but we read before the update
+ * happened. Therefore, we need to retry.
*
- * A slight complication is that the snapshot and timestamp information is three separate pieces
- * of metadata; we read the time from all three and if they don't agree, it must be because a
- * checkpoint is finishing at this very moment, so we retry.
+ * If the two copies of the snapshot time match, we check the other wall clock times against the
+ * snapshot time. If any of the items are newer, they were written by a currently running
+ * checkpoint that hasn't finished yet, and we need to retry.
*
- * (It is actually slightly more complicated: either timestamp might not be present, in which
+ * (For the timestamps it is slightly easier; either timestamp might not be present, in which
* case both the timestamp and its associated time will read back as zero. We take advantage of
* the knowledge that for both these timestamps the system cannot transition from a state with
* the timestamp set to one where it is not, and therefore once any checkpoint includes either
- * timestamp, every subsequent checkpoint will too. Since the snapshot is written after both
- * timestamps, we read it first. Then for each timestamp, if we read it and find it present, it
- * must be from the same checkpoint as the snapshot or the next. If it isn't present, its
- * absence might technically be associated with the next checkpoint, but if so it cannot have
- * been present in the snapshot's checkpoint either and we are ok to proceed. So we retry if
- * either timestamp's wall time is newer than the snapshot's. Then, to partially crosscheck this
- * logic we assert that the wall time is either the same as the snapshot's or zero.)
+ * timestamp, every subsequent checkpoint will too. Therefore, the timestamps' wall times should
+ * either match the snapshot or be zero; and if they're zero, it doesn't matter if they were
+ * actually zero in a newer, currently running checkpoint, because then they must have always
+ * been zero.)
*
* This scheme relies on the fact we take steps to make sure that the checkpoint wall clock time
* does not run backward, and that successive checkpoints are never given the same wall clock
@@ -482,7 +486,10 @@ __wt_session_get_btree_ckpt(WT_SESSION_IMPL *session, const char *uri, const cha
/* We're opening the history store directly, so don't open it twice. */
hs_dhandlep = NULL;
- /* Test for the internal checkpoint name (WiredTigerCheckpoint). */
+ /*
+ * Test for the internal checkpoint name (WiredTigerCheckpoint). Note: must_resolve is true in a
+ * subset of the cases where is_unnamed_ckpt is true.
+ */
must_resolve = WT_STRING_MATCH(WT_CHECKPOINT, cval.str, cval.len);
is_unnamed_ckpt = cval.len >= strlen(WT_CHECKPOINT) && WT_PREFIX_MATCH(cval.str, WT_CHECKPOINT);
@@ -490,20 +497,28 @@ __wt_session_get_btree_ckpt(WT_SESSION_IMPL *session, const char *uri, const cha
do {
ret = 0;
- if (ckpt_snapshot != NULL)
+ if (!must_resolve)
+ /* Copy the checkpoint name first because we may need it to get the first wall time. */
+ WT_RET(__wt_strndup(session, cval.str, cval.len, &checkpoint));
+
+ if (ckpt_snapshot != NULL) {
/* We're about to re-fetch this; discard the prior version. No effect the first time. */
__wt_free(session, ckpt_snapshot->snapshot_txns);
- /* Look up the data store checkpoint. */
+ /*
+ * Now, as the first step of the retrieval process, get the wall-clock time of the
+ * snapshot metadata (only). If we need the name, we'll have copied it already.
+ */
+ WT_RET(__session_fetch_checkpoint_snapshot_wall_time(
+ session, is_unnamed_ckpt ? NULL : checkpoint, &first_snapshot_time));
+ }
+
if (must_resolve)
+ /* Look up the most recent data store checkpoint. This fetches the exact name to use. */
WT_RET(__wt_meta_checkpoint_last_name(session, uri, &checkpoint, &ds_order, &ds_time));
- else {
- /* Copy the checkpoint name. */
- WT_RET(__wt_strndup(session, cval.str, cval.len, &checkpoint));
-
- /* Look up the checkpoint and get its time and order information. */
+ else
+ /* Look up the checkpoint by name and get its time and order information. */
WT_RET(__wt_meta_checkpoint_by_name(session, uri, checkpoint, &ds_order, &ds_time));
- }
/* Look up the history store checkpoint. */
if (hs_dhandlep != NULL) {
@@ -532,10 +547,9 @@ __wt_session_get_btree_ckpt(WT_SESSION_IMPL *session, const char *uri, const cha
/*
* Check if we raced with a running checkpoint.
*
- * If either timestamp metadata time is newer than the snapshot, we read in the middle
- * of that material being updated and we need to retry. If that didn't happen, then
- * check if either the data store or history store checkpoint time is newer than the
- * metadata time. In either case we need to retry.
+ * If the two copies of the snapshot don't match, or if any of the other metadata items'
+ * time is newer than the snapshot, we read in the middle of that material being updated
+ * and we need to retry.
*
* Otherwise we have successfully gotten a matching set, as described above.
*
@@ -548,8 +562,8 @@ __wt_session_get_btree_ckpt(WT_SESSION_IMPL *session, const char *uri, const cha
* forever.
*/
- if (ds_time > snapshot_time || hs_time > snapshot_time || stable_time > snapshot_time ||
- oldest_time > snapshot_time)
+ if (first_snapshot_time != snapshot_time || ds_time > snapshot_time ||
+ hs_time > snapshot_time || stable_time > snapshot_time || oldest_time > snapshot_time)
ret = __wt_set_return(session, EBUSY);
else {
/* Crosscheck that we didn't somehow get an older timestamp. */