summaryrefslogtreecommitdiff
path: root/src/third_party/wiredtiger/src/session/session_dhandle.c
diff options
context:
space:
mode:
Diffstat (limited to 'src/third_party/wiredtiger/src/session/session_dhandle.c')
-rw-r--r--src/third_party/wiredtiger/src/session/session_dhandle.c116
1 files changed, 65 insertions, 51 deletions
diff --git a/src/third_party/wiredtiger/src/session/session_dhandle.c b/src/third_party/wiredtiger/src/session/session_dhandle.c
index 106d4aebb4a..c8f39c28273 100644
--- a/src/third_party/wiredtiger/src/session/session_dhandle.c
+++ b/src/third_party/wiredtiger/src/session/session_dhandle.c
@@ -292,13 +292,16 @@ __session_fetch_checkpoint_meta(WT_SESSION_IMPL *session, const char *ckpt_name,
WT_CKPT_SNAPSHOT *info_ret, uint64_t *snapshot_time_ret, uint64_t *stable_time_ret,
uint64_t *oldest_time_ret)
{
- WT_DECL_RET;
- uint64_t *snapshot_txns;
+ /* Get the timestamps. */
+ WT_RET(__wt_meta_read_checkpoint_timestamp(
+ session, ckpt_name, &info_ret->stable_ts, stable_time_ret));
+ WT_RET(
+ __wt_meta_read_checkpoint_oldest(session, ckpt_name, &info_ret->oldest_ts, oldest_time_ret));
- /* Get the snapshot first; it's written last as the checkpoint completes. */
+ /* Get the snapshot. */
WT_RET(__wt_meta_read_checkpoint_snapshot(session, ckpt_name, &info_ret->snapshot_write_gen,
- &info_ret->snapshot_min, &info_ret->snapshot_max, &snapshot_txns, &info_ret->snapshot_count,
- snapshot_time_ret));
+ &info_ret->snapshot_min, &info_ret->snapshot_max, &info_ret->snapshot_txns,
+ &info_ret->snapshot_count, snapshot_time_ret));
/*
* If we successfully read a null snapshot, set the min and max to WT_TXN_MAX so everything is
@@ -309,22 +312,22 @@ __session_fetch_checkpoint_meta(WT_SESSION_IMPL *session, const char *ckpt_name,
*/
if (info_ret->snapshot_min == WT_TXN_NONE && info_ret->snapshot_max == WT_TXN_NONE) {
info_ret->snapshot_min = info_ret->snapshot_max = WT_TXN_MAX;
- WT_ASSERT(session, snapshot_txns == NULL && info_ret->snapshot_count == 0);
+ WT_ASSERT(session, info_ret->snapshot_txns == NULL && info_ret->snapshot_count == 0);
}
- /* Get the timestamps. */
- WT_ERR(__wt_meta_read_checkpoint_timestamp(
- session, ckpt_name, &info_ret->stable_ts, stable_time_ret));
- WT_ERR(
- __wt_meta_read_checkpoint_oldest(session, ckpt_name, &info_ret->oldest_ts, oldest_time_ret));
-
- /* Wait until we succeed to assign this, to be sure it can't be cleaned up twice. */
- info_ret->snapshot_txns = snapshot_txns;
return (0);
+}
-err:
- __wt_free(session, snapshot_txns);
- return (ret);
+/*
+ * __session_fetch_checkpoint_snapshot_wall_time --
+ * Like __session_fetch_checkpoint_meta, but retrieves just the wall clock time of the snapshot.
+ */
+static int
+__session_fetch_checkpoint_snapshot_wall_time(
+ WT_SESSION_IMPL *session, const char *ckpt_name, uint64_t *walltime)
+{
+ return (__wt_meta_read_checkpoint_snapshot(
+ session, ckpt_name, NULL, NULL, NULL, NULL, NULL, walltime));
}
/*
@@ -365,12 +368,12 @@ __wt_session_get_btree_ckpt(WT_SESSION_IMPL *session, const char *uri, const cha
{
WT_CONFIG_ITEM cval;
WT_DECL_RET;
- uint64_t ds_time, hs_time, oldest_time, snapshot_time, stable_time;
+ uint64_t ds_time, first_snapshot_time, hs_time, oldest_time, snapshot_time, stable_time;
int64_t ds_order, hs_order;
const char *checkpoint, *hs_checkpoint;
bool is_unnamed_ckpt, must_resolve;
- ds_time = hs_time = oldest_time = snapshot_time = stable_time = 0;
+ ds_time = first_snapshot_time = hs_time = oldest_time = snapshot_time = stable_time = 0;
ds_order = hs_order = 0;
checkpoint = NULL;
hs_checkpoint = NULL;
@@ -434,28 +437,29 @@ __wt_session_get_btree_ckpt(WT_SESSION_IMPL *session, const char *uri, const cha
* because unnamed checkpoints are never replaced, but for named checkpoints it's possible for
* the open to race with regeneration of the checkpoint.)
*
- * Because the snapshot and timestamp information is always written by every checkpoint, and is
- * written last, it always gives the wall clock time of the most recent completed global
- * checkpoint. If either the data store or history store checkpoint has a newer wall clock time,
- * it must be from a currently running checkpoint and does not match the snapshot; therefore we
- * must retry or fail. If both have the same or an older wall clock time, they are from the same
- * or an older checkpoint and can be presumed to match.
+ * Because the snapshot information is always written by every checkpoint, and is written last,
+ * we use its wall clock time as the reference. This is always the wall clock time of the most
+ * recent completed global checkpoint of the same name, or the most recent completed unnamed
+ * checkpoint, as appropriate. We read this time twice, once at the very beginning and again
+ * along with the snapshot information itself at the end after the other items. If these two
+ * times don't match, a global checkpoint completed while we were reading. In this case we
+ * cannot tell for sure if we read one of the trees' metadata before the checkpoint updated it;
+ * if the tree's wall clock time is older than the snapshot's, it might be because that tree was
+ * skipped, but it might also be because there was an update but we read before the update
+ * happened. Therefore, we need to retry.
*
- * A slight complication is that the snapshot and timestamp information is three separate pieces
- * of metadata; we read the time from all three and if they don't agree, it must be because a
- * checkpoint is finishing at this very moment, so we retry.
+ * If the two copies of the snapshot time match, we check the other wall clock times against the
+ * snapshot time. If any of the items are newer, they were written by a currently running
+ * checkpoint that hasn't finished yet, and we need to retry.
*
- * (It is actually slightly more complicated: either timestamp might not be present, in which
+ * (For the timestamps it is slightly easier; either timestamp might not be present, in which
* case both the timestamp and its associated time will read back as zero. We take advantage of
* the knowledge that for both these timestamps the system cannot transition from a state with
* the timestamp set to one where it is not, and therefore once any checkpoint includes either
- * timestamp, every subsequent checkpoint will too. Since the snapshot is written after both
- * timestamps, we read it first. Then for each timestamp, if we read it and find it present, it
- * must be from the same checkpoint as the snapshot or the next. If it isn't present, its
- * absence might technically be associated with the next checkpoint, but if so it cannot have
- * been present in the snapshot's checkpoint either and we are ok to proceed. So we retry if
- * either timestamp's wall time is newer than the snapshot's. Then, to partially crosscheck this
- * logic we assert that the wall time is either the same as the snapshot's or zero.)
+ * timestamp, every subsequent checkpoint will too. Therefore, the timestamps' wall times should
+ * either match the snapshot or be zero; and if they're zero, it doesn't matter if they were
+ * actually zero in a newer, currently running checkpoint, because then they must have always
+ * been zero.)
*
* This scheme relies on the fact we take steps to make sure that the checkpoint wall clock time
* does not run backward, and that successive checkpoints are never given the same wall clock
@@ -482,7 +486,10 @@ __wt_session_get_btree_ckpt(WT_SESSION_IMPL *session, const char *uri, const cha
/* We're opening the history store directly, so don't open it twice. */
hs_dhandlep = NULL;
- /* Test for the internal checkpoint name (WiredTigerCheckpoint). */
+ /*
+ * Test for the internal checkpoint name (WiredTigerCheckpoint). Note: must_resolve is true in a
+ * subset of the cases where is_unnamed_ckpt is true.
+ */
must_resolve = WT_STRING_MATCH(WT_CHECKPOINT, cval.str, cval.len);
is_unnamed_ckpt = cval.len >= strlen(WT_CHECKPOINT) && WT_PREFIX_MATCH(cval.str, WT_CHECKPOINT);
@@ -490,20 +497,28 @@ __wt_session_get_btree_ckpt(WT_SESSION_IMPL *session, const char *uri, const cha
do {
ret = 0;
- if (ckpt_snapshot != NULL)
+ if (!must_resolve)
+ /* Copy the checkpoint name first because we may need it to get the first wall time. */
+ WT_RET(__wt_strndup(session, cval.str, cval.len, &checkpoint));
+
+ if (ckpt_snapshot != NULL) {
/* We're about to re-fetch this; discard the prior version. No effect the first time. */
__wt_free(session, ckpt_snapshot->snapshot_txns);
- /* Look up the data store checkpoint. */
+ /*
+ * Now, as the first step of the retrieval process, get the wall-clock time of the
+ * snapshot metadata (only). If we need the name, we'll have copied it already.
+ */
+ WT_RET(__session_fetch_checkpoint_snapshot_wall_time(
+ session, is_unnamed_ckpt ? NULL : checkpoint, &first_snapshot_time));
+ }
+
if (must_resolve)
+ /* Look up the most recent data store checkpoint. This fetches the exact name to use. */
WT_RET(__wt_meta_checkpoint_last_name(session, uri, &checkpoint, &ds_order, &ds_time));
- else {
- /* Copy the checkpoint name. */
- WT_RET(__wt_strndup(session, cval.str, cval.len, &checkpoint));
-
- /* Look up the checkpoint and get its time and order information. */
+ else
+ /* Look up the checkpoint by name and get its time and order information. */
WT_RET(__wt_meta_checkpoint_by_name(session, uri, checkpoint, &ds_order, &ds_time));
- }
/* Look up the history store checkpoint. */
if (hs_dhandlep != NULL) {
@@ -532,10 +547,9 @@ __wt_session_get_btree_ckpt(WT_SESSION_IMPL *session, const char *uri, const cha
/*
* Check if we raced with a running checkpoint.
*
- * If either timestamp metadata time is newer than the snapshot, we read in the middle
- * of that material being updated and we need to retry. If that didn't happen, then
- * check if either the data store or history store checkpoint time is newer than the
- * metadata time. In either case we need to retry.
+ * If the two copies of the snapshot don't match, or if any of the other metadata items'
+ * time is newer than the snapshot, we read in the middle of that material being updated
+ * and we need to retry.
*
* Otherwise we have successfully gotten a matching set, as described above.
*
@@ -548,8 +562,8 @@ __wt_session_get_btree_ckpt(WT_SESSION_IMPL *session, const char *uri, const cha
* forever.
*/
- if (ds_time > snapshot_time || hs_time > snapshot_time || stable_time > snapshot_time ||
- oldest_time > snapshot_time)
+ if (first_snapshot_time != snapshot_time || ds_time > snapshot_time ||
+ hs_time > snapshot_time || stable_time > snapshot_time || oldest_time > snapshot_time)
ret = __wt_set_return(session, EBUSY);
else {
/* Crosscheck that we didn't somehow get an older timestamp. */