diff options
Diffstat (limited to 'src/third_party/wiredtiger/src/session/session_dhandle.c')
-rw-r--r-- | src/third_party/wiredtiger/src/session/session_dhandle.c | 116 |
1 files changed, 65 insertions, 51 deletions
diff --git a/src/third_party/wiredtiger/src/session/session_dhandle.c b/src/third_party/wiredtiger/src/session/session_dhandle.c index 106d4aebb4a..c8f39c28273 100644 --- a/src/third_party/wiredtiger/src/session/session_dhandle.c +++ b/src/third_party/wiredtiger/src/session/session_dhandle.c @@ -292,13 +292,16 @@ __session_fetch_checkpoint_meta(WT_SESSION_IMPL *session, const char *ckpt_name, WT_CKPT_SNAPSHOT *info_ret, uint64_t *snapshot_time_ret, uint64_t *stable_time_ret, uint64_t *oldest_time_ret) { - WT_DECL_RET; - uint64_t *snapshot_txns; + /* Get the timestamps. */ + WT_RET(__wt_meta_read_checkpoint_timestamp( + session, ckpt_name, &info_ret->stable_ts, stable_time_ret)); + WT_RET( + __wt_meta_read_checkpoint_oldest(session, ckpt_name, &info_ret->oldest_ts, oldest_time_ret)); - /* Get the snapshot first; it's written last as the checkpoint completes. */ + /* Get the snapshot. */ WT_RET(__wt_meta_read_checkpoint_snapshot(session, ckpt_name, &info_ret->snapshot_write_gen, - &info_ret->snapshot_min, &info_ret->snapshot_max, &snapshot_txns, &info_ret->snapshot_count, - snapshot_time_ret)); + &info_ret->snapshot_min, &info_ret->snapshot_max, &info_ret->snapshot_txns, + &info_ret->snapshot_count, snapshot_time_ret)); /* * If we successfully read a null snapshot, set the min and max to WT_TXN_MAX so everything is @@ -309,22 +312,22 @@ __session_fetch_checkpoint_meta(WT_SESSION_IMPL *session, const char *ckpt_name, */ if (info_ret->snapshot_min == WT_TXN_NONE && info_ret->snapshot_max == WT_TXN_NONE) { info_ret->snapshot_min = info_ret->snapshot_max = WT_TXN_MAX; - WT_ASSERT(session, snapshot_txns == NULL && info_ret->snapshot_count == 0); + WT_ASSERT(session, info_ret->snapshot_txns == NULL && info_ret->snapshot_count == 0); } - /* Get the timestamps. */ - WT_ERR(__wt_meta_read_checkpoint_timestamp( - session, ckpt_name, &info_ret->stable_ts, stable_time_ret)); - WT_ERR( - __wt_meta_read_checkpoint_oldest(session, ckpt_name, &info_ret->oldest_ts, oldest_time_ret)); - - /* Wait until we succeed to assign this, to be sure it can't be cleaned up twice. */ - info_ret->snapshot_txns = snapshot_txns; return (0); +} -err: - __wt_free(session, snapshot_txns); - return (ret); +/* + * __session_fetch_checkpoint_snapshot_wall_time -- + * Like __session_fetch_checkpoint_meta, but retrieves just the wall clock time of the snapshot. + */ +static int +__session_fetch_checkpoint_snapshot_wall_time( + WT_SESSION_IMPL *session, const char *ckpt_name, uint64_t *walltime) +{ + return (__wt_meta_read_checkpoint_snapshot( + session, ckpt_name, NULL, NULL, NULL, NULL, NULL, walltime)); } /* @@ -365,12 +368,12 @@ __wt_session_get_btree_ckpt(WT_SESSION_IMPL *session, const char *uri, const cha { WT_CONFIG_ITEM cval; WT_DECL_RET; - uint64_t ds_time, hs_time, oldest_time, snapshot_time, stable_time; + uint64_t ds_time, first_snapshot_time, hs_time, oldest_time, snapshot_time, stable_time; int64_t ds_order, hs_order; const char *checkpoint, *hs_checkpoint; bool is_unnamed_ckpt, must_resolve; - ds_time = hs_time = oldest_time = snapshot_time = stable_time = 0; + ds_time = first_snapshot_time = hs_time = oldest_time = snapshot_time = stable_time = 0; ds_order = hs_order = 0; checkpoint = NULL; hs_checkpoint = NULL; @@ -434,28 +437,29 @@ __wt_session_get_btree_ckpt(WT_SESSION_IMPL *session, const char *uri, const cha * because unnamed checkpoints are never replaced, but for named checkpoints it's possible for * the open to race with regeneration of the checkpoint.) * - * Because the snapshot and timestamp information is always written by every checkpoint, and is - * written last, it always gives the wall clock time of the most recent completed global - * checkpoint. If either the data store or history store checkpoint has a newer wall clock time, - * it must be from a currently running checkpoint and does not match the snapshot; therefore we - * must retry or fail. If both have the same or an older wall clock time, they are from the same - * or an older checkpoint and can be presumed to match. + * Because the snapshot information is always written by every checkpoint, and is written last, + * we use its wall clock time as the reference. This is always the wall clock time of the most + * recent completed global checkpoint of the same name, or the most recent completed unnamed + * checkpoint, as appropriate. We read this time twice, once at the very beginning and again + * along with the snapshot information itself at the end after the other items. If these two + * times don't match, a global checkpoint completed while we were reading. In this case we + * cannot tell for sure if we read one of the trees' metadata before the checkpoint updated it; + * if the tree's wall clock time is older than the snapshot's, it might be because that tree was + * skipped, but it might also be because there was an update but we read before the update + * happened. Therefore, we need to retry. * - * A slight complication is that the snapshot and timestamp information is three separate pieces - * of metadata; we read the time from all three and if they don't agree, it must be because a - * checkpoint is finishing at this very moment, so we retry. + * If the two copies of the snapshot time match, we check the other wall clock times against the + * snapshot time. If any of the items are newer, they were written by a currently running + * checkpoint that hasn't finished yet, and we need to retry. * - * (It is actually slightly more complicated: either timestamp might not be present, in which + * (For the timestamps it is slightly easier; either timestamp might not be present, in which * case both the timestamp and its associated time will read back as zero. We take advantage of * the knowledge that for both these timestamps the system cannot transition from a state with * the timestamp set to one where it is not, and therefore once any checkpoint includes either - * timestamp, every subsequent checkpoint will too. Since the snapshot is written after both - * timestamps, we read it first. Then for each timestamp, if we read it and find it present, it - * must be from the same checkpoint as the snapshot or the next. If it isn't present, its - * absence might technically be associated with the next checkpoint, but if so it cannot have - * been present in the snapshot's checkpoint either and we are ok to proceed. So we retry if - * either timestamp's wall time is newer than the snapshot's. Then, to partially crosscheck this - * logic we assert that the wall time is either the same as the snapshot's or zero.) + * timestamp, every subsequent checkpoint will too. Therefore, the timestamps' wall times should + * either match the snapshot or be zero; and if they're zero, it doesn't matter if they were + * actually zero in a newer, currently running checkpoint, because then they must have always + * been zero.) * * This scheme relies on the fact we take steps to make sure that the checkpoint wall clock time * does not run backward, and that successive checkpoints are never given the same wall clock @@ -482,7 +486,10 @@ __wt_session_get_btree_ckpt(WT_SESSION_IMPL *session, const char *uri, const cha /* We're opening the history store directly, so don't open it twice. */ hs_dhandlep = NULL; - /* Test for the internal checkpoint name (WiredTigerCheckpoint). */ + /* + * Test for the internal checkpoint name (WiredTigerCheckpoint). Note: must_resolve is true in a + * subset of the cases where is_unnamed_ckpt is true. + */ must_resolve = WT_STRING_MATCH(WT_CHECKPOINT, cval.str, cval.len); is_unnamed_ckpt = cval.len >= strlen(WT_CHECKPOINT) && WT_PREFIX_MATCH(cval.str, WT_CHECKPOINT); @@ -490,20 +497,28 @@ __wt_session_get_btree_ckpt(WT_SESSION_IMPL *session, const char *uri, const cha do { ret = 0; - if (ckpt_snapshot != NULL) + if (!must_resolve) + /* Copy the checkpoint name first because we may need it to get the first wall time. */ + WT_RET(__wt_strndup(session, cval.str, cval.len, &checkpoint)); + + if (ckpt_snapshot != NULL) { /* We're about to re-fetch this; discard the prior version. No effect the first time. */ __wt_free(session, ckpt_snapshot->snapshot_txns); - /* Look up the data store checkpoint. */ + /* + * Now, as the first step of the retrieval process, get the wall-clock time of the + * snapshot metadata (only). If we need the name, we'll have copied it already. + */ + WT_RET(__session_fetch_checkpoint_snapshot_wall_time( + session, is_unnamed_ckpt ? NULL : checkpoint, &first_snapshot_time)); + } + if (must_resolve) + /* Look up the most recent data store checkpoint. This fetches the exact name to use. */ WT_RET(__wt_meta_checkpoint_last_name(session, uri, &checkpoint, &ds_order, &ds_time)); - else { - /* Copy the checkpoint name. */ - WT_RET(__wt_strndup(session, cval.str, cval.len, &checkpoint)); - - /* Look up the checkpoint and get its time and order information. */ + else + /* Look up the checkpoint by name and get its time and order information. */ WT_RET(__wt_meta_checkpoint_by_name(session, uri, checkpoint, &ds_order, &ds_time)); - } /* Look up the history store checkpoint. */ if (hs_dhandlep != NULL) { @@ -532,10 +547,9 @@ __wt_session_get_btree_ckpt(WT_SESSION_IMPL *session, const char *uri, const cha /* * Check if we raced with a running checkpoint. * - * If either timestamp metadata time is newer than the snapshot, we read in the middle - * of that material being updated and we need to retry. If that didn't happen, then - * check if either the data store or history store checkpoint time is newer than the - * metadata time. In either case we need to retry. + * If the two copies of the snapshot don't match, or if any of the other metadata items' + * time is newer than the snapshot, we read in the middle of that material being updated + * and we need to retry. * * Otherwise we have successfully gotten a matching set, as described above. * @@ -548,8 +562,8 @@ __wt_session_get_btree_ckpt(WT_SESSION_IMPL *session, const char *uri, const cha * forever. */ - if (ds_time > snapshot_time || hs_time > snapshot_time || stable_time > snapshot_time || - oldest_time > snapshot_time) + if (first_snapshot_time != snapshot_time || ds_time > snapshot_time || + hs_time > snapshot_time || stable_time > snapshot_time || oldest_time > snapshot_time) ret = __wt_set_return(session, EBUSY); else { /* Crosscheck that we didn't somehow get an older timestamp. */ |