summaryrefslogtreecommitdiff
path: root/src/third_party/wiredtiger/src/reconcile/rec_visibility.c
diff options
context:
space:
mode:
Diffstat (limited to 'src/third_party/wiredtiger/src/reconcile/rec_visibility.c')
-rw-r--r--src/third_party/wiredtiger/src/reconcile/rec_visibility.c129
1 files changed, 73 insertions, 56 deletions
diff --git a/src/third_party/wiredtiger/src/reconcile/rec_visibility.c b/src/third_party/wiredtiger/src/reconcile/rec_visibility.c
index 36e2de9ccc5..aa44301a21d 100644
--- a/src/third_party/wiredtiger/src/reconcile/rec_visibility.c
+++ b/src/third_party/wiredtiger/src/reconcile/rec_visibility.c
@@ -76,11 +76,18 @@ __rec_append_orig_value(
return (0);
/*
+ * Prepared updates should already be in the update list, add the original update to the
+ * list only when the prepared update is a tombstone.
+ */
+ if (F_ISSET(unpack, WT_CELL_UNPACK_PREPARE) && upd->type != WT_UPDATE_TOMBSTONE)
+ return (0);
+
+ /*
* Done if the on page value already appears on the update list. We can't do the same check
* for stop time pair because we may still need to append the onpage value if only the
* tombstone is on the update chain.
*/
- if (unpack->start_ts == upd->start_ts && unpack->start_txn == upd->txnid &&
+ if (unpack->tw.start_ts == upd->start_ts && unpack->tw.start_txn == upd->txnid &&
upd->type != WT_UPDATE_TOMBSTONE)
return (0);
@@ -104,8 +111,8 @@ __rec_append_orig_value(
}
/* Done if the stop time pair of the onpage cell is globally visible. */
- if ((unpack->stop_ts != WT_TS_MAX || unpack->stop_txn != WT_TXN_MAX) &&
- __wt_txn_visible_all(session, unpack->stop_txn, unpack->stop_ts))
+ if ((unpack->tw.stop_ts != WT_TS_MAX || unpack->tw.stop_txn != WT_TXN_MAX) &&
+ __wt_txn_visible_all(session, unpack->tw.stop_txn, unpack->tw.stop_ts))
return (0);
/* We need the original on-page value for some reader: get a copy. */
@@ -113,9 +120,9 @@ __rec_append_orig_value(
WT_ERR(__wt_page_cell_data_ref(session, page, unpack, tmp));
WT_ERR(__wt_upd_alloc(session, tmp, WT_UPDATE_STANDARD, &append, &size));
total_size += size;
- append->txnid = unpack->start_txn;
- append->start_ts = unpack->start_ts;
- append->durable_ts = unpack->durable_start_ts;
+ append->txnid = unpack->tw.start_txn;
+ append->start_ts = unpack->tw.start_ts;
+ append->durable_ts = unpack->tw.durable_start_ts;
/*
* Additionally, we need to append a tombstone before the onpage value we're about to append to
@@ -123,20 +130,26 @@ __rec_append_orig_value(
* delete a value respectively at timestamp 0 and 10, and later insert it again at 20. We need
* the tombstone to tell us there is no value between 10 and 20.
*/
- if (unpack->stop_ts != WT_TS_MAX || unpack->stop_txn != WT_TXN_MAX) {
+ if (unpack->tw.stop_ts != WT_TS_MAX || unpack->tw.stop_txn != WT_TXN_MAX) {
/* No need to append the tombstone if it is already in the update chain. */
if (oldest_upd->type != WT_UPDATE_TOMBSTONE) {
WT_ERR(__wt_upd_alloc_tombstone(session, &tombstone, &size));
total_size += size;
- tombstone->txnid = unpack->stop_txn;
- tombstone->start_ts = unpack->stop_ts;
- tombstone->durable_ts = unpack->durable_stop_ts;
+ tombstone->txnid = unpack->tw.stop_txn;
+ tombstone->start_ts = unpack->tw.stop_ts;
+ tombstone->durable_ts = unpack->tw.durable_stop_ts;
tombstone->next = append;
append = tombstone;
} else
- WT_ASSERT(session,
- unpack->stop_ts == oldest_upd->start_ts && unpack->stop_txn == oldest_upd->txnid);
+ /*
+ * Once the prepared update is resolved, the in-memory update and on-disk written copy
+ * doesn't have same timestamp due to replacing of prepare timestamp with commit and
+ * durable timestamps. Don't compare them when the on-disk version is a prepare.
+ */
+ WT_ASSERT(session, F_ISSET(unpack, WT_CELL_UNPACK_PREPARE) ||
+ (unpack->tw.stop_ts == oldest_upd->start_ts &&
+ unpack->tw.stop_txn == oldest_upd->txnid));
}
/* Append the new entry into the update list. */
@@ -161,6 +174,9 @@ static inline bool
__rec_need_save_upd(
WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_UPDATE_SELECT *upd_select, bool has_newer_updates)
{
+ if (upd_select->tw.prepare)
+ return (true);
+
if (F_ISSET(r, WT_REC_EVICT) && has_newer_updates)
return (true);
@@ -176,8 +192,8 @@ __rec_need_save_upd(
if (F_ISSET(r, WT_REC_CHECKPOINT) && upd_select->upd == NULL)
return (false);
- return (!__wt_txn_visible_all(session, upd_select->stop_txn, upd_select->stop_ts) &&
- !__wt_txn_visible_all(session, upd_select->start_txn, upd_select->start_ts));
+ return (!__wt_txn_visible_all(session, upd_select->tw.stop_txn, upd_select->tw.stop_ts) &&
+ !__wt_txn_visible_all(session, upd_select->tw.start_txn, upd_select->tw.start_ts));
}
/*
@@ -191,10 +207,12 @@ __wt_rec_upd_select(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_INSERT *ins, v
WT_DECL_ITEM(tmp);
WT_DECL_RET;
WT_PAGE *page;
+ WT_TIME_WINDOW *select_tw;
WT_UPDATE *first_txn_upd, *first_upd, *upd, *last_upd, *tombstone;
wt_timestamp_t max_ts;
size_t upd_memsize;
uint64_t max_txn, txnid;
+ char time_string[WT_TIME_STRING_SIZE];
bool has_newer_updates, is_hs_page, supd_restore, upd_saved;
/*
@@ -202,13 +220,8 @@ __wt_rec_upd_select(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_INSERT *ins, v
* both must be initialized.
*/
upd_select->upd = NULL;
- upd_select->start_durable_ts = WT_TS_NONE;
- upd_select->start_ts = WT_TS_NONE;
- upd_select->start_txn = WT_TXN_NONE;
- upd_select->stop_durable_ts = WT_TS_NONE;
- upd_select->stop_ts = WT_TS_MAX;
- upd_select->stop_txn = WT_TXN_MAX;
- upd_select->prepare = false;
+ select_tw = &upd_select->tw;
+ __wt_time_window_init(select_tw);
page = r->page;
first_txn_upd = upd = last_upd = tombstone = NULL;
@@ -264,20 +277,24 @@ __wt_rec_upd_select(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_INSERT *ins, v
continue;
}
+ /* Ignore prepared updates if it is not eviction. */
if (upd->prepare_state == WT_PREPARE_LOCKED ||
upd->prepare_state == WT_PREPARE_INPROGRESS) {
- WT_ASSERT(session, upd_select->upd == NULL);
- has_newer_updates = true;
- if (upd->start_ts > max_ts)
- max_ts = upd->start_ts;
-
- /*
- * Track the oldest update not on the page, used to decide whether reads can use the
- * page image, hence using the start rather than the durable timestamp.
- */
- if (upd->start_ts < r->min_skipped_ts)
- r->min_skipped_ts = upd->start_ts;
- continue;
+ WT_ASSERT(session, upd_select->upd == NULL || upd_select->upd->txnid == upd->txnid);
+ if (!F_ISSET(r, WT_REC_EVICT)) {
+ has_newer_updates = true;
+ if (upd->start_ts > max_ts)
+ max_ts = upd->start_ts;
+
+ /*
+ * Track the oldest update not on the page, used to decide whether reads can use the
+ * page image, hence using the start rather than the durable timestamp.
+ */
+ if (upd->start_ts < r->min_skipped_ts)
+ r->min_skipped_ts = upd->start_ts;
+ continue;
+ } else
+ WT_ASSERT(session, upd->prepare_state == WT_PREPARE_INPROGRESS);
}
/* Track the first update with non-zero timestamp. */
@@ -322,7 +339,7 @@ __wt_rec_upd_select(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_INSERT *ins, v
*/
if (has_newer_updates && F_ISSET(r, WT_REC_CLEAN_AFTER_REC | WT_REC_VISIBILITY_ERR)) {
if (F_ISSET(r, WT_REC_VISIBILITY_ERR))
- WT_PANIC_RET(session, EINVAL, "reconciliation error, update not visible");
+ WT_RET_PANIC(session, EINVAL, "reconciliation error, update not visible");
return (__wt_set_return(session, EBUSY));
}
@@ -341,6 +358,14 @@ __wt_rec_upd_select(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_INSERT *ins, v
*/
if (upd != NULL) {
/*
+ * Mark the prepare flag if the selected update is an uncommitted prepare. As tombstone
+ * updates are never returned to write, set this flag before we move into the previous
+ * update to write.
+ */
+ if (upd->prepare_state == WT_PREPARE_INPROGRESS)
+ select_tw->prepare = 1;
+
+ /*
* If the newest is a tombstone then select the update before it and set the end of the
* visibility window to its time pair as appropriate to indicate that we should return "not
* found" for reads after this point.
@@ -349,9 +374,7 @@ __wt_rec_upd_select(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_INSERT *ins, v
* indicate that the value is visible to any timestamp/transaction id ahead of it.
*/
if (upd->type == WT_UPDATE_TOMBSTONE) {
- upd_select->stop_ts = upd->start_ts;
- upd_select->stop_txn = upd->txnid;
- upd_select->stop_durable_ts = upd->durable_ts;
+ __wt_time_window_set_stop(select_tw, upd);
tombstone = upd;
/* Find the update this tombstone applies to. */
@@ -366,10 +389,8 @@ __wt_rec_upd_select(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_INSERT *ins, v
}
if (upd != NULL) {
/* The beginning of the validity window is the selected update's time pair. */
- upd_select->start_ts = upd->start_ts;
- upd_select->start_durable_ts = upd->durable_ts;
- upd_select->start_txn = upd->txnid;
- } else if (upd_select->stop_ts != WT_TS_NONE || upd_select->stop_txn != WT_TXN_NONE) {
+ __wt_time_window_set_start(select_tw, upd);
+ } else if (select_tw->stop_ts != WT_TS_NONE || select_tw->stop_txn != WT_TXN_NONE) {
/* If we only have a tombstone in the update list, we must have an ondisk value. */
WT_ASSERT(session, vpack != NULL && tombstone != NULL);
/*
@@ -384,13 +405,11 @@ __wt_rec_upd_select(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_INSERT *ins, v
*/
WT_ERR(__rec_append_orig_value(session, page, tombstone, vpack));
WT_ASSERT(session, last_upd->next != NULL &&
- last_upd->next->txnid == vpack->start_txn &&
- last_upd->next->start_ts == vpack->start_ts &&
+ last_upd->next->txnid == vpack->tw.start_txn &&
+ last_upd->next->start_ts == vpack->tw.start_ts &&
last_upd->next->type == WT_UPDATE_STANDARD && last_upd->next->next == NULL);
upd_select->upd = last_upd->next;
- upd_select->start_ts = last_upd->next->start_ts;
- upd_select->start_durable_ts = last_upd->next->durable_ts;
- upd_select->start_txn = last_upd->next->txnid;
+ __wt_time_window_set_start(select_tw, last_upd->next);
}
}
@@ -405,17 +424,15 @@ __wt_rec_upd_select(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_INSERT *ins, v
* time pair. While unusual, it is permitted for a single transaction to insert and then remove
* a record. We don't want to generate a warning in that case.
*/
- if (upd_select->stop_ts < upd_select->start_ts ||
- (upd_select->stop_ts == upd_select->start_ts &&
- upd_select->stop_txn < upd_select->start_txn)) {
- char ts_string[2][WT_TS_INT_STRING_SIZE];
+ if (select_tw->stop_ts < select_tw->start_ts ||
+ (select_tw->stop_ts == select_tw->start_ts && select_tw->stop_txn < select_tw->start_txn)) {
__wt_verbose(session, WT_VERB_TIMESTAMP,
- "Warning: fixing out-of-order timestamps remove at %s earlier than value at %s",
- __wt_timestamp_to_string(upd_select->stop_ts, ts_string[0]),
- __wt_timestamp_to_string(upd_select->start_ts, ts_string[1]));
- upd_select->start_durable_ts = upd_select->stop_durable_ts;
- upd_select->start_ts = upd_select->stop_ts;
- upd_select->start_txn = upd_select->stop_txn;
+ "Warning: fixing out-of-order timestamps remove earlier than value; time window %s",
+ __wt_time_window_to_string(select_tw, time_string));
+
+ select_tw->durable_start_ts = select_tw->durable_stop_ts;
+ select_tw->start_ts = select_tw->stop_ts;
+ select_tw->start_txn = select_tw->stop_txn;
}
/*