diff options
Diffstat (limited to 'src/third_party/wiredtiger/src/reconcile/rec_visibility.c')
-rw-r--r-- | src/third_party/wiredtiger/src/reconcile/rec_visibility.c | 129 |
1 files changed, 73 insertions, 56 deletions
diff --git a/src/third_party/wiredtiger/src/reconcile/rec_visibility.c b/src/third_party/wiredtiger/src/reconcile/rec_visibility.c index 36e2de9ccc5..aa44301a21d 100644 --- a/src/third_party/wiredtiger/src/reconcile/rec_visibility.c +++ b/src/third_party/wiredtiger/src/reconcile/rec_visibility.c @@ -76,11 +76,18 @@ __rec_append_orig_value( return (0); /* + * Prepared updates should already be in the update list, add the original update to the + * list only when the prepared update is a tombstone. + */ + if (F_ISSET(unpack, WT_CELL_UNPACK_PREPARE) && upd->type != WT_UPDATE_TOMBSTONE) + return (0); + + /* * Done if the on page value already appears on the update list. We can't do the same check * for stop time pair because we may still need to append the onpage value if only the * tombstone is on the update chain. */ - if (unpack->start_ts == upd->start_ts && unpack->start_txn == upd->txnid && + if (unpack->tw.start_ts == upd->start_ts && unpack->tw.start_txn == upd->txnid && upd->type != WT_UPDATE_TOMBSTONE) return (0); @@ -104,8 +111,8 @@ __rec_append_orig_value( } /* Done if the stop time pair of the onpage cell is globally visible. */ - if ((unpack->stop_ts != WT_TS_MAX || unpack->stop_txn != WT_TXN_MAX) && - __wt_txn_visible_all(session, unpack->stop_txn, unpack->stop_ts)) + if ((unpack->tw.stop_ts != WT_TS_MAX || unpack->tw.stop_txn != WT_TXN_MAX) && + __wt_txn_visible_all(session, unpack->tw.stop_txn, unpack->tw.stop_ts)) return (0); /* We need the original on-page value for some reader: get a copy. */ @@ -113,9 +120,9 @@ __rec_append_orig_value( WT_ERR(__wt_page_cell_data_ref(session, page, unpack, tmp)); WT_ERR(__wt_upd_alloc(session, tmp, WT_UPDATE_STANDARD, &append, &size)); total_size += size; - append->txnid = unpack->start_txn; - append->start_ts = unpack->start_ts; - append->durable_ts = unpack->durable_start_ts; + append->txnid = unpack->tw.start_txn; + append->start_ts = unpack->tw.start_ts; + append->durable_ts = unpack->tw.durable_start_ts; /* * Additionally, we need to append a tombstone before the onpage value we're about to append to @@ -123,20 +130,26 @@ __rec_append_orig_value( * delete a value respectively at timestamp 0 and 10, and later insert it again at 20. We need * the tombstone to tell us there is no value between 10 and 20. */ - if (unpack->stop_ts != WT_TS_MAX || unpack->stop_txn != WT_TXN_MAX) { + if (unpack->tw.stop_ts != WT_TS_MAX || unpack->tw.stop_txn != WT_TXN_MAX) { /* No need to append the tombstone if it is already in the update chain. */ if (oldest_upd->type != WT_UPDATE_TOMBSTONE) { WT_ERR(__wt_upd_alloc_tombstone(session, &tombstone, &size)); total_size += size; - tombstone->txnid = unpack->stop_txn; - tombstone->start_ts = unpack->stop_ts; - tombstone->durable_ts = unpack->durable_stop_ts; + tombstone->txnid = unpack->tw.stop_txn; + tombstone->start_ts = unpack->tw.stop_ts; + tombstone->durable_ts = unpack->tw.durable_stop_ts; tombstone->next = append; append = tombstone; } else - WT_ASSERT(session, - unpack->stop_ts == oldest_upd->start_ts && unpack->stop_txn == oldest_upd->txnid); + /* + * Once the prepared update is resolved, the in-memory update and on-disk written copy + * doesn't have same timestamp due to replacing of prepare timestamp with commit and + * durable timestamps. Don't compare them when the on-disk version is a prepare. + */ + WT_ASSERT(session, F_ISSET(unpack, WT_CELL_UNPACK_PREPARE) || + (unpack->tw.stop_ts == oldest_upd->start_ts && + unpack->tw.stop_txn == oldest_upd->txnid)); } /* Append the new entry into the update list. */ @@ -161,6 +174,9 @@ static inline bool __rec_need_save_upd( WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_UPDATE_SELECT *upd_select, bool has_newer_updates) { + if (upd_select->tw.prepare) + return (true); + if (F_ISSET(r, WT_REC_EVICT) && has_newer_updates) return (true); @@ -176,8 +192,8 @@ __rec_need_save_upd( if (F_ISSET(r, WT_REC_CHECKPOINT) && upd_select->upd == NULL) return (false); - return (!__wt_txn_visible_all(session, upd_select->stop_txn, upd_select->stop_ts) && - !__wt_txn_visible_all(session, upd_select->start_txn, upd_select->start_ts)); + return (!__wt_txn_visible_all(session, upd_select->tw.stop_txn, upd_select->tw.stop_ts) && + !__wt_txn_visible_all(session, upd_select->tw.start_txn, upd_select->tw.start_ts)); } /* @@ -191,10 +207,12 @@ __wt_rec_upd_select(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_INSERT *ins, v WT_DECL_ITEM(tmp); WT_DECL_RET; WT_PAGE *page; + WT_TIME_WINDOW *select_tw; WT_UPDATE *first_txn_upd, *first_upd, *upd, *last_upd, *tombstone; wt_timestamp_t max_ts; size_t upd_memsize; uint64_t max_txn, txnid; + char time_string[WT_TIME_STRING_SIZE]; bool has_newer_updates, is_hs_page, supd_restore, upd_saved; /* @@ -202,13 +220,8 @@ __wt_rec_upd_select(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_INSERT *ins, v * both must be initialized. */ upd_select->upd = NULL; - upd_select->start_durable_ts = WT_TS_NONE; - upd_select->start_ts = WT_TS_NONE; - upd_select->start_txn = WT_TXN_NONE; - upd_select->stop_durable_ts = WT_TS_NONE; - upd_select->stop_ts = WT_TS_MAX; - upd_select->stop_txn = WT_TXN_MAX; - upd_select->prepare = false; + select_tw = &upd_select->tw; + __wt_time_window_init(select_tw); page = r->page; first_txn_upd = upd = last_upd = tombstone = NULL; @@ -264,20 +277,24 @@ __wt_rec_upd_select(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_INSERT *ins, v continue; } + /* Ignore prepared updates if it is not eviction. */ if (upd->prepare_state == WT_PREPARE_LOCKED || upd->prepare_state == WT_PREPARE_INPROGRESS) { - WT_ASSERT(session, upd_select->upd == NULL); - has_newer_updates = true; - if (upd->start_ts > max_ts) - max_ts = upd->start_ts; - - /* - * Track the oldest update not on the page, used to decide whether reads can use the - * page image, hence using the start rather than the durable timestamp. - */ - if (upd->start_ts < r->min_skipped_ts) - r->min_skipped_ts = upd->start_ts; - continue; + WT_ASSERT(session, upd_select->upd == NULL || upd_select->upd->txnid == upd->txnid); + if (!F_ISSET(r, WT_REC_EVICT)) { + has_newer_updates = true; + if (upd->start_ts > max_ts) + max_ts = upd->start_ts; + + /* + * Track the oldest update not on the page, used to decide whether reads can use the + * page image, hence using the start rather than the durable timestamp. + */ + if (upd->start_ts < r->min_skipped_ts) + r->min_skipped_ts = upd->start_ts; + continue; + } else + WT_ASSERT(session, upd->prepare_state == WT_PREPARE_INPROGRESS); } /* Track the first update with non-zero timestamp. */ @@ -322,7 +339,7 @@ __wt_rec_upd_select(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_INSERT *ins, v */ if (has_newer_updates && F_ISSET(r, WT_REC_CLEAN_AFTER_REC | WT_REC_VISIBILITY_ERR)) { if (F_ISSET(r, WT_REC_VISIBILITY_ERR)) - WT_PANIC_RET(session, EINVAL, "reconciliation error, update not visible"); + WT_RET_PANIC(session, EINVAL, "reconciliation error, update not visible"); return (__wt_set_return(session, EBUSY)); } @@ -341,6 +358,14 @@ __wt_rec_upd_select(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_INSERT *ins, v */ if (upd != NULL) { /* + * Mark the prepare flag if the selected update is an uncommitted prepare. As tombstone + * updates are never returned to write, set this flag before we move into the previous + * update to write. + */ + if (upd->prepare_state == WT_PREPARE_INPROGRESS) + select_tw->prepare = 1; + + /* * If the newest is a tombstone then select the update before it and set the end of the * visibility window to its time pair as appropriate to indicate that we should return "not * found" for reads after this point. @@ -349,9 +374,7 @@ __wt_rec_upd_select(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_INSERT *ins, v * indicate that the value is visible to any timestamp/transaction id ahead of it. */ if (upd->type == WT_UPDATE_TOMBSTONE) { - upd_select->stop_ts = upd->start_ts; - upd_select->stop_txn = upd->txnid; - upd_select->stop_durable_ts = upd->durable_ts; + __wt_time_window_set_stop(select_tw, upd); tombstone = upd; /* Find the update this tombstone applies to. */ @@ -366,10 +389,8 @@ __wt_rec_upd_select(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_INSERT *ins, v } if (upd != NULL) { /* The beginning of the validity window is the selected update's time pair. */ - upd_select->start_ts = upd->start_ts; - upd_select->start_durable_ts = upd->durable_ts; - upd_select->start_txn = upd->txnid; - } else if (upd_select->stop_ts != WT_TS_NONE || upd_select->stop_txn != WT_TXN_NONE) { + __wt_time_window_set_start(select_tw, upd); + } else if (select_tw->stop_ts != WT_TS_NONE || select_tw->stop_txn != WT_TXN_NONE) { /* If we only have a tombstone in the update list, we must have an ondisk value. */ WT_ASSERT(session, vpack != NULL && tombstone != NULL); /* @@ -384,13 +405,11 @@ __wt_rec_upd_select(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_INSERT *ins, v */ WT_ERR(__rec_append_orig_value(session, page, tombstone, vpack)); WT_ASSERT(session, last_upd->next != NULL && - last_upd->next->txnid == vpack->start_txn && - last_upd->next->start_ts == vpack->start_ts && + last_upd->next->txnid == vpack->tw.start_txn && + last_upd->next->start_ts == vpack->tw.start_ts && last_upd->next->type == WT_UPDATE_STANDARD && last_upd->next->next == NULL); upd_select->upd = last_upd->next; - upd_select->start_ts = last_upd->next->start_ts; - upd_select->start_durable_ts = last_upd->next->durable_ts; - upd_select->start_txn = last_upd->next->txnid; + __wt_time_window_set_start(select_tw, last_upd->next); } } @@ -405,17 +424,15 @@ __wt_rec_upd_select(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_INSERT *ins, v * time pair. While unusual, it is permitted for a single transaction to insert and then remove * a record. We don't want to generate a warning in that case. */ - if (upd_select->stop_ts < upd_select->start_ts || - (upd_select->stop_ts == upd_select->start_ts && - upd_select->stop_txn < upd_select->start_txn)) { - char ts_string[2][WT_TS_INT_STRING_SIZE]; + if (select_tw->stop_ts < select_tw->start_ts || + (select_tw->stop_ts == select_tw->start_ts && select_tw->stop_txn < select_tw->start_txn)) { __wt_verbose(session, WT_VERB_TIMESTAMP, - "Warning: fixing out-of-order timestamps remove at %s earlier than value at %s", - __wt_timestamp_to_string(upd_select->stop_ts, ts_string[0]), - __wt_timestamp_to_string(upd_select->start_ts, ts_string[1])); - upd_select->start_durable_ts = upd_select->stop_durable_ts; - upd_select->start_ts = upd_select->stop_ts; - upd_select->start_txn = upd_select->stop_txn; + "Warning: fixing out-of-order timestamps remove earlier than value; time window %s", + __wt_time_window_to_string(select_tw, time_string)); + + select_tw->durable_start_ts = select_tw->durable_stop_ts; + select_tw->start_ts = select_tw->stop_ts; + select_tw->start_txn = select_tw->stop_txn; } /* |