summaryrefslogtreecommitdiff
path: root/src/third_party/wiredtiger/src/reconcile
diff options
context:
space:
mode:
Diffstat (limited to 'src/third_party/wiredtiger/src/reconcile')
-rw-r--r--src/third_party/wiredtiger/src/reconcile/rec_col.c12
-rw-r--r--src/third_party/wiredtiger/src/reconcile/rec_row.c11
-rw-r--r--src/third_party/wiredtiger/src/reconcile/rec_visibility.c175
3 files changed, 178 insertions, 20 deletions
diff --git a/src/third_party/wiredtiger/src/reconcile/rec_col.c b/src/third_party/wiredtiger/src/reconcile/rec_col.c
index 8000026c58b..03f075e7cb0 100644
--- a/src/third_party/wiredtiger/src/reconcile/rec_col.c
+++ b/src/third_party/wiredtiger/src/reconcile/rec_col.c
@@ -824,9 +824,15 @@ record_loop:
if (hs_cursor == NULL)
WT_ERR(__wt_curhs_open(session, NULL, &hs_cursor));
- /* From WT_TS_NONE to delete all the history store content of the key. */
- WT_ERR(__wt_hs_delete_key_from_ts(session, hs_cursor, btree->id,
- &hs_recno_key, WT_TS_NONE, false, F_ISSET(r, WT_REC_CHECKPOINT_RUNNING)));
+ /*
+ * From WT_TS_NONE delete all the history store content of the key. This
+ * path will never be taken for a mixed-mode deletion being evicted and with
+ * a checkpoint that started prior to the eviction starting its
+ * reconciliation as previous checks done while selecting an update will
+ * detect that.
+ */
+ WT_ERR(__wt_hs_delete_key_from_ts(
+ session, hs_cursor, btree->id, &hs_recno_key, WT_TS_NONE, false, false));
WT_STAT_CONN_INCR(session, cache_hs_key_truncate_onpage_removal);
WT_STAT_DATA_INCR(session, cache_hs_key_truncate_onpage_removal);
diff --git a/src/third_party/wiredtiger/src/reconcile/rec_row.c b/src/third_party/wiredtiger/src/reconcile/rec_row.c
index 002086f540e..6817ee53899 100644
--- a/src/third_party/wiredtiger/src/reconcile/rec_row.c
+++ b/src/third_party/wiredtiger/src/reconcile/rec_row.c
@@ -927,9 +927,14 @@ __wt_rec_row_leaf(
if (hs_cursor == NULL)
WT_ERR(__wt_curhs_open(session, NULL, &hs_cursor));
- /* From WT_TS_NONE to delete all the history store content of the key. */
- WT_ERR(__wt_hs_delete_key_from_ts(session, hs_cursor, btree->id, tmpkey,
- WT_TS_NONE, false, F_ISSET(r, WT_REC_CHECKPOINT_RUNNING)));
+ /*
+ * From WT_TS_NONE delete all the history store content of the key. This path
+ * will never be taken for a mixed-mode deletion being evicted and with a
+ * checkpoint that started prior to the eviction starting its reconciliation as
+ * previous checks done while selecting an update will detect that.
+ */
+ WT_ERR(__wt_hs_delete_key_from_ts(
+ session, hs_cursor, btree->id, tmpkey, WT_TS_NONE, false, false));
/* Fail 1% of the time. */
if (F_ISSET(r, WT_REC_EVICT) &&
diff --git a/src/third_party/wiredtiger/src/reconcile/rec_visibility.c b/src/third_party/wiredtiger/src/reconcile/rec_visibility.c
index 8caeac903b4..63b2c73c0d5 100644
--- a/src/third_party/wiredtiger/src/reconcile/rec_visibility.c
+++ b/src/third_party/wiredtiger/src/reconcile/rec_visibility.c
@@ -205,9 +205,10 @@ __rec_need_save_upd(
return (true);
/*
- * Save updates for any reconciliation that doesn't involve history store (in-memory database
- * and fixed length column store), except when the selected stop time point or the selected
- * start time point is globally visible.
+ * Don't save updates for any reconciliation that doesn't involve history store (in-memory
+ * database, fixed length column store, metadata, and history store reconciliation itself),
+ * except when the selected stop time point or the selected start time point is not globally
+ * visible for in memory database and fixed length column store.
*/
if (!F_ISSET(r, WT_REC_HS) && !F_ISSET(r, WT_REC_IN_MEMORY) && r->page->type != WT_PAGE_COL_FIX)
return (false);
@@ -232,20 +233,147 @@ __rec_need_save_upd(
* equal to the start time point. While unusual, it is permitted for a single transaction to
* insert and then remove a record. We don't want to generate a warning in that case.
*/
-static inline void
+static inline bool
__timestamp_out_of_order_fix(WT_SESSION_IMPL *session, WT_TIME_WINDOW *select_tw)
{
char time_string[WT_TIME_STRING_SIZE];
- if (select_tw->stop_ts < select_tw->start_ts ||
- (select_tw->stop_ts == select_tw->start_ts && select_tw->stop_txn < select_tw->start_txn)) {
+ /*
+ * When supporting read-uncommitted it was possible for the stop_txn to be less than the
+ * start_txn, this is no longer true so assert that we don't encounter it.
+ */
+ WT_ASSERT(session, select_tw->stop_txn >= select_tw->start_txn);
+
+ if (select_tw->stop_ts < select_tw->start_ts) {
__wt_verbose(session, WT_VERB_TIMESTAMP,
"Warning: fixing out-of-order timestamps remove earlier than value; time window %s",
__wt_time_window_to_string(select_tw, time_string));
select_tw->durable_start_ts = select_tw->durable_stop_ts;
select_tw->start_ts = select_tw->stop_ts;
+ return (true);
+ }
+ return (false);
+}
+
+/*
+ * __rec_validate_upd_chain --
+ * Check the update chain for conditions that would prevent its insertion into the history
+ * store. Return EBUSY if the update chain cannot be inserted into the history store at this
+ * time.
+ */
+static int
+__rec_validate_upd_chain(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_UPDATE *select_upd,
+ WT_TIME_WINDOW *select_tw, WT_CELL_UNPACK_KV *vpack)
+{
+ WT_UPDATE *prev_upd, *upd;
+
+ /*
+ * There is no selected update to go to disk as such we don't need to check the updates
+ * following it.
+ */
+ if (select_upd == NULL)
+ return (0);
+
+ /*
+ * No need to check out of order timestamps for any reconciliation that doesn't involve history
+ * store (in-memory database, fixed length column store, metadata, and history store
+ * reconciliation itself).
+ */
+ if (!F_ISSET(r, WT_REC_HS))
+ return (0);
+
+ /*
+ * If eviction reconciliation starts before checkpoint, it is fine to evict out of order
+ * timestamp updates.
+ */
+ if (!F_ISSET(r, WT_REC_CHECKPOINT_RUNNING))
+ return (0);
+
+ /*
+ * The selected time window may contain information that isn't visible given the selected
+ * update, as such we have to check it separately. This is true when there is a tombstone ahead
+ * of the selected update.
+ */
+ if (select_tw->stop_ts < select_tw->start_ts) {
+ WT_STAT_CONN_DATA_INCR(session, cache_eviction_blocked_ooo_checkpoint_race_2);
+ return (EBUSY);
+ }
+
+ /*
+ * Rollback to stable may restore older updates from the data store or history store. In this
+ * case, the restored update has older update than the onpage value, which is expected.
+ * Reconciliation may restore the onpage value to the update chain. In this case, no need to
+ * check further as the value is the same as the onpage value which means we processed this
+ * update chain in a previous round of reconciliation. If we have a prepared update restored
+ * from the onpage value, no need to check as well because the update chain should only contain
+ * prepared updates from the same transaction.
+ */
+ if (F_ISSET(select_upd,
+ WT_UPDATE_RESTORED_FROM_DS | WT_UPDATE_RESTORED_FROM_HS |
+ WT_UPDATE_PREPARE_RESTORED_FROM_DS))
+ return (0);
+
+ /* Loop forward from update after the selected on-page update. */
+ for (prev_upd = select_upd, upd = select_upd->next; upd != NULL; upd = upd->next) {
+ if (upd->txnid == WT_TXN_ABORTED)
+ continue;
+
+ /* If we have a prepared update, durable timestamp cannot be out of order. */
+ WT_ASSERT(session,
+ prev_upd->prepare_state == WT_PREPARE_INPROGRESS ||
+ prev_upd->start_ts == prev_upd->durable_ts || prev_upd->durable_ts >= upd->durable_ts);
+
+ /* Validate that the updates older than us have older timestamps. */
+ if (prev_upd->start_ts < upd->start_ts) {
+ WT_STAT_CONN_DATA_INCR(session, cache_eviction_blocked_ooo_checkpoint_race_4);
+ return (EBUSY);
+ }
+
+ /*
+ * Rollback to stable may restore older updates from the data store or history store. In
+ * this case, the restored update has older update than the onpage value, which is expected.
+ * Reconciliation may restore the onpage value to the update chain. In this case, no need to
+ * check further as the value is the same as the onpage value. If we have a committed
+ * prepared update restored from the onpage value, no need to check further as well because
+ * the update chain after it should only contain committed prepared updates from the same
+ * transaction.
+ */
+ if (F_ISSET(upd,
+ WT_UPDATE_RESTORED_FROM_DS | WT_UPDATE_RESTORED_FROM_HS |
+ WT_UPDATE_PREPARE_RESTORED_FROM_DS))
+ return (0);
+
+ prev_upd = upd;
+ }
+
+ /*
+ * Check that the on-page time window isn't out-of-order. Don't check against ondisk prepared
+ * update. It is either committed or rolled back if we are here. If we haven't seen an update
+ * with the flag WT_UPDATE_RESTORED_FROM_DS we check against the ondisk value.
+ *
+ * In the case of checkpoint reconciliation the ondisk value could be an update in the middle of
+ * the update chain but checkpoint won't replace the page image as such it will be the previous
+ * reconciliations ondisk value that we will be comparing against.
+ */
+ if (vpack != NULL && !vpack->tw.prepare) {
+ /* If we have a prepared update, durable timestamp cannot be out of order. */
+ WT_ASSERT(session,
+ prev_upd->prepare_state == WT_PREPARE_INPROGRESS ||
+ prev_upd->start_ts == prev_upd->durable_ts ||
+ prev_upd->durable_ts >= vpack->tw.durable_start_ts);
+ WT_ASSERT(session,
+ prev_upd->prepare_state == WT_PREPARE_INPROGRESS ||
+ prev_upd->start_ts == prev_upd->durable_ts || !WT_TIME_WINDOW_HAS_STOP(&vpack->tw) ||
+ prev_upd->durable_ts >= vpack->tw.durable_stop_ts);
+ if (prev_upd->start_ts < vpack->tw.start_ts ||
+ (WT_TIME_WINDOW_HAS_STOP(&vpack->tw) && prev_upd->start_ts < vpack->tw.stop_ts)) {
+ WT_STAT_CONN_DATA_INCR(session, cache_eviction_blocked_ooo_checkpoint_race_1);
+ return (EBUSY);
+ }
}
+
+ return (0);
}
/*
@@ -260,7 +388,7 @@ __wt_rec_upd_select(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_INSERT *ins, v
WT_DECL_RET;
WT_PAGE *page;
WT_TIME_WINDOW *select_tw;
- WT_UPDATE *first_txn_upd, *first_upd, *upd, *last_upd, *tombstone;
+ WT_UPDATE *first_txn_upd, *first_upd, *onpage_upd, *upd, *last_upd, *tombstone;
wt_timestamp_t max_ts;
size_t upd_memsize;
uint64_t max_txn, session_txnid, txnid;
@@ -276,7 +404,7 @@ __wt_rec_upd_select(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_INSERT *ins, v
WT_TIME_WINDOW_INIT(select_tw);
page = r->page;
- first_txn_upd = upd = last_upd = tombstone = NULL;
+ first_txn_upd = onpage_upd = upd = last_upd = tombstone = NULL;
upd_memsize = 0;
max_ts = WT_TS_NONE;
max_txn = WT_TXN_NONE;
@@ -539,8 +667,6 @@ __wt_rec_upd_select(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_INSERT *ins, v
}
}
- __timestamp_out_of_order_fix(session, select_tw);
-
/*
* Track the most recent transaction in the page. We store this in the tree at the end of
* reconciliation in the service of checkpoints, it is used to avoid discarding trees from
@@ -557,6 +683,28 @@ __wt_rec_upd_select(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_INSERT *ins, v
if (has_newer_updates)
r->leave_dirty = true;
+ onpage_upd = upd_select->upd != NULL && upd_select->upd->type == WT_UPDATE_TOMBSTONE ?
+ NULL :
+ upd_select->upd;
+
+ /* Check the update chain for conditions that could prevent it's eviction. */
+ WT_ERR(__rec_validate_upd_chain(session, r, onpage_upd, select_tw, vpack));
+
+ /*
+ * Fixup any out of order timestamps, assert that checkpoint wasn't running when this round of
+ * reconciliation started.
+ *
+ * Returning EBUSY here is okay as the previous call to validate the update chain wouldn't have
+ * caught the situation where only a tombstone is selected.
+ */
+ if (__timestamp_out_of_order_fix(session, select_tw) && F_ISSET(r, WT_REC_HS) &&
+ F_ISSET(r, WT_REC_CHECKPOINT_RUNNING)) {
+ /* Catch this case in diagnostic builds. */
+ WT_STAT_CONN_DATA_INCR(session, cache_eviction_blocked_ooo_checkpoint_race_3);
+ WT_ASSERT(session, false);
+ WT_ERR(EBUSY);
+ }
+
/*
* The update doesn't have any further updates that need to be written to the history store,
* skip saving the update as saving the update will cause reconciliation to think there is work
@@ -573,10 +721,9 @@ __wt_rec_upd_select(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_INSERT *ins, v
supd_restore = F_ISSET(r, WT_REC_EVICT) &&
(has_newer_updates || F_ISSET(S2C(session), WT_CONN_IN_MEMORY) ||
page->type == WT_PAGE_COL_FIX);
- WT_ERR(__rec_update_save(session, r, ins, ripcip,
- upd_select->upd != NULL && upd_select->upd->type == WT_UPDATE_TOMBSTONE ? NULL :
- upd_select->upd,
- supd_restore, upd_memsize));
+
+ WT_ERR(__rec_update_save(session, r, ins, ripcip, onpage_upd, supd_restore, upd_memsize));
+
/*
* Mark the selected update (and potentially the tombstone preceding it) as being destined
* for the data store. Subsequent reconciliations should know that they can select this