diff options
Diffstat (limited to 'src/third_party/wiredtiger/src/reconcile')
-rw-r--r-- | src/third_party/wiredtiger/src/reconcile/rec_col.c | 12 | ||||
-rw-r--r-- | src/third_party/wiredtiger/src/reconcile/rec_row.c | 11 | ||||
-rw-r--r-- | src/third_party/wiredtiger/src/reconcile/rec_visibility.c | 175 |
3 files changed, 178 insertions, 20 deletions
diff --git a/src/third_party/wiredtiger/src/reconcile/rec_col.c b/src/third_party/wiredtiger/src/reconcile/rec_col.c index 8000026c58b..03f075e7cb0 100644 --- a/src/third_party/wiredtiger/src/reconcile/rec_col.c +++ b/src/third_party/wiredtiger/src/reconcile/rec_col.c @@ -824,9 +824,15 @@ record_loop: if (hs_cursor == NULL) WT_ERR(__wt_curhs_open(session, NULL, &hs_cursor)); - /* From WT_TS_NONE to delete all the history store content of the key. */ - WT_ERR(__wt_hs_delete_key_from_ts(session, hs_cursor, btree->id, - &hs_recno_key, WT_TS_NONE, false, F_ISSET(r, WT_REC_CHECKPOINT_RUNNING))); + /* + * From WT_TS_NONE delete all the history store content of the key. This + * path will never be taken for a mixed-mode deletion being evicted and with + * a checkpoint that started prior to the eviction starting its + * reconciliation as previous checks done while selecting an update will + * detect that. + */ + WT_ERR(__wt_hs_delete_key_from_ts( + session, hs_cursor, btree->id, &hs_recno_key, WT_TS_NONE, false, false)); WT_STAT_CONN_INCR(session, cache_hs_key_truncate_onpage_removal); WT_STAT_DATA_INCR(session, cache_hs_key_truncate_onpage_removal); diff --git a/src/third_party/wiredtiger/src/reconcile/rec_row.c b/src/third_party/wiredtiger/src/reconcile/rec_row.c index 002086f540e..6817ee53899 100644 --- a/src/third_party/wiredtiger/src/reconcile/rec_row.c +++ b/src/third_party/wiredtiger/src/reconcile/rec_row.c @@ -927,9 +927,14 @@ __wt_rec_row_leaf( if (hs_cursor == NULL) WT_ERR(__wt_curhs_open(session, NULL, &hs_cursor)); - /* From WT_TS_NONE to delete all the history store content of the key. */ - WT_ERR(__wt_hs_delete_key_from_ts(session, hs_cursor, btree->id, tmpkey, - WT_TS_NONE, false, F_ISSET(r, WT_REC_CHECKPOINT_RUNNING))); + /* + * From WT_TS_NONE delete all the history store content of the key. This path + * will never be taken for a mixed-mode deletion being evicted and with a + * checkpoint that started prior to the eviction starting its reconciliation as + * previous checks done while selecting an update will detect that. + */ + WT_ERR(__wt_hs_delete_key_from_ts( + session, hs_cursor, btree->id, tmpkey, WT_TS_NONE, false, false)); /* Fail 1% of the time. */ if (F_ISSET(r, WT_REC_EVICT) && diff --git a/src/third_party/wiredtiger/src/reconcile/rec_visibility.c b/src/third_party/wiredtiger/src/reconcile/rec_visibility.c index 8caeac903b4..63b2c73c0d5 100644 --- a/src/third_party/wiredtiger/src/reconcile/rec_visibility.c +++ b/src/third_party/wiredtiger/src/reconcile/rec_visibility.c @@ -205,9 +205,10 @@ __rec_need_save_upd( return (true); /* - * Save updates for any reconciliation that doesn't involve history store (in-memory database - * and fixed length column store), except when the selected stop time point or the selected - * start time point is globally visible. + * Don't save updates for any reconciliation that doesn't involve history store (in-memory + * database, fixed length column store, metadata, and history store reconciliation itself), + * except when the selected stop time point or the selected start time point is not globally + * visible for in memory database and fixed length column store. */ if (!F_ISSET(r, WT_REC_HS) && !F_ISSET(r, WT_REC_IN_MEMORY) && r->page->type != WT_PAGE_COL_FIX) return (false); @@ -232,20 +233,147 @@ __rec_need_save_upd( * equal to the start time point. While unusual, it is permitted for a single transaction to * insert and then remove a record. We don't want to generate a warning in that case. */ -static inline void +static inline bool __timestamp_out_of_order_fix(WT_SESSION_IMPL *session, WT_TIME_WINDOW *select_tw) { char time_string[WT_TIME_STRING_SIZE]; - if (select_tw->stop_ts < select_tw->start_ts || - (select_tw->stop_ts == select_tw->start_ts && select_tw->stop_txn < select_tw->start_txn)) { + /* + * When supporting read-uncommitted it was possible for the stop_txn to be less than the + * start_txn, this is no longer true so assert that we don't encounter it. + */ + WT_ASSERT(session, select_tw->stop_txn >= select_tw->start_txn); + + if (select_tw->stop_ts < select_tw->start_ts) { __wt_verbose(session, WT_VERB_TIMESTAMP, "Warning: fixing out-of-order timestamps remove earlier than value; time window %s", __wt_time_window_to_string(select_tw, time_string)); select_tw->durable_start_ts = select_tw->durable_stop_ts; select_tw->start_ts = select_tw->stop_ts; + return (true); + } + return (false); +} + +/* + * __rec_validate_upd_chain -- + * Check the update chain for conditions that would prevent its insertion into the history + * store. Return EBUSY if the update chain cannot be inserted into the history store at this + * time. + */ +static int +__rec_validate_upd_chain(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_UPDATE *select_upd, + WT_TIME_WINDOW *select_tw, WT_CELL_UNPACK_KV *vpack) +{ + WT_UPDATE *prev_upd, *upd; + + /* + * There is no selected update to go to disk as such we don't need to check the updates + * following it. + */ + if (select_upd == NULL) + return (0); + + /* + * No need to check out of order timestamps for any reconciliation that doesn't involve history + * store (in-memory database, fixed length column store, metadata, and history store + * reconciliation itself). + */ + if (!F_ISSET(r, WT_REC_HS)) + return (0); + + /* + * If eviction reconciliation starts before checkpoint, it is fine to evict out of order + * timestamp updates. + */ + if (!F_ISSET(r, WT_REC_CHECKPOINT_RUNNING)) + return (0); + + /* + * The selected time window may contain information that isn't visible given the selected + * update, as such we have to check it separately. This is true when there is a tombstone ahead + * of the selected update. + */ + if (select_tw->stop_ts < select_tw->start_ts) { + WT_STAT_CONN_DATA_INCR(session, cache_eviction_blocked_ooo_checkpoint_race_2); + return (EBUSY); + } + + /* + * Rollback to stable may restore older updates from the data store or history store. In this + * case, the restored update has older update than the onpage value, which is expected. + * Reconciliation may restore the onpage value to the update chain. In this case, no need to + * check further as the value is the same as the onpage value which means we processed this + * update chain in a previous round of reconciliation. If we have a prepared update restored + * from the onpage value, no need to check as well because the update chain should only contain + * prepared updates from the same transaction. + */ + if (F_ISSET(select_upd, + WT_UPDATE_RESTORED_FROM_DS | WT_UPDATE_RESTORED_FROM_HS | + WT_UPDATE_PREPARE_RESTORED_FROM_DS)) + return (0); + + /* Loop forward from update after the selected on-page update. */ + for (prev_upd = select_upd, upd = select_upd->next; upd != NULL; upd = upd->next) { + if (upd->txnid == WT_TXN_ABORTED) + continue; + + /* If we have a prepared update, durable timestamp cannot be out of order. */ + WT_ASSERT(session, + prev_upd->prepare_state == WT_PREPARE_INPROGRESS || + prev_upd->start_ts == prev_upd->durable_ts || prev_upd->durable_ts >= upd->durable_ts); + + /* Validate that the updates older than us have older timestamps. */ + if (prev_upd->start_ts < upd->start_ts) { + WT_STAT_CONN_DATA_INCR(session, cache_eviction_blocked_ooo_checkpoint_race_4); + return (EBUSY); + } + + /* + * Rollback to stable may restore older updates from the data store or history store. In + * this case, the restored update has older update than the onpage value, which is expected. + * Reconciliation may restore the onpage value to the update chain. In this case, no need to + * check further as the value is the same as the onpage value. If we have a committed + * prepared update restored from the onpage value, no need to check further as well because + * the update chain after it should only contain committed prepared updates from the same + * transaction. + */ + if (F_ISSET(upd, + WT_UPDATE_RESTORED_FROM_DS | WT_UPDATE_RESTORED_FROM_HS | + WT_UPDATE_PREPARE_RESTORED_FROM_DS)) + return (0); + + prev_upd = upd; + } + + /* + * Check that the on-page time window isn't out-of-order. Don't check against ondisk prepared + * update. It is either committed or rolled back if we are here. If we haven't seen an update + * with the flag WT_UPDATE_RESTORED_FROM_DS we check against the ondisk value. + * + * In the case of checkpoint reconciliation the ondisk value could be an update in the middle of + * the update chain but checkpoint won't replace the page image as such it will be the previous + * reconciliations ondisk value that we will be comparing against. + */ + if (vpack != NULL && !vpack->tw.prepare) { + /* If we have a prepared update, durable timestamp cannot be out of order. */ + WT_ASSERT(session, + prev_upd->prepare_state == WT_PREPARE_INPROGRESS || + prev_upd->start_ts == prev_upd->durable_ts || + prev_upd->durable_ts >= vpack->tw.durable_start_ts); + WT_ASSERT(session, + prev_upd->prepare_state == WT_PREPARE_INPROGRESS || + prev_upd->start_ts == prev_upd->durable_ts || !WT_TIME_WINDOW_HAS_STOP(&vpack->tw) || + prev_upd->durable_ts >= vpack->tw.durable_stop_ts); + if (prev_upd->start_ts < vpack->tw.start_ts || + (WT_TIME_WINDOW_HAS_STOP(&vpack->tw) && prev_upd->start_ts < vpack->tw.stop_ts)) { + WT_STAT_CONN_DATA_INCR(session, cache_eviction_blocked_ooo_checkpoint_race_1); + return (EBUSY); + } } + + return (0); } /* @@ -260,7 +388,7 @@ __wt_rec_upd_select(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_INSERT *ins, v WT_DECL_RET; WT_PAGE *page; WT_TIME_WINDOW *select_tw; - WT_UPDATE *first_txn_upd, *first_upd, *upd, *last_upd, *tombstone; + WT_UPDATE *first_txn_upd, *first_upd, *onpage_upd, *upd, *last_upd, *tombstone; wt_timestamp_t max_ts; size_t upd_memsize; uint64_t max_txn, session_txnid, txnid; @@ -276,7 +404,7 @@ __wt_rec_upd_select(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_INSERT *ins, v WT_TIME_WINDOW_INIT(select_tw); page = r->page; - first_txn_upd = upd = last_upd = tombstone = NULL; + first_txn_upd = onpage_upd = upd = last_upd = tombstone = NULL; upd_memsize = 0; max_ts = WT_TS_NONE; max_txn = WT_TXN_NONE; @@ -539,8 +667,6 @@ __wt_rec_upd_select(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_INSERT *ins, v } } - __timestamp_out_of_order_fix(session, select_tw); - /* * Track the most recent transaction in the page. We store this in the tree at the end of * reconciliation in the service of checkpoints, it is used to avoid discarding trees from @@ -557,6 +683,28 @@ __wt_rec_upd_select(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_INSERT *ins, v if (has_newer_updates) r->leave_dirty = true; + onpage_upd = upd_select->upd != NULL && upd_select->upd->type == WT_UPDATE_TOMBSTONE ? + NULL : + upd_select->upd; + + /* Check the update chain for conditions that could prevent it's eviction. */ + WT_ERR(__rec_validate_upd_chain(session, r, onpage_upd, select_tw, vpack)); + + /* + * Fixup any out of order timestamps, assert that checkpoint wasn't running when this round of + * reconciliation started. + * + * Returning EBUSY here is okay as the previous call to validate the update chain wouldn't have + * caught the situation where only a tombstone is selected. + */ + if (__timestamp_out_of_order_fix(session, select_tw) && F_ISSET(r, WT_REC_HS) && + F_ISSET(r, WT_REC_CHECKPOINT_RUNNING)) { + /* Catch this case in diagnostic builds. */ + WT_STAT_CONN_DATA_INCR(session, cache_eviction_blocked_ooo_checkpoint_race_3); + WT_ASSERT(session, false); + WT_ERR(EBUSY); + } + /* * The update doesn't have any further updates that need to be written to the history store, * skip saving the update as saving the update will cause reconciliation to think there is work @@ -573,10 +721,9 @@ __wt_rec_upd_select(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_INSERT *ins, v supd_restore = F_ISSET(r, WT_REC_EVICT) && (has_newer_updates || F_ISSET(S2C(session), WT_CONN_IN_MEMORY) || page->type == WT_PAGE_COL_FIX); - WT_ERR(__rec_update_save(session, r, ins, ripcip, - upd_select->upd != NULL && upd_select->upd->type == WT_UPDATE_TOMBSTONE ? NULL : - upd_select->upd, - supd_restore, upd_memsize)); + + WT_ERR(__rec_update_save(session, r, ins, ripcip, onpage_upd, supd_restore, upd_memsize)); + /* * Mark the selected update (and potentially the tombstone preceding it) as being destined * for the data store. Subsequent reconciliations should know that they can select this |