diff options
11 files changed, 278 insertions, 101 deletions
diff --git a/src/third_party/wiredtiger/dist/stat_data.py b/src/third_party/wiredtiger/dist/stat_data.py index c1406f22f95..d9f476d14bc 100644 --- a/src/third_party/wiredtiger/dist/stat_data.py +++ b/src/third_party/wiredtiger/dist/stat_data.py @@ -652,6 +652,8 @@ connection_stats = [ TxnStat('txn_rollback', 'transactions rolled back'), TxnStat('txn_rts', 'rollback to stable calls'), TxnStat('txn_rts_hs_removed', 'rollback to stable updates removed from history store'), + TxnStat('txn_rts_hs_restore_tombstones', 'rollback to stable restored tombstones from history store'), + TxnStat('txn_rts_hs_stop_older_than_newer_start', 'rollback to stable hs records with stop timestamps older than newer records'), TxnStat('txn_rts_keys_removed', 'rollback to stable keys removed'), TxnStat('txn_rts_keys_restored', 'rollback to stable keys restored'), TxnStat('txn_rts_pages_visited', 'rollback to stable pages visited'), diff --git a/src/third_party/wiredtiger/import.data b/src/third_party/wiredtiger/import.data index a6a732aab81..eb8474c7ca6 100644 --- a/src/third_party/wiredtiger/import.data +++ b/src/third_party/wiredtiger/import.data @@ -2,5 +2,5 @@ "vendor": "wiredtiger", "github": "wiredtiger/wiredtiger.git", "branch": "mongodb-4.4", - "commit": "5faf7b26eb9d311b8a7575a16c757078772eb02d" + "commit": "3998a1f701bfc67afeceeef68624fbeb58daa468" } diff --git a/src/third_party/wiredtiger/src/include/stat.h b/src/third_party/wiredtiger/src/include/stat.h index eb90627015d..a158e26e0fc 100644 --- a/src/third_party/wiredtiger/src/include/stat.h +++ b/src/third_party/wiredtiger/src/include/stat.h @@ -712,9 +712,11 @@ struct __wt_connection_stats { int64_t txn_read_queue_inserts; int64_t txn_read_queue_len; int64_t txn_rts; + int64_t txn_rts_hs_stop_older_than_newer_start; int64_t txn_rts_keys_removed; int64_t txn_rts_keys_restored; int64_t txn_rts_pages_visited; + int64_t txn_rts_hs_restore_tombstones; int64_t txn_rts_skip_interal_pages_walk; int64_t txn_rts_sweep_hs_keys; int64_t txn_rts_upd_aborted; diff --git a/src/third_party/wiredtiger/src/include/wiredtiger.in b/src/third_party/wiredtiger/src/include/wiredtiger.in index 26ed8a8cd9c..24325bba4bc 100644 --- a/src/third_party/wiredtiger/src/include/wiredtiger.in +++ b/src/third_party/wiredtiger/src/include/wiredtiger.in @@ -5904,114 +5904,121 @@ extern int wiredtiger_extension_terminate(WT_CONNECTION *connection); #define WT_STAT_CONN_TXN_READ_QUEUE_LEN 1415 /*! transaction: rollback to stable calls */ #define WT_STAT_CONN_TXN_RTS 1416 +/*! + * transaction: rollback to stable hs records with stop timestamps older + * than newer records + */ +#define WT_STAT_CONN_TXN_RTS_HS_STOP_OLDER_THAN_NEWER_START 1417 /*! transaction: rollback to stable keys removed */ -#define WT_STAT_CONN_TXN_RTS_KEYS_REMOVED 1417 +#define WT_STAT_CONN_TXN_RTS_KEYS_REMOVED 1418 /*! transaction: rollback to stable keys restored */ -#define WT_STAT_CONN_TXN_RTS_KEYS_RESTORED 1418 +#define WT_STAT_CONN_TXN_RTS_KEYS_RESTORED 1419 /*! transaction: rollback to stable pages visited */ -#define WT_STAT_CONN_TXN_RTS_PAGES_VISITED 1419 +#define WT_STAT_CONN_TXN_RTS_PAGES_VISITED 1420 +/*! transaction: rollback to stable restored tombstones from history store */ +#define WT_STAT_CONN_TXN_RTS_HS_RESTORE_TOMBSTONES 1421 /*! transaction: rollback to stable skipping internal pages tree walk */ -#define WT_STAT_CONN_TXN_RTS_SKIP_INTERAL_PAGES_WALK 1420 +#define WT_STAT_CONN_TXN_RTS_SKIP_INTERAL_PAGES_WALK 1422 /*! transaction: rollback to stable sweeping history store keys */ -#define WT_STAT_CONN_TXN_RTS_SWEEP_HS_KEYS 1421 +#define WT_STAT_CONN_TXN_RTS_SWEEP_HS_KEYS 1423 /*! transaction: rollback to stable updates aborted */ -#define WT_STAT_CONN_TXN_RTS_UPD_ABORTED 1422 +#define WT_STAT_CONN_TXN_RTS_UPD_ABORTED 1424 /*! transaction: rollback to stable updates removed from history store */ -#define WT_STAT_CONN_TXN_RTS_HS_REMOVED 1423 +#define WT_STAT_CONN_TXN_RTS_HS_REMOVED 1425 /*! transaction: set timestamp calls */ -#define WT_STAT_CONN_TXN_SET_TS 1424 +#define WT_STAT_CONN_TXN_SET_TS 1426 /*! transaction: set timestamp durable calls */ -#define WT_STAT_CONN_TXN_SET_TS_DURABLE 1425 +#define WT_STAT_CONN_TXN_SET_TS_DURABLE 1427 /*! transaction: set timestamp durable updates */ -#define WT_STAT_CONN_TXN_SET_TS_DURABLE_UPD 1426 +#define WT_STAT_CONN_TXN_SET_TS_DURABLE_UPD 1428 /*! transaction: set timestamp oldest calls */ -#define WT_STAT_CONN_TXN_SET_TS_OLDEST 1427 +#define WT_STAT_CONN_TXN_SET_TS_OLDEST 1429 /*! transaction: set timestamp oldest updates */ -#define WT_STAT_CONN_TXN_SET_TS_OLDEST_UPD 1428 +#define WT_STAT_CONN_TXN_SET_TS_OLDEST_UPD 1430 /*! transaction: set timestamp stable calls */ -#define WT_STAT_CONN_TXN_SET_TS_STABLE 1429 +#define WT_STAT_CONN_TXN_SET_TS_STABLE 1431 /*! transaction: set timestamp stable updates */ -#define WT_STAT_CONN_TXN_SET_TS_STABLE_UPD 1430 +#define WT_STAT_CONN_TXN_SET_TS_STABLE_UPD 1432 /*! transaction: transaction begins */ -#define WT_STAT_CONN_TXN_BEGIN 1431 +#define WT_STAT_CONN_TXN_BEGIN 1433 /*! transaction: transaction checkpoint currently running */ -#define WT_STAT_CONN_TXN_CHECKPOINT_RUNNING 1432 +#define WT_STAT_CONN_TXN_CHECKPOINT_RUNNING 1434 /*! transaction: transaction checkpoint generation */ -#define WT_STAT_CONN_TXN_CHECKPOINT_GENERATION 1433 +#define WT_STAT_CONN_TXN_CHECKPOINT_GENERATION 1435 /*! * transaction: transaction checkpoint history store file duration * (usecs) */ -#define WT_STAT_CONN_TXN_HS_CKPT_DURATION 1434 +#define WT_STAT_CONN_TXN_HS_CKPT_DURATION 1436 /*! transaction: transaction checkpoint max time (msecs) */ -#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_MAX 1435 +#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_MAX 1437 /*! transaction: transaction checkpoint min time (msecs) */ -#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_MIN 1436 +#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_MIN 1438 /*! transaction: transaction checkpoint most recent time (msecs) */ -#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_RECENT 1437 +#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_RECENT 1439 /*! transaction: transaction checkpoint prepare currently running */ -#define WT_STAT_CONN_TXN_CHECKPOINT_PREP_RUNNING 1438 +#define WT_STAT_CONN_TXN_CHECKPOINT_PREP_RUNNING 1440 /*! transaction: transaction checkpoint prepare max time (msecs) */ -#define WT_STAT_CONN_TXN_CHECKPOINT_PREP_MAX 1439 +#define WT_STAT_CONN_TXN_CHECKPOINT_PREP_MAX 1441 /*! transaction: transaction checkpoint prepare min time (msecs) */ -#define WT_STAT_CONN_TXN_CHECKPOINT_PREP_MIN 1440 +#define WT_STAT_CONN_TXN_CHECKPOINT_PREP_MIN 1442 /*! transaction: transaction checkpoint prepare most recent time (msecs) */ -#define WT_STAT_CONN_TXN_CHECKPOINT_PREP_RECENT 1441 +#define WT_STAT_CONN_TXN_CHECKPOINT_PREP_RECENT 1443 /*! transaction: transaction checkpoint prepare total time (msecs) */ -#define WT_STAT_CONN_TXN_CHECKPOINT_PREP_TOTAL 1442 +#define WT_STAT_CONN_TXN_CHECKPOINT_PREP_TOTAL 1444 /*! transaction: transaction checkpoint scrub dirty target */ -#define WT_STAT_CONN_TXN_CHECKPOINT_SCRUB_TARGET 1443 +#define WT_STAT_CONN_TXN_CHECKPOINT_SCRUB_TARGET 1445 /*! transaction: transaction checkpoint scrub time (msecs) */ -#define WT_STAT_CONN_TXN_CHECKPOINT_SCRUB_TIME 1444 +#define WT_STAT_CONN_TXN_CHECKPOINT_SCRUB_TIME 1446 /*! transaction: transaction checkpoint total time (msecs) */ -#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_TOTAL 1445 +#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_TOTAL 1447 /*! transaction: transaction checkpoints */ -#define WT_STAT_CONN_TXN_CHECKPOINT 1446 +#define WT_STAT_CONN_TXN_CHECKPOINT 1448 /*! * transaction: transaction checkpoints skipped because database was * clean */ -#define WT_STAT_CONN_TXN_CHECKPOINT_SKIPPED 1447 +#define WT_STAT_CONN_TXN_CHECKPOINT_SKIPPED 1449 /*! transaction: transaction failures due to history store */ -#define WT_STAT_CONN_TXN_FAIL_CACHE 1448 +#define WT_STAT_CONN_TXN_FAIL_CACHE 1450 /*! * transaction: transaction fsync calls for checkpoint after allocating * the transaction ID */ -#define WT_STAT_CONN_TXN_CHECKPOINT_FSYNC_POST 1449 +#define WT_STAT_CONN_TXN_CHECKPOINT_FSYNC_POST 1451 /*! * transaction: transaction fsync duration for checkpoint after * allocating the transaction ID (usecs) */ -#define WT_STAT_CONN_TXN_CHECKPOINT_FSYNC_POST_DURATION 1450 +#define WT_STAT_CONN_TXN_CHECKPOINT_FSYNC_POST_DURATION 1452 /*! transaction: transaction range of IDs currently pinned */ -#define WT_STAT_CONN_TXN_PINNED_RANGE 1451 +#define WT_STAT_CONN_TXN_PINNED_RANGE 1453 /*! transaction: transaction range of IDs currently pinned by a checkpoint */ -#define WT_STAT_CONN_TXN_PINNED_CHECKPOINT_RANGE 1452 +#define WT_STAT_CONN_TXN_PINNED_CHECKPOINT_RANGE 1454 /*! transaction: transaction range of timestamps currently pinned */ -#define WT_STAT_CONN_TXN_PINNED_TIMESTAMP 1453 +#define WT_STAT_CONN_TXN_PINNED_TIMESTAMP 1455 /*! transaction: transaction range of timestamps pinned by a checkpoint */ -#define WT_STAT_CONN_TXN_PINNED_TIMESTAMP_CHECKPOINT 1454 +#define WT_STAT_CONN_TXN_PINNED_TIMESTAMP_CHECKPOINT 1456 /*! * transaction: transaction range of timestamps pinned by the oldest * active read timestamp */ -#define WT_STAT_CONN_TXN_PINNED_TIMESTAMP_READER 1455 +#define WT_STAT_CONN_TXN_PINNED_TIMESTAMP_READER 1457 /*! * transaction: transaction range of timestamps pinned by the oldest * timestamp */ -#define WT_STAT_CONN_TXN_PINNED_TIMESTAMP_OLDEST 1456 +#define WT_STAT_CONN_TXN_PINNED_TIMESTAMP_OLDEST 1458 /*! transaction: transaction read timestamp of the oldest active reader */ -#define WT_STAT_CONN_TXN_TIMESTAMP_OLDEST_ACTIVE_READ 1457 +#define WT_STAT_CONN_TXN_TIMESTAMP_OLDEST_ACTIVE_READ 1459 /*! transaction: transaction sync calls */ -#define WT_STAT_CONN_TXN_SYNC 1458 +#define WT_STAT_CONN_TXN_SYNC 1460 /*! transaction: transactions committed */ -#define WT_STAT_CONN_TXN_COMMIT 1459 +#define WT_STAT_CONN_TXN_COMMIT 1461 /*! transaction: transactions rolled back */ -#define WT_STAT_CONN_TXN_ROLLBACK 1460 +#define WT_STAT_CONN_TXN_ROLLBACK 1462 /*! transaction: update conflicts */ -#define WT_STAT_CONN_TXN_UPDATE_CONFLICT 1461 +#define WT_STAT_CONN_TXN_UPDATE_CONFLICT 1463 /*! * @} diff --git a/src/third_party/wiredtiger/src/support/stat.c b/src/third_party/wiredtiger/src/support/stat.c index f94d20c7875..92f0e13f268 100644 --- a/src/third_party/wiredtiger/src/support/stat.c +++ b/src/third_party/wiredtiger/src/support/stat.c @@ -1051,8 +1051,11 @@ static const char *const __stats_connection_desc[] = { "transaction: read timestamp queue insert to empty", "transaction: read timestamp queue inserts to head", "transaction: read timestamp queue inserts total", "transaction: read timestamp queue length", - "transaction: rollback to stable calls", "transaction: rollback to stable keys removed", - "transaction: rollback to stable keys restored", "transaction: rollback to stable pages visited", + "transaction: rollback to stable calls", + "transaction: rollback to stable hs records with stop timestamps older than newer records", + "transaction: rollback to stable keys removed", "transaction: rollback to stable keys restored", + "transaction: rollback to stable pages visited", + "transaction: rollback to stable restored tombstones from history store", "transaction: rollback to stable skipping internal pages tree walk", "transaction: rollback to stable sweeping history store keys", "transaction: rollback to stable updates aborted", @@ -1546,9 +1549,11 @@ __wt_stat_connection_clear_single(WT_CONNECTION_STATS *stats) stats->txn_read_queue_inserts = 0; stats->txn_read_queue_len = 0; stats->txn_rts = 0; + stats->txn_rts_hs_stop_older_than_newer_start = 0; stats->txn_rts_keys_removed = 0; stats->txn_rts_keys_restored = 0; stats->txn_rts_pages_visited = 0; + stats->txn_rts_hs_restore_tombstones = 0; stats->txn_rts_skip_interal_pages_walk = 0; stats->txn_rts_sweep_hs_keys = 0; stats->txn_rts_upd_aborted = 0; @@ -2051,9 +2056,12 @@ __wt_stat_connection_aggregate(WT_CONNECTION_STATS **from, WT_CONNECTION_STATS * to->txn_read_queue_inserts += WT_STAT_READ(from, txn_read_queue_inserts); to->txn_read_queue_len += WT_STAT_READ(from, txn_read_queue_len); to->txn_rts += WT_STAT_READ(from, txn_rts); + to->txn_rts_hs_stop_older_than_newer_start += + WT_STAT_READ(from, txn_rts_hs_stop_older_than_newer_start); to->txn_rts_keys_removed += WT_STAT_READ(from, txn_rts_keys_removed); to->txn_rts_keys_restored += WT_STAT_READ(from, txn_rts_keys_restored); to->txn_rts_pages_visited += WT_STAT_READ(from, txn_rts_pages_visited); + to->txn_rts_hs_restore_tombstones += WT_STAT_READ(from, txn_rts_hs_restore_tombstones); to->txn_rts_skip_interal_pages_walk += WT_STAT_READ(from, txn_rts_skip_interal_pages_walk); to->txn_rts_sweep_hs_keys += WT_STAT_READ(from, txn_rts_sweep_hs_keys); to->txn_rts_upd_aborted += WT_STAT_READ(from, txn_rts_upd_aborted); diff --git a/src/third_party/wiredtiger/src/txn/txn_rollback_to_stable.c b/src/third_party/wiredtiger/src/txn/txn_rollback_to_stable.c index 13c3725659d..7b89d4f21d3 100644 --- a/src/third_party/wiredtiger/src/txn/txn_rollback_to_stable.c +++ b/src/third_party/wiredtiger/src/txn/txn_rollback_to_stable.c @@ -152,27 +152,27 @@ __rollback_row_ondisk_fixup_key(WT_SESSION_IMPL *session, WT_PAGE *page, WT_ROW WT_DECL_ITEM(key); WT_DECL_RET; WT_ITEM full_value; - WT_UPDATE *hs_upd, *upd; - wt_timestamp_t durable_ts, hs_start_ts, hs_stop_ts; -#ifdef HAVE_DIAGNOSTIC - wt_timestamp_t newer_hs_ts; -#endif + WT_UPDATE *hs_upd, *tombstone, *upd; + wt_timestamp_t hs_durable_ts, hs_start_ts, hs_stop_durable_ts, newer_hs_durable_ts; uint64_t hs_counter, type_full; uint32_t hs_btree_id, session_flags; uint8_t type; int cmp; char ts_string[4][WT_TS_INT_STRING_SIZE]; bool is_owner, valid_update_found; - - hs_cursor = NULL; - hs_upd = upd = NULL; - durable_ts = hs_start_ts = WT_TS_NONE; #ifdef HAVE_DIAGNOSTIC - newer_hs_ts = WT_TS_NONE; + bool first_record; #endif + + hs_cursor = NULL; + hs_upd = tombstone = upd = NULL; + hs_durable_ts = hs_start_ts = hs_stop_durable_ts = WT_TS_NONE; hs_btree_id = S2BT(session)->id; session_flags = 0; is_owner = valid_update_found = false; +#ifdef HAVE_DIAGNOSTIC + first_record = true; +#endif /* Allocate buffers for the data store and history store key. */ WT_RET(__wt_scr_alloc(session, 0, &key)); @@ -183,12 +183,11 @@ __rollback_row_ondisk_fixup_key(WT_SESSION_IMPL *session, WT_PAGE *page, WT_ROW /* Get the full update value from the data store. */ WT_CLEAR(full_value); - if (!__wt_row_leaf_value(page, rip, &full_value)) { - unpack = &_unpack; - __wt_row_leaf_value_cell(session, page, rip, NULL, unpack); - WT_ERR(__wt_page_cell_data_ref(session, page, unpack, &full_value)); - } + unpack = &_unpack; + __wt_row_leaf_value_cell(session, page, rip, NULL, unpack); + WT_ERR(__wt_page_cell_data_ref(session, page, unpack, &full_value)); WT_ERR(__wt_buf_set(session, &full_value, full_value.data, full_value.size)); + newer_hs_durable_ts = unpack->tw.durable_start_ts; /* Open a history store table cursor. */ WT_ERR(__wt_hs_cursor(session, &session_flags, &is_owner)); @@ -228,7 +227,8 @@ __rollback_row_ondisk_fixup_key(WT_SESSION_IMPL *session, WT_PAGE *page, WT_ROW cbt->compare = 0; /* Get current value and convert to full update if it is a modify. */ - WT_ERR(hs_cursor->get_value(hs_cursor, &hs_stop_ts, &durable_ts, &type_full, hs_value)); + WT_ERR(hs_cursor->get_value( + hs_cursor, &hs_stop_durable_ts, &hs_durable_ts, &type_full, hs_value)); type = (uint8_t)type_full; if (type == WT_UPDATE_MODIFY) WT_ERR(__wt_modify_apply_item( @@ -240,10 +240,15 @@ __rollback_row_ondisk_fixup_key(WT_SESSION_IMPL *session, WT_PAGE *page, WT_ROW /* * Verify the history store timestamps are in order. The start timestamp may be equal to the - * stop timestamp if the original update's commit timestamp is out of order. + * stop timestamp if the original update's commit timestamp is out of order. We may see + * records newer than or equal to the onpage value if eviction runs concurrently with + * checkpoint. In that case, don't verify the first record. */ - WT_ASSERT(session, - (newer_hs_ts == WT_TS_NONE || hs_stop_ts <= newer_hs_ts || hs_start_ts == hs_stop_ts)); + WT_ASSERT(session, hs_stop_durable_ts <= newer_hs_durable_ts || + hs_start_ts == hs_stop_durable_ts || first_record); + + if (hs_stop_durable_ts < newer_hs_durable_ts) + WT_STAT_CONN_INCR(session, txn_rts_hs_stop_older_than_newer_start); /* * Stop processing when we find the newer version value of this key is stable according to @@ -251,22 +256,22 @@ __rollback_row_ondisk_fixup_key(WT_SESSION_IMPL *session, WT_PAGE *page, WT_ROW * update chain. Also it confirms that history store doesn't contains any newer version than * the current version for the key. */ - if (!replace && hs_stop_ts <= rollback_timestamp) { + if (!replace && hs_stop_durable_ts <= rollback_timestamp) { __wt_verbose(session, WT_VERB_RTS, "history store update valid with stop timestamp: %s and stable timestamp: %s", - __wt_timestamp_to_string(hs_stop_ts, ts_string[0]), + __wt_timestamp_to_string(hs_stop_durable_ts, ts_string[0]), __wt_timestamp_to_string(rollback_timestamp, ts_string[1])); break; } /* Stop processing when we find a stable update according to the given timestamp. */ - if (durable_ts <= rollback_timestamp) { + if (hs_durable_ts <= rollback_timestamp) { __wt_verbose(session, WT_VERB_RTS, "history store update valid with start timestamp: %s, durable timestamp: %s, " "stop timestamp: %s and stable timestamp: %s", __wt_timestamp_to_string(hs_start_ts, ts_string[0]), - __wt_timestamp_to_string(durable_ts, ts_string[1]), - __wt_timestamp_to_string(hs_stop_ts, ts_string[2]), + __wt_timestamp_to_string(hs_durable_ts, ts_string[1]), + __wt_timestamp_to_string(hs_stop_durable_ts, ts_string[2]), __wt_timestamp_to_string(rollback_timestamp, ts_string[3])); valid_update_found = true; break; @@ -276,21 +281,23 @@ __rollback_row_ondisk_fixup_key(WT_SESSION_IMPL *session, WT_PAGE *page, WT_ROW "history store update aborted with start timestamp: %s, durable timestamp: %s, stop " "timestamp: %s and stable timestamp: %s", __wt_timestamp_to_string(hs_start_ts, ts_string[0]), - __wt_timestamp_to_string(durable_ts, ts_string[1]), - __wt_timestamp_to_string(hs_stop_ts, ts_string[2]), + __wt_timestamp_to_string(hs_durable_ts, ts_string[1]), + __wt_timestamp_to_string(hs_stop_durable_ts, ts_string[2]), __wt_timestamp_to_string(rollback_timestamp, ts_string[3])); -#ifdef HAVE_DIAGNOSTIC /* - * Durable timestamp of the current record is used as stop timestamp of previous record. - * Save it to verify against previous record. + * Start time point of the current record may be used as stop time point of the previous + * record. Save it to verify against the previous record and check if we need to append the + * stop time point as a tombstone when we rollback the history store record. */ - newer_hs_ts = durable_ts; + newer_hs_durable_ts = hs_durable_ts; +#ifdef HAVE_DIAGNOSTIC + first_record = false; #endif + WT_ERR(__wt_upd_alloc_tombstone(session, &hs_upd, NULL)); WT_ERR(__wt_hs_modify(cbt, hs_upd)); WT_STAT_CONN_INCR(session, txn_rts_hs_removed); - hs_upd = NULL; } if (replace) { @@ -301,9 +308,9 @@ __rollback_row_ondisk_fixup_key(WT_SESSION_IMPL *session, WT_PAGE *page, WT_ROW if (valid_update_found) { WT_ERR(__wt_upd_alloc(session, &full_value, WT_UPDATE_STANDARD, &upd, NULL)); - upd->txnid = WT_TXN_NONE; - upd->durable_ts = durable_ts; - upd->start_ts = hs_start_ts; + upd->txnid = cbt->upd_value->tw.start_txn; + upd->durable_ts = cbt->upd_value->tw.durable_start_ts; + upd->start_ts = cbt->upd_value->tw.start_ts; __wt_verbose(session, WT_VERB_RTS, "update restored from history store (txnid: %" PRIu64 ", start_ts: %s, durable_ts: %s", upd->txnid, __wt_timestamp_to_string(upd->start_ts, ts_string[0]), @@ -314,6 +321,28 @@ __rollback_row_ondisk_fixup_key(WT_SESSION_IMPL *session, WT_PAGE *page, WT_ROW * the rollback to stable operation. */ F_SET(upd, WT_UPDATE_RESTORED_FROM_HS); + + /* + * We have a tombstone on the original update chain and it is behind the stable + * timestamp, we need to restore that as well. + */ + if (hs_stop_durable_ts <= rollback_timestamp && + hs_stop_durable_ts < newer_hs_durable_ts) { + WT_ERR(__wt_upd_alloc_tombstone(session, &tombstone, NULL)); + tombstone->txnid = cbt->upd_value->tw.stop_txn; + tombstone->durable_ts = cbt->upd_value->tw.durable_stop_ts; + tombstone->start_ts = cbt->upd_value->tw.stop_ts; + + /* + * Set the flag to indicate that this update has been restored from history store + * for the rollback to stable operation. + */ + F_SET(tombstone, WT_UPDATE_RESTORED_FROM_HS); + + tombstone->next = upd; + upd = tombstone; + WT_STAT_CONN_INCR(session, txn_rts_hs_restore_tombstones); + } } else { WT_ERR(__wt_upd_alloc_tombstone(session, &upd, NULL)); WT_STAT_CONN_INCR(session, txn_rts_keys_removed); @@ -321,7 +350,6 @@ __rollback_row_ondisk_fixup_key(WT_SESSION_IMPL *session, WT_PAGE *page, WT_ROW } WT_ERR(__rollback_row_add_update(session, page, rip, upd)); - upd = NULL; } /* Finally remove that update from history store. */ @@ -329,18 +357,19 @@ __rollback_row_ondisk_fixup_key(WT_SESSION_IMPL *session, WT_PAGE *page, WT_ROW WT_ERR(__wt_upd_alloc_tombstone(session, &hs_upd, NULL)); WT_ERR(__wt_hs_modify(cbt, hs_upd)); WT_STAT_CONN_INCR(session, txn_rts_hs_removed); - hs_upd = NULL; } + if (0) { err: + WT_ASSERT(session, tombstone == NULL || upd == tombstone); + __wt_free_update_list(session, &upd); + __wt_free_update_list(session, &hs_upd); + } __wt_scr_free(session, &key); __wt_scr_free(session, &hs_key); __wt_scr_free(session, &hs_value); __wt_buf_free(session, &full_value); - __wt_free(session, hs_upd); - __wt_free(session, upd); WT_TRET(__wt_hs_cursor_close(session, session_flags, is_owner)); - return (ret); } diff --git a/src/third_party/wiredtiger/test/csuite/timestamp_abort/main.c b/src/third_party/wiredtiger/test/csuite/timestamp_abort/main.c index 89c141290a9..b565282dad1 100644 --- a/src/third_party/wiredtiger/test/csuite/timestamp_abort/main.c +++ b/src/third_party/wiredtiger/test/csuite/timestamp_abort/main.c @@ -82,15 +82,15 @@ static bool compat, inmem, use_ts; static volatile uint64_t global_ts = 1; #define ENV_CONFIG_COMPAT ",compatibility=(release=\"2.9\")" -#define ENV_CONFIG_DEF \ - "cache_size=20M,create,log=(archive=true,file_max=10M,enabled)," \ - "debug_mode=(table_logging=true,checkpoint_retention=5)," \ - "statistics=(fast),statistics_log=(wait=1,json=true),session_max=%d" -#define ENV_CONFIG_TXNSYNC \ - "cache_size=20M,create,log=(archive=true,file_max=10M,enabled)," \ - "debug_mode=(table_logging=true,checkpoint_retention=5)," \ - "statistics=(fast),statistics_log=(wait=1,json=true)," \ - "transaction_sync=(enabled,method=none),session_max=%d" +#define ENV_CONFIG_DEF \ + "cache_size=20M,create," \ + "debug_mode=(table_logging=true,checkpoint_retention=5)," \ + "eviction_dirty_trigger=100," \ + "log=(archive=true,file_max=10M,enabled),session_max=%d," \ + "statistics=(fast),statistics_log=(wait=1,json=true)," +#define ENV_CONFIG_TXNSYNC \ + ENV_CONFIG_DEF \ + "transaction_sync=(enabled,method=none)" #define ENV_CONFIG_REC "log=(archive=false,recover=on)" typedef struct { diff --git a/src/third_party/wiredtiger/test/suite/test_hs05.py b/src/third_party/wiredtiger/test/suite/test_hs05.py index 17c87109efd..f2d93a40547 100644 --- a/src/third_party/wiredtiger/test/suite/test_hs05.py +++ b/src/third_party/wiredtiger/test/suite/test_hs05.py @@ -38,8 +38,10 @@ def timestamp_str(t): # Verify hs_score reflects cache pressure due to history # even if we're not yet actively pushing into the history store file. class test_hs05(wttest.WiredTigerTestCase): - # Force a small cache. - conn_config = 'cache_size=50MB,statistics=(fast)' + # Force a small cache, but disable eviction of dirty pages until the cache is full. + conn_config = 'cache_size=50MB,statistics=(fast),' + conn_config += 'eviction_dirty_target=100,eviction_dirty_trigger=100,' + conn_config += 'eviction_updates_target=100,eviction_updates_trigger=100' session_config = 'isolation=snapshot' stable = 1 diff --git a/src/third_party/wiredtiger/test/suite/test_rollback_to_stable13.py b/src/third_party/wiredtiger/test/suite/test_rollback_to_stable13.py new file mode 100644 index 00000000000..b5c22889f6a --- /dev/null +++ b/src/third_party/wiredtiger/test/suite/test_rollback_to_stable13.py @@ -0,0 +1,127 @@ +#!/usr/bin/env python +# +# Public Domain 2014-2020 MongoDB, Inc. +# Public Domain 2008-2014 WiredTiger, Inc. +# +# This is free and unencumbered software released into the public domain. +# +# Anyone is free to copy, modify, publish, use, compile, sell, or +# distribute this software, either in source code form or as a compiled +# binary, for any purpose, commercial or non-commercial, and by any +# means. +# +# In jurisdictions that recognize copyright laws, the author or authors +# of this software dedicate any and all copyright interest in the +# software to the public domain. We make this dedication for the benefit +# of the public at large and to the detriment of our heirs and +# successors. We intend this dedication to be an overt act of +# relinquishment in perpetuity of all present and future rights to this +# software under copyright law. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +# IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR +# OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +# ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR +# OTHER DEALINGS IN THE SOFTWARE. + +import fnmatch, os, shutil, time +from helper import copy_wiredtiger_home +from test_rollback_to_stable01 import test_rollback_to_stable_base +from wiredtiger import stat +from wtdataset import SimpleDataSet +from wtscenario import make_scenarios + +def timestamp_str(t): + return '%x' % t + +# test_rollback_to_stable13.py +# Test the rollback to stable should roll back the tombstone in the history store. +class test_rollback_to_stable13(test_rollback_to_stable_base): + session_config = 'isolation=snapshot' + + prepare_values = [ + ('no_prepare', dict(prepare=False)), + ('prepare', dict(prepare=True)) + ] + + scenarios = make_scenarios(prepare_values) + + def conn_config(self): + config = 'cache_size=500MB,statistics=(all),log=(enabled=true)' + return config + + def simulate_crash_restart(self, olddir, newdir): + ''' Simulate a crash from olddir and restart in newdir. ''' + # with the connection still open, copy files to new directory + shutil.rmtree(newdir, ignore_errors=True) + os.mkdir(newdir) + for fname in os.listdir(olddir): + fullname = os.path.join(olddir, fname) + # Skip lock file on Windows since it is locked + if os.path.isfile(fullname) and \ + "WiredTiger.lock" not in fullname and \ + "Tmplog" not in fullname and \ + "Preplog" not in fullname: + shutil.copy(fullname, newdir) + # + # close the original connection and open to new directory + # NOTE: This really cannot test the difference between the + # write-no-sync (off) version of log_flush and the sync + # version since we're not crashing the system itself. + # + self.close_conn() + self.conn = self.setUpConnectionOpen(newdir) + self.session = self.setUpSessionOpen(self.conn) + + def test_rollback_to_stable(self): + nrows = 1000 + + # Create a table without logging. + uri = "table:rollback_to_stable13" + ds = SimpleDataSet( + self, uri, 0, key_format="i", value_format="S", config='split_pct=50,log=(enabled=false)') + ds.populate() + + # Pin oldest and stable to timestamp 10. + self.conn.set_timestamp('oldest_timestamp=' + timestamp_str(10) + + ',stable_timestamp=' + timestamp_str(10)) + + value_a = "aaaaa" * 100 + value_b = "bbbbb" * 100 + + # Perform several updates. + self.large_updates(uri, value_a, ds, nrows, 20) + + # Perform several removes. + self.large_removes(uri, ds, nrows, 30) + + # Perform several updates. + self.large_updates(uri, value_b, ds, nrows, 60) + + # Verify data is visible and correct. + self.check(value_a, uri, nrows, 20) + self.check(None, uri, 0, 30) + self.check(value_b, uri, nrows, 60) + + # Pin stable to timestamp 50 if prepare otherwise 40. + if self.prepare: + self.conn.set_timestamp('stable_timestamp=' + timestamp_str(50)) + else: + self.conn.set_timestamp('stable_timestamp=' + timestamp_str(40)) + + self.session.checkpoint() + + # Simulate a server crash and restart. + self.simulate_crash_restart(".", "RESTART") + + # Check that the correct data is seen at and after the stable timestamp. + self.check(None, uri, 0, 50) + + # Check that we restore the correct value from the history store. + self.check(value_a, uri, nrows, 20) + + stat_cursor = self.session.open_cursor('statistics:', None, None) + restored_tombstones = stat_cursor[stat.conn.txn_rts_hs_restore_tombstones][2] + self.assertEqual(restored_tombstones, nrows) diff --git a/src/third_party/wiredtiger/test/suite/test_stat05.py b/src/third_party/wiredtiger/test/suite/test_stat05.py index dd1b94b543a..235f6236a1d 100644 --- a/src/third_party/wiredtiger/test/suite/test_stat05.py +++ b/src/third_party/wiredtiger/test/suite/test_stat05.py @@ -45,12 +45,12 @@ class test_stat_cursor_config(wttest.WiredTigerTestCase): conn_config = 'in_memory,statistics=(fast)')), ('table-lsm', dict(uri='table:' + pfx, dataset=SimpleDataSet, cfg='lsm=(chunk_size=1MB,merge_min=2)', - conn_config = 'statistics=(fast),eviction_dirty_target=99,eviction_dirty_trigger=99')), + conn_config = 'statistics=(fast)')), ('complex', dict(uri='table:' + pfx, dataset=ComplexDataSet, cfg='')), ('complex-lsm', dict(uri='table:' + pfx, dataset=ComplexLSMDataSet, cfg='lsm=(chunk_size=1MB,merge_min=2)', - conn_config = 'statistics=(fast),eviction_dirty_target=99,eviction_dirty_trigger=99')), + conn_config = 'statistics=(fast)')), ] scenarios = make_scenarios(uri) diff --git a/src/third_party/wiredtiger/test/suite/test_txn13.py b/src/third_party/wiredtiger/test/suite/test_txn13.py index f2b1849333a..541017804c9 100644 --- a/src/third_party/wiredtiger/test/suite/test_txn13.py +++ b/src/third_party/wiredtiger/test/suite/test_txn13.py @@ -52,7 +52,7 @@ class test_txn13(wttest.WiredTigerTestCase, suite_subprocess): # Turn on logging for this test. def conn_config(self): return 'log=(archive=false,enabled,file_max=%s)' % self.logmax + \ - ',cache_size=20G' + ',cache_size=20G,eviction_dirty_trigger=100' @wttest.longtest('txn tests with huge values') def test_large_values(self): |