diff options
author | Luke Chen <luke.chen@mongodb.com> | 2021-03-01 16:33:28 +1100 |
---|---|---|
committer | Evergreen Agent <no-reply@evergreen.mongodb.com> | 2021-03-01 06:03:15 +0000 |
commit | 6dcf69cdd37b7fd8cc86b6d5412ceab67bceddfe (patch) | |
tree | 7c1593caea8a43d0e43c994b9a7f64812ece5be6 | |
parent | f4054ff40c1c8309eefdbc87e84e8e8403e51281 (diff) | |
download | mongo-6dcf69cdd37b7fd8cc86b6d5412ceab67bceddfe.tar.gz |
Import wiredtiger: 9f6b212f1fe4a069ed18bf49ff237b31b2098c4c from branch mongodb-5.0
ref: 135a36dc0a..9f6b212f1f
for: 4.9.0
WT-6673 RTS fix inconsistent checkpoint by removing updates outside of the checkpoint snapshot
17 files changed, 444 insertions, 94 deletions
diff --git a/src/third_party/wiredtiger/dist/stat_data.py b/src/third_party/wiredtiger/dist/stat_data.py index 97c20322319..e6eab22645d 100644 --- a/src/third_party/wiredtiger/dist/stat_data.py +++ b/src/third_party/wiredtiger/dist/stat_data.py @@ -875,6 +875,7 @@ conn_dsrc_stats = [ TxnStat('txn_rts_hs_restore_updates', 'rollback to stable restored updates from history store'), TxnStat('txn_rts_hs_restore_tombstones', 'rollback to stable restored tombstones from history store'), TxnStat('txn_rts_hs_stop_older_than_newer_start', 'rollback to stable hs records with stop timestamps older than newer records'), + TxnStat('txn_rts_inconsistent_ckpt', 'rollback to stable inconsistent checkpoint'), TxnStat('txn_rts_keys_removed', 'rollback to stable keys removed'), TxnStat('txn_rts_keys_restored', 'rollback to stable keys restored'), TxnStat('txn_rts_sweep_hs_keys', 'rollback to stable sweeping history store keys'), diff --git a/src/third_party/wiredtiger/import.data b/src/third_party/wiredtiger/import.data index a721db1e45c..2335d486a79 100644 --- a/src/third_party/wiredtiger/import.data +++ b/src/third_party/wiredtiger/import.data @@ -2,5 +2,5 @@ "vendor": "wiredtiger", "github": "wiredtiger/wiredtiger.git", "branch": "mongodb-5.0", - "commit": "135a36dc0ad05fea990dc2c02696e68ce0fb287d" + "commit": "9f6b212f1fe4a069ed18bf49ff237b31b2098c4c" } diff --git a/src/third_party/wiredtiger/src/btree/bt_handle.c b/src/third_party/wiredtiger/src/btree/bt_handle.c index 2d26a1bf573..1d0f9dd115d 100644 --- a/src/third_party/wiredtiger/src/btree/bt_handle.c +++ b/src/third_party/wiredtiger/src/btree/bt_handle.c @@ -524,9 +524,22 @@ __btree_conf(WT_SESSION_IMPL *session, WT_CKPT *ckpt) /* If this is the first time opening the tree this run. */ if (F_ISSET(session, WT_SESSION_IMPORT) || ckpt->run_write_gen < conn->base_write_gen) - btree->base_write_gen = btree->run_write_gen = btree->write_gen; + btree->run_write_gen = btree->write_gen; else - btree->base_write_gen = btree->run_write_gen = ckpt->run_write_gen; + btree->run_write_gen = ckpt->run_write_gen; + + /* + * In recovery use the last checkpointed run write generation number as base write generation + * number to reset the transaction ids of the pages that were modified before the restart. The + * transaction ids are retained only on the pages that are written after the restart. + * + * Rollback to stable does not operate on logged tables and metadata, so it is skipped. + */ + if (!F_ISSET(conn, WT_CONN_RECOVERING) || WT_IS_METADATA(btree->dhandle) || + __wt_btree_immediately_durable(session)) + btree->base_write_gen = btree->run_write_gen; + else + btree->base_write_gen = ckpt->run_write_gen; /* * We've just overwritten the runtime write generation based off the fact that know that we're diff --git a/src/third_party/wiredtiger/src/btree/bt_vrfy.c b/src/third_party/wiredtiger/src/btree/bt_vrfy.c index 13abe891dd8..f539d44daf7 100644 --- a/src/third_party/wiredtiger/src/btree/bt_vrfy.c +++ b/src/third_party/wiredtiger/src/btree/bt_vrfy.c @@ -259,10 +259,6 @@ __wt_verify(WT_SESSION_IMPL *session, const char *cfg[]) */ memset(&addr_unpack, 0, sizeof(addr_unpack)); WT_TIME_AGGREGATE_COPY(&addr_unpack.ta, &ckpt->ta); - if (ckpt->write_gen <= btree->base_write_gen) { - addr_unpack.ta.newest_txn = WT_TXN_NONE; - addr_unpack.ta.newest_stop_txn = WT_TXN_MAX; - } if (ckpt->ta.prepare) addr_unpack.ta.prepare = 1; addr_unpack.raw = WT_CELL_ADDR_INT; diff --git a/src/third_party/wiredtiger/src/conn/conn_dhandle.c b/src/third_party/wiredtiger/src/conn/conn_dhandle.c index 11379c1f419..1ba4cbe30ed 100644 --- a/src/third_party/wiredtiger/src/conn/conn_dhandle.c +++ b/src/third_party/wiredtiger/src/conn/conn_dhandle.c @@ -920,6 +920,37 @@ restart: } /* + * __wt_dhandle_update_write_gens -- + * Update the open dhandles write generation, run write generation and base write generation + * number. + */ +void +__wt_dhandle_update_write_gens(WT_SESSION_IMPL *session) +{ + WT_BTREE *btree; + WT_CONNECTION_IMPL *conn; + WT_DATA_HANDLE *dhandle; + + conn = S2C(session); + + for (dhandle = NULL;;) { + WT_WITH_HANDLE_LIST_WRITE_LOCK(session, WT_DHANDLE_NEXT(session, dhandle, &conn->dhqh, q)); + if (dhandle == NULL) + break; + btree = (WT_BTREE *)dhandle->handle; + + WT_ASSERT(session, btree != NULL); + + /* + * Initialize the btree write generation numbers after rollback to stable so that the + * transaction ids of the pages will be reset when loaded from disk to memory. + */ + btree->write_gen = btree->base_write_gen = btree->run_write_gen = + WT_MAX(btree->write_gen, conn->base_write_gen); + } +} + +/* * __wt_verbose_dump_handles -- * Dump information about all data handles. */ diff --git a/src/third_party/wiredtiger/src/include/extern.h b/src/third_party/wiredtiger/src/include/extern.h index bf1505b2229..dfefe57ba26 100644 --- a/src/third_party/wiredtiger/src/include/extern.h +++ b/src/third_party/wiredtiger/src/include/extern.h @@ -1685,6 +1685,7 @@ extern void __wt_curstat_cache_walk(WT_SESSION_IMPL *session); extern void __wt_curstat_dsrc_final(WT_CURSOR_STAT *cst); extern void __wt_curtable_set_key(WT_CURSOR *cursor, ...); extern void __wt_curtable_set_value(WT_CURSOR *cursor, ...); +extern void __wt_dhandle_update_write_gens(WT_SESSION_IMPL *session); extern void __wt_encrypt_size( WT_SESSION_IMPL *session, WT_KEYED_ENCRYPTOR *kencryptor, size_t incoming_size, size_t *sizep); extern void __wt_err_func( diff --git a/src/third_party/wiredtiger/src/include/stat.h b/src/third_party/wiredtiger/src/include/stat.h index 99bcb538f59..bb3ae8c176a 100644 --- a/src/third_party/wiredtiger/src/include/stat.h +++ b/src/third_party/wiredtiger/src/include/stat.h @@ -785,6 +785,7 @@ struct __wt_connection_stats { int64_t tiered_retention; int64_t txn_read_race_prepare_update; int64_t txn_rts_hs_stop_older_than_newer_start; + int64_t txn_rts_inconsistent_ckpt; int64_t txn_rts_keys_removed; int64_t txn_rts_keys_restored; int64_t txn_rts_hs_restore_tombstones; @@ -1002,6 +1003,7 @@ struct __wt_dsrc_stats { int64_t tiered_retention; int64_t txn_read_race_prepare_update; int64_t txn_rts_hs_stop_older_than_newer_start; + int64_t txn_rts_inconsistent_ckpt; int64_t txn_rts_keys_removed; int64_t txn_rts_keys_restored; int64_t txn_rts_hs_restore_tombstones; diff --git a/src/third_party/wiredtiger/src/include/wiredtiger.in b/src/third_party/wiredtiger/src/include/wiredtiger.in index d041c231680..5ccd7bab902 100644 --- a/src/third_party/wiredtiger/src/include/wiredtiger.in +++ b/src/third_party/wiredtiger/src/include/wiredtiger.in @@ -6110,22 +6110,24 @@ extern int wiredtiger_extension_terminate(WT_CONNECTION *connection); * than newer records */ #define WT_STAT_CONN_TXN_RTS_HS_STOP_OLDER_THAN_NEWER_START 1471 +/*! transaction: rollback to stable inconsistent checkpoint */ +#define WT_STAT_CONN_TXN_RTS_INCONSISTENT_CKPT 1472 /*! transaction: rollback to stable keys removed */ -#define WT_STAT_CONN_TXN_RTS_KEYS_REMOVED 1472 +#define WT_STAT_CONN_TXN_RTS_KEYS_REMOVED 1473 /*! transaction: rollback to stable keys restored */ -#define WT_STAT_CONN_TXN_RTS_KEYS_RESTORED 1473 +#define WT_STAT_CONN_TXN_RTS_KEYS_RESTORED 1474 /*! transaction: rollback to stable restored tombstones from history store */ -#define WT_STAT_CONN_TXN_RTS_HS_RESTORE_TOMBSTONES 1474 +#define WT_STAT_CONN_TXN_RTS_HS_RESTORE_TOMBSTONES 1475 /*! transaction: rollback to stable restored updates from history store */ -#define WT_STAT_CONN_TXN_RTS_HS_RESTORE_UPDATES 1475 +#define WT_STAT_CONN_TXN_RTS_HS_RESTORE_UPDATES 1476 /*! transaction: rollback to stable sweeping history store keys */ -#define WT_STAT_CONN_TXN_RTS_SWEEP_HS_KEYS 1476 +#define WT_STAT_CONN_TXN_RTS_SWEEP_HS_KEYS 1477 /*! transaction: rollback to stable updates removed from history store */ -#define WT_STAT_CONN_TXN_RTS_HS_REMOVED 1477 +#define WT_STAT_CONN_TXN_RTS_HS_REMOVED 1478 /*! transaction: transaction checkpoints due to obsolete pages */ -#define WT_STAT_CONN_TXN_CHECKPOINT_OBSOLETE_APPLIED 1478 +#define WT_STAT_CONN_TXN_CHECKPOINT_OBSOLETE_APPLIED 1479 /*! transaction: update conflicts */ -#define WT_STAT_CONN_TXN_UPDATE_CONFLICT 1479 +#define WT_STAT_CONN_TXN_UPDATE_CONFLICT 1480 /*! * @} @@ -6727,22 +6729,24 @@ extern int wiredtiger_extension_terminate(WT_CONNECTION *connection); * than newer records */ #define WT_STAT_DSRC_TXN_RTS_HS_STOP_OLDER_THAN_NEWER_START 2201 +/*! transaction: rollback to stable inconsistent checkpoint */ +#define WT_STAT_DSRC_TXN_RTS_INCONSISTENT_CKPT 2202 /*! transaction: rollback to stable keys removed */ -#define WT_STAT_DSRC_TXN_RTS_KEYS_REMOVED 2202 +#define WT_STAT_DSRC_TXN_RTS_KEYS_REMOVED 2203 /*! transaction: rollback to stable keys restored */ -#define WT_STAT_DSRC_TXN_RTS_KEYS_RESTORED 2203 +#define WT_STAT_DSRC_TXN_RTS_KEYS_RESTORED 2204 /*! transaction: rollback to stable restored tombstones from history store */ -#define WT_STAT_DSRC_TXN_RTS_HS_RESTORE_TOMBSTONES 2204 +#define WT_STAT_DSRC_TXN_RTS_HS_RESTORE_TOMBSTONES 2205 /*! transaction: rollback to stable restored updates from history store */ -#define WT_STAT_DSRC_TXN_RTS_HS_RESTORE_UPDATES 2205 +#define WT_STAT_DSRC_TXN_RTS_HS_RESTORE_UPDATES 2206 /*! transaction: rollback to stable sweeping history store keys */ -#define WT_STAT_DSRC_TXN_RTS_SWEEP_HS_KEYS 2206 +#define WT_STAT_DSRC_TXN_RTS_SWEEP_HS_KEYS 2207 /*! transaction: rollback to stable updates removed from history store */ -#define WT_STAT_DSRC_TXN_RTS_HS_REMOVED 2207 +#define WT_STAT_DSRC_TXN_RTS_HS_REMOVED 2208 /*! transaction: transaction checkpoints due to obsolete pages */ -#define WT_STAT_DSRC_TXN_CHECKPOINT_OBSOLETE_APPLIED 2208 +#define WT_STAT_DSRC_TXN_CHECKPOINT_OBSOLETE_APPLIED 2209 /*! transaction: update conflicts */ -#define WT_STAT_DSRC_TXN_UPDATE_CONFLICT 2209 +#define WT_STAT_DSRC_TXN_UPDATE_CONFLICT 2210 /*! * @} diff --git a/src/third_party/wiredtiger/src/meta/meta_ckpt.c b/src/third_party/wiredtiger/src/meta/meta_ckpt.c index 150880625a6..9b54431a8cf 100644 --- a/src/third_party/wiredtiger/src/meta/meta_ckpt.c +++ b/src/third_party/wiredtiger/src/meta/meta_ckpt.c @@ -1001,6 +1001,7 @@ __wt_meta_sysinfo_set(WT_SESSION_IMPL *session) wt_timestamp_t oldest_timestamp; uint32_t snap_count; char hex_timestamp[WT_TS_HEX_STRING_SIZE]; + char ts_string[2][WT_TS_INT_STRING_SIZE]; txn_global = &S2C(session)->txn_global; @@ -1044,19 +1045,26 @@ __wt_meta_sysinfo_set(WT_SESSION_IMPL *session) } /* Record snapshot information in metadata for checkpoint. */ - if (txn->snapshot_count > 0) { - WT_ERR(__wt_buf_fmt(session, buf, - WT_SYSTEM_CKPT_SNAPSHOT_MIN "=%" PRIu64 "," WT_SYSTEM_CKPT_SNAPSHOT_MAX "=%" PRIu64 - "," WT_SYSTEM_CKPT_SNAPSHOT_COUNT "=%" PRIu32 - "," WT_SYSTEM_CKPT_SNAPSHOT "=[", - txn->snap_min, txn->snap_max, txn->snapshot_count)); + WT_ERR(__wt_buf_fmt(session, buf, + WT_SYSTEM_CKPT_SNAPSHOT_MIN "=%" PRIu64 "," WT_SYSTEM_CKPT_SNAPSHOT_MAX "=%" PRIu64 + "," WT_SYSTEM_CKPT_SNAPSHOT_COUNT "=%" PRIu32, + txn->snap_min, txn->snap_max, txn->snapshot_count)); + if (txn->snapshot_count > 0) { + WT_ERR(__wt_buf_catfmt(session, buf, "," WT_SYSTEM_CKPT_SNAPSHOT "=[")); for (snap_count = 0; snap_count < txn->snapshot_count - 1; ++snap_count) WT_ERR(__wt_buf_catfmt(session, buf, "%" PRIu64 "%s", txn->snapshot[snap_count], ",")); WT_ERR(__wt_buf_catfmt(session, buf, "%" PRIu64 "%s", txn->snapshot[snap_count], "]")); - WT_ERR(__wt_metadata_update(session, WT_SYSTEM_CKPT_SNAPSHOT_URI, buf->data)); } + WT_ERR(__wt_metadata_update(session, WT_SYSTEM_CKPT_SNAPSHOT_URI, buf->data)); + + __wt_verbose(session, WT_VERB_CHECKPOINT_PROGRESS, + "saving checkpoint snapshot min: %" PRIu64 ", snapshot max: %" PRIu64 + " snapshot count: %" PRIu32 ", oldest timestamp: %s , meta checkpoint timestamp: %s", + txn->snap_min, txn->snap_max, txn->snapshot_count, + __wt_timestamp_to_string(txn_global->oldest_timestamp, ts_string[0]), + __wt_timestamp_to_string(txn_global->meta_ckpt_timestamp, ts_string[1])); /* Record the base write gen in metadata as part of checkpoint */ WT_ERR(__wt_buf_fmt( diff --git a/src/third_party/wiredtiger/src/os_posix/os_fs.c b/src/third_party/wiredtiger/src/os_posix/os_fs.c index ede22518c26..a472273de48 100644 --- a/src/third_party/wiredtiger/src/os_posix/os_fs.c +++ b/src/third_party/wiredtiger/src/os_posix/os_fs.c @@ -748,7 +748,10 @@ __posix_open_file(WT_FILE_SYSTEM *file_system, WT_SESSION *wt_session, const cha f |= O_CLOEXEC; #endif WT_SYSCALL_RETRY(((pfh->fd = open(name, f, 0444)) == -1 ? -1 : 0), ret); - if (ret != 0) + /* Return error if the file not found during rollback to stable. */ + if (ret != 0 && F_ISSET(session, WT_SESSION_ROLLBACK_TO_STABLE)) + WT_ERR(__wt_errno()); + else if (ret != 0) WT_ERR_MSG(session, ret, "%s: handle-open: open-directory", name); WT_ERR(__posix_open_file_cloexec(session, pfh->fd, name)); goto directory_open; @@ -800,7 +803,10 @@ __posix_open_file(WT_FILE_SYSTEM *file_system, WT_SESSION *wt_session, const cha /* Create/Open the file. */ WT_SYSCALL_RETRY(((pfh->fd = open(name, f, mode)) == -1 ? -1 : 0), ret); - if (ret != 0) + /* Return error if the file not found during rollback to stable. */ + if (ret != 0 && F_ISSET(session, WT_SESSION_ROLLBACK_TO_STABLE)) + WT_ERR(ENOENT); + else if (ret != 0) WT_ERR_MSG(session, ret, pfh->direct_io ? "%s: handle-open: open: failed with direct I/O configured, some " "filesystem types do not support direct I/O" : diff --git a/src/third_party/wiredtiger/src/support/stat.c b/src/third_party/wiredtiger/src/support/stat.c index c0c776bde2e..47605ab42f8 100644 --- a/src/third_party/wiredtiger/src/support/stat.c +++ b/src/third_party/wiredtiger/src/support/stat.c @@ -209,6 +209,7 @@ static const char *const __stats_dsrc_desc[] = { "session: tiered storage local retention time (secs)", "transaction: race to read prepared update retry", "transaction: rollback to stable hs records with stop timestamps older than newer records", + "transaction: rollback to stable inconsistent checkpoint", "transaction: rollback to stable keys removed", "transaction: rollback to stable keys restored", "transaction: rollback to stable restored tombstones from history store", @@ -459,6 +460,7 @@ __wt_stat_dsrc_clear_single(WT_DSRC_STATS *stats) /* not clearing tiered_retention */ stats->txn_read_race_prepare_update = 0; stats->txn_rts_hs_stop_older_than_newer_start = 0; + stats->txn_rts_inconsistent_ckpt = 0; stats->txn_rts_keys_removed = 0; stats->txn_rts_keys_restored = 0; stats->txn_rts_hs_restore_tombstones = 0; @@ -696,6 +698,7 @@ __wt_stat_dsrc_aggregate_single(WT_DSRC_STATS *from, WT_DSRC_STATS *to) to->tiered_retention += from->tiered_retention; to->txn_read_race_prepare_update += from->txn_read_race_prepare_update; to->txn_rts_hs_stop_older_than_newer_start += from->txn_rts_hs_stop_older_than_newer_start; + to->txn_rts_inconsistent_ckpt += from->txn_rts_inconsistent_ckpt; to->txn_rts_keys_removed += from->txn_rts_keys_removed; to->txn_rts_keys_restored += from->txn_rts_keys_restored; to->txn_rts_hs_restore_tombstones += from->txn_rts_hs_restore_tombstones; @@ -939,6 +942,7 @@ __wt_stat_dsrc_aggregate(WT_DSRC_STATS **from, WT_DSRC_STATS *to) to->txn_read_race_prepare_update += WT_STAT_READ(from, txn_read_race_prepare_update); to->txn_rts_hs_stop_older_than_newer_start += WT_STAT_READ(from, txn_rts_hs_stop_older_than_newer_start); + to->txn_rts_inconsistent_ckpt += WT_STAT_READ(from, txn_rts_inconsistent_ckpt); to->txn_rts_keys_removed += WT_STAT_READ(from, txn_rts_keys_removed); to->txn_rts_keys_restored += WT_STAT_READ(from, txn_rts_keys_restored); to->txn_rts_hs_restore_tombstones += WT_STAT_READ(from, txn_rts_hs_restore_tombstones); @@ -1434,6 +1438,7 @@ static const char *const __stats_connection_desc[] = { "session: tiered storage local retention time (secs)", "transaction: race to read prepared update retry", "transaction: rollback to stable hs records with stop timestamps older than newer records", + "transaction: rollback to stable inconsistent checkpoint", "transaction: rollback to stable keys removed", "transaction: rollback to stable keys restored", "transaction: rollback to stable restored tombstones from history store", @@ -1954,6 +1959,7 @@ __wt_stat_connection_clear_single(WT_CONNECTION_STATS *stats) /* not clearing tiered_retention */ stats->txn_read_race_prepare_update = 0; stats->txn_rts_hs_stop_older_than_newer_start = 0; + stats->txn_rts_inconsistent_ckpt = 0; stats->txn_rts_keys_removed = 0; stats->txn_rts_keys_restored = 0; stats->txn_rts_hs_restore_tombstones = 0; @@ -2485,6 +2491,7 @@ __wt_stat_connection_aggregate(WT_CONNECTION_STATS **from, WT_CONNECTION_STATS * to->txn_read_race_prepare_update += WT_STAT_READ(from, txn_read_race_prepare_update); to->txn_rts_hs_stop_older_than_newer_start += WT_STAT_READ(from, txn_rts_hs_stop_older_than_newer_start); + to->txn_rts_inconsistent_ckpt += WT_STAT_READ(from, txn_rts_inconsistent_ckpt); to->txn_rts_keys_removed += WT_STAT_READ(from, txn_rts_keys_removed); to->txn_rts_keys_restored += WT_STAT_READ(from, txn_rts_keys_restored); to->txn_rts_hs_restore_tombstones += WT_STAT_READ(from, txn_rts_hs_restore_tombstones); diff --git a/src/third_party/wiredtiger/src/txn/txn.c b/src/third_party/wiredtiger/src/txn/txn.c index 94396fc92f2..e77b31df9f2 100644 --- a/src/third_party/wiredtiger/src/txn/txn.c +++ b/src/third_party/wiredtiger/src/txn/txn.c @@ -2178,7 +2178,7 @@ __wt_txn_global_shutdown(WT_SESSION_IMPL *session, const char **cfg) */ if (F_ISSET(conn, WT_CONN_CLOSING_TIMESTAMP)) { __wt_verbose(session, WT_VERB_RTS, - "Performing shutdown rollback to stable with stable timestamp: %s", + "performing shutdown rollback to stable with stable timestamp: %s", __wt_timestamp_to_string(conn->txn_global.stable_timestamp, ts_string)); WT_TRET(__wt_rollback_to_stable(session, cfg, true)); } diff --git a/src/third_party/wiredtiger/src/txn/txn_recover.c b/src/third_party/wiredtiger/src/txn/txn_recover.c index e71b686d633..59acf9894b6 100644 --- a/src/third_party/wiredtiger/src/txn/txn_recover.c +++ b/src/third_party/wiredtiger/src/txn/txn_recover.c @@ -743,12 +743,14 @@ __wt_txn_recover(WT_SESSION_IMPL *session, const char *cfg[]) char *config; char ts_string[2][WT_TS_INT_STRING_SIZE]; bool do_checkpoint, eviction_started, hs_exists, needs_rec, was_backup; + bool rts_executed; conn = S2C(session); WT_CLEAR(r); WT_INIT_LSN(&r.ckpt_lsn); config = NULL; do_checkpoint = hs_exists = true; + rts_executed = false; eviction_started = false; was_backup = F_ISSET(conn, WT_CONN_WAS_BACKUP); @@ -761,7 +763,6 @@ __wt_txn_recover(WT_SESSION_IMPL *session, const char *cfg[]) F_SET(conn, WT_CONN_RECOVERING); WT_ERR(__recovery_set_ckpt_base_write_gen(&r)); - WT_ERR(__recovery_set_checkpoint_snapshot(session)); WT_ERR(__wt_metadata_search(session, WT_METAFILE_URI, &config)); WT_ERR(__recovery_setup_file(&r, WT_METAFILE_URI, config)); WT_ERR(__wt_metadata_cursor_open(session, NULL, &metac)); @@ -928,6 +929,8 @@ __wt_txn_recover(WT_SESSION_IMPL *session, const char *cfg[]) done: WT_ERR(__recovery_set_checkpoint_timestamp(&r)); WT_ERR(__recovery_set_oldest_timestamp(&r)); + WT_ERR(__recovery_set_checkpoint_snapshot(session)); + /* * Perform rollback to stable only when the following conditions met. * 1. The connection is not read-only. A read-only connection expects that there shouldn't be @@ -941,17 +944,6 @@ done: eviction_started = true; } - /* - * Currently, rollback to stable only needs to make changes to tables that use timestamps. - * That is because eviction does not run in parallel with a checkpoint, so content that is - * written never uses transaction IDs newer than the checkpoint's transaction ID and thus - * never needs to be rolled back. Once eviction is allowed while a checkpoint is active, it - * will be necessary to take the page write generation number into account during rollback - * to stable. For example, a page with write generation 10 and txnid 20 is written in one - * checkpoint, and in the next restart a new page with write generation 30 and txnid 20 is - * written. The rollback to stable operation should only rollback the latest page changes - * solely based on the write generation numbers. - */ WT_ASSERT(session, conn->txn_global.has_stable_timestamp == false && conn->txn_global.stable_timestamp == WT_TS_NONE); @@ -967,19 +959,31 @@ done: conn->txn_global.has_stable_timestamp = true; __wt_verbose(session, WT_VERB_RECOVERY | WT_VERB_RTS, - "Performing recovery rollback_to_stable with stable timestamp: %s and oldest timestamp: " + "performing recovery rollback_to_stable with stable timestamp: %s and oldest timestamp: " "%s", __wt_timestamp_to_string(conn->txn_global.stable_timestamp, ts_string[0]), __wt_timestamp_to_string(conn->txn_global.oldest_timestamp, ts_string[1])); + rts_executed = true; + WT_ERR(__wt_rollback_to_stable(session, NULL, true)); + } - WT_ERR(__wt_rollback_to_stable(session, NULL, false)); - } else if (do_checkpoint) + if (do_checkpoint || rts_executed) /* * Forcibly log a checkpoint so the next open is fast and keep the metadata up to date with * the checkpoint LSN and archiving. */ WT_ERR(session->iface.checkpoint(&session->iface, "force=1")); + /* Initialize the connection's base write generation after rollback to stable. */ + WT_ERR(__wt_metadata_init_base_write_gen(session)); + + /* + * Update the open dhandles write generations and base write generation with the connection's + * base write generation because the recovery checkpoint writes the pages to disk with new write + * generation number which contains transaction ids that are needed to reset later. + */ + __wt_dhandle_update_write_gens(session); + /* * If we're downgrading and have newer log files, force an archive, no matter what the archive * setting is. diff --git a/src/third_party/wiredtiger/src/txn/txn_rollback_to_stable.c b/src/third_party/wiredtiger/src/txn/txn_rollback_to_stable.c index 0001d09302b..13d9eba8d58 100644 --- a/src/third_party/wiredtiger/src/txn/txn_rollback_to_stable.c +++ b/src/third_party/wiredtiger/src/txn/txn_rollback_to_stable.c @@ -8,9 +8,14 @@ #include "wt_internal.h" +#define WT_CHECK_RECOVERY_FLAG_TS_TXNID(session, txnid, durablets) \ + (durablets == WT_TS_NONE && F_ISSET(S2C(session), WT_CONN_RECOVERING) && \ + (txnid) >= S2C(session)->recovery_ckpt_snap_min) + /* Enable rollback to stable verbose messaging during recovery. */ #define WT_VERB_RECOVERY_RTS(session) \ (F_ISSET(S2C(session), WT_CONN_RECOVERING) ? WT_VERB_RECOVERY | WT_VERB_RTS : WT_VERB_RTS) + /* * __rollback_abort_newer_update -- * Abort updates in an update change with timestamps newer than the rollback timestamp. Also, @@ -149,6 +154,53 @@ err: } /* + * __rollback_check_if_txnid_non_committed -- + * Check if the transaction id is non committed. + */ +static bool +__rollback_check_if_txnid_non_committed(WT_SESSION_IMPL *session, uint64_t txnid) +{ + WT_CONNECTION_IMPL *conn; + bool found; + + conn = S2C(session); + + /* If not recovery then assume all the data as committed. */ + if (!F_ISSET(conn, WT_CONN_RECOVERING)) + return (false); + + /* + * Only full checkpoint writes the metadata with snapshot. If the recovered checkpoint snapshot + * details are zero then return false i.e, updates are committed. + */ + if (conn->recovery_ckpt_snap_min == 0 && conn->recovery_ckpt_snap_max == 0) + return (false); + + /* + * Snapshot data: + * ids < recovery_ckpt_snap_min are committed, + * ids > recovery_ckpt_snap_max are non committed, + * everything else is committed unless it is found in the recovery_ckpt_snapshot array. + */ + if (txnid < conn->recovery_ckpt_snap_min) + return (false); + else if (txnid > conn->recovery_ckpt_snap_max) + return (true); + + /* + * Return false when the recovery snapshot count is 0, which means there is no uncommitted + * transaction ids. + */ + if (conn->recovery_ckpt_snapshot_count == 0) + return (false); + + WT_BINARY_SEARCH( + txnid, conn->recovery_ckpt_snapshot, conn->recovery_ckpt_snapshot_count, found); + + return (found); +} + +/* * __rollback_row_ondisk_fixup_key -- * Abort updates in the history store and replace the on-disk value with an update that * satisfies the given timestamp. @@ -295,40 +347,51 @@ __rollback_row_ondisk_fixup_key(WT_SESSION_IMPL *session, WT_PAGE *page, WT_ROW /* * Stop processing when we find the newer version value of this key is stable according to - * the current version stop timestamp when it is not appending the selected update to the - * update chain. Also it confirms that history store doesn't contains any newer version than - * the current version for the key. + * the current version stop timestamp and transaction id when it is not appending the + * selected update to the update chain. Also it confirms that history store doesn't contains + * any newer version than the current version for the key. */ - if (!replace && hs_stop_durable_ts <= rollback_timestamp) { + if (!replace && + (hs_stop_durable_ts != WT_TS_NONE || + !__rollback_check_if_txnid_non_committed(session, cbt->upd_value->tw.stop_txn)) && + (hs_stop_durable_ts <= rollback_timestamp)) { __wt_verbose(session, WT_VERB_RECOVERY_RTS(session), - "history store update valid with stop timestamp: %s, stable timestamp: %s and type: " - "%" PRIu8, + "history store update valid with stop timestamp: %s, stable timestamp: %s, txnid: " + "%" PRIu64 " and type: %" PRIu8, __wt_timestamp_to_string(hs_stop_durable_ts, ts_string[0]), - __wt_timestamp_to_string(rollback_timestamp, ts_string[1]), type); + __wt_timestamp_to_string(rollback_timestamp, ts_string[1]), + cbt->upd_value->tw.stop_txn, type); break; } - /* Stop processing when we find a stable update according to the given timestamp. */ - if (hs_durable_ts <= rollback_timestamp) { + /* + * Stop processing when we find a stable update according to the given timestamp and + * transaction id. + */ + if ((hs_durable_ts != WT_TS_NONE || + !__rollback_check_if_txnid_non_committed(session, cbt->upd_value->tw.start_txn)) && + (hs_durable_ts <= rollback_timestamp)) { __wt_verbose(session, WT_VERB_RECOVERY_RTS(session), "history store update valid with start timestamp: %s, durable timestamp: %s, stop " - "timestamp: %s, stable timestamp: %s and type: %" PRIu8, + "timestamp: %s, stable timestamp: %s, txnid: %" PRIu64 " and type: %" PRIu8, __wt_timestamp_to_string(hs_start_ts, ts_string[0]), __wt_timestamp_to_string(hs_durable_ts, ts_string[1]), __wt_timestamp_to_string(hs_stop_durable_ts, ts_string[2]), - __wt_timestamp_to_string(rollback_timestamp, ts_string[3]), type); - WT_ASSERT(session, cbt->upd_value->tw.start_ts < unpack->tw.start_ts); + __wt_timestamp_to_string(rollback_timestamp, ts_string[3]), + cbt->upd_value->tw.start_txn, type); valid_update_found = true; break; } __wt_verbose(session, WT_VERB_RECOVERY_RTS(session), "history store update aborted with start timestamp: %s, durable timestamp: %s, stop " - "timestamp: %s, stable timestamp: %s and type: %" PRIu8, + "timestamp: %s, stable timestamp: %s, start txnid: %" PRIu64 ", stop txnid: %" PRIu64 + " and type: %" PRIu8, __wt_timestamp_to_string(hs_start_ts, ts_string[0]), __wt_timestamp_to_string(hs_durable_ts, ts_string[1]), __wt_timestamp_to_string(hs_stop_durable_ts, ts_string[2]), - __wt_timestamp_to_string(rollback_timestamp, ts_string[3]), type); + __wt_timestamp_to_string(rollback_timestamp, ts_string[3]), cbt->upd_value->tw.start_txn, + cbt->upd_value->tw.stop_txn, type); /* * Start time point of the current record may be used as stop time point of the previous @@ -352,14 +415,25 @@ __rollback_row_ondisk_fixup_key(WT_SESSION_IMPL *session, WT_PAGE *page, WT_ROW * list. Otherwise remove the key by adding a tombstone. */ if (valid_update_found) { - WT_ASSERT(session, cbt->upd_value->tw.start_ts < unpack->tw.start_ts); + WT_ASSERT(session, + cbt->upd_value->tw.start_ts < unpack->tw.start_ts || + cbt->upd_value->tw.start_txn < unpack->tw.start_txn); WT_ERR(__wt_upd_alloc(session, &full_value, WT_UPDATE_STANDARD, &upd, NULL)); - upd->txnid = cbt->upd_value->tw.start_txn; + /* + * Set the transaction id of updates to WT_TXN_NONE when called from recovery, because + * the connections write generation will be initialized after rollback to stable and the + * updates in the cache will be problematic. The transaction id of pages which are in + * disk will be automatically reset as part of unpacking cell when loaded to cache. + */ + if (F_ISSET(S2C(session), WT_CONN_RECOVERING)) + upd->txnid = WT_TXN_NONE; + else + upd->txnid = cbt->upd_value->tw.start_txn; upd->durable_ts = cbt->upd_value->tw.durable_start_ts; upd->start_ts = cbt->upd_value->tw.start_ts; __wt_verbose(session, WT_VERB_RECOVERY_RTS(session), - "update restored from history store (txnid: %" PRIu64 + "update restored from history store txnid: %" PRIu64 ", start_ts: %s and durable_ts: %s", upd->txnid, __wt_timestamp_to_string(upd->start_ts, ts_string[0]), __wt_timestamp_to_string(upd->durable_ts, ts_string[1])); @@ -378,11 +452,21 @@ __rollback_row_ondisk_fixup_key(WT_SESSION_IMPL *session, WT_PAGE *page, WT_ROW if (hs_stop_durable_ts <= rollback_timestamp && hs_stop_durable_ts < newer_hs_durable_ts) { WT_ERR(__wt_upd_alloc_tombstone(session, &tombstone, NULL)); - tombstone->txnid = cbt->upd_value->tw.stop_txn; + /* + * Set the transaction id of updates to WT_TXN_NONE when called from recovery, + * because the connections write generation will be initialized after rollback to + * stable and the updates in the cache will be problematic. The transaction id of + * pages which are in disk will be automatically reset as part of unpacking cell + * when loaded to cache. + */ + if (F_ISSET(S2C(session), WT_CONN_RECOVERING)) + tombstone->txnid = WT_TXN_NONE; + else + tombstone->txnid = cbt->upd_value->tw.stop_txn; tombstone->durable_ts = cbt->upd_value->tw.durable_stop_ts; tombstone->start_ts = cbt->upd_value->tw.stop_ts; __wt_verbose(session, WT_VERB_RECOVERY_RTS(session), - "tombstone restored from history store (txnid: %" PRIu64 + "tombstone restored from history store txnid: %" PRIu64 ", start_ts: %s, durable_ts: %s", tombstone->txnid, __wt_timestamp_to_string(tombstone->start_ts, ts_string[0]), __wt_timestamp_to_string(tombstone->durable_ts, ts_string[1])); @@ -468,14 +552,16 @@ __rollback_abort_row_ondisk_kv( WT_STAT_CONN_DATA_INCR(session, txn_rts_sweep_hs_keys); } else return (0); - } else if (vpack->tw.durable_start_ts > rollback_timestamp || + } else if (((vpack->tw.durable_start_ts > rollback_timestamp) || + (vpack->tw.durable_start_ts == WT_TS_NONE && + __rollback_check_if_txnid_non_committed(session, vpack->tw.start_txn))) || (!WT_TIME_WINDOW_HAS_STOP(&vpack->tw) && prepared)) { __wt_verbose(session, WT_VERB_RECOVERY_RTS(session), "on-disk update aborted with start durable timestamp: %s, commit timestamp: %s, " - "prepared: %s and stable timestamp: %s", + "prepared: %s, stable timestamp: %s and txnid : %" PRIu64, __wt_timestamp_to_string(vpack->tw.durable_start_ts, ts_string[0]), __wt_timestamp_to_string(vpack->tw.start_ts, ts_string[1]), prepared ? "true" : "false", - __wt_timestamp_to_string(rollback_timestamp, ts_string[2])); + __wt_timestamp_to_string(rollback_timestamp, ts_string[2]), vpack->tw.start_txn); if (!F_ISSET(S2C(session), WT_CONN_IN_MEMORY)) return (__rollback_row_ondisk_fixup_key(session, page, rip, rollback_timestamp, true)); else { @@ -487,7 +573,10 @@ __rollback_abort_row_ondisk_kv( WT_STAT_CONN_DATA_INCR(session, txn_rts_keys_removed); } } else if (WT_TIME_WINDOW_HAS_STOP(&vpack->tw) && - (vpack->tw.durable_stop_ts > rollback_timestamp || prepared)) { + (((vpack->tw.durable_stop_ts > rollback_timestamp) || + (vpack->tw.durable_stop_ts == WT_TS_NONE && + __rollback_check_if_txnid_non_committed(session, vpack->tw.stop_txn))) || + prepared)) { /* * Clear the remove operation from the key by inserting the original on-disk value as a * standard update. @@ -495,19 +584,30 @@ __rollback_abort_row_ondisk_kv( WT_RET(__wt_page_cell_data_ref(session, page, vpack, &buf)); WT_ERR(__wt_upd_alloc(session, &buf, WT_UPDATE_STANDARD, &upd, NULL)); - upd->txnid = vpack->tw.start_txn; + /* + * Set the transaction id of updates to WT_TXN_NONE when called from recovery, because the + * connections write generation will be initialized after rollback to stable and the updates + * in the cache will be problematic. The transaction id of pages which are in disk will be + * automatically reset as part of unpacking cell when loaded to cache. + */ + if (F_ISSET(S2C(session), WT_CONN_RECOVERING)) + upd->txnid = WT_TXN_NONE; + else + upd->txnid = vpack->tw.start_txn; upd->durable_ts = vpack->tw.durable_start_ts; upd->start_ts = vpack->tw.start_ts; F_SET(upd, WT_UPDATE_RESTORED_FROM_DS); WT_STAT_CONN_DATA_INCR(session, txn_rts_keys_restored); __wt_verbose(session, WT_VERB_RECOVERY_RTS(session), - "key restored with commit timestamp: %s, durable timestamp: %s txnid: %" PRIu64 - "and removed commit timestamp: %s, durable timestamp: %s, txnid: %" PRIu64 + "key restored with commit timestamp: %s, durable timestamp: %s, stable timestamp: %s, " + "txnid: %" PRIu64 + " and removed commit timestamp: %s, durable timestamp: %s, txnid: %" PRIu64 ", prepared: %s", __wt_timestamp_to_string(upd->start_ts, ts_string[0]), - __wt_timestamp_to_string(upd->durable_ts, ts_string[1]), upd->txnid, - __wt_timestamp_to_string(vpack->tw.stop_ts, ts_string[2]), - __wt_timestamp_to_string(vpack->tw.durable_stop_ts, ts_string[3]), vpack->tw.stop_txn, + __wt_timestamp_to_string(upd->durable_ts, ts_string[1]), + __wt_timestamp_to_string(rollback_timestamp, ts_string[2]), upd->txnid, + __wt_timestamp_to_string(vpack->tw.stop_ts, ts_string[3]), + __wt_timestamp_to_string(vpack->tw.durable_stop_ts, ts_string[4]), vpack->tw.stop_txn, prepared ? "true" : "false"); } else /* Stable version according to the timestamp. */ @@ -760,6 +860,7 @@ __rollback_page_needs_abort( WT_MULTI *multi; WT_PAGE_MODIFY *mod; wt_timestamp_t durable_ts; + uint64_t newest_txn; uint32_t i; char ts_string[WT_TS_INT_STRING_SIZE]; const char *tag; @@ -768,12 +869,14 @@ __rollback_page_needs_abort( addr = ref->addr; mod = ref->page == NULL ? NULL : ref->page->modify; durable_ts = WT_TS_NONE; + newest_txn = WT_TXN_NONE; tag = "undefined state"; prepared = result = false; /* * The rollback operation should be performed on this page when any one of the following is - * greater than the given timestamp: + * greater than the given timestamp or during recovery if the newest transaction id on the page + * is greater than or equal to recovered checkpoint snapshot min: * 1. The reconciled replace page max durable timestamp. * 2. The reconciled multi page max durable timestamp. * 3. The on page address max durable timestamp. @@ -800,17 +903,22 @@ __rollback_page_needs_abort( __wt_cell_unpack_addr(session, ref->home->dsk, (WT_CELL *)addr, &vpack); durable_ts = __rollback_get_ref_max_durable_timestamp(session, &vpack.ta); prepared = vpack.ta.prepare; - result = (durable_ts > rollback_timestamp) || prepared; + newest_txn = vpack.ta.newest_txn; + result = (durable_ts > rollback_timestamp) || prepared || + WT_CHECK_RECOVERY_FLAG_TS_TXNID(session, newest_txn, durable_ts); } else if (addr != NULL) { tag = "address"; durable_ts = __rollback_get_ref_max_durable_timestamp(session, &addr->ta); prepared = addr->ta.prepare; - result = (durable_ts > rollback_timestamp) || prepared; + newest_txn = addr->ta.newest_txn; + result = (durable_ts > rollback_timestamp) || prepared || + WT_CHECK_RECOVERY_FLAG_TS_TXNID(session, newest_txn, durable_ts); } __wt_verbose(session, WT_VERB_RECOVERY_RTS(session), - "%p: page with %s durable timestamp: %s and prepared updates: %s", (void *)ref, tag, - __wt_timestamp_to_string(durable_ts, ts_string), prepared ? "true" : "false"); + "%p: page with %s durable timestamp: %s, newest txn: %" PRIu64 " and prepared updates: %s", + (void *)ref, tag, __wt_timestamp_to_string(durable_ts, ts_string), newest_txn, + prepared ? "true" : "false"); return (result); } @@ -929,6 +1037,9 @@ __rollback_to_stable_btree_walk(WT_SESSION_IMPL *session, wt_timestamp_t rollbac WT_DECL_RET; WT_REF *child_ref, *ref; + /* Set this flag to return error instead of panic if file is corrupted. */ + F_SET(session, WT_SESSION_QUIET_CORRUPT_FILE); + /* Walk the tree, marking commits aborted where appropriate. */ ref = NULL; while ((ret = __wt_tree_walk_custom_skip(session, &ref, __wt_rts_page_skip, &rollback_timestamp, @@ -942,6 +1053,7 @@ __rollback_to_stable_btree_walk(WT_SESSION_IMPL *session, wt_timestamp_t rollbac } else WT_RET(__rollback_abort_newer_updates(session, ref, rollback_timestamp)); + F_CLR(session, WT_SESSION_QUIET_CORRUPT_FILE); return (ret); } @@ -1176,12 +1288,14 @@ __rollback_to_stable_btree_apply(WT_SESSION_IMPL *session) WT_TXN_GLOBAL *txn_global; wt_timestamp_t max_durable_ts, newest_start_durable_ts, newest_stop_durable_ts, rollback_timestamp; + uint64_t rollback_txnid; size_t addr_size; char ts_string[2][WT_TS_INT_STRING_SIZE]; const char *config, *uri; - bool durable_ts_found, prepared_updates; + bool durable_ts_found, prepared_updates, has_txn_updates_gt_than_ckpt_snap; txn_global = &S2C(session)->txn_global; + rollback_txnid = 0; addr_size = 0; /* @@ -1198,6 +1312,13 @@ __rollback_to_stable_btree_apply(WT_SESSION_IMPL *session) WT_ASSERT(session, F_ISSET(session, WT_SESSION_LOCKED_SCHEMA)); WT_RET(__wt_metadata_cursor(session, &cursor)); + if (F_ISSET(S2C(session), WT_CONN_RECOVERING)) + __wt_verbose(session, WT_VERB_RECOVERY_RTS(session), + "recovered checkpoint snapshot min: %" PRIu64 ", snapshot max: %" PRIu64 + ", snapshot count: %" PRIu32, + S2C(session)->recovery_ckpt_snap_min, S2C(session)->recovery_ckpt_snap_max, + S2C(session)->recovery_ckpt_snapshot_count); + while ((ret = cursor->next(cursor)) == 0) { WT_ERR(cursor->get_key(cursor, &uri)); @@ -1212,7 +1333,7 @@ __rollback_to_stable_btree_apply(WT_SESSION_IMPL *session) /* Find out the max durable timestamp of the object from checkpoint. */ newest_start_durable_ts = newest_stop_durable_ts = WT_TS_NONE; - durable_ts_found = prepared_updates = false; + durable_ts_found = prepared_updates = has_txn_updates_gt_than_ckpt_snap = false; WT_ERR(__wt_config_getones(session, config, "checkpoint", &cval)); __wt_config_subinit(session, &ckptconf, &cval); for (; __wt_config_next(&ckptconf, &key, &cval) == 0;) { @@ -1235,12 +1356,22 @@ __rollback_to_stable_btree_apply(WT_SESSION_IMPL *session) prepared_updates = true; } WT_ERR_NOTFOUND_OK(ret, false); + ret = __wt_config_subgets(session, &cval, "newest_txn", &value); + if (value.len != 0) + rollback_txnid = (uint64_t)value.val; + WT_ERR_NOTFOUND_OK(ret, false); ret = __wt_config_subgets(session, &cval, "addr", &value); if (ret == 0) addr_size = value.len; WT_ERR_NOTFOUND_OK(ret, false); } max_durable_ts = WT_MAX(newest_start_durable_ts, newest_stop_durable_ts); + has_txn_updates_gt_than_ckpt_snap = + WT_CHECK_RECOVERY_FLAG_TS_TXNID(session, rollback_txnid, max_durable_ts); + + /* Increment the inconsistent checkpoint stats counter. */ + if (has_txn_updates_gt_than_ckpt_snap) + WT_STAT_CONN_DATA_INCR(session, txn_rts_inconsistent_ckpt); /* * The rollback to stable will skip the tables during recovery and shutdown in the following @@ -1253,7 +1384,7 @@ __rollback_to_stable_btree_apply(WT_SESSION_IMPL *session) (addr_size == 0 || (txn_global->stable_timestamp == WT_TS_NONE && max_durable_ts != WT_TS_NONE))) { __wt_verbose(session, WT_VERB_RECOVERY_RTS(session), - "Skip rollback to stable on file %s because %s", uri, + "skip rollback to stable on file %s because %s", uri, addr_size == 0 ? "its checkpoint address length is 0" : "it has timestamped updates and the stable timestamp is 0"); continue; @@ -1271,7 +1402,7 @@ __rollback_to_stable_btree_apply(WT_SESSION_IMPL *session) if ((ret == ENOENT) || (ret == WT_ERROR && F_ISSET(S2C(session), WT_CONN_DATA_CORRUPTION))) { __wt_verbose(session, WT_VERB_RECOVERY_RTS(session), - "Ignore performing rollback to stable on %s because the file %s", uri, + "ignore performing rollback to stable on %s because the file %s", uri, ret == ENOENT ? "does not exist" : "is corrupted."); continue; } @@ -1282,21 +1413,24 @@ __rollback_to_stable_btree_apply(WT_SESSION_IMPL *session) * 1. The tree is modified. * 2. The checkpoint durable start/stop timestamp is greater than the rollback timestamp. * 3. There is no durable timestamp in any checkpoint. + * 4. The checkpoint newest txn is greater than snapshot min txn id */ if (S2BT(session)->modified || max_durable_ts > rollback_timestamp || prepared_updates || - !durable_ts_found) { + !durable_ts_found || has_txn_updates_gt_than_ckpt_snap) { __wt_verbose(session, WT_VERB_RECOVERY_RTS(session), "tree rolled back with durable timestamp: %s, or when tree is modified: %s or " - "prepared updates: %s or when durable time is not found: %s", + "prepared updates: %s or when durable time is not found: %s or txnid: %" PRIu64 + " is greater than recovery checkpoint snap min: %s", __wt_timestamp_to_string(max_durable_ts, ts_string[0]), S2BT(session)->modified ? "true" : "false", prepared_updates ? "true" : "false", - !durable_ts_found ? "true" : "false"); + !durable_ts_found ? "true" : "false", rollback_txnid, + has_txn_updates_gt_than_ckpt_snap ? "true" : "false"); WT_TRET(__rollback_to_stable_btree(session, rollback_timestamp)); } else __wt_verbose(session, WT_VERB_RECOVERY_RTS(session), - "tree skipped with durable timestamp: %s and stable timestamp: %s", + "tree skipped with durable timestamp: %s and stable timestamp: %s or txnid: %" PRIu64, __wt_timestamp_to_string(max_durable_ts, ts_string[0]), - __wt_timestamp_to_string(rollback_timestamp, ts_string[1])); + __wt_timestamp_to_string(rollback_timestamp, ts_string[1]), rollback_txnid); /* * Truncate history store entries for the non-timestamped table. @@ -1312,6 +1446,14 @@ __rollback_to_stable_btree_apply(WT_SESSION_IMPL *session) WT_TRET(__rollback_to_stable_btree_hs_truncate(session, S2BT(session)->id)); WT_TRET(__wt_session_release_dhandle(session)); + + /* + * Continue when the table is corrupted and proceed to perform rollback to stable on other + * tables. + */ + if (ret == WT_ERROR && F_ISSET(S2C(session), WT_CONN_DATA_CORRUPTION)) + continue; + WT_ERR(ret); } WT_ERR_NOTFOUND_OK(ret, false); diff --git a/src/third_party/wiredtiger/test/suite/test_checkpoint_snapshot02.py b/src/third_party/wiredtiger/test/suite/test_checkpoint_snapshot02.py new file mode 100644 index 00000000000..323cc990d67 --- /dev/null +++ b/src/third_party/wiredtiger/test/suite/test_checkpoint_snapshot02.py @@ -0,0 +1,136 @@ +#!/usr/bin/env python +# +# Public Domain 2014-2020 MongoDB, Inc. +# Public Domain 2008-2014 WiredTiger, Inc. +# +# This is free and unencumbered software released into the public domain. +# +# Anyone is free to copy, modify, publish, use, compile, sell, or +# distribute this software, either in source code form or as a compiled +# binary, for any purpose, commercial or non-commercial, and by any +# means. +# +# In jurisdictions that recognize copyright laws, the author or authors +# of this software dedicate any and all copyright interest in the +# software to the public domain. We make this dedication for the benefit +# of the public at large and to the detriment of our heirs and +# successors. We intend this dedication to be an overt act of +# relinquishment in perpetuity of all present and future rights to this +# software under copyright law. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +# IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR +# OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +# ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR +# OTHER DEALINGS IN THE SOFTWARE. + +import fnmatch, os, shutil, threading, time +from wtthread import checkpoint_thread, op_thread +from helper import copy_wiredtiger_home +import wiredtiger, wttest +from wtdataset import SimpleDataSet +from wtscenario import make_scenarios +from wiredtiger import stat + +# test_checkpoint_snapshot02.py +# This test is to run checkpoint and eviction in parallel with timing +# stress for checkpoint and let eviction write more data than checkpoint. +# + +def timestamp_str(t): + return '%x' % t +class test_checkpoint_snapshot02(wttest.WiredTigerTestCase): + + # Create a table. + uri = "table:test_checkpoint_snapshot02" + nrows = 1000 + + def conn_config(self): + config = 'cache_size=10MB,statistics=(all),statistics_log=(json,on_close,wait=1),log=(enabled=true),timing_stress_for_test=[checkpoint_slow]' + return config + + def large_updates(self, uri, value, ds, nrows): + # Update a large number of records. + session = self.session + cursor = session.open_cursor(uri) + for i in range(0, nrows): + session.begin_transaction() + cursor[ds.key(i)] = value + session.commit_transaction() + cursor.close() + + def check(self, check_value, uri, nrows): + session = self.session + session.begin_transaction() + cursor = session.open_cursor(uri) + count = 0 + for k, v in cursor: + self.assertEqual(v, check_value) + count += 1 + session.commit_transaction() + self.assertEqual(count, nrows) + + def test_checkpoint_snapshot(self): + + ds = SimpleDataSet(self, self.uri, 0, key_format="S", value_format="S",config='log=(enabled=false)') + ds.populate() + valuea = "aaaaa" * 100 + valueb = "bbbbb" * 100 + valuec = "ccccc" * 100 + valued = "ddddd" * 100 + + cursor = self.session.open_cursor(self.uri) + self.large_updates(self.uri, valuea, ds, self.nrows) + + self.check(valuea, self.uri, self.nrows) + + session1 = self.conn.open_session() + session1.begin_transaction() + cursor1 = session1.open_cursor(self.uri) + + for i in range(self.nrows, self.nrows*2): + cursor1.set_key(ds.key(i)) + cursor1.set_value(valuea) + self.assertEqual(cursor1.insert(), 0) + + # Create a checkpoint thread + done = threading.Event() + ckpt = checkpoint_thread(self.conn, done) + try: + ckpt.start() + # Sleep for sometime so that checkpoint starts before committing last transaction. + time.sleep(2) + session1.commit_transaction() + + finally: + done.set() + ckpt.join() + + #Simulate a crash by copying to a new directory(RESTART). + copy_wiredtiger_home(self, ".", "RESTART") + + # Open the new directory. + self.conn = self.setUpConnectionOpen("RESTART") + self.session = self.setUpSessionOpen(self.conn) + + # Check the table contains the last checkpointed value. + self.check(valuea, self.uri, self.nrows) + + stat_cursor = self.session.open_cursor('statistics:', None, None) + inconsistent_ckpt = stat_cursor[stat.conn.txn_rts_inconsistent_ckpt][2] + keys_removed = stat_cursor[stat.conn.txn_rts_keys_removed][2] + keys_restored = stat_cursor[stat.conn.txn_rts_keys_restored][2] + pages_visited = stat_cursor[stat.conn.txn_rts_pages_visited][2] + upd_aborted = stat_cursor[stat.conn.txn_rts_upd_aborted][2] + stat_cursor.close() + + self.assertGreater(inconsistent_ckpt, 0) + self.assertEqual(upd_aborted, 0) + self.assertGreaterEqual(keys_removed, 0) + self.assertEqual(keys_restored, 0) + self.assertGreaterEqual(pages_visited, 0) + +if __name__ == '__main__': + wttest.run() diff --git a/src/third_party/wiredtiger/test/suite/test_rollback_to_stable05.py b/src/third_party/wiredtiger/test/suite/test_rollback_to_stable05.py index f3c93509d63..7cfc3ba2fe7 100755 --- a/src/third_party/wiredtiger/test/suite/test_rollback_to_stable05.py +++ b/src/third_party/wiredtiger/test/suite/test_rollback_to_stable05.py @@ -133,12 +133,11 @@ class test_rollback_to_stable05(test_rollback_to_stable_base): self.assertEqual(calls, 1) self.assertEqual(keys_removed, 0) self.assertEqual(keys_restored, 0) + self.assertGreaterEqual(pages_visited, 0) if self.in_memory: - self.assertGreaterEqual(pages_visited, 0) self.assertEqual(upd_aborted, 0) self.assertEqual(hs_removed, 0) else: - self.assertEqual(pages_visited, 0) self.assertEqual(upd_aborted, 0) self.assertEqual(hs_removed, 0) diff --git a/src/third_party/wiredtiger/test/suite/test_rollback_to_stable12.py b/src/third_party/wiredtiger/test/suite/test_rollback_to_stable12.py index fbbd2146306..e5255cbe4fd 100755 --- a/src/third_party/wiredtiger/test/suite/test_rollback_to_stable12.py +++ b/src/third_party/wiredtiger/test/suite/test_rollback_to_stable12.py @@ -120,7 +120,7 @@ class test_rollback_to_stable12(test_rollback_to_stable_base): self.assertGreater(pages_visited, 0) self.assertGreaterEqual(hs_removed, 0) self.assertEqual(hs_sweep, 0) - self.assertGreater(pages_walk_skipped, 0) + self.assertGreaterEqual(pages_walk_skipped, 0) if __name__ == '__main__': wttest.run() |