summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorLuke Chen <luke.chen@mongodb.com>2021-03-01 16:33:28 +1100
committerEvergreen Agent <no-reply@evergreen.mongodb.com>2021-03-01 06:03:15 +0000
commit6dcf69cdd37b7fd8cc86b6d5412ceab67bceddfe (patch)
tree7c1593caea8a43d0e43c994b9a7f64812ece5be6
parentf4054ff40c1c8309eefdbc87e84e8e8403e51281 (diff)
downloadmongo-6dcf69cdd37b7fd8cc86b6d5412ceab67bceddfe.tar.gz
Import wiredtiger: 9f6b212f1fe4a069ed18bf49ff237b31b2098c4c from branch mongodb-5.0
ref: 135a36dc0a..9f6b212f1f for: 4.9.0 WT-6673 RTS fix inconsistent checkpoint by removing updates outside of the checkpoint snapshot
-rw-r--r--src/third_party/wiredtiger/dist/stat_data.py1
-rw-r--r--src/third_party/wiredtiger/import.data2
-rw-r--r--src/third_party/wiredtiger/src/btree/bt_handle.c17
-rw-r--r--src/third_party/wiredtiger/src/btree/bt_vrfy.c4
-rw-r--r--src/third_party/wiredtiger/src/conn/conn_dhandle.c31
-rw-r--r--src/third_party/wiredtiger/src/include/extern.h1
-rw-r--r--src/third_party/wiredtiger/src/include/stat.h2
-rw-r--r--src/third_party/wiredtiger/src/include/wiredtiger.in36
-rw-r--r--src/third_party/wiredtiger/src/meta/meta_ckpt.c22
-rw-r--r--src/third_party/wiredtiger/src/os_posix/os_fs.c10
-rw-r--r--src/third_party/wiredtiger/src/support/stat.c7
-rw-r--r--src/third_party/wiredtiger/src/txn/txn.c2
-rw-r--r--src/third_party/wiredtiger/src/txn/txn_recover.c34
-rw-r--r--src/third_party/wiredtiger/src/txn/txn_rollback_to_stable.c228
-rw-r--r--src/third_party/wiredtiger/test/suite/test_checkpoint_snapshot02.py136
-rwxr-xr-xsrc/third_party/wiredtiger/test/suite/test_rollback_to_stable05.py3
-rwxr-xr-xsrc/third_party/wiredtiger/test/suite/test_rollback_to_stable12.py2
17 files changed, 444 insertions, 94 deletions
diff --git a/src/third_party/wiredtiger/dist/stat_data.py b/src/third_party/wiredtiger/dist/stat_data.py
index 97c20322319..e6eab22645d 100644
--- a/src/third_party/wiredtiger/dist/stat_data.py
+++ b/src/third_party/wiredtiger/dist/stat_data.py
@@ -875,6 +875,7 @@ conn_dsrc_stats = [
TxnStat('txn_rts_hs_restore_updates', 'rollback to stable restored updates from history store'),
TxnStat('txn_rts_hs_restore_tombstones', 'rollback to stable restored tombstones from history store'),
TxnStat('txn_rts_hs_stop_older_than_newer_start', 'rollback to stable hs records with stop timestamps older than newer records'),
+ TxnStat('txn_rts_inconsistent_ckpt', 'rollback to stable inconsistent checkpoint'),
TxnStat('txn_rts_keys_removed', 'rollback to stable keys removed'),
TxnStat('txn_rts_keys_restored', 'rollback to stable keys restored'),
TxnStat('txn_rts_sweep_hs_keys', 'rollback to stable sweeping history store keys'),
diff --git a/src/third_party/wiredtiger/import.data b/src/third_party/wiredtiger/import.data
index a721db1e45c..2335d486a79 100644
--- a/src/third_party/wiredtiger/import.data
+++ b/src/third_party/wiredtiger/import.data
@@ -2,5 +2,5 @@
"vendor": "wiredtiger",
"github": "wiredtiger/wiredtiger.git",
"branch": "mongodb-5.0",
- "commit": "135a36dc0ad05fea990dc2c02696e68ce0fb287d"
+ "commit": "9f6b212f1fe4a069ed18bf49ff237b31b2098c4c"
}
diff --git a/src/third_party/wiredtiger/src/btree/bt_handle.c b/src/third_party/wiredtiger/src/btree/bt_handle.c
index 2d26a1bf573..1d0f9dd115d 100644
--- a/src/third_party/wiredtiger/src/btree/bt_handle.c
+++ b/src/third_party/wiredtiger/src/btree/bt_handle.c
@@ -524,9 +524,22 @@ __btree_conf(WT_SESSION_IMPL *session, WT_CKPT *ckpt)
/* If this is the first time opening the tree this run. */
if (F_ISSET(session, WT_SESSION_IMPORT) || ckpt->run_write_gen < conn->base_write_gen)
- btree->base_write_gen = btree->run_write_gen = btree->write_gen;
+ btree->run_write_gen = btree->write_gen;
else
- btree->base_write_gen = btree->run_write_gen = ckpt->run_write_gen;
+ btree->run_write_gen = ckpt->run_write_gen;
+
+ /*
+ * In recovery use the last checkpointed run write generation number as base write generation
+ * number to reset the transaction ids of the pages that were modified before the restart. The
+ * transaction ids are retained only on the pages that are written after the restart.
+ *
+ * Rollback to stable does not operate on logged tables and metadata, so it is skipped.
+ */
+ if (!F_ISSET(conn, WT_CONN_RECOVERING) || WT_IS_METADATA(btree->dhandle) ||
+ __wt_btree_immediately_durable(session))
+ btree->base_write_gen = btree->run_write_gen;
+ else
+ btree->base_write_gen = ckpt->run_write_gen;
/*
* We've just overwritten the runtime write generation based off the fact that know that we're
diff --git a/src/third_party/wiredtiger/src/btree/bt_vrfy.c b/src/third_party/wiredtiger/src/btree/bt_vrfy.c
index 13abe891dd8..f539d44daf7 100644
--- a/src/third_party/wiredtiger/src/btree/bt_vrfy.c
+++ b/src/third_party/wiredtiger/src/btree/bt_vrfy.c
@@ -259,10 +259,6 @@ __wt_verify(WT_SESSION_IMPL *session, const char *cfg[])
*/
memset(&addr_unpack, 0, sizeof(addr_unpack));
WT_TIME_AGGREGATE_COPY(&addr_unpack.ta, &ckpt->ta);
- if (ckpt->write_gen <= btree->base_write_gen) {
- addr_unpack.ta.newest_txn = WT_TXN_NONE;
- addr_unpack.ta.newest_stop_txn = WT_TXN_MAX;
- }
if (ckpt->ta.prepare)
addr_unpack.ta.prepare = 1;
addr_unpack.raw = WT_CELL_ADDR_INT;
diff --git a/src/third_party/wiredtiger/src/conn/conn_dhandle.c b/src/third_party/wiredtiger/src/conn/conn_dhandle.c
index 11379c1f419..1ba4cbe30ed 100644
--- a/src/third_party/wiredtiger/src/conn/conn_dhandle.c
+++ b/src/third_party/wiredtiger/src/conn/conn_dhandle.c
@@ -920,6 +920,37 @@ restart:
}
/*
+ * __wt_dhandle_update_write_gens --
+ * Update the open dhandles write generation, run write generation and base write generation
+ * number.
+ */
+void
+__wt_dhandle_update_write_gens(WT_SESSION_IMPL *session)
+{
+ WT_BTREE *btree;
+ WT_CONNECTION_IMPL *conn;
+ WT_DATA_HANDLE *dhandle;
+
+ conn = S2C(session);
+
+ for (dhandle = NULL;;) {
+ WT_WITH_HANDLE_LIST_WRITE_LOCK(session, WT_DHANDLE_NEXT(session, dhandle, &conn->dhqh, q));
+ if (dhandle == NULL)
+ break;
+ btree = (WT_BTREE *)dhandle->handle;
+
+ WT_ASSERT(session, btree != NULL);
+
+ /*
+ * Initialize the btree write generation numbers after rollback to stable so that the
+ * transaction ids of the pages will be reset when loaded from disk to memory.
+ */
+ btree->write_gen = btree->base_write_gen = btree->run_write_gen =
+ WT_MAX(btree->write_gen, conn->base_write_gen);
+ }
+}
+
+/*
* __wt_verbose_dump_handles --
* Dump information about all data handles.
*/
diff --git a/src/third_party/wiredtiger/src/include/extern.h b/src/third_party/wiredtiger/src/include/extern.h
index bf1505b2229..dfefe57ba26 100644
--- a/src/third_party/wiredtiger/src/include/extern.h
+++ b/src/third_party/wiredtiger/src/include/extern.h
@@ -1685,6 +1685,7 @@ extern void __wt_curstat_cache_walk(WT_SESSION_IMPL *session);
extern void __wt_curstat_dsrc_final(WT_CURSOR_STAT *cst);
extern void __wt_curtable_set_key(WT_CURSOR *cursor, ...);
extern void __wt_curtable_set_value(WT_CURSOR *cursor, ...);
+extern void __wt_dhandle_update_write_gens(WT_SESSION_IMPL *session);
extern void __wt_encrypt_size(
WT_SESSION_IMPL *session, WT_KEYED_ENCRYPTOR *kencryptor, size_t incoming_size, size_t *sizep);
extern void __wt_err_func(
diff --git a/src/third_party/wiredtiger/src/include/stat.h b/src/third_party/wiredtiger/src/include/stat.h
index 99bcb538f59..bb3ae8c176a 100644
--- a/src/third_party/wiredtiger/src/include/stat.h
+++ b/src/third_party/wiredtiger/src/include/stat.h
@@ -785,6 +785,7 @@ struct __wt_connection_stats {
int64_t tiered_retention;
int64_t txn_read_race_prepare_update;
int64_t txn_rts_hs_stop_older_than_newer_start;
+ int64_t txn_rts_inconsistent_ckpt;
int64_t txn_rts_keys_removed;
int64_t txn_rts_keys_restored;
int64_t txn_rts_hs_restore_tombstones;
@@ -1002,6 +1003,7 @@ struct __wt_dsrc_stats {
int64_t tiered_retention;
int64_t txn_read_race_prepare_update;
int64_t txn_rts_hs_stop_older_than_newer_start;
+ int64_t txn_rts_inconsistent_ckpt;
int64_t txn_rts_keys_removed;
int64_t txn_rts_keys_restored;
int64_t txn_rts_hs_restore_tombstones;
diff --git a/src/third_party/wiredtiger/src/include/wiredtiger.in b/src/third_party/wiredtiger/src/include/wiredtiger.in
index d041c231680..5ccd7bab902 100644
--- a/src/third_party/wiredtiger/src/include/wiredtiger.in
+++ b/src/third_party/wiredtiger/src/include/wiredtiger.in
@@ -6110,22 +6110,24 @@ extern int wiredtiger_extension_terminate(WT_CONNECTION *connection);
* than newer records
*/
#define WT_STAT_CONN_TXN_RTS_HS_STOP_OLDER_THAN_NEWER_START 1471
+/*! transaction: rollback to stable inconsistent checkpoint */
+#define WT_STAT_CONN_TXN_RTS_INCONSISTENT_CKPT 1472
/*! transaction: rollback to stable keys removed */
-#define WT_STAT_CONN_TXN_RTS_KEYS_REMOVED 1472
+#define WT_STAT_CONN_TXN_RTS_KEYS_REMOVED 1473
/*! transaction: rollback to stable keys restored */
-#define WT_STAT_CONN_TXN_RTS_KEYS_RESTORED 1473
+#define WT_STAT_CONN_TXN_RTS_KEYS_RESTORED 1474
/*! transaction: rollback to stable restored tombstones from history store */
-#define WT_STAT_CONN_TXN_RTS_HS_RESTORE_TOMBSTONES 1474
+#define WT_STAT_CONN_TXN_RTS_HS_RESTORE_TOMBSTONES 1475
/*! transaction: rollback to stable restored updates from history store */
-#define WT_STAT_CONN_TXN_RTS_HS_RESTORE_UPDATES 1475
+#define WT_STAT_CONN_TXN_RTS_HS_RESTORE_UPDATES 1476
/*! transaction: rollback to stable sweeping history store keys */
-#define WT_STAT_CONN_TXN_RTS_SWEEP_HS_KEYS 1476
+#define WT_STAT_CONN_TXN_RTS_SWEEP_HS_KEYS 1477
/*! transaction: rollback to stable updates removed from history store */
-#define WT_STAT_CONN_TXN_RTS_HS_REMOVED 1477
+#define WT_STAT_CONN_TXN_RTS_HS_REMOVED 1478
/*! transaction: transaction checkpoints due to obsolete pages */
-#define WT_STAT_CONN_TXN_CHECKPOINT_OBSOLETE_APPLIED 1478
+#define WT_STAT_CONN_TXN_CHECKPOINT_OBSOLETE_APPLIED 1479
/*! transaction: update conflicts */
-#define WT_STAT_CONN_TXN_UPDATE_CONFLICT 1479
+#define WT_STAT_CONN_TXN_UPDATE_CONFLICT 1480
/*!
* @}
@@ -6727,22 +6729,24 @@ extern int wiredtiger_extension_terminate(WT_CONNECTION *connection);
* than newer records
*/
#define WT_STAT_DSRC_TXN_RTS_HS_STOP_OLDER_THAN_NEWER_START 2201
+/*! transaction: rollback to stable inconsistent checkpoint */
+#define WT_STAT_DSRC_TXN_RTS_INCONSISTENT_CKPT 2202
/*! transaction: rollback to stable keys removed */
-#define WT_STAT_DSRC_TXN_RTS_KEYS_REMOVED 2202
+#define WT_STAT_DSRC_TXN_RTS_KEYS_REMOVED 2203
/*! transaction: rollback to stable keys restored */
-#define WT_STAT_DSRC_TXN_RTS_KEYS_RESTORED 2203
+#define WT_STAT_DSRC_TXN_RTS_KEYS_RESTORED 2204
/*! transaction: rollback to stable restored tombstones from history store */
-#define WT_STAT_DSRC_TXN_RTS_HS_RESTORE_TOMBSTONES 2204
+#define WT_STAT_DSRC_TXN_RTS_HS_RESTORE_TOMBSTONES 2205
/*! transaction: rollback to stable restored updates from history store */
-#define WT_STAT_DSRC_TXN_RTS_HS_RESTORE_UPDATES 2205
+#define WT_STAT_DSRC_TXN_RTS_HS_RESTORE_UPDATES 2206
/*! transaction: rollback to stable sweeping history store keys */
-#define WT_STAT_DSRC_TXN_RTS_SWEEP_HS_KEYS 2206
+#define WT_STAT_DSRC_TXN_RTS_SWEEP_HS_KEYS 2207
/*! transaction: rollback to stable updates removed from history store */
-#define WT_STAT_DSRC_TXN_RTS_HS_REMOVED 2207
+#define WT_STAT_DSRC_TXN_RTS_HS_REMOVED 2208
/*! transaction: transaction checkpoints due to obsolete pages */
-#define WT_STAT_DSRC_TXN_CHECKPOINT_OBSOLETE_APPLIED 2208
+#define WT_STAT_DSRC_TXN_CHECKPOINT_OBSOLETE_APPLIED 2209
/*! transaction: update conflicts */
-#define WT_STAT_DSRC_TXN_UPDATE_CONFLICT 2209
+#define WT_STAT_DSRC_TXN_UPDATE_CONFLICT 2210
/*!
* @}
diff --git a/src/third_party/wiredtiger/src/meta/meta_ckpt.c b/src/third_party/wiredtiger/src/meta/meta_ckpt.c
index 150880625a6..9b54431a8cf 100644
--- a/src/third_party/wiredtiger/src/meta/meta_ckpt.c
+++ b/src/third_party/wiredtiger/src/meta/meta_ckpt.c
@@ -1001,6 +1001,7 @@ __wt_meta_sysinfo_set(WT_SESSION_IMPL *session)
wt_timestamp_t oldest_timestamp;
uint32_t snap_count;
char hex_timestamp[WT_TS_HEX_STRING_SIZE];
+ char ts_string[2][WT_TS_INT_STRING_SIZE];
txn_global = &S2C(session)->txn_global;
@@ -1044,19 +1045,26 @@ __wt_meta_sysinfo_set(WT_SESSION_IMPL *session)
}
/* Record snapshot information in metadata for checkpoint. */
- if (txn->snapshot_count > 0) {
- WT_ERR(__wt_buf_fmt(session, buf,
- WT_SYSTEM_CKPT_SNAPSHOT_MIN "=%" PRIu64 "," WT_SYSTEM_CKPT_SNAPSHOT_MAX "=%" PRIu64
- "," WT_SYSTEM_CKPT_SNAPSHOT_COUNT "=%" PRIu32
- "," WT_SYSTEM_CKPT_SNAPSHOT "=[",
- txn->snap_min, txn->snap_max, txn->snapshot_count));
+ WT_ERR(__wt_buf_fmt(session, buf,
+ WT_SYSTEM_CKPT_SNAPSHOT_MIN "=%" PRIu64 "," WT_SYSTEM_CKPT_SNAPSHOT_MAX "=%" PRIu64
+ "," WT_SYSTEM_CKPT_SNAPSHOT_COUNT "=%" PRIu32,
+ txn->snap_min, txn->snap_max, txn->snapshot_count));
+ if (txn->snapshot_count > 0) {
+ WT_ERR(__wt_buf_catfmt(session, buf, "," WT_SYSTEM_CKPT_SNAPSHOT "=["));
for (snap_count = 0; snap_count < txn->snapshot_count - 1; ++snap_count)
WT_ERR(__wt_buf_catfmt(session, buf, "%" PRIu64 "%s", txn->snapshot[snap_count], ","));
WT_ERR(__wt_buf_catfmt(session, buf, "%" PRIu64 "%s", txn->snapshot[snap_count], "]"));
- WT_ERR(__wt_metadata_update(session, WT_SYSTEM_CKPT_SNAPSHOT_URI, buf->data));
}
+ WT_ERR(__wt_metadata_update(session, WT_SYSTEM_CKPT_SNAPSHOT_URI, buf->data));
+
+ __wt_verbose(session, WT_VERB_CHECKPOINT_PROGRESS,
+ "saving checkpoint snapshot min: %" PRIu64 ", snapshot max: %" PRIu64
+ " snapshot count: %" PRIu32 ", oldest timestamp: %s , meta checkpoint timestamp: %s",
+ txn->snap_min, txn->snap_max, txn->snapshot_count,
+ __wt_timestamp_to_string(txn_global->oldest_timestamp, ts_string[0]),
+ __wt_timestamp_to_string(txn_global->meta_ckpt_timestamp, ts_string[1]));
/* Record the base write gen in metadata as part of checkpoint */
WT_ERR(__wt_buf_fmt(
diff --git a/src/third_party/wiredtiger/src/os_posix/os_fs.c b/src/third_party/wiredtiger/src/os_posix/os_fs.c
index ede22518c26..a472273de48 100644
--- a/src/third_party/wiredtiger/src/os_posix/os_fs.c
+++ b/src/third_party/wiredtiger/src/os_posix/os_fs.c
@@ -748,7 +748,10 @@ __posix_open_file(WT_FILE_SYSTEM *file_system, WT_SESSION *wt_session, const cha
f |= O_CLOEXEC;
#endif
WT_SYSCALL_RETRY(((pfh->fd = open(name, f, 0444)) == -1 ? -1 : 0), ret);
- if (ret != 0)
+ /* Return error if the file not found during rollback to stable. */
+ if (ret != 0 && F_ISSET(session, WT_SESSION_ROLLBACK_TO_STABLE))
+ WT_ERR(__wt_errno());
+ else if (ret != 0)
WT_ERR_MSG(session, ret, "%s: handle-open: open-directory", name);
WT_ERR(__posix_open_file_cloexec(session, pfh->fd, name));
goto directory_open;
@@ -800,7 +803,10 @@ __posix_open_file(WT_FILE_SYSTEM *file_system, WT_SESSION *wt_session, const cha
/* Create/Open the file. */
WT_SYSCALL_RETRY(((pfh->fd = open(name, f, mode)) == -1 ? -1 : 0), ret);
- if (ret != 0)
+ /* Return error if the file not found during rollback to stable. */
+ if (ret != 0 && F_ISSET(session, WT_SESSION_ROLLBACK_TO_STABLE))
+ WT_ERR(ENOENT);
+ else if (ret != 0)
WT_ERR_MSG(session, ret,
pfh->direct_io ? "%s: handle-open: open: failed with direct I/O configured, some "
"filesystem types do not support direct I/O" :
diff --git a/src/third_party/wiredtiger/src/support/stat.c b/src/third_party/wiredtiger/src/support/stat.c
index c0c776bde2e..47605ab42f8 100644
--- a/src/third_party/wiredtiger/src/support/stat.c
+++ b/src/third_party/wiredtiger/src/support/stat.c
@@ -209,6 +209,7 @@ static const char *const __stats_dsrc_desc[] = {
"session: tiered storage local retention time (secs)",
"transaction: race to read prepared update retry",
"transaction: rollback to stable hs records with stop timestamps older than newer records",
+ "transaction: rollback to stable inconsistent checkpoint",
"transaction: rollback to stable keys removed",
"transaction: rollback to stable keys restored",
"transaction: rollback to stable restored tombstones from history store",
@@ -459,6 +460,7 @@ __wt_stat_dsrc_clear_single(WT_DSRC_STATS *stats)
/* not clearing tiered_retention */
stats->txn_read_race_prepare_update = 0;
stats->txn_rts_hs_stop_older_than_newer_start = 0;
+ stats->txn_rts_inconsistent_ckpt = 0;
stats->txn_rts_keys_removed = 0;
stats->txn_rts_keys_restored = 0;
stats->txn_rts_hs_restore_tombstones = 0;
@@ -696,6 +698,7 @@ __wt_stat_dsrc_aggregate_single(WT_DSRC_STATS *from, WT_DSRC_STATS *to)
to->tiered_retention += from->tiered_retention;
to->txn_read_race_prepare_update += from->txn_read_race_prepare_update;
to->txn_rts_hs_stop_older_than_newer_start += from->txn_rts_hs_stop_older_than_newer_start;
+ to->txn_rts_inconsistent_ckpt += from->txn_rts_inconsistent_ckpt;
to->txn_rts_keys_removed += from->txn_rts_keys_removed;
to->txn_rts_keys_restored += from->txn_rts_keys_restored;
to->txn_rts_hs_restore_tombstones += from->txn_rts_hs_restore_tombstones;
@@ -939,6 +942,7 @@ __wt_stat_dsrc_aggregate(WT_DSRC_STATS **from, WT_DSRC_STATS *to)
to->txn_read_race_prepare_update += WT_STAT_READ(from, txn_read_race_prepare_update);
to->txn_rts_hs_stop_older_than_newer_start +=
WT_STAT_READ(from, txn_rts_hs_stop_older_than_newer_start);
+ to->txn_rts_inconsistent_ckpt += WT_STAT_READ(from, txn_rts_inconsistent_ckpt);
to->txn_rts_keys_removed += WT_STAT_READ(from, txn_rts_keys_removed);
to->txn_rts_keys_restored += WT_STAT_READ(from, txn_rts_keys_restored);
to->txn_rts_hs_restore_tombstones += WT_STAT_READ(from, txn_rts_hs_restore_tombstones);
@@ -1434,6 +1438,7 @@ static const char *const __stats_connection_desc[] = {
"session: tiered storage local retention time (secs)",
"transaction: race to read prepared update retry",
"transaction: rollback to stable hs records with stop timestamps older than newer records",
+ "transaction: rollback to stable inconsistent checkpoint",
"transaction: rollback to stable keys removed",
"transaction: rollback to stable keys restored",
"transaction: rollback to stable restored tombstones from history store",
@@ -1954,6 +1959,7 @@ __wt_stat_connection_clear_single(WT_CONNECTION_STATS *stats)
/* not clearing tiered_retention */
stats->txn_read_race_prepare_update = 0;
stats->txn_rts_hs_stop_older_than_newer_start = 0;
+ stats->txn_rts_inconsistent_ckpt = 0;
stats->txn_rts_keys_removed = 0;
stats->txn_rts_keys_restored = 0;
stats->txn_rts_hs_restore_tombstones = 0;
@@ -2485,6 +2491,7 @@ __wt_stat_connection_aggregate(WT_CONNECTION_STATS **from, WT_CONNECTION_STATS *
to->txn_read_race_prepare_update += WT_STAT_READ(from, txn_read_race_prepare_update);
to->txn_rts_hs_stop_older_than_newer_start +=
WT_STAT_READ(from, txn_rts_hs_stop_older_than_newer_start);
+ to->txn_rts_inconsistent_ckpt += WT_STAT_READ(from, txn_rts_inconsistent_ckpt);
to->txn_rts_keys_removed += WT_STAT_READ(from, txn_rts_keys_removed);
to->txn_rts_keys_restored += WT_STAT_READ(from, txn_rts_keys_restored);
to->txn_rts_hs_restore_tombstones += WT_STAT_READ(from, txn_rts_hs_restore_tombstones);
diff --git a/src/third_party/wiredtiger/src/txn/txn.c b/src/third_party/wiredtiger/src/txn/txn.c
index 94396fc92f2..e77b31df9f2 100644
--- a/src/third_party/wiredtiger/src/txn/txn.c
+++ b/src/third_party/wiredtiger/src/txn/txn.c
@@ -2178,7 +2178,7 @@ __wt_txn_global_shutdown(WT_SESSION_IMPL *session, const char **cfg)
*/
if (F_ISSET(conn, WT_CONN_CLOSING_TIMESTAMP)) {
__wt_verbose(session, WT_VERB_RTS,
- "Performing shutdown rollback to stable with stable timestamp: %s",
+ "performing shutdown rollback to stable with stable timestamp: %s",
__wt_timestamp_to_string(conn->txn_global.stable_timestamp, ts_string));
WT_TRET(__wt_rollback_to_stable(session, cfg, true));
}
diff --git a/src/third_party/wiredtiger/src/txn/txn_recover.c b/src/third_party/wiredtiger/src/txn/txn_recover.c
index e71b686d633..59acf9894b6 100644
--- a/src/third_party/wiredtiger/src/txn/txn_recover.c
+++ b/src/third_party/wiredtiger/src/txn/txn_recover.c
@@ -743,12 +743,14 @@ __wt_txn_recover(WT_SESSION_IMPL *session, const char *cfg[])
char *config;
char ts_string[2][WT_TS_INT_STRING_SIZE];
bool do_checkpoint, eviction_started, hs_exists, needs_rec, was_backup;
+ bool rts_executed;
conn = S2C(session);
WT_CLEAR(r);
WT_INIT_LSN(&r.ckpt_lsn);
config = NULL;
do_checkpoint = hs_exists = true;
+ rts_executed = false;
eviction_started = false;
was_backup = F_ISSET(conn, WT_CONN_WAS_BACKUP);
@@ -761,7 +763,6 @@ __wt_txn_recover(WT_SESSION_IMPL *session, const char *cfg[])
F_SET(conn, WT_CONN_RECOVERING);
WT_ERR(__recovery_set_ckpt_base_write_gen(&r));
- WT_ERR(__recovery_set_checkpoint_snapshot(session));
WT_ERR(__wt_metadata_search(session, WT_METAFILE_URI, &config));
WT_ERR(__recovery_setup_file(&r, WT_METAFILE_URI, config));
WT_ERR(__wt_metadata_cursor_open(session, NULL, &metac));
@@ -928,6 +929,8 @@ __wt_txn_recover(WT_SESSION_IMPL *session, const char *cfg[])
done:
WT_ERR(__recovery_set_checkpoint_timestamp(&r));
WT_ERR(__recovery_set_oldest_timestamp(&r));
+ WT_ERR(__recovery_set_checkpoint_snapshot(session));
+
/*
* Perform rollback to stable only when the following conditions met.
* 1. The connection is not read-only. A read-only connection expects that there shouldn't be
@@ -941,17 +944,6 @@ done:
eviction_started = true;
}
- /*
- * Currently, rollback to stable only needs to make changes to tables that use timestamps.
- * That is because eviction does not run in parallel with a checkpoint, so content that is
- * written never uses transaction IDs newer than the checkpoint's transaction ID and thus
- * never needs to be rolled back. Once eviction is allowed while a checkpoint is active, it
- * will be necessary to take the page write generation number into account during rollback
- * to stable. For example, a page with write generation 10 and txnid 20 is written in one
- * checkpoint, and in the next restart a new page with write generation 30 and txnid 20 is
- * written. The rollback to stable operation should only rollback the latest page changes
- * solely based on the write generation numbers.
- */
WT_ASSERT(session,
conn->txn_global.has_stable_timestamp == false &&
conn->txn_global.stable_timestamp == WT_TS_NONE);
@@ -967,19 +959,31 @@ done:
conn->txn_global.has_stable_timestamp = true;
__wt_verbose(session, WT_VERB_RECOVERY | WT_VERB_RTS,
- "Performing recovery rollback_to_stable with stable timestamp: %s and oldest timestamp: "
+ "performing recovery rollback_to_stable with stable timestamp: %s and oldest timestamp: "
"%s",
__wt_timestamp_to_string(conn->txn_global.stable_timestamp, ts_string[0]),
__wt_timestamp_to_string(conn->txn_global.oldest_timestamp, ts_string[1]));
+ rts_executed = true;
+ WT_ERR(__wt_rollback_to_stable(session, NULL, true));
+ }
- WT_ERR(__wt_rollback_to_stable(session, NULL, false));
- } else if (do_checkpoint)
+ if (do_checkpoint || rts_executed)
/*
* Forcibly log a checkpoint so the next open is fast and keep the metadata up to date with
* the checkpoint LSN and archiving.
*/
WT_ERR(session->iface.checkpoint(&session->iface, "force=1"));
+ /* Initialize the connection's base write generation after rollback to stable. */
+ WT_ERR(__wt_metadata_init_base_write_gen(session));
+
+ /*
+ * Update the open dhandles write generations and base write generation with the connection's
+ * base write generation because the recovery checkpoint writes the pages to disk with new write
+ * generation number which contains transaction ids that are needed to reset later.
+ */
+ __wt_dhandle_update_write_gens(session);
+
/*
* If we're downgrading and have newer log files, force an archive, no matter what the archive
* setting is.
diff --git a/src/third_party/wiredtiger/src/txn/txn_rollback_to_stable.c b/src/third_party/wiredtiger/src/txn/txn_rollback_to_stable.c
index 0001d09302b..13d9eba8d58 100644
--- a/src/third_party/wiredtiger/src/txn/txn_rollback_to_stable.c
+++ b/src/third_party/wiredtiger/src/txn/txn_rollback_to_stable.c
@@ -8,9 +8,14 @@
#include "wt_internal.h"
+#define WT_CHECK_RECOVERY_FLAG_TS_TXNID(session, txnid, durablets) \
+ (durablets == WT_TS_NONE && F_ISSET(S2C(session), WT_CONN_RECOVERING) && \
+ (txnid) >= S2C(session)->recovery_ckpt_snap_min)
+
/* Enable rollback to stable verbose messaging during recovery. */
#define WT_VERB_RECOVERY_RTS(session) \
(F_ISSET(S2C(session), WT_CONN_RECOVERING) ? WT_VERB_RECOVERY | WT_VERB_RTS : WT_VERB_RTS)
+
/*
* __rollback_abort_newer_update --
* Abort updates in an update change with timestamps newer than the rollback timestamp. Also,
@@ -149,6 +154,53 @@ err:
}
/*
+ * __rollback_check_if_txnid_non_committed --
+ * Check if the transaction id is non committed.
+ */
+static bool
+__rollback_check_if_txnid_non_committed(WT_SESSION_IMPL *session, uint64_t txnid)
+{
+ WT_CONNECTION_IMPL *conn;
+ bool found;
+
+ conn = S2C(session);
+
+ /* If not recovery then assume all the data as committed. */
+ if (!F_ISSET(conn, WT_CONN_RECOVERING))
+ return (false);
+
+ /*
+ * Only full checkpoint writes the metadata with snapshot. If the recovered checkpoint snapshot
+ * details are zero then return false i.e, updates are committed.
+ */
+ if (conn->recovery_ckpt_snap_min == 0 && conn->recovery_ckpt_snap_max == 0)
+ return (false);
+
+ /*
+ * Snapshot data:
+ * ids < recovery_ckpt_snap_min are committed,
+ * ids > recovery_ckpt_snap_max are non committed,
+ * everything else is committed unless it is found in the recovery_ckpt_snapshot array.
+ */
+ if (txnid < conn->recovery_ckpt_snap_min)
+ return (false);
+ else if (txnid > conn->recovery_ckpt_snap_max)
+ return (true);
+
+ /*
+ * Return false when the recovery snapshot count is 0, which means there is no uncommitted
+ * transaction ids.
+ */
+ if (conn->recovery_ckpt_snapshot_count == 0)
+ return (false);
+
+ WT_BINARY_SEARCH(
+ txnid, conn->recovery_ckpt_snapshot, conn->recovery_ckpt_snapshot_count, found);
+
+ return (found);
+}
+
+/*
* __rollback_row_ondisk_fixup_key --
* Abort updates in the history store and replace the on-disk value with an update that
* satisfies the given timestamp.
@@ -295,40 +347,51 @@ __rollback_row_ondisk_fixup_key(WT_SESSION_IMPL *session, WT_PAGE *page, WT_ROW
/*
* Stop processing when we find the newer version value of this key is stable according to
- * the current version stop timestamp when it is not appending the selected update to the
- * update chain. Also it confirms that history store doesn't contains any newer version than
- * the current version for the key.
+ * the current version stop timestamp and transaction id when it is not appending the
+ * selected update to the update chain. Also it confirms that history store doesn't contains
+ * any newer version than the current version for the key.
*/
- if (!replace && hs_stop_durable_ts <= rollback_timestamp) {
+ if (!replace &&
+ (hs_stop_durable_ts != WT_TS_NONE ||
+ !__rollback_check_if_txnid_non_committed(session, cbt->upd_value->tw.stop_txn)) &&
+ (hs_stop_durable_ts <= rollback_timestamp)) {
__wt_verbose(session, WT_VERB_RECOVERY_RTS(session),
- "history store update valid with stop timestamp: %s, stable timestamp: %s and type: "
- "%" PRIu8,
+ "history store update valid with stop timestamp: %s, stable timestamp: %s, txnid: "
+ "%" PRIu64 " and type: %" PRIu8,
__wt_timestamp_to_string(hs_stop_durable_ts, ts_string[0]),
- __wt_timestamp_to_string(rollback_timestamp, ts_string[1]), type);
+ __wt_timestamp_to_string(rollback_timestamp, ts_string[1]),
+ cbt->upd_value->tw.stop_txn, type);
break;
}
- /* Stop processing when we find a stable update according to the given timestamp. */
- if (hs_durable_ts <= rollback_timestamp) {
+ /*
+ * Stop processing when we find a stable update according to the given timestamp and
+ * transaction id.
+ */
+ if ((hs_durable_ts != WT_TS_NONE ||
+ !__rollback_check_if_txnid_non_committed(session, cbt->upd_value->tw.start_txn)) &&
+ (hs_durable_ts <= rollback_timestamp)) {
__wt_verbose(session, WT_VERB_RECOVERY_RTS(session),
"history store update valid with start timestamp: %s, durable timestamp: %s, stop "
- "timestamp: %s, stable timestamp: %s and type: %" PRIu8,
+ "timestamp: %s, stable timestamp: %s, txnid: %" PRIu64 " and type: %" PRIu8,
__wt_timestamp_to_string(hs_start_ts, ts_string[0]),
__wt_timestamp_to_string(hs_durable_ts, ts_string[1]),
__wt_timestamp_to_string(hs_stop_durable_ts, ts_string[2]),
- __wt_timestamp_to_string(rollback_timestamp, ts_string[3]), type);
- WT_ASSERT(session, cbt->upd_value->tw.start_ts < unpack->tw.start_ts);
+ __wt_timestamp_to_string(rollback_timestamp, ts_string[3]),
+ cbt->upd_value->tw.start_txn, type);
valid_update_found = true;
break;
}
__wt_verbose(session, WT_VERB_RECOVERY_RTS(session),
"history store update aborted with start timestamp: %s, durable timestamp: %s, stop "
- "timestamp: %s, stable timestamp: %s and type: %" PRIu8,
+ "timestamp: %s, stable timestamp: %s, start txnid: %" PRIu64 ", stop txnid: %" PRIu64
+ " and type: %" PRIu8,
__wt_timestamp_to_string(hs_start_ts, ts_string[0]),
__wt_timestamp_to_string(hs_durable_ts, ts_string[1]),
__wt_timestamp_to_string(hs_stop_durable_ts, ts_string[2]),
- __wt_timestamp_to_string(rollback_timestamp, ts_string[3]), type);
+ __wt_timestamp_to_string(rollback_timestamp, ts_string[3]), cbt->upd_value->tw.start_txn,
+ cbt->upd_value->tw.stop_txn, type);
/*
* Start time point of the current record may be used as stop time point of the previous
@@ -352,14 +415,25 @@ __rollback_row_ondisk_fixup_key(WT_SESSION_IMPL *session, WT_PAGE *page, WT_ROW
* list. Otherwise remove the key by adding a tombstone.
*/
if (valid_update_found) {
- WT_ASSERT(session, cbt->upd_value->tw.start_ts < unpack->tw.start_ts);
+ WT_ASSERT(session,
+ cbt->upd_value->tw.start_ts < unpack->tw.start_ts ||
+ cbt->upd_value->tw.start_txn < unpack->tw.start_txn);
WT_ERR(__wt_upd_alloc(session, &full_value, WT_UPDATE_STANDARD, &upd, NULL));
- upd->txnid = cbt->upd_value->tw.start_txn;
+ /*
+ * Set the transaction id of updates to WT_TXN_NONE when called from recovery, because
+ * the connections write generation will be initialized after rollback to stable and the
+ * updates in the cache will be problematic. The transaction id of pages which are in
+ * disk will be automatically reset as part of unpacking cell when loaded to cache.
+ */
+ if (F_ISSET(S2C(session), WT_CONN_RECOVERING))
+ upd->txnid = WT_TXN_NONE;
+ else
+ upd->txnid = cbt->upd_value->tw.start_txn;
upd->durable_ts = cbt->upd_value->tw.durable_start_ts;
upd->start_ts = cbt->upd_value->tw.start_ts;
__wt_verbose(session, WT_VERB_RECOVERY_RTS(session),
- "update restored from history store (txnid: %" PRIu64
+ "update restored from history store txnid: %" PRIu64
", start_ts: %s and durable_ts: %s",
upd->txnid, __wt_timestamp_to_string(upd->start_ts, ts_string[0]),
__wt_timestamp_to_string(upd->durable_ts, ts_string[1]));
@@ -378,11 +452,21 @@ __rollback_row_ondisk_fixup_key(WT_SESSION_IMPL *session, WT_PAGE *page, WT_ROW
if (hs_stop_durable_ts <= rollback_timestamp &&
hs_stop_durable_ts < newer_hs_durable_ts) {
WT_ERR(__wt_upd_alloc_tombstone(session, &tombstone, NULL));
- tombstone->txnid = cbt->upd_value->tw.stop_txn;
+ /*
+ * Set the transaction id of updates to WT_TXN_NONE when called from recovery,
+ * because the connections write generation will be initialized after rollback to
+ * stable and the updates in the cache will be problematic. The transaction id of
+ * pages which are in disk will be automatically reset as part of unpacking cell
+ * when loaded to cache.
+ */
+ if (F_ISSET(S2C(session), WT_CONN_RECOVERING))
+ tombstone->txnid = WT_TXN_NONE;
+ else
+ tombstone->txnid = cbt->upd_value->tw.stop_txn;
tombstone->durable_ts = cbt->upd_value->tw.durable_stop_ts;
tombstone->start_ts = cbt->upd_value->tw.stop_ts;
__wt_verbose(session, WT_VERB_RECOVERY_RTS(session),
- "tombstone restored from history store (txnid: %" PRIu64
+ "tombstone restored from history store txnid: %" PRIu64
", start_ts: %s, durable_ts: %s",
tombstone->txnid, __wt_timestamp_to_string(tombstone->start_ts, ts_string[0]),
__wt_timestamp_to_string(tombstone->durable_ts, ts_string[1]));
@@ -468,14 +552,16 @@ __rollback_abort_row_ondisk_kv(
WT_STAT_CONN_DATA_INCR(session, txn_rts_sweep_hs_keys);
} else
return (0);
- } else if (vpack->tw.durable_start_ts > rollback_timestamp ||
+ } else if (((vpack->tw.durable_start_ts > rollback_timestamp) ||
+ (vpack->tw.durable_start_ts == WT_TS_NONE &&
+ __rollback_check_if_txnid_non_committed(session, vpack->tw.start_txn))) ||
(!WT_TIME_WINDOW_HAS_STOP(&vpack->tw) && prepared)) {
__wt_verbose(session, WT_VERB_RECOVERY_RTS(session),
"on-disk update aborted with start durable timestamp: %s, commit timestamp: %s, "
- "prepared: %s and stable timestamp: %s",
+ "prepared: %s, stable timestamp: %s and txnid : %" PRIu64,
__wt_timestamp_to_string(vpack->tw.durable_start_ts, ts_string[0]),
__wt_timestamp_to_string(vpack->tw.start_ts, ts_string[1]), prepared ? "true" : "false",
- __wt_timestamp_to_string(rollback_timestamp, ts_string[2]));
+ __wt_timestamp_to_string(rollback_timestamp, ts_string[2]), vpack->tw.start_txn);
if (!F_ISSET(S2C(session), WT_CONN_IN_MEMORY))
return (__rollback_row_ondisk_fixup_key(session, page, rip, rollback_timestamp, true));
else {
@@ -487,7 +573,10 @@ __rollback_abort_row_ondisk_kv(
WT_STAT_CONN_DATA_INCR(session, txn_rts_keys_removed);
}
} else if (WT_TIME_WINDOW_HAS_STOP(&vpack->tw) &&
- (vpack->tw.durable_stop_ts > rollback_timestamp || prepared)) {
+ (((vpack->tw.durable_stop_ts > rollback_timestamp) ||
+ (vpack->tw.durable_stop_ts == WT_TS_NONE &&
+ __rollback_check_if_txnid_non_committed(session, vpack->tw.stop_txn))) ||
+ prepared)) {
/*
* Clear the remove operation from the key by inserting the original on-disk value as a
* standard update.
@@ -495,19 +584,30 @@ __rollback_abort_row_ondisk_kv(
WT_RET(__wt_page_cell_data_ref(session, page, vpack, &buf));
WT_ERR(__wt_upd_alloc(session, &buf, WT_UPDATE_STANDARD, &upd, NULL));
- upd->txnid = vpack->tw.start_txn;
+ /*
+ * Set the transaction id of updates to WT_TXN_NONE when called from recovery, because the
+ * connections write generation will be initialized after rollback to stable and the updates
+ * in the cache will be problematic. The transaction id of pages which are in disk will be
+ * automatically reset as part of unpacking cell when loaded to cache.
+ */
+ if (F_ISSET(S2C(session), WT_CONN_RECOVERING))
+ upd->txnid = WT_TXN_NONE;
+ else
+ upd->txnid = vpack->tw.start_txn;
upd->durable_ts = vpack->tw.durable_start_ts;
upd->start_ts = vpack->tw.start_ts;
F_SET(upd, WT_UPDATE_RESTORED_FROM_DS);
WT_STAT_CONN_DATA_INCR(session, txn_rts_keys_restored);
__wt_verbose(session, WT_VERB_RECOVERY_RTS(session),
- "key restored with commit timestamp: %s, durable timestamp: %s txnid: %" PRIu64
- "and removed commit timestamp: %s, durable timestamp: %s, txnid: %" PRIu64
+ "key restored with commit timestamp: %s, durable timestamp: %s, stable timestamp: %s, "
+ "txnid: %" PRIu64
+ " and removed commit timestamp: %s, durable timestamp: %s, txnid: %" PRIu64
", prepared: %s",
__wt_timestamp_to_string(upd->start_ts, ts_string[0]),
- __wt_timestamp_to_string(upd->durable_ts, ts_string[1]), upd->txnid,
- __wt_timestamp_to_string(vpack->tw.stop_ts, ts_string[2]),
- __wt_timestamp_to_string(vpack->tw.durable_stop_ts, ts_string[3]), vpack->tw.stop_txn,
+ __wt_timestamp_to_string(upd->durable_ts, ts_string[1]),
+ __wt_timestamp_to_string(rollback_timestamp, ts_string[2]), upd->txnid,
+ __wt_timestamp_to_string(vpack->tw.stop_ts, ts_string[3]),
+ __wt_timestamp_to_string(vpack->tw.durable_stop_ts, ts_string[4]), vpack->tw.stop_txn,
prepared ? "true" : "false");
} else
/* Stable version according to the timestamp. */
@@ -760,6 +860,7 @@ __rollback_page_needs_abort(
WT_MULTI *multi;
WT_PAGE_MODIFY *mod;
wt_timestamp_t durable_ts;
+ uint64_t newest_txn;
uint32_t i;
char ts_string[WT_TS_INT_STRING_SIZE];
const char *tag;
@@ -768,12 +869,14 @@ __rollback_page_needs_abort(
addr = ref->addr;
mod = ref->page == NULL ? NULL : ref->page->modify;
durable_ts = WT_TS_NONE;
+ newest_txn = WT_TXN_NONE;
tag = "undefined state";
prepared = result = false;
/*
* The rollback operation should be performed on this page when any one of the following is
- * greater than the given timestamp:
+ * greater than the given timestamp or during recovery if the newest transaction id on the page
+ * is greater than or equal to recovered checkpoint snapshot min:
* 1. The reconciled replace page max durable timestamp.
* 2. The reconciled multi page max durable timestamp.
* 3. The on page address max durable timestamp.
@@ -800,17 +903,22 @@ __rollback_page_needs_abort(
__wt_cell_unpack_addr(session, ref->home->dsk, (WT_CELL *)addr, &vpack);
durable_ts = __rollback_get_ref_max_durable_timestamp(session, &vpack.ta);
prepared = vpack.ta.prepare;
- result = (durable_ts > rollback_timestamp) || prepared;
+ newest_txn = vpack.ta.newest_txn;
+ result = (durable_ts > rollback_timestamp) || prepared ||
+ WT_CHECK_RECOVERY_FLAG_TS_TXNID(session, newest_txn, durable_ts);
} else if (addr != NULL) {
tag = "address";
durable_ts = __rollback_get_ref_max_durable_timestamp(session, &addr->ta);
prepared = addr->ta.prepare;
- result = (durable_ts > rollback_timestamp) || prepared;
+ newest_txn = addr->ta.newest_txn;
+ result = (durable_ts > rollback_timestamp) || prepared ||
+ WT_CHECK_RECOVERY_FLAG_TS_TXNID(session, newest_txn, durable_ts);
}
__wt_verbose(session, WT_VERB_RECOVERY_RTS(session),
- "%p: page with %s durable timestamp: %s and prepared updates: %s", (void *)ref, tag,
- __wt_timestamp_to_string(durable_ts, ts_string), prepared ? "true" : "false");
+ "%p: page with %s durable timestamp: %s, newest txn: %" PRIu64 " and prepared updates: %s",
+ (void *)ref, tag, __wt_timestamp_to_string(durable_ts, ts_string), newest_txn,
+ prepared ? "true" : "false");
return (result);
}
@@ -929,6 +1037,9 @@ __rollback_to_stable_btree_walk(WT_SESSION_IMPL *session, wt_timestamp_t rollbac
WT_DECL_RET;
WT_REF *child_ref, *ref;
+ /* Set this flag to return error instead of panic if file is corrupted. */
+ F_SET(session, WT_SESSION_QUIET_CORRUPT_FILE);
+
/* Walk the tree, marking commits aborted where appropriate. */
ref = NULL;
while ((ret = __wt_tree_walk_custom_skip(session, &ref, __wt_rts_page_skip, &rollback_timestamp,
@@ -942,6 +1053,7 @@ __rollback_to_stable_btree_walk(WT_SESSION_IMPL *session, wt_timestamp_t rollbac
} else
WT_RET(__rollback_abort_newer_updates(session, ref, rollback_timestamp));
+ F_CLR(session, WT_SESSION_QUIET_CORRUPT_FILE);
return (ret);
}
@@ -1176,12 +1288,14 @@ __rollback_to_stable_btree_apply(WT_SESSION_IMPL *session)
WT_TXN_GLOBAL *txn_global;
wt_timestamp_t max_durable_ts, newest_start_durable_ts, newest_stop_durable_ts,
rollback_timestamp;
+ uint64_t rollback_txnid;
size_t addr_size;
char ts_string[2][WT_TS_INT_STRING_SIZE];
const char *config, *uri;
- bool durable_ts_found, prepared_updates;
+ bool durable_ts_found, prepared_updates, has_txn_updates_gt_than_ckpt_snap;
txn_global = &S2C(session)->txn_global;
+ rollback_txnid = 0;
addr_size = 0;
/*
@@ -1198,6 +1312,13 @@ __rollback_to_stable_btree_apply(WT_SESSION_IMPL *session)
WT_ASSERT(session, F_ISSET(session, WT_SESSION_LOCKED_SCHEMA));
WT_RET(__wt_metadata_cursor(session, &cursor));
+ if (F_ISSET(S2C(session), WT_CONN_RECOVERING))
+ __wt_verbose(session, WT_VERB_RECOVERY_RTS(session),
+ "recovered checkpoint snapshot min: %" PRIu64 ", snapshot max: %" PRIu64
+ ", snapshot count: %" PRIu32,
+ S2C(session)->recovery_ckpt_snap_min, S2C(session)->recovery_ckpt_snap_max,
+ S2C(session)->recovery_ckpt_snapshot_count);
+
while ((ret = cursor->next(cursor)) == 0) {
WT_ERR(cursor->get_key(cursor, &uri));
@@ -1212,7 +1333,7 @@ __rollback_to_stable_btree_apply(WT_SESSION_IMPL *session)
/* Find out the max durable timestamp of the object from checkpoint. */
newest_start_durable_ts = newest_stop_durable_ts = WT_TS_NONE;
- durable_ts_found = prepared_updates = false;
+ durable_ts_found = prepared_updates = has_txn_updates_gt_than_ckpt_snap = false;
WT_ERR(__wt_config_getones(session, config, "checkpoint", &cval));
__wt_config_subinit(session, &ckptconf, &cval);
for (; __wt_config_next(&ckptconf, &key, &cval) == 0;) {
@@ -1235,12 +1356,22 @@ __rollback_to_stable_btree_apply(WT_SESSION_IMPL *session)
prepared_updates = true;
}
WT_ERR_NOTFOUND_OK(ret, false);
+ ret = __wt_config_subgets(session, &cval, "newest_txn", &value);
+ if (value.len != 0)
+ rollback_txnid = (uint64_t)value.val;
+ WT_ERR_NOTFOUND_OK(ret, false);
ret = __wt_config_subgets(session, &cval, "addr", &value);
if (ret == 0)
addr_size = value.len;
WT_ERR_NOTFOUND_OK(ret, false);
}
max_durable_ts = WT_MAX(newest_start_durable_ts, newest_stop_durable_ts);
+ has_txn_updates_gt_than_ckpt_snap =
+ WT_CHECK_RECOVERY_FLAG_TS_TXNID(session, rollback_txnid, max_durable_ts);
+
+ /* Increment the inconsistent checkpoint stats counter. */
+ if (has_txn_updates_gt_than_ckpt_snap)
+ WT_STAT_CONN_DATA_INCR(session, txn_rts_inconsistent_ckpt);
/*
* The rollback to stable will skip the tables during recovery and shutdown in the following
@@ -1253,7 +1384,7 @@ __rollback_to_stable_btree_apply(WT_SESSION_IMPL *session)
(addr_size == 0 ||
(txn_global->stable_timestamp == WT_TS_NONE && max_durable_ts != WT_TS_NONE))) {
__wt_verbose(session, WT_VERB_RECOVERY_RTS(session),
- "Skip rollback to stable on file %s because %s", uri,
+ "skip rollback to stable on file %s because %s", uri,
addr_size == 0 ? "its checkpoint address length is 0" :
"it has timestamped updates and the stable timestamp is 0");
continue;
@@ -1271,7 +1402,7 @@ __rollback_to_stable_btree_apply(WT_SESSION_IMPL *session)
if ((ret == ENOENT) ||
(ret == WT_ERROR && F_ISSET(S2C(session), WT_CONN_DATA_CORRUPTION))) {
__wt_verbose(session, WT_VERB_RECOVERY_RTS(session),
- "Ignore performing rollback to stable on %s because the file %s", uri,
+ "ignore performing rollback to stable on %s because the file %s", uri,
ret == ENOENT ? "does not exist" : "is corrupted.");
continue;
}
@@ -1282,21 +1413,24 @@ __rollback_to_stable_btree_apply(WT_SESSION_IMPL *session)
* 1. The tree is modified.
* 2. The checkpoint durable start/stop timestamp is greater than the rollback timestamp.
* 3. There is no durable timestamp in any checkpoint.
+ * 4. The checkpoint newest txn is greater than snapshot min txn id
*/
if (S2BT(session)->modified || max_durable_ts > rollback_timestamp || prepared_updates ||
- !durable_ts_found) {
+ !durable_ts_found || has_txn_updates_gt_than_ckpt_snap) {
__wt_verbose(session, WT_VERB_RECOVERY_RTS(session),
"tree rolled back with durable timestamp: %s, or when tree is modified: %s or "
- "prepared updates: %s or when durable time is not found: %s",
+ "prepared updates: %s or when durable time is not found: %s or txnid: %" PRIu64
+ " is greater than recovery checkpoint snap min: %s",
__wt_timestamp_to_string(max_durable_ts, ts_string[0]),
S2BT(session)->modified ? "true" : "false", prepared_updates ? "true" : "false",
- !durable_ts_found ? "true" : "false");
+ !durable_ts_found ? "true" : "false", rollback_txnid,
+ has_txn_updates_gt_than_ckpt_snap ? "true" : "false");
WT_TRET(__rollback_to_stable_btree(session, rollback_timestamp));
} else
__wt_verbose(session, WT_VERB_RECOVERY_RTS(session),
- "tree skipped with durable timestamp: %s and stable timestamp: %s",
+ "tree skipped with durable timestamp: %s and stable timestamp: %s or txnid: %" PRIu64,
__wt_timestamp_to_string(max_durable_ts, ts_string[0]),
- __wt_timestamp_to_string(rollback_timestamp, ts_string[1]));
+ __wt_timestamp_to_string(rollback_timestamp, ts_string[1]), rollback_txnid);
/*
* Truncate history store entries for the non-timestamped table.
@@ -1312,6 +1446,14 @@ __rollback_to_stable_btree_apply(WT_SESSION_IMPL *session)
WT_TRET(__rollback_to_stable_btree_hs_truncate(session, S2BT(session)->id));
WT_TRET(__wt_session_release_dhandle(session));
+
+ /*
+ * Continue when the table is corrupted and proceed to perform rollback to stable on other
+ * tables.
+ */
+ if (ret == WT_ERROR && F_ISSET(S2C(session), WT_CONN_DATA_CORRUPTION))
+ continue;
+
WT_ERR(ret);
}
WT_ERR_NOTFOUND_OK(ret, false);
diff --git a/src/third_party/wiredtiger/test/suite/test_checkpoint_snapshot02.py b/src/third_party/wiredtiger/test/suite/test_checkpoint_snapshot02.py
new file mode 100644
index 00000000000..323cc990d67
--- /dev/null
+++ b/src/third_party/wiredtiger/test/suite/test_checkpoint_snapshot02.py
@@ -0,0 +1,136 @@
+#!/usr/bin/env python
+#
+# Public Domain 2014-2020 MongoDB, Inc.
+# Public Domain 2008-2014 WiredTiger, Inc.
+#
+# This is free and unencumbered software released into the public domain.
+#
+# Anyone is free to copy, modify, publish, use, compile, sell, or
+# distribute this software, either in source code form or as a compiled
+# binary, for any purpose, commercial or non-commercial, and by any
+# means.
+#
+# In jurisdictions that recognize copyright laws, the author or authors
+# of this software dedicate any and all copyright interest in the
+# software to the public domain. We make this dedication for the benefit
+# of the public at large and to the detriment of our heirs and
+# successors. We intend this dedication to be an overt act of
+# relinquishment in perpetuity of all present and future rights to this
+# software under copyright law.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+# IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+# OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+# ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+# OTHER DEALINGS IN THE SOFTWARE.
+
+import fnmatch, os, shutil, threading, time
+from wtthread import checkpoint_thread, op_thread
+from helper import copy_wiredtiger_home
+import wiredtiger, wttest
+from wtdataset import SimpleDataSet
+from wtscenario import make_scenarios
+from wiredtiger import stat
+
+# test_checkpoint_snapshot02.py
+# This test is to run checkpoint and eviction in parallel with timing
+# stress for checkpoint and let eviction write more data than checkpoint.
+#
+
+def timestamp_str(t):
+ return '%x' % t
+class test_checkpoint_snapshot02(wttest.WiredTigerTestCase):
+
+ # Create a table.
+ uri = "table:test_checkpoint_snapshot02"
+ nrows = 1000
+
+ def conn_config(self):
+ config = 'cache_size=10MB,statistics=(all),statistics_log=(json,on_close,wait=1),log=(enabled=true),timing_stress_for_test=[checkpoint_slow]'
+ return config
+
+ def large_updates(self, uri, value, ds, nrows):
+ # Update a large number of records.
+ session = self.session
+ cursor = session.open_cursor(uri)
+ for i in range(0, nrows):
+ session.begin_transaction()
+ cursor[ds.key(i)] = value
+ session.commit_transaction()
+ cursor.close()
+
+ def check(self, check_value, uri, nrows):
+ session = self.session
+ session.begin_transaction()
+ cursor = session.open_cursor(uri)
+ count = 0
+ for k, v in cursor:
+ self.assertEqual(v, check_value)
+ count += 1
+ session.commit_transaction()
+ self.assertEqual(count, nrows)
+
+ def test_checkpoint_snapshot(self):
+
+ ds = SimpleDataSet(self, self.uri, 0, key_format="S", value_format="S",config='log=(enabled=false)')
+ ds.populate()
+ valuea = "aaaaa" * 100
+ valueb = "bbbbb" * 100
+ valuec = "ccccc" * 100
+ valued = "ddddd" * 100
+
+ cursor = self.session.open_cursor(self.uri)
+ self.large_updates(self.uri, valuea, ds, self.nrows)
+
+ self.check(valuea, self.uri, self.nrows)
+
+ session1 = self.conn.open_session()
+ session1.begin_transaction()
+ cursor1 = session1.open_cursor(self.uri)
+
+ for i in range(self.nrows, self.nrows*2):
+ cursor1.set_key(ds.key(i))
+ cursor1.set_value(valuea)
+ self.assertEqual(cursor1.insert(), 0)
+
+ # Create a checkpoint thread
+ done = threading.Event()
+ ckpt = checkpoint_thread(self.conn, done)
+ try:
+ ckpt.start()
+ # Sleep for sometime so that checkpoint starts before committing last transaction.
+ time.sleep(2)
+ session1.commit_transaction()
+
+ finally:
+ done.set()
+ ckpt.join()
+
+ #Simulate a crash by copying to a new directory(RESTART).
+ copy_wiredtiger_home(self, ".", "RESTART")
+
+ # Open the new directory.
+ self.conn = self.setUpConnectionOpen("RESTART")
+ self.session = self.setUpSessionOpen(self.conn)
+
+ # Check the table contains the last checkpointed value.
+ self.check(valuea, self.uri, self.nrows)
+
+ stat_cursor = self.session.open_cursor('statistics:', None, None)
+ inconsistent_ckpt = stat_cursor[stat.conn.txn_rts_inconsistent_ckpt][2]
+ keys_removed = stat_cursor[stat.conn.txn_rts_keys_removed][2]
+ keys_restored = stat_cursor[stat.conn.txn_rts_keys_restored][2]
+ pages_visited = stat_cursor[stat.conn.txn_rts_pages_visited][2]
+ upd_aborted = stat_cursor[stat.conn.txn_rts_upd_aborted][2]
+ stat_cursor.close()
+
+ self.assertGreater(inconsistent_ckpt, 0)
+ self.assertEqual(upd_aborted, 0)
+ self.assertGreaterEqual(keys_removed, 0)
+ self.assertEqual(keys_restored, 0)
+ self.assertGreaterEqual(pages_visited, 0)
+
+if __name__ == '__main__':
+ wttest.run()
diff --git a/src/third_party/wiredtiger/test/suite/test_rollback_to_stable05.py b/src/third_party/wiredtiger/test/suite/test_rollback_to_stable05.py
index f3c93509d63..7cfc3ba2fe7 100755
--- a/src/third_party/wiredtiger/test/suite/test_rollback_to_stable05.py
+++ b/src/third_party/wiredtiger/test/suite/test_rollback_to_stable05.py
@@ -133,12 +133,11 @@ class test_rollback_to_stable05(test_rollback_to_stable_base):
self.assertEqual(calls, 1)
self.assertEqual(keys_removed, 0)
self.assertEqual(keys_restored, 0)
+ self.assertGreaterEqual(pages_visited, 0)
if self.in_memory:
- self.assertGreaterEqual(pages_visited, 0)
self.assertEqual(upd_aborted, 0)
self.assertEqual(hs_removed, 0)
else:
- self.assertEqual(pages_visited, 0)
self.assertEqual(upd_aborted, 0)
self.assertEqual(hs_removed, 0)
diff --git a/src/third_party/wiredtiger/test/suite/test_rollback_to_stable12.py b/src/third_party/wiredtiger/test/suite/test_rollback_to_stable12.py
index fbbd2146306..e5255cbe4fd 100755
--- a/src/third_party/wiredtiger/test/suite/test_rollback_to_stable12.py
+++ b/src/third_party/wiredtiger/test/suite/test_rollback_to_stable12.py
@@ -120,7 +120,7 @@ class test_rollback_to_stable12(test_rollback_to_stable_base):
self.assertGreater(pages_visited, 0)
self.assertGreaterEqual(hs_removed, 0)
self.assertEqual(hs_sweep, 0)
- self.assertGreater(pages_walk_skipped, 0)
+ self.assertGreaterEqual(pages_walk_skipped, 0)
if __name__ == '__main__':
wttest.run()