summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorLuke Chen <luke.chen@mongodb.com>2022-11-21 14:16:08 +1100
committerEvergreen Agent <no-reply@evergreen.mongodb.com>2022-11-21 03:50:48 +0000
commit80003a2abe375e7512871006510e8cce4f010b05 (patch)
treeb0d1e6a9a8b7c1677de39a6058cbe3038e409b5b
parent866c9351beacb4c26b3a3083dbbefe184425f91b (diff)
downloadmongo-80003a2abe375e7512871006510e8cce4f010b05.tar.gz
Import wiredtiger: 185cb659029fc4e5213783c5d290aa0fab5ab043 from branch mongodb-6.2
ref: 47951efac1..185cb65902 for: 6.2.0-rc2 WT-10164 Split apart the rollback to stable implementation into multiple files
-rw-r--r--src/third_party/wiredtiger/dist/filelist7
-rw-r--r--src/third_party/wiredtiger/import.data2
-rw-r--r--src/third_party/wiredtiger/src/conn/conn_api.c2
-rw-r--r--src/third_party/wiredtiger/src/conn/conn_open.c1
-rw-r--r--src/third_party/wiredtiger/src/docs/arch-fast-truncate.dox4
-rw-r--r--src/third_party/wiredtiger/src/include/connection.h1
-rw-r--r--src/third_party/wiredtiger/src/include/extern.h28
-rw-r--r--src/third_party/wiredtiger/src/include/rollback_to_stable.h28
-rw-r--r--src/third_party/wiredtiger/src/include/wt_internal.h3
-rw-r--r--src/third_party/wiredtiger/src/rollback_to_stable/rts.c151
-rw-r--r--src/third_party/wiredtiger/src/rollback_to_stable/rts_api.c168
-rw-r--r--src/third_party/wiredtiger/src/rollback_to_stable/rts_btree.c1083
-rw-r--r--src/third_party/wiredtiger/src/rollback_to_stable/rts_btree_walk.c320
-rw-r--r--src/third_party/wiredtiger/src/rollback_to_stable/rts_history.c230
-rw-r--r--src/third_party/wiredtiger/src/rollback_to_stable/rts_visibility.c156
-rw-r--r--src/third_party/wiredtiger/src/session/session_api.c3
-rw-r--r--src/third_party/wiredtiger/src/txn/txn.c2
-rw-r--r--src/third_party/wiredtiger/src/txn/txn_recover.c2
-rw-r--r--src/third_party/wiredtiger/src/txn/txn_rollback_to_stable.c2056
19 files changed, 2179 insertions, 2068 deletions
diff --git a/src/third_party/wiredtiger/dist/filelist b/src/third_party/wiredtiger/dist/filelist
index de5ac7fc1f1..b5de826bcf1 100644
--- a/src/third_party/wiredtiger/dist/filelist
+++ b/src/third_party/wiredtiger/dist/filelist
@@ -194,6 +194,12 @@ src/schema/schema_worker.c
src/session/session_api.c
src/session/session_compact.c
src/session/session_dhandle.c
+src/rollback_to_stable/rts_api.c
+src/rollback_to_stable/rts_btree.c
+src/rollback_to_stable/rts_btree_walk.c
+src/rollback_to_stable/rts.c
+src/rollback_to_stable/rts_history.c
+src/rollback_to_stable/rts_visibility.c
src/support/cond_auto.c
src/support/crypto.c
src/support/err.c
@@ -221,5 +227,4 @@ src/txn/txn.c
src/txn/txn_ckpt.c
src/txn/txn_log.c
src/txn/txn_recover.c
-src/txn/txn_rollback_to_stable.c
src/txn/txn_timestamp.c
diff --git a/src/third_party/wiredtiger/import.data b/src/third_party/wiredtiger/import.data
index f3450f2d4ee..51fdceb998e 100644
--- a/src/third_party/wiredtiger/import.data
+++ b/src/third_party/wiredtiger/import.data
@@ -2,5 +2,5 @@
"vendor": "wiredtiger",
"github": "wiredtiger/wiredtiger.git",
"branch": "mongodb-6.2",
- "commit": "47951efac16fbf8e5d44230ad18fce1b18858cda"
+ "commit": "185cb659029fc4e5213783c5d290aa0fab5ab043"
}
diff --git a/src/third_party/wiredtiger/src/conn/conn_api.c b/src/third_party/wiredtiger/src/conn/conn_api.c
index 38bd79e6230..e6c8f0893fb 100644
--- a/src/third_party/wiredtiger/src/conn/conn_api.c
+++ b/src/third_party/wiredtiger/src/conn/conn_api.c
@@ -1361,7 +1361,7 @@ __conn_rollback_to_stable(WT_CONNECTION *wt_conn, const char *config)
CONNECTION_API_CALL(conn, session, rollback_to_stable, config, cfg);
WT_STAT_CONN_INCR(session, txn_rts);
- ret = __wt_rollback_to_stable(session, cfg, false);
+ ret = conn->rts->rollback_to_stable(session, cfg, false);
err:
API_END_RET(session, ret);
}
diff --git a/src/third_party/wiredtiger/src/conn/conn_open.c b/src/third_party/wiredtiger/src/conn/conn_open.c
index 322c7ba6b50..1ca0d683278 100644
--- a/src/third_party/wiredtiger/src/conn/conn_open.c
+++ b/src/third_party/wiredtiger/src/conn/conn_open.c
@@ -51,6 +51,7 @@ __wt_connection_open(WT_CONNECTION_IMPL *conn, const char *cfg[])
/* Initialize transaction support. */
WT_RET(__wt_txn_global_init(session, cfg));
+ __wt_rollback_to_stable_init(conn);
WT_STAT_CONN_SET(session, dh_conn_handle_size, sizeof(WT_DATA_HANDLE));
return (0);
}
diff --git a/src/third_party/wiredtiger/src/docs/arch-fast-truncate.dox b/src/third_party/wiredtiger/src/docs/arch-fast-truncate.dox
index fba8e415089..a775c35008b 100644
--- a/src/third_party/wiredtiger/src/docs/arch-fast-truncate.dox
+++ b/src/third_party/wiredtiger/src/docs/arch-fast-truncate.dox
@@ -829,10 +829,10 @@ Call \c __wt_txn_op_delete_apply_prepare_state.}
@row{txn.c, Hook, in \c __wt_txn_rollback,
Call \c __wt_delete_page_rollback.}
-@row{txn_rollback_to_stable.c, Hook, in \c __rollback_page_needs_abort,
+@row{rts_visibility.c, Hook, in \c __wt_rollback_page_needs_abort,
Check \c page_del when deciding whether the page contains unstable values that need to be
examined.}
-@row{txn_rollback_to_stable.c, Hook, in \c __rollback_to_stable_page_skip,
+@row{rts_btree_walk.c, Hook, in \c __rollback_to_stable_page_skip,
Check \c page del when deciding whether to skip over the page.}
</table>
diff --git a/src/third_party/wiredtiger/src/include/connection.h b/src/third_party/wiredtiger/src/include/connection.h
index 629088c2aab..e68db066f72 100644
--- a/src/third_party/wiredtiger/src/include/connection.h
+++ b/src/third_party/wiredtiger/src/include/connection.h
@@ -497,6 +497,7 @@ struct __wt_connection_impl {
uint16_t log_req_min; /* Min required log version */
uint32_t txn_logsync; /* Log sync configuration */
+ WT_ROLLBACK_TO_STABLE *rts, _rts; /* Rollback to stable subsystem */
WT_SESSION_IMPL *meta_ckpt_session; /* Metadata checkpoint session */
/*
diff --git a/src/third_party/wiredtiger/src/include/extern.h b/src/third_party/wiredtiger/src/include/extern.h
index 3d88ffc24ca..6d466ba2ca3 100644
--- a/src/third_party/wiredtiger/src/include/extern.h
+++ b/src/third_party/wiredtiger/src/include/extern.h
@@ -33,6 +33,12 @@ extern bool __wt_page_evict_urgent(WT_SESSION_IMPL *session, WT_REF *ref)
WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
extern bool __wt_read_cell_time_window(WT_CURSOR_BTREE *cbt, WT_TIME_WINDOW *tw)
WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
+extern bool __wt_rts_visibility_has_stable_update(WT_UPDATE *upd)
+ WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
+extern bool __wt_rts_visibility_page_needs_abort(WT_SESSION_IMPL *session, WT_REF *ref,
+ wt_timestamp_t rollback_timestamp) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
+extern bool __wt_rts_visibility_txn_visible_id(WT_SESSION_IMPL *session, uint64_t id)
+ WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
extern bool __wt_rwlock_islocked(WT_SESSION_IMPL *session, WT_RWLOCK *l)
WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
extern bool __wt_txn_active(WT_SESSION_IMPL *session, uint64_t txnid)
@@ -1284,10 +1290,6 @@ extern int __wt_remove_if_exists(WT_SESSION_IMPL *session, const char *name, boo
WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
extern int __wt_reset_blkmod(WT_SESSION_IMPL *session, const char *orig_config, WT_ITEM *buf)
WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
-extern int __wt_rollback_to_stable(WT_SESSION_IMPL *session, const char *cfg[], bool no_ckpt)
- WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
-extern int __wt_rollback_to_stable_one(WT_SESSION_IMPL *session, const char *uri, bool *skipp)
- WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
extern int __wt_row_ikey(WT_SESSION_IMPL *session, uint32_t cell_offset, const void *key,
size_t size, WT_REF *ref) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
extern int __wt_row_ikey_alloc(WT_SESSION_IMPL *session, uint32_t cell_offset, const void *key,
@@ -1309,6 +1311,23 @@ extern int __wt_row_modify(WT_CURSOR_BTREE *cbt, const WT_ITEM *key, const WT_IT
) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
extern int __wt_row_search(WT_CURSOR_BTREE *cbt, WT_ITEM *srch_key, bool insert, WT_REF *leaf,
bool leaf_safe, bool *leaf_foundp) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
+extern int __wt_rts_btree_abort_updates(WT_SESSION_IMPL *session, WT_REF *ref,
+ wt_timestamp_t rollback_timestamp) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
+extern int __wt_rts_btree_apply_all(WT_SESSION_IMPL *session, wt_timestamp_t rollback_timestamp)
+ WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
+extern int __wt_rts_btree_walk_btree(WT_SESSION_IMPL *session, wt_timestamp_t rollback_timestamp)
+ WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
+extern int __wt_rts_btree_walk_btree_apply(
+ WT_SESSION_IMPL *session, const char *uri, const char *config, wt_timestamp_t rollback_timestamp)
+ WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
+extern int __wt_rts_check(WT_SESSION_IMPL *session)
+ WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
+extern int __wt_rts_history_btree_hs_truncate(WT_SESSION_IMPL *session, uint32_t btree_id)
+ WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
+extern int __wt_rts_history_delete_hs(WT_SESSION_IMPL *session, WT_ITEM *key, wt_timestamp_t ts)
+ WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
+extern int __wt_rts_history_final_pass(WT_SESSION_IMPL *session, wt_timestamp_t rollback_timestamp)
+ WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
extern int __wt_rwlock_init(WT_SESSION_IMPL *session, WT_RWLOCK *l)
WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
extern int __wt_salvage(WT_SESSION_IMPL *session, const char *cfg[])
@@ -1861,6 +1880,7 @@ extern void __wt_rec_dictionary_free(WT_SESSION_IMPL *session, WT_RECONCILE *r);
extern void __wt_rec_dictionary_reset(WT_RECONCILE *r);
extern void __wt_ref_addr_free(WT_SESSION_IMPL *session, WT_REF *ref);
extern void __wt_ref_out(WT_SESSION_IMPL *session, WT_REF *ref);
+extern void __wt_rollback_to_stable_init(WT_CONNECTION_IMPL *conn);
extern void __wt_root_ref_init(
WT_SESSION_IMPL *session, WT_REF *root_ref, WT_PAGE *root, bool is_recno);
extern void __wt_rwlock_destroy(WT_SESSION_IMPL *session, WT_RWLOCK *l);
diff --git a/src/third_party/wiredtiger/src/include/rollback_to_stable.h b/src/third_party/wiredtiger/src/include/rollback_to_stable.h
new file mode 100644
index 00000000000..5c1fd555e43
--- /dev/null
+++ b/src/third_party/wiredtiger/src/include/rollback_to_stable.h
@@ -0,0 +1,28 @@
+/*-
+ * Copyright (c) 2014-present MongoDB, Inc.
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#define WT_CHECK_RECOVERY_FLAG_TXNID(session, txnid) \
+ (F_ISSET(S2C(session), WT_CONN_RECOVERING) && S2C(session)->recovery_ckpt_snap_min != 0 && \
+ (txnid) >= S2C(session)->recovery_ckpt_snap_min)
+
+/* Enable rollback to stable verbose messaging during recovery. */
+#define WT_VERB_RECOVERY_RTS(session) \
+ (F_ISSET(S2C(session), WT_CONN_RECOVERING) ? \
+ WT_DECL_VERBOSE_MULTI_CATEGORY(((WT_VERBOSE_CATEGORY[]){WT_VERB_RECOVERY, WT_VERB_RTS})) : \
+ WT_DECL_VERBOSE_MULTI_CATEGORY(((WT_VERBOSE_CATEGORY[]){WT_VERB_RTS})))
+
+/*
+ * WT_ROLLBACK_TO_STABLE --
+ * Rollback to stable singleton, contains the interface to rollback to stable along
+ * with context used by rollback to stable.
+ */
+struct __wt_rollback_to_stable {
+ /* Methods */
+ int (*rollback_to_stable_one)(WT_SESSION_IMPL *, const char *, bool *);
+ int (*rollback_to_stable)(WT_SESSION_IMPL *, const char *[], bool);
+};
diff --git a/src/third_party/wiredtiger/src/include/wt_internal.h b/src/third_party/wiredtiger/src/include/wt_internal.h
index 53072c2df5e..c5e53d12d36 100644
--- a/src/third_party/wiredtiger/src/include/wt_internal.h
+++ b/src/third_party/wiredtiger/src/include/wt_internal.h
@@ -313,6 +313,8 @@ struct __wt_ref;
typedef struct __wt_ref WT_REF;
struct __wt_ref_hist;
typedef struct __wt_ref_hist WT_REF_HIST;
+struct __wt_rollback_to_stable;
+typedef struct __wt_rollback_to_stable WT_ROLLBACK_TO_STABLE;
struct __wt_row;
typedef struct __wt_row WT_ROW;
struct __wt_rwlock;
@@ -445,6 +447,7 @@ typedef uint64_t wt_timestamp_t;
#include "optrack.h"
#include "os.h"
#include "reconcile.h"
+#include "rollback_to_stable.h"
#include "schema.h"
#include "thread_group.h"
#include "tiered.h"
diff --git a/src/third_party/wiredtiger/src/rollback_to_stable/rts.c b/src/third_party/wiredtiger/src/rollback_to_stable/rts.c
new file mode 100644
index 00000000000..7a0f35f5164
--- /dev/null
+++ b/src/third_party/wiredtiger/src/rollback_to_stable/rts.c
@@ -0,0 +1,151 @@
+/*-
+ * Copyright (c) 2014-present MongoDB, Inc.
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __wt_rts_check --
+ * Check to the extent possible that the rollback request is reasonable.
+ */
+int
+__wt_rts_check(WT_SESSION_IMPL *session)
+{
+ WT_CONNECTION_IMPL *conn;
+ WT_DECL_RET;
+ WT_SESSION_IMPL *session_in_list;
+ uint32_t i, session_cnt;
+ bool cursor_active, txn_active;
+
+ conn = S2C(session);
+ cursor_active = txn_active = false;
+
+ WT_STAT_CONN_INCR(session, txn_walk_sessions);
+
+ /*
+ * Help the user comply with the requirement there be no concurrent user operations. It is okay
+ * to have a transaction in the prepared state.
+ *
+ * WT_TXN structures are allocated and freed as sessions are activated and closed. Lock the
+ * session open/close to ensure we don't race. This call is a rarely used RTS-only function,
+ * acquiring the lock shouldn't be an issue.
+ */
+ __wt_spin_lock(session, &conn->api_lock);
+
+ WT_ORDERED_READ(session_cnt, conn->session_cnt);
+ for (i = 0, session_in_list = conn->sessions; i < session_cnt; i++, session_in_list++) {
+
+ /* Skip inactive or internal sessions. */
+ if (!session_in_list->active || F_ISSET(session_in_list, WT_SESSION_INTERNAL))
+ continue;
+
+ /* Check if a user session has a running transaction. */
+ if (F_ISSET(session_in_list->txn, WT_TXN_RUNNING)) {
+ txn_active = true;
+ break;
+ }
+
+ /* Check if a user session has an active file cursor. */
+ if (session_in_list->ncursors != 0) {
+ cursor_active = true;
+ break;
+ }
+ }
+ __wt_spin_unlock(session, &conn->api_lock);
+
+ /*
+ * A new cursor may be positioned or a transaction may start after we return from this call and
+ * callers should be aware of this limitation.
+ */
+ if (cursor_active)
+ WT_RET_MSG(session, EBUSY, "rollback_to_stable illegal with active file cursors");
+ if (txn_active) {
+ ret = EBUSY;
+ WT_TRET(__wt_verbose_dump_txn(session));
+ WT_RET_MSG(session, ret, "rollback_to_stable illegal with active transactions");
+ }
+ return (0);
+}
+
+/*
+ * __rts_progress_msg --
+ * Log a verbose message about the progress of the current rollback to stable.
+ */
+static void
+__rts_progress_msg(WT_SESSION_IMPL *session, struct timespec rollback_start,
+ uint64_t rollback_count, uint64_t *rollback_msg_count)
+{
+ struct timespec cur_time;
+ uint64_t time_diff;
+
+ __wt_epoch(session, &cur_time);
+
+ /* Time since the rollback started. */
+ time_diff = WT_TIMEDIFF_SEC(cur_time, rollback_start);
+
+ if ((time_diff / WT_PROGRESS_MSG_PERIOD) > *rollback_msg_count) {
+ __wt_verbose(session, WT_VERB_RECOVERY_PROGRESS,
+ "Rollback to stable has been running for %" PRIu64 " seconds and has inspected %" PRIu64
+ " files. For more detailed logging, enable WT_VERB_RTS",
+ time_diff, rollback_count);
+ ++(*rollback_msg_count);
+ }
+}
+
+/*
+ * __wt_rts_btree_apply_all --
+ * Perform rollback to stable to all files listed in the metadata, apart from the metadata and
+ * history store files.
+ */
+int
+__wt_rts_btree_apply_all(WT_SESSION_IMPL *session, wt_timestamp_t rollback_timestamp)
+{
+ struct timespec rollback_timer;
+ WT_CURSOR *cursor;
+ WT_DECL_RET;
+ uint64_t rollback_count, rollback_msg_count;
+ const char *config, *uri;
+
+ /* Initialize the verbose tracking timer. */
+ __wt_epoch(session, &rollback_timer);
+ rollback_count = 0;
+ rollback_msg_count = 0;
+
+ WT_RET(__wt_metadata_cursor(session, &cursor));
+ while ((ret = cursor->next(cursor)) == 0) {
+ /* Log a progress message. */
+ __rts_progress_msg(session, rollback_timer, rollback_count, &rollback_msg_count);
+ ++rollback_count;
+
+ WT_ERR(cursor->get_key(cursor, &uri));
+ WT_ERR(cursor->get_value(cursor, &config));
+
+ F_SET(session, WT_SESSION_QUIET_CORRUPT_FILE);
+ ret = __wt_rts_btree_walk_btree_apply(session, uri, config, rollback_timestamp);
+ F_CLR(session, WT_SESSION_QUIET_CORRUPT_FILE);
+
+ /*
+ * Ignore rollback to stable failures on files that don't exist or files where corruption is
+ * detected.
+ */
+ if (ret == ENOENT || (ret == WT_ERROR && F_ISSET(S2C(session), WT_CONN_DATA_CORRUPTION))) {
+ __wt_verbose_multi(session, WT_VERB_RECOVERY_RTS(session),
+ "%s: skipped performing rollback to stable because the file %s", uri,
+ ret == ENOENT ? "does not exist" : "is corrupted.");
+ continue;
+ }
+ WT_ERR(ret);
+ }
+ WT_ERR_NOTFOUND_OK(ret, false);
+
+ if (F_ISSET(S2C(session), WT_CONN_RECOVERING))
+ WT_ERR(__wt_rts_history_final_pass(session, rollback_timestamp));
+
+err:
+ WT_TRET(__wt_metadata_cursor_release(session, &cursor));
+ return (ret);
+}
diff --git a/src/third_party/wiredtiger/src/rollback_to_stable/rts_api.c b/src/third_party/wiredtiger/src/rollback_to_stable/rts_api.c
new file mode 100644
index 00000000000..8a484070f79
--- /dev/null
+++ b/src/third_party/wiredtiger/src/rollback_to_stable/rts_api.c
@@ -0,0 +1,168 @@
+/*-
+ * Copyright (c) 2014-present MongoDB, Inc.
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __rollback_to_stable_int --
+ * Rollback all modifications with timestamps more recent than the passed in timestamp.
+ */
+static int
+__rollback_to_stable_int(WT_SESSION_IMPL *session, bool no_ckpt)
+{
+ WT_CONNECTION_IMPL *conn;
+ WT_DECL_RET;
+ WT_TXN_GLOBAL *txn_global;
+ wt_timestamp_t rollback_timestamp;
+ char ts_string[2][WT_TS_INT_STRING_SIZE];
+
+ conn = S2C(session);
+ txn_global = &conn->txn_global;
+
+ /*
+ * Rollback to stable should ignore tombstones in the history store since it needs to scan the
+ * entire table sequentially.
+ */
+ F_SET(session, WT_SESSION_ROLLBACK_TO_STABLE);
+
+ WT_ERR(__wt_rts_check(session));
+
+ /*
+ * Update the global time window state to have consistent view from global visibility rules for
+ * the rollback to stable to bring back the database into a consistent state.
+ *
+ * As part of the below function call, the oldest transaction id and pinned timestamps are
+ * updated.
+ */
+ WT_ERR(__wt_txn_update_oldest(session, WT_TXN_OLDEST_STRICT | WT_TXN_OLDEST_WAIT));
+
+ WT_ASSERT_ALWAYS(session,
+ (txn_global->has_pinned_timestamp || !txn_global->has_oldest_timestamp),
+ "Database has no pinned timestamp but an oldest timestamp. Pinned timestamp is required to "
+ "find out the global visibility/obsolete of an update.");
+
+ /*
+ * Copy the stable timestamp, otherwise we'd need to lock it each time it's accessed. Even
+ * though the stable timestamp isn't supposed to be updated while rolling back, accessing it
+ * without a lock would violate protocol.
+ */
+ WT_ORDERED_READ(rollback_timestamp, txn_global->stable_timestamp);
+ __wt_verbose_multi(session, WT_VERB_RECOVERY_RTS(session),
+ "performing rollback to stable with stable timestamp: %s and oldest timestamp: %s",
+ __wt_timestamp_to_string(rollback_timestamp, ts_string[0]),
+ __wt_timestamp_to_string(txn_global->oldest_timestamp, ts_string[1]));
+
+ if (F_ISSET(conn, WT_CONN_RECOVERING))
+ __wt_verbose_multi(session, WT_VERB_RECOVERY_RTS(session),
+ "recovered checkpoint snapshot min: %" PRIu64 ", snapshot max: %" PRIu64
+ ", snapshot count: %" PRIu32,
+ conn->recovery_ckpt_snap_min, conn->recovery_ckpt_snap_max,
+ conn->recovery_ckpt_snapshot_count);
+
+ WT_ERR(__wt_rts_btree_apply_all(session, rollback_timestamp));
+
+ /* Rollback the global durable timestamp to the stable timestamp. */
+ txn_global->has_durable_timestamp = txn_global->has_stable_timestamp;
+ txn_global->durable_timestamp = txn_global->stable_timestamp;
+
+ /*
+ * If the configuration is not in-memory, forcibly log a checkpoint after rollback to stable to
+ * ensure that both in-memory and on-disk versions are the same unless caller requested for no
+ * checkpoint.
+ */
+ if (!F_ISSET(conn, WT_CONN_IN_MEMORY) && !no_ckpt)
+ WT_ERR(session->iface.checkpoint(&session->iface, "force=1"));
+
+err:
+ F_CLR(session, WT_SESSION_ROLLBACK_TO_STABLE);
+ return (ret);
+}
+
+/*
+ * __rollback_to_stable_one --
+ * Perform rollback to stable on a single object.
+ */
+static int
+__rollback_to_stable_one(WT_SESSION_IMPL *session, const char *uri, bool *skipp)
+{
+ WT_DECL_RET;
+ wt_timestamp_t rollback_timestamp;
+ char *config;
+
+ /*
+ * This is confusing: the caller's boolean argument "skip" stops the schema-worker loop from
+ * processing this object and any underlying objects it may have (for example, a table with
+ * multiple underlying file objects). We rollback-to-stable all of the file objects an object
+ * may contain, so set the caller's skip argument to true on all file objects, else set the
+ * caller's skip argument to false so our caller continues down the tree of objects.
+ */
+ *skipp = WT_BTREE_PREFIX(uri);
+ if (!*skipp)
+ return (0);
+
+ WT_RET(__wt_metadata_search(session, uri, &config));
+
+ /* Read the stable timestamp once, when we first start up. */
+ WT_ORDERED_READ(rollback_timestamp, S2C(session)->txn_global.stable_timestamp);
+
+ F_SET(session, WT_SESSION_QUIET_CORRUPT_FILE);
+ ret = __wt_rts_btree_walk_btree_apply(session, uri, config, rollback_timestamp);
+ F_CLR(session, WT_SESSION_QUIET_CORRUPT_FILE);
+
+ __wt_free(session, config);
+
+ return (ret);
+}
+
+/*
+ * __rollback_to_stable --
+ * Rollback the database to the stable timestamp.
+ */
+static int
+__rollback_to_stable(WT_SESSION_IMPL *session, const char *cfg[], bool no_ckpt)
+{
+ WT_DECL_RET;
+
+ WT_UNUSED(cfg);
+
+ /*
+ * Don't use the connection's default session: we are working on data handles and (a) don't want
+ * to cache all of them forever, plus (b) can't guarantee that no other method will be called
+ * concurrently. Copy parent session no logging option to the internal session to make sure that
+ * rollback to stable doesn't generate log records.
+ */
+ WT_RET(
+ __wt_open_internal_session(S2C(session), "txn rollback_to_stable", true, 0, 0, &session));
+
+ WT_STAT_CONN_SET(session, txn_rollback_to_stable_running, 1);
+ WT_WITH_CHECKPOINT_LOCK(
+ session, WT_WITH_SCHEMA_LOCK(session, ret = __rollback_to_stable_int(session, no_ckpt)));
+ WT_STAT_CONN_SET(session, txn_rollback_to_stable_running, 0);
+
+ WT_TRET(__wt_session_close_internal(session));
+
+ return (ret);
+}
+
+/*
+ * __wt_rollback_to_stable_init --
+ * Initialize the data structures for the rollback to stable subsystem
+ */
+void
+__wt_rollback_to_stable_init(WT_CONNECTION_IMPL *conn)
+{
+ /*
+ * Setup the pointer so the data structure can be accessed easily while avoiding the need to do
+ * explicit memory management.
+ */
+ conn->rts = &conn->_rts;
+
+ /* Setup function pointers */
+ conn->rts->rollback_to_stable = __rollback_to_stable;
+ conn->rts->rollback_to_stable_one = __rollback_to_stable_one;
+}
diff --git a/src/third_party/wiredtiger/src/rollback_to_stable/rts_btree.c b/src/third_party/wiredtiger/src/rollback_to_stable/rts_btree.c
new file mode 100644
index 00000000000..34a2971939d
--- /dev/null
+++ b/src/third_party/wiredtiger/src/rollback_to_stable/rts_btree.c
@@ -0,0 +1,1083 @@
+/*-
+ * Copyright (c) 2014-present MongoDB, Inc.
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __rts_btree_abort_update --
+ * Abort updates in an update change with timestamps newer than the rollback timestamp. Also,
+ * clear the history store flag for the first stable update in the update.
+ */
+static int
+__rts_btree_abort_update(WT_SESSION_IMPL *session, WT_ITEM *key, WT_UPDATE *first_upd,
+ wt_timestamp_t rollback_timestamp, bool *stable_update_found)
+{
+ WT_UPDATE *stable_upd, *tombstone, *upd;
+ char ts_string[2][WT_TS_INT_STRING_SIZE];
+ bool txn_id_visible;
+
+ stable_upd = tombstone = NULL;
+ txn_id_visible = false;
+ if (stable_update_found != NULL)
+ *stable_update_found = false;
+ for (upd = first_upd; upd != NULL; upd = upd->next) {
+ /* Skip the updates that are aborted. */
+ if (upd->txnid == WT_TXN_ABORTED)
+ continue;
+
+ /*
+ * An unstable update needs to be aborted if any of the following are true:
+ * 1. An update is invisible based on the checkpoint snapshot during recovery.
+ * 2. The update durable timestamp is greater than the stable timestamp.
+ * 3. The update is a prepared update.
+ *
+ * Usually during recovery, there are no in memory updates present on the page. But
+ * whenever an unstable fast truncate operation is written to the disk, as part
+ * of the rollback to stable page read, it instantiates the tombstones on the page.
+ * The transaction id validation is ignored in all scenarios except recovery.
+ */
+ txn_id_visible = __wt_rts_visibility_txn_visible_id(session, upd->txnid);
+ if (!txn_id_visible || rollback_timestamp < upd->durable_ts ||
+ upd->prepare_state == WT_PREPARE_INPROGRESS) {
+ __wt_verbose_multi(session, WT_VERB_RECOVERY_RTS(session),
+ "rollback to stable update aborted with txnid: %" PRIu64
+ ", txnid not visible: %s, or stable timestamp (%s) < durable timestamp (%s): %s, or "
+ "prepare state (%d) is in progress: %s",
+ upd->txnid, !txn_id_visible ? "true" : "false",
+ __wt_timestamp_to_string(rollback_timestamp, ts_string[1]),
+ __wt_timestamp_to_string(upd->durable_ts, ts_string[0]),
+ rollback_timestamp < upd->durable_ts ? "true" : "false", upd->prepare_state,
+ upd->prepare_state == WT_PREPARE_INPROGRESS ? "true" : "false");
+
+ upd->txnid = WT_TXN_ABORTED;
+ WT_STAT_CONN_INCR(session, txn_rts_upd_aborted);
+ } else {
+ /* Valid update is found. */
+ stable_upd = upd;
+ break;
+ }
+ }
+
+ /*
+ * Clear the history store flags for the stable update to indicate that this update should be
+ * written to the history store later. The next time when this update is moved into the history
+ * store, it will have a different stop time point.
+ */
+ if (stable_upd != NULL) {
+ if (F_ISSET(stable_upd, WT_UPDATE_HS | WT_UPDATE_TO_DELETE_FROM_HS)) {
+ /* Find the update following a stable tombstone. */
+ if (stable_upd->type == WT_UPDATE_TOMBSTONE) {
+ tombstone = stable_upd;
+ for (stable_upd = stable_upd->next; stable_upd != NULL;
+ stable_upd = stable_upd->next) {
+ if (stable_upd->txnid != WT_TXN_ABORTED) {
+ WT_ASSERT(session,
+ stable_upd->type != WT_UPDATE_TOMBSTONE &&
+ F_ISSET(stable_upd, WT_UPDATE_HS | WT_UPDATE_TO_DELETE_FROM_HS));
+ break;
+ }
+ }
+ }
+
+ /*
+ * Delete the first stable update and any newer update from the history store. If the
+ * update following the stable tombstone is removed by obsolete check, no need to remove
+ * that update from the history store as it has a globally visible tombstone. In that
+ * case, it is enough to delete everything up until to the tombstone timestamp.
+ */
+ WT_RET(__wt_rts_history_delete_hs(
+ session, key, stable_upd == NULL ? tombstone->start_ts : stable_upd->start_ts));
+
+ /*
+ * Clear the history store flags for the first stable update. Otherwise, it will not be
+ * moved to history store again.
+ */
+ if (stable_upd != NULL)
+ F_CLR(stable_upd, WT_UPDATE_HS | WT_UPDATE_TO_DELETE_FROM_HS);
+ if (tombstone != NULL)
+ F_CLR(tombstone, WT_UPDATE_HS | WT_UPDATE_TO_DELETE_FROM_HS);
+ }
+ if (stable_update_found != NULL)
+ *stable_update_found = true;
+ }
+
+ return (0);
+}
+
+/*
+ * __rts_btree_abort_insert_list --
+ * Apply the update abort check to each entry in an insert skip list. Return how many entries
+ * had stable updates.
+ */
+static int
+__rts_btree_abort_insert_list(WT_SESSION_IMPL *session, WT_PAGE *page, WT_INSERT_HEAD *head,
+ wt_timestamp_t rollback_timestamp, uint32_t *stable_updates_count)
+{
+ WT_DECL_ITEM(key);
+ WT_DECL_RET;
+ WT_INSERT *ins;
+ uint64_t recno;
+ uint8_t *memp;
+ bool stable_update_found;
+
+ WT_ERR(
+ __wt_scr_alloc(session, page->type == WT_PAGE_ROW_LEAF ? 0 : WT_INTPACK64_MAXSIZE, &key));
+
+ WT_SKIP_FOREACH (ins, head)
+ if (ins->upd != NULL) {
+ if (page->type == WT_PAGE_ROW_LEAF) {
+ key->data = WT_INSERT_KEY(ins);
+ key->size = WT_INSERT_KEY_SIZE(ins);
+ } else {
+ recno = WT_INSERT_RECNO(ins);
+ memp = key->mem;
+ WT_ERR(__wt_vpack_uint(&memp, 0, recno));
+ key->size = WT_PTRDIFF(memp, key->data);
+ }
+ WT_ERR(__rts_btree_abort_update(
+ session, key, ins->upd, rollback_timestamp, &stable_update_found));
+ if (stable_update_found && stable_updates_count != NULL)
+ (*stable_updates_count)++;
+ if (!stable_update_found && page->type == WT_PAGE_ROW_LEAF &&
+ !F_ISSET(S2C(session), WT_CONN_IN_MEMORY))
+ /*
+ * When a new key is added to a page and the page is then checkpointed, updates for
+ * that key can be present in the History Store while the key isn't present in the
+ * disk image. RTS will then only remove these updates when there is a stable update
+ * on-chain. These updates still need removing when no stable updates are on-chain,
+ * so do so here explicitly. Pass in rollback_timestamp + 1 as history store cleanup
+ * removes updates inclusive of the provided timestamp, but we only want to remove
+ * unstable updates.
+ *
+ * FIXME-WT-10017: WT-9846 is an interim fix only for row-store while we investigate
+ * the impacts of a long term correction in WT-10017. Once completed this change can
+ * be reverted.
+ */
+ WT_ERR(__wt_rts_history_delete_hs(session, key, rollback_timestamp + 1));
+ }
+
+err:
+ __wt_scr_free(session, &key);
+ return (ret);
+}
+
+/*
+ * __rts_btree_col_modify --
+ * Add the provided update to the head of the update list.
+ */
+static inline int
+__rts_btree_col_modify(WT_SESSION_IMPL *session, WT_REF *ref, WT_UPDATE *upd, uint64_t recno)
+{
+ WT_CURSOR_BTREE cbt;
+ WT_DECL_RET;
+
+ __wt_btcur_init(session, &cbt);
+ __wt_btcur_open(&cbt);
+
+ /* Search the page. */
+ WT_ERR(__wt_col_search(&cbt, recno, ref, true, NULL));
+
+ /* Apply the modification. */
+#ifdef HAVE_DIAGNOSTIC
+ WT_ERR(__wt_col_modify(&cbt, recno, NULL, upd, WT_UPDATE_INVALID, true, false));
+#else
+ WT_ERR(__wt_col_modify(&cbt, recno, NULL, upd, WT_UPDATE_INVALID, true));
+#endif
+
+err:
+ /* Free any resources that may have been cached in the cursor. */
+ WT_TRET(__wt_btcur_close(&cbt, true));
+
+ return (ret);
+}
+
+/*
+ * __rts_btree_row_modify --
+ * Add the provided update to the head of the update list.
+ */
+static inline int
+__rts_btree_row_modify(WT_SESSION_IMPL *session, WT_REF *ref, WT_UPDATE *upd, WT_ITEM *key)
+{
+ WT_CURSOR_BTREE cbt;
+ WT_DECL_RET;
+
+ __wt_btcur_init(session, &cbt);
+ __wt_btcur_open(&cbt);
+
+ /* Search the page. */
+ WT_ERR(__wt_row_search(&cbt, key, true, ref, true, NULL));
+
+ /* Apply the modification. */
+#ifdef HAVE_DIAGNOSTIC
+ WT_ERR(__wt_row_modify(&cbt, key, NULL, upd, WT_UPDATE_INVALID, true, false));
+#else
+ WT_ERR(__wt_row_modify(&cbt, key, NULL, upd, WT_UPDATE_INVALID, true));
+#endif
+
+err:
+ /* Free any resources that may have been cached in the cursor. */
+ WT_TRET(__wt_btcur_close(&cbt, true));
+
+ return (ret);
+}
+
+/*
+ * __rts_btree_ondisk_fixup_key --
+ * Abort updates in the history store and replace the on-disk value with an update that
+ * satisfies the given timestamp.
+ */
+static int
+__rts_btree_ondisk_fixup_key(WT_SESSION_IMPL *session, WT_REF *ref, WT_ROW *rip, uint64_t recno,
+ WT_ITEM *row_key, WT_CELL_UNPACK_KV *unpack, wt_timestamp_t rollback_timestamp)
+{
+ WT_CURSOR *hs_cursor;
+ WT_DECL_ITEM(full_value);
+ WT_DECL_ITEM(hs_key);
+ WT_DECL_ITEM(hs_value);
+ WT_DECL_ITEM(key);
+ WT_DECL_ITEM(key_string);
+ WT_DECL_RET;
+ WT_PAGE *page;
+ WT_TIME_WINDOW *hs_tw;
+ WT_UPDATE *tombstone, *upd;
+ wt_timestamp_t hs_durable_ts, hs_start_ts, hs_stop_durable_ts, newer_hs_durable_ts, pinned_ts;
+ uint64_t hs_counter, type_full;
+ uint32_t hs_btree_id;
+ uint8_t *memp;
+ uint8_t type;
+ char ts_string[4][WT_TS_INT_STRING_SIZE];
+ char tw_string[WT_TIME_STRING_SIZE];
+ bool valid_update_found;
+#ifdef HAVE_DIAGNOSTIC
+ bool first_record;
+#endif
+
+ page = ref->page;
+
+ hs_cursor = NULL;
+ tombstone = upd = NULL;
+ hs_durable_ts = hs_start_ts = hs_stop_durable_ts = WT_TS_NONE;
+ hs_btree_id = S2BT(session)->id;
+ valid_update_found = false;
+#ifdef HAVE_DIAGNOSTIC
+ first_record = true;
+#endif
+
+ /* Allocate buffers for the data store and history store key. */
+ WT_ERR(__wt_scr_alloc(session, 0, &hs_key));
+ WT_ERR(__wt_scr_alloc(session, 0, &hs_value));
+
+ if (rip != NULL) {
+ if (row_key != NULL)
+ key = row_key;
+ else {
+ /* Unpack a row key. */
+ WT_ERR(__wt_scr_alloc(session, 0, &key));
+ WT_ERR(__wt_row_leaf_key(session, page, rip, key, false));
+ }
+ } else {
+ /* Manufacture a column key. */
+ WT_ERR(__wt_scr_alloc(session, WT_INTPACK64_MAXSIZE, &key));
+ memp = key->mem;
+ WT_ERR(__wt_vpack_uint(&memp, 0, recno));
+ key->size = WT_PTRDIFF(memp, key->data);
+ }
+
+ WT_ERR(__wt_scr_alloc(session, 0, &key_string));
+ __wt_verbose_level_multi(session, WT_VERB_RECOVERY_RTS(session), WT_VERBOSE_DEBUG_2,
+ "rolling back the on-disk key: %s",
+ __wt_key_string(session, key->data, key->size, S2BT(session)->key_format, key_string));
+
+ WT_ERR(__wt_scr_alloc(session, 0, &full_value));
+ WT_ERR(__wt_page_cell_data_ref_kv(session, page, unpack, full_value));
+ /*
+ * We can read overflow removed value if checkpoint has run before rollback to stable. In this
+ * case, we have already appended the on page value to the update chain. At this point, we have
+ * visited the update chain and decided the value is not stable. In addition, checkpoint must
+ * have moved this value to the history store as a full value. Therefore, we can safely ignore
+ * the on page value if it is overflow removed.
+ */
+ if (__wt_cell_type_raw(unpack->cell) == WT_CELL_VALUE_OVFL_RM)
+ ret = 0;
+ else
+ WT_ERR(__wt_buf_set(session, full_value, full_value->data, full_value->size));
+
+ newer_hs_durable_ts = unpack->tw.durable_start_ts;
+
+ __wt_txn_pinned_timestamp(session, &pinned_ts);
+
+ /* Open a history store table cursor. */
+ WT_ERR(__wt_curhs_open(session, NULL, &hs_cursor));
+ /*
+ * Rollback-to-stable operates exclusively (i.e., it is the only active operation in the system)
+ * outside the constraints of transactions. Therefore, there is no need for snapshot based
+ * visibility checks.
+ */
+ F_SET(hs_cursor, WT_CURSTD_HS_READ_ALL);
+
+ /*
+ * Scan the history store for the given btree and key with maximum start timestamp to let the
+ * search point to the last version of the key and start traversing backwards to find out the
+ * satisfying record according the given timestamp. Any satisfying history store record is moved
+ * into data store and removed from history store. If none of the history store records satisfy
+ * the given timestamp, the key is removed from data store.
+ */
+ hs_cursor->set_key(hs_cursor, 4, hs_btree_id, key, WT_TS_MAX, UINT64_MAX);
+ ret = __wt_curhs_search_near_before(session, hs_cursor);
+ for (; ret == 0; ret = hs_cursor->prev(hs_cursor)) {
+ WT_ERR(hs_cursor->get_key(hs_cursor, &hs_btree_id, hs_key, &hs_start_ts, &hs_counter));
+
+ /* Get current value and convert to full update if it is a modify. */
+ WT_ERR(hs_cursor->get_value(
+ hs_cursor, &hs_stop_durable_ts, &hs_durable_ts, &type_full, hs_value));
+ type = (uint8_t)type_full;
+
+ /* Retrieve the time window from the history cursor. */
+ __wt_hs_upd_time_window(hs_cursor, &hs_tw);
+
+ /*
+ * We have a tombstone on the history update and it is obsolete according to the timestamp
+ * and txnid, so no need to restore it. These obsolete updates are written to the disk when
+ * they are not obsolete at the time of reconciliation by an eviction thread and later they
+ * become obsolete according to the checkpoint.
+ */
+ if (__wt_rts_visibility_txn_visible_id(session, hs_tw->stop_txn) &&
+ hs_tw->durable_stop_ts <= pinned_ts) {
+ __wt_verbose_multi(session, WT_VERB_RECOVERY_RTS(session),
+ "history store stop is obsolete with time window: %s and pinned timestamp: %s",
+ __wt_time_window_to_string(hs_tw, tw_string),
+ __wt_timestamp_to_string(pinned_ts, ts_string[0]));
+ WT_ERR(hs_cursor->remove(hs_cursor));
+ WT_STAT_CONN_DATA_INCR(session, txn_rts_hs_removed);
+ continue;
+ }
+
+ /*
+ * Do not include history store updates greater than on-disk data store version to construct
+ * a full update to restore except when the on-disk update is prepared. Including more
+ * recent updates than the on-disk version shouldn't be problem as the on-disk version in
+ * history store is always a full update. It is better to not to include those updates as it
+ * unnecessarily increases the rollback to stable time.
+ *
+ * Comparing with timestamps here has no problem unlike in search flow where the timestamps
+ * may be reset during reconciliation. RTS detects an on-disk update is unstable based on
+ * the written proper timestamp, so comparing against it with history store shouldn't have
+ * any problem.
+ */
+ if (hs_tw->start_ts <= unpack->tw.start_ts || unpack->tw.prepare) {
+ if (type == WT_UPDATE_MODIFY)
+ WT_ERR(__wt_modify_apply_item(
+ session, S2BT(session)->value_format, full_value, hs_value->data));
+ else {
+ WT_ASSERT(session, type == WT_UPDATE_STANDARD);
+ WT_ERR(__wt_buf_set(session, full_value, hs_value->data, hs_value->size));
+ }
+ } else
+ __wt_verbose_level_multi(session, WT_VERB_RECOVERY_RTS(session), WT_VERBOSE_DEBUG_2,
+ "history store update more recent than on-disk update with time window: %s and type: "
+ "%" PRIu8,
+ __wt_time_window_to_string(hs_tw, tw_string), type);
+
+ /*
+ * Verify the history store timestamps are in order. The start timestamp may be equal to the
+ * stop timestamp if the original update's commit timestamp is in order. We may see records
+ * newer than or equal to the onpage value if eviction runs concurrently with checkpoint. In
+ * that case, don't verify the first record.
+ *
+ * It is possible during a prepared transaction rollback, the history store update that have
+ * its own stop timestamp doesn't get removed leads to duplicate records in history store
+ * after further operations on that same key. Rollback to stable should ignore such records
+ * for timestamp ordering verification.
+ *
+ * If we have fixed the missing timestamps, then the newer update reinserted with an older
+ * timestamp may have a durable timestamp that is smaller than the current stop durable
+ * timestamp.
+ *
+ * It is possible that there can be an update in the history store with a max stop timestamp
+ * in the middle of the same key updates. This occurs when the checkpoint writes the
+ * committed prepared update and further updates on that key including the history store
+ * changes before the transaction fixes the history store update to have a proper stop
+ * timestamp. It is a rare scenario.
+ */
+ WT_ASSERT(session,
+ hs_stop_durable_ts <= newer_hs_durable_ts || hs_start_ts == hs_stop_durable_ts ||
+ hs_start_ts == newer_hs_durable_ts || newer_hs_durable_ts == hs_durable_ts ||
+ first_record || hs_stop_durable_ts == WT_TS_MAX);
+
+ if (hs_stop_durable_ts < newer_hs_durable_ts)
+ WT_STAT_CONN_DATA_INCR(session, txn_rts_hs_stop_older_than_newer_start);
+
+ /*
+ * Validate the timestamps in the key and the cell are same. This must be validated only
+ * after verifying it's stop time window is not globally visible. The start timestamps of
+ * the time window are cleared when they are globally visible and there will be no stop
+ * timestamp in the history store whenever a prepared update is written to the data store.
+ */
+ WT_ASSERT(session,
+ (hs_tw->start_ts == WT_TS_NONE || hs_tw->start_ts == hs_start_ts) &&
+ (hs_tw->durable_start_ts == WT_TS_NONE || hs_tw->durable_start_ts == hs_durable_ts) &&
+ ((hs_tw->durable_stop_ts == 0 && hs_stop_durable_ts == WT_TS_MAX) ||
+ hs_tw->durable_stop_ts == hs_stop_durable_ts));
+
+ /*
+ * Stop processing when we find a stable update according to the given timestamp and
+ * transaction id.
+ */
+ if (__wt_rts_visibility_txn_visible_id(session, hs_tw->start_txn) &&
+ hs_tw->durable_start_ts <= rollback_timestamp) {
+ __wt_verbose_level_multi(session, WT_VERB_RECOVERY_RTS(session), WT_VERBOSE_DEBUG_2,
+ "history store update valid with time window: %s, type: %" PRIu8
+ " and stable timestamp: %s",
+ __wt_time_window_to_string(hs_tw, tw_string), type,
+ __wt_timestamp_to_string(rollback_timestamp, ts_string[0]));
+ WT_ASSERT(session, unpack->tw.prepare || hs_tw->start_ts <= unpack->tw.start_ts);
+ valid_update_found = true;
+ break;
+ }
+
+ __wt_verbose_multi(session, WT_VERB_RECOVERY_RTS(session),
+ "history store update aborted with time window: %s, type: %" PRIu8
+ " and stable timestamp: %s",
+ __wt_time_window_to_string(hs_tw, tw_string), type,
+ __wt_timestamp_to_string(rollback_timestamp, ts_string[3]));
+
+ /*
+ * Start time point of the current record may be used as stop time point of the previous
+ * record. Save it to verify against the previous record and check if we need to append the
+ * stop time point as a tombstone when we rollback the history store record.
+ */
+ newer_hs_durable_ts = hs_durable_ts;
+#ifdef HAVE_DIAGNOSTIC
+ first_record = false;
+#endif
+
+ WT_ERR(hs_cursor->remove(hs_cursor));
+ WT_STAT_CONN_DATA_INCR(session, txn_rts_hs_removed);
+ WT_STAT_CONN_DATA_INCR(session, cache_hs_key_truncate_rts_unstable);
+ }
+
+ /*
+ * If we found a history value that satisfied the given timestamp, add it to the update list.
+ * Otherwise remove the key by adding a tombstone.
+ */
+ if (valid_update_found) {
+ /* Retrieve the time window from the history cursor. */
+ __wt_hs_upd_time_window(hs_cursor, &hs_tw);
+ WT_ASSERT(session,
+ hs_tw->start_ts < unpack->tw.start_ts || hs_tw->start_txn < unpack->tw.start_txn);
+ WT_ERR(__wt_upd_alloc(session, full_value, WT_UPDATE_STANDARD, &upd, NULL));
+
+ /*
+ * Set the transaction id of updates to WT_TXN_NONE when called from recovery, because the
+ * connections write generation will be initialized after rollback to stable and the updates
+ * in the cache will be problematic. The transaction id of pages which are in disk will be
+ * automatically reset as part of unpacking cell when loaded to cache.
+ */
+ if (F_ISSET(S2C(session), WT_CONN_RECOVERING))
+ upd->txnid = WT_TXN_NONE;
+ else
+ upd->txnid = hs_tw->start_txn;
+ upd->durable_ts = hs_tw->durable_start_ts;
+ upd->start_ts = hs_tw->start_ts;
+ __wt_verbose_multi(session, WT_VERB_RECOVERY_RTS(session),
+ "history store update restored txnid: %" PRIu64 ", start_ts: %s and durable_ts: %s",
+ upd->txnid, __wt_timestamp_to_string(upd->start_ts, ts_string[0]),
+ __wt_timestamp_to_string(upd->durable_ts, ts_string[1]));
+
+ /*
+ * Set the flag to indicate that this update has been restored from history store for the
+ * rollback to stable operation.
+ */
+ F_SET(upd, WT_UPDATE_RESTORED_FROM_HS);
+ WT_STAT_CONN_DATA_INCR(session, txn_rts_hs_restore_updates);
+
+ /*
+ * We have a tombstone on the original update chain and it is stable according to the
+ * timestamp and txnid, we need to restore that as well.
+ */
+ if (__wt_rts_visibility_txn_visible_id(session, hs_tw->stop_txn) &&
+ hs_tw->durable_stop_ts <= rollback_timestamp) {
+ /*
+ * The restoring tombstone timestamp must be zero or less than previous update start
+ * timestamp.
+ */
+ WT_ASSERT(session,
+ hs_stop_durable_ts == WT_TS_NONE || hs_stop_durable_ts < newer_hs_durable_ts ||
+ unpack->tw.prepare);
+
+ WT_ERR(__wt_upd_alloc_tombstone(session, &tombstone, NULL));
+ /*
+ * Set the transaction id of updates to WT_TXN_NONE when called from recovery, because
+ * the connections write generation will be initialized after rollback to stable and the
+ * updates in the cache will be problematic. The transaction id of pages which are in
+ * disk will be automatically reset as part of unpacking cell when loaded to cache.
+ */
+ if (F_ISSET(S2C(session), WT_CONN_RECOVERING))
+ tombstone->txnid = WT_TXN_NONE;
+ else
+ tombstone->txnid = hs_tw->stop_txn;
+ tombstone->durable_ts = hs_tw->durable_stop_ts;
+ tombstone->start_ts = hs_tw->stop_ts;
+ __wt_verbose_multi(session, WT_VERB_RECOVERY_RTS(session),
+ "history store tombstone restored txnid: %" PRIu64
+ ", start_ts: %s and durable_ts: %s",
+ tombstone->txnid, __wt_timestamp_to_string(tombstone->start_ts, ts_string[0]),
+ __wt_timestamp_to_string(tombstone->durable_ts, ts_string[1]));
+
+ /*
+ * Set the flag to indicate that this update has been restored from history store for
+ * the rollback to stable operation.
+ */
+ F_SET(tombstone, WT_UPDATE_RESTORED_FROM_HS);
+
+ tombstone->next = upd;
+ upd = tombstone;
+ WT_STAT_CONN_DATA_INCR(session, txn_rts_hs_restore_tombstones);
+ }
+ } else {
+ WT_ERR(__wt_upd_alloc_tombstone(session, &upd, NULL));
+ WT_STAT_CONN_DATA_INCR(session, txn_rts_keys_removed);
+ __wt_verbose_level_multi(
+ session, WT_VERB_RECOVERY_RTS(session), WT_VERBOSE_DEBUG_3, "%s", "key removed");
+ }
+
+ if (rip != NULL)
+ WT_ERR(__rts_btree_row_modify(session, ref, upd, key));
+ else
+ WT_ERR(__rts_btree_col_modify(session, ref, upd, recno));
+
+ /* Finally remove that update from history store. */
+ if (valid_update_found) {
+ /* Avoid freeing the updates while still in use if hs_cursor->remove fails. */
+ upd = tombstone = NULL;
+
+ WT_ERR(hs_cursor->remove(hs_cursor));
+ WT_STAT_CONN_DATA_INCR(session, txn_rts_hs_removed);
+ WT_STAT_CONN_DATA_INCR(session, cache_hs_key_truncate_rts);
+ }
+
+ if (0) {
+err:
+ WT_ASSERT(session, tombstone == NULL || upd == tombstone);
+ __wt_free_update_list(session, &upd);
+ }
+ __wt_scr_free(session, &full_value);
+ __wt_scr_free(session, &hs_key);
+ __wt_scr_free(session, &hs_value);
+ if (rip == NULL || row_key == NULL)
+ __wt_scr_free(session, &key);
+ __wt_scr_free(session, &key_string);
+ if (hs_cursor != NULL)
+ WT_TRET(hs_cursor->close(hs_cursor));
+ return (ret);
+}
+
+/*
+ * __rts_btree_abort_ondisk_kv --
+ * Fix the on-disk K/V version according to the given timestamp.
+ */
+static int
+__rts_btree_abort_ondisk_kv(WT_SESSION_IMPL *session, WT_REF *ref, WT_ROW *rip, uint64_t recno,
+ WT_ITEM *row_key, WT_CELL_UNPACK_KV *vpack, wt_timestamp_t rollback_timestamp,
+ bool *is_ondisk_stable)
+{
+ WT_DECL_ITEM(key);
+ WT_DECL_ITEM(key_string);
+ WT_DECL_ITEM(tmp);
+ WT_DECL_RET;
+ WT_PAGE *page;
+ WT_UPDATE *upd;
+ uint8_t *memp;
+ char time_string[WT_TIME_STRING_SIZE];
+ char ts_string[5][WT_TS_INT_STRING_SIZE];
+ bool prepared;
+
+ page = ref->page;
+ upd = NULL;
+
+ /* Initialize the on-disk stable version flag. */
+ if (is_ondisk_stable != NULL)
+ *is_ondisk_stable = false;
+
+ prepared = vpack->tw.prepare;
+ if (WT_IS_HS(session->dhandle)) {
+ /*
+ * Abort the history store update with stop durable timestamp greater than the stable
+ * timestamp or the updates with max stop timestamp which implies that they are associated
+ * with prepared transactions.
+ */
+ if (vpack->tw.durable_stop_ts > rollback_timestamp || vpack->tw.stop_ts == WT_TS_MAX) {
+ __wt_verbose_multi(session, WT_VERB_RECOVERY_RTS(session),
+ "history store update aborted with start durable/commit timestamp: %s, %s, stop "
+ "durable/commit timestamp: %s, %s and stable timestamp: %s",
+ __wt_timestamp_to_string(vpack->tw.durable_start_ts, ts_string[0]),
+ __wt_timestamp_to_string(vpack->tw.start_ts, ts_string[1]),
+ __wt_timestamp_to_string(vpack->tw.durable_stop_ts, ts_string[2]),
+ __wt_timestamp_to_string(vpack->tw.stop_ts, ts_string[3]),
+ __wt_timestamp_to_string(rollback_timestamp, ts_string[4]));
+ WT_RET(__wt_upd_alloc_tombstone(session, &upd, NULL));
+ WT_STAT_CONN_DATA_INCR(session, txn_rts_sweep_hs_keys);
+ } else
+ return (0);
+ } else if (vpack->tw.durable_start_ts > rollback_timestamp ||
+ !__wt_rts_visibility_txn_visible_id(session, vpack->tw.start_txn) ||
+ (!WT_TIME_WINDOW_HAS_STOP(&vpack->tw) && prepared)) {
+ __wt_verbose_multi(session, WT_VERB_RECOVERY_RTS(session),
+ "on-disk update aborted with time window %s. Start durable timestamp > stable timestamp: "
+ "%s, or txnid is not visible: %s, or tw has not stop and is prepared: %s",
+ __wt_time_point_to_string(
+ vpack->tw.start_ts, vpack->tw.durable_start_ts, vpack->tw.start_txn, time_string),
+ vpack->tw.durable_start_ts > rollback_timestamp ? "true" : "false",
+ !__wt_rts_visibility_txn_visible_id(session, vpack->tw.start_txn) ? "true" : "false",
+ !WT_TIME_WINDOW_HAS_STOP(&vpack->tw) && prepared ? "true" : "false");
+ if (!F_ISSET(S2C(session), WT_CONN_IN_MEMORY))
+ return (__rts_btree_ondisk_fixup_key(
+ session, ref, rip, recno, row_key, vpack, rollback_timestamp));
+ else {
+ /*
+ * In-memory database don't have a history store to provide a stable update, so remove
+ * the key. Note that an in-memory database will have saved old values in the update
+ * chain, so we should only get here for a key/value that never existed at all as of the
+ * rollback timestamp; thus, deleting it is the correct response.
+ */
+ WT_RET(__wt_upd_alloc_tombstone(session, &upd, NULL));
+ WT_STAT_CONN_DATA_INCR(session, txn_rts_keys_removed);
+ }
+ } else if (WT_TIME_WINDOW_HAS_STOP(&vpack->tw) &&
+ (vpack->tw.durable_stop_ts > rollback_timestamp ||
+ !__wt_rts_visibility_txn_visible_id(session, vpack->tw.stop_txn) || prepared)) {
+ /*
+ * For prepared transactions, it is possible that both the on-disk key start and stop time
+ * windows can be the same. To abort these updates, check for any stable update from history
+ * store or remove the key.
+ */
+ if (vpack->tw.start_ts == vpack->tw.stop_ts &&
+ vpack->tw.durable_start_ts == vpack->tw.durable_stop_ts &&
+ vpack->tw.start_txn == vpack->tw.stop_txn) {
+ WT_ASSERT(session, prepared == true);
+ if (!F_ISSET(S2C(session), WT_CONN_IN_MEMORY))
+ return (__rts_btree_ondisk_fixup_key(
+ session, ref, rip, recno, row_key, vpack, rollback_timestamp));
+ else {
+ /*
+ * In-memory database don't have a history store to provide a stable update, so
+ * remove the key.
+ */
+ WT_RET(__wt_upd_alloc_tombstone(session, &upd, NULL));
+ WT_STAT_CONN_DATA_INCR(session, txn_rts_keys_removed);
+ }
+ } else {
+ /*
+ * Clear the remove operation from the key by inserting the original on-disk value as a
+ * standard update.
+ */
+ WT_RET(__wt_scr_alloc(session, 0, &tmp));
+ if ((ret = __wt_page_cell_data_ref_kv(session, page, vpack, tmp)) == 0)
+ ret = __wt_upd_alloc(session, tmp, WT_UPDATE_STANDARD, &upd, NULL);
+ __wt_scr_free(session, &tmp);
+ WT_RET(ret);
+
+ /*
+ * Set the transaction id of updates to WT_TXN_NONE when called from recovery, because
+ * the connections write generation will be initialized after rollback to stable and the
+ * updates in the cache will be problematic. The transaction id of pages which are in
+ * disk will be automatically reset as part of unpacking cell when loaded to cache.
+ */
+ if (F_ISSET(S2C(session), WT_CONN_RECOVERING))
+ upd->txnid = WT_TXN_NONE;
+ else
+ upd->txnid = vpack->tw.start_txn;
+ upd->durable_ts = vpack->tw.durable_start_ts;
+ upd->start_ts = vpack->tw.start_ts;
+ F_SET(upd, WT_UPDATE_RESTORED_FROM_DS);
+ WT_STAT_CONN_DATA_INCR(session, txn_rts_keys_restored);
+ __wt_verbose_multi(session, WT_VERB_RECOVERY_RTS(session),
+ "key restored with commit timestamp: %s, durable timestamp: %s, stable timestamp: "
+ "%s, "
+ "txnid: %" PRIu64
+ " and removed commit timestamp: %s, durable timestamp: %s, txnid: %" PRIu64
+ ", prepared: %s",
+ __wt_timestamp_to_string(upd->start_ts, ts_string[0]),
+ __wt_timestamp_to_string(upd->durable_ts, ts_string[1]),
+ __wt_timestamp_to_string(rollback_timestamp, ts_string[2]), upd->txnid,
+ __wt_timestamp_to_string(vpack->tw.stop_ts, ts_string[3]),
+ __wt_timestamp_to_string(vpack->tw.durable_stop_ts, ts_string[4]), vpack->tw.stop_txn,
+ prepared ? "true" : "false");
+ }
+ } else {
+ /* Stable version according to the timestamp. */
+ if (is_ondisk_stable != NULL)
+ *is_ondisk_stable = true;
+ return (0);
+ }
+
+ if (rip != NULL) {
+ if (row_key != NULL)
+ key = row_key;
+ else {
+ /* Unpack a row key. */
+ WT_ERR(__wt_scr_alloc(session, 0, &key));
+ WT_ERR(__wt_row_leaf_key(session, page, rip, key, false));
+ }
+ } else {
+ /* Manufacture a column key. */
+ WT_ERR(__wt_scr_alloc(session, WT_INTPACK64_MAXSIZE, &key));
+ memp = key->mem;
+ WT_ERR(__wt_vpack_uint(&memp, 0, recno));
+ key->size = WT_PTRDIFF(memp, key->data);
+ }
+
+ WT_ERR(__wt_scr_alloc(session, 0, &key_string));
+ __wt_verbose_level_multi(session, WT_VERB_RECOVERY_RTS(session), WT_VERBOSE_DEBUG_2,
+ "removing the key%s: %s", upd->type == WT_UPDATE_TOMBSTONE ? "" : " tombstone",
+ __wt_key_string(session, key->data, key->size, S2BT(session)->key_format, key_string));
+
+ if (rip != NULL)
+ WT_ERR(__rts_btree_row_modify(session, ref, upd, key));
+ else
+ WT_ERR(__rts_btree_col_modify(session, ref, upd, recno));
+
+ if (0) {
+err:
+ __wt_free(session, upd);
+ }
+ if (rip == NULL || row_key == NULL)
+ __wt_scr_free(session, &key);
+ __wt_scr_free(session, &key_string);
+ return (ret);
+}
+
+/*
+ * __rts_btree_abort_col_var --
+ * Abort updates on a variable length col leaf page with timestamps newer than the rollback
+ * timestamp.
+ */
+static int
+__rts_btree_abort_col_var(WT_SESSION_IMPL *session, WT_REF *ref, wt_timestamp_t rollback_timestamp)
+{
+ WT_CELL *kcell;
+ WT_CELL_UNPACK_KV unpack;
+ WT_COL *cip;
+ WT_INSERT *ins;
+ WT_INSERT_HEAD *inshead;
+ WT_PAGE *page;
+ uint64_t ins_recno, recno, rle;
+ uint32_t i, j, stable_updates_count;
+ bool is_ondisk_stable;
+
+ page = ref->page;
+ /*
+ * If a disk image exists, start from the provided recno; or else start from 0.
+ */
+ if (page->dsk != NULL)
+ recno = page->dsk->recno;
+ else
+ recno = 0;
+
+ /* Review the changes to the original on-page data items. */
+ WT_COL_FOREACH (page, cip, i) {
+ stable_updates_count = 0;
+
+ if ((inshead = WT_COL_UPDATE(page, cip)) != NULL)
+ WT_RET(__rts_btree_abort_insert_list(
+ session, page, inshead, rollback_timestamp, &stable_updates_count));
+
+ if (page->dsk != NULL) {
+ /* Unpack the cell. We need its RLE count whether or not we're going to iterate it. */
+ kcell = WT_COL_PTR(page, cip);
+ __wt_cell_unpack_kv(session, page->dsk, kcell, &unpack);
+ rle = __wt_cell_rle(&unpack);
+
+ /*
+ * Each key whose on-disk value is not stable and has no stable update on the update
+ * list must be processed downstream.
+ *
+ * If we can determine that the cell's on-disk value is stable, we can skip iterating
+ * over the cell; likewise, if we can determine that every key in the cell has a stable
+ * update on the update list, we can skip the iteration. Otherwise we have to try each
+ * key.
+ *
+ * If the on-disk cell is deleted, it is stable, because cells only appear as deleted
+ * when there is no older value that might need to be restored.
+ *
+ * Note that in a purely timestamped world, the presence of any stable update for any
+ * key in the cell means the on-disk value must be stable, because the update must be
+ * newer than the on-disk value. However, this is no longer true if the stable update
+ * has no timestamp. It may also not be true if the on-disk value is prepared, or other
+ * corner cases. Therefore, we must iterate the cell unless _every_ key has a stable
+ * update.
+ *
+ * We can, however, stop iterating as soon as the downstream code reports back that the
+ * on-disk value is actually stable.
+ */
+ if (unpack.type == WT_CELL_DEL)
+ WT_STAT_CONN_DATA_INCR(session, txn_rts_delete_rle_skipped);
+ else if (stable_updates_count == rle)
+ WT_STAT_CONN_DATA_INCR(session, txn_rts_stable_rle_skipped);
+ else {
+ j = 0;
+ if (inshead != NULL) {
+ WT_SKIP_FOREACH (ins, inshead) {
+ /* If the update list goes past the end of the cell, something's wrong. */
+ WT_ASSERT(session, j < rle);
+ ins_recno = WT_INSERT_RECNO(ins);
+ /* Process all the keys before this update. */
+ while (recno + j < ins_recno) {
+ WT_RET(__rts_btree_abort_ondisk_kv(session, ref, NULL, recno + j, NULL,
+ &unpack, rollback_timestamp, &is_ondisk_stable));
+ /* We can stop right away if the on-disk version is stable. */
+ if (is_ondisk_stable) {
+ if (rle > 1)
+ WT_STAT_CONN_DATA_INCR(session, txn_rts_stable_rle_skipped);
+ goto stop;
+ }
+ j++;
+ }
+ /* If this key has a stable update, skip over it. */
+ if (recno + j == ins_recno &&
+ __wt_rts_visibility_has_stable_update(ins->upd))
+ j++;
+ }
+ }
+ /* Process the rest of the keys. */
+ while (j < rle) {
+ WT_RET(__rts_btree_abort_ondisk_kv(session, ref, NULL, recno + j, NULL, &unpack,
+ rollback_timestamp, &is_ondisk_stable));
+ /* We can stop right away if the on-disk version is stable. */
+ if (is_ondisk_stable) {
+ if (rle > 1)
+ WT_STAT_CONN_DATA_INCR(session, txn_rts_stable_rle_skipped);
+ goto stop;
+ }
+ j++;
+ }
+ }
+stop:
+ recno += rle;
+ }
+ }
+
+ /* Review the append list */
+ if ((inshead = WT_COL_APPEND(page)) != NULL)
+ WT_RET(__rts_btree_abort_insert_list(session, page, inshead, rollback_timestamp, NULL));
+
+ return (0);
+}
+
+/*
+ * __rts_btree_abort_col_fix_one --
+ * Handle one possibly unstable on-disk time window.
+ */
+static int
+__rts_btree_abort_col_fix_one(WT_SESSION_IMPL *session, WT_REF *ref, uint32_t tw,
+ uint32_t recno_offset, wt_timestamp_t rollback_timestamp)
+{
+ WT_BTREE *btree;
+ WT_CELL *cell;
+ WT_CELL_UNPACK_KV unpack;
+ WT_PAGE *page;
+ uint8_t value;
+
+ btree = S2BT(session);
+ page = ref->page;
+
+ /* Unpack the cell to get the time window. */
+ cell = WT_COL_FIX_TW_CELL(page, &page->pg_fix_tws[tw]);
+ __wt_cell_unpack_kv(session, page->dsk, cell, &unpack);
+
+ /* Fake up the value (which is not physically in the cell) in case it's wanted. */
+ value = __bit_getv(page->pg_fix_bitf, recno_offset, btree->bitcnt);
+ unpack.data = &value;
+ unpack.size = 1;
+
+ return (__rts_btree_abort_ondisk_kv(session, ref, NULL, page->dsk->recno + recno_offset, NULL,
+ &unpack, rollback_timestamp, NULL));
+}
+
+/*
+ * __rts_btree_abort_col_fix --
+ * Abort updates on a fixed length col leaf page with timestamps newer than the rollback
+ * timestamp.
+ */
+static int
+__rts_btree_abort_col_fix(WT_SESSION_IMPL *session, WT_REF *ref, wt_timestamp_t rollback_timestamp)
+{
+ WT_INSERT *ins;
+ WT_INSERT_HEAD *inshead;
+ WT_PAGE *page;
+ uint32_t ins_recno_offset, recno_offset, numtws, tw;
+
+ page = ref->page;
+
+ /*
+ * Review the changes to the original on-page data items. Note that while this can report back
+ * to us whether it saw a stable update, that information doesn't do us any good -- unlike in
+ * VLCS where the uniformity of cells lets us reason about the timestamps of all of them based
+ * on the timestamp of an update to any of them, in FLCS everything is just thrown together, so
+ * we'll need to iterate over all the keys anyway.
+ */
+ if ((inshead = WT_COL_UPDATE_SINGLE(page)) != NULL)
+ WT_RET(__rts_btree_abort_insert_list(session, page, inshead, rollback_timestamp, NULL));
+
+ /*
+ * Iterate over all the keys, stopping only on keys that (a) have a time window on disk, and
+ * also (b) do not have a stable update remaining in the update list. Keys with no on-disk time
+ * window are stable. And we must not try to adjust the on-disk value for keys with stable
+ * updates, because the downstream code assumes that has already been checked and in some cases
+ * (e.g. in-memory databases) the wrong thing will happen.
+ *
+ * Iterate over the update list and carry along the iteration over the time window list in
+ * parallel, even though the code would perhaps make more sense the other way around, because
+ * this allows using the skiplist iterator macro instead of an open-coded mess.
+ */
+ numtws = WT_COL_FIX_TWS_SET(page) ? page->pg_fix_numtws : 0;
+ WT_ASSERT(session, numtws == 0 || page->dsk != NULL);
+ tw = 0;
+ if (inshead != NULL) {
+ WT_SKIP_FOREACH (ins, inshead) {
+ /* Process all the keys before this update entry. */
+ ins_recno_offset = (uint32_t)(WT_INSERT_RECNO(ins) - ref->ref_recno);
+ while (tw < numtws &&
+ (recno_offset = page->pg_fix_tws[tw].recno_offset) < ins_recno_offset) {
+
+ WT_RET(__rts_btree_abort_col_fix_one(
+ session, ref, tw, recno_offset, rollback_timestamp));
+ tw++;
+ }
+ /* If this key has a stable update, skip over it. */
+ if (tw < numtws && page->pg_fix_tws[tw].recno_offset == ins_recno_offset &&
+ ins->upd != NULL && __wt_rts_visibility_has_stable_update(ins->upd))
+ tw++;
+ }
+ }
+ /* Process the rest of the keys with time windows. */
+ while (tw < numtws) {
+ recno_offset = page->pg_fix_tws[tw].recno_offset;
+ WT_RET(__rts_btree_abort_col_fix_one(session, ref, tw, recno_offset, rollback_timestamp));
+ tw++;
+ }
+
+ /* Review the append list. */
+ if ((inshead = WT_COL_APPEND(page)) != NULL)
+ WT_RET(__rts_btree_abort_insert_list(session, page, inshead, rollback_timestamp, NULL));
+
+ return (0);
+}
+
+/*
+ * __rts_btree_abort_row_leaf --
+ * Abort updates on a row leaf page with timestamps newer than the rollback timestamp.
+ */
+static int
+__rts_btree_abort_row_leaf(WT_SESSION_IMPL *session, WT_REF *ref, wt_timestamp_t rollback_timestamp)
+{
+ WT_CELL_UNPACK_KV *vpack, _vpack;
+ WT_DECL_ITEM(key);
+ WT_DECL_RET;
+ WT_INSERT_HEAD *insert;
+ WT_PAGE *page;
+ WT_ROW *rip;
+ WT_UPDATE *upd;
+ uint32_t i;
+ bool have_key, stable_update_found;
+
+ page = ref->page;
+
+ WT_RET(__wt_scr_alloc(session, 0, &key));
+
+ /*
+ * Review the insert list for keys before the first entry on the disk page.
+ */
+ if ((insert = WT_ROW_INSERT_SMALLEST(page)) != NULL)
+ WT_ERR(__rts_btree_abort_insert_list(session, page, insert, rollback_timestamp, NULL));
+
+ /*
+ * Review updates that belong to keys that are on the disk image, as well as for keys inserted
+ * since the page was read from disk.
+ */
+ WT_ROW_FOREACH (page, rip, i) {
+ stable_update_found = false;
+ if ((upd = WT_ROW_UPDATE(page, rip)) != NULL) {
+ WT_ERR(__wt_row_leaf_key(session, page, rip, key, false));
+ WT_ERR(__rts_btree_abort_update(
+ session, key, upd, rollback_timestamp, &stable_update_found));
+ have_key = true;
+ } else
+ have_key = false;
+
+ if ((insert = WT_ROW_INSERT(page, rip)) != NULL)
+ WT_ERR(__rts_btree_abort_insert_list(session, page, insert, rollback_timestamp, NULL));
+
+ /*
+ * If there is no stable update found in the update list, abort any on-disk value.
+ */
+ if (!stable_update_found) {
+ vpack = &_vpack;
+ __wt_row_leaf_value_cell(session, page, rip, vpack);
+ WT_ERR(__rts_btree_abort_ondisk_kv(
+ session, ref, rip, 0, have_key ? key : NULL, vpack, rollback_timestamp, NULL));
+ }
+ }
+
+err:
+ __wt_scr_free(session, &key);
+ return (ret);
+}
+
+/*
+ * __wt_rts_btree_abort_updates --
+ * Abort updates on this page newer than the timestamp.
+ */
+int
+__wt_rts_btree_abort_updates(
+ WT_SESSION_IMPL *session, WT_REF *ref, wt_timestamp_t rollback_timestamp)
+{
+ WT_PAGE *page;
+ bool modified;
+
+ /*
+ * If we have a ref with clean page, find out whether the page has any modifications that are
+ * newer than the given timestamp. As eviction writes the newest version to page, even a clean
+ * page may also contain modifications that need rollback.
+ */
+ page = ref->page;
+ modified = __wt_page_is_modified(page);
+ if (!modified && !__wt_rts_visibility_page_needs_abort(session, ref, rollback_timestamp)) {
+ __wt_verbose_level_multi(session, WT_VERB_RECOVERY_RTS(session), WT_VERBOSE_DEBUG_3,
+ "%p: unmodified stable page skipped", (void *)ref);
+ return (0);
+ }
+
+ WT_STAT_CONN_INCR(session, txn_rts_pages_visited);
+ __wt_verbose_level_multi(session, WT_VERB_RECOVERY_RTS(session), WT_VERBOSE_DEBUG_2,
+ "%p: roll back %s page", (void *)ref, modified ? "modified" : "clean");
+
+ switch (page->type) {
+ case WT_PAGE_COL_FIX:
+ WT_RET(__rts_btree_abort_col_fix(session, ref, rollback_timestamp));
+ break;
+ case WT_PAGE_COL_VAR:
+ WT_RET(__rts_btree_abort_col_var(session, ref, rollback_timestamp));
+ break;
+ case WT_PAGE_ROW_LEAF:
+ WT_RET(__rts_btree_abort_row_leaf(session, ref, rollback_timestamp));
+ break;
+ case WT_PAGE_COL_INT:
+ case WT_PAGE_ROW_INT:
+ /* This function is not called for internal pages. */
+ WT_ASSERT(session, false);
+ /* Fall through. */
+ default:
+ WT_RET(__wt_illegal_value(session, page->type));
+ }
+
+ /* Mark the page as dirty to reconcile the page. */
+ if (page->modify)
+ __wt_page_modify_set(session, page);
+ return (0);
+}
diff --git a/src/third_party/wiredtiger/src/rollback_to_stable/rts_btree_walk.c b/src/third_party/wiredtiger/src/rollback_to_stable/rts_btree_walk.c
new file mode 100644
index 00000000000..7045e2d2950
--- /dev/null
+++ b/src/third_party/wiredtiger/src/rollback_to_stable/rts_btree_walk.c
@@ -0,0 +1,320 @@
+/*-
+ * Copyright (c) 2014-present MongoDB, Inc.
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __rts_btree_walk_check_btree_modified --
+ * Check that the rollback to stable btree is modified or not.
+ */
+static int
+__rts_btree_walk_check_btree_modified(WT_SESSION_IMPL *session, const char *uri, bool *modified)
+{
+ WT_DECL_RET;
+
+ ret = __wt_conn_dhandle_find(session, uri, NULL);
+ *modified = ret == 0 && S2BT(session)->modified;
+ return (ret);
+}
+
+/*
+ * __rts_btree_walk_page_skip --
+ * Skip if rollback to stable doesn't require reading this page.
+ */
+static int
+__rts_btree_walk_page_skip(
+ WT_SESSION_IMPL *session, WT_REF *ref, void *context, bool visible_all, bool *skipp)
+{
+ WT_PAGE_DELETED *page_del;
+ wt_timestamp_t rollback_timestamp;
+ char time_string[WT_TIME_STRING_SIZE];
+
+ rollback_timestamp = *(wt_timestamp_t *)context;
+ WT_UNUSED(visible_all);
+
+ *skipp = false; /* Default to reading */
+
+ /*
+ * Skip pages truncated at or before the RTS timestamp. (We could read the page, but that would
+ * unnecessarily instantiate it). If the page has no fast-delete information, that means either
+ * it was discarded because the delete is globally visible, or the internal page holding the
+ * cell was an old format page so none was loaded. In the latter case we should skip the page as
+ * there's no way to get correct behavior and skipping matches the historic behavior. Note that
+ * eviction is running; we must lock the WT_REF before examining the fast-delete information.
+ */
+ if (ref->state == WT_REF_DELETED &&
+ WT_REF_CAS_STATE(session, ref, WT_REF_DELETED, WT_REF_LOCKED)) {
+ page_del = ref->page_del;
+ if (page_del == NULL ||
+ (__wt_rts_visibility_txn_visible_id(session, page_del->txnid) &&
+ page_del->durable_timestamp <= rollback_timestamp)) {
+ /*
+ * We should never see a prepared truncate here; not at recovery time because prepared
+ * truncates can't be written to disk, and not during a runtime RTS either because it
+ * should not be possible to do that with an unresolved prepared transaction.
+ */
+ WT_ASSERT(session,
+ page_del == NULL || page_del->prepare_state == WT_PREPARE_INIT ||
+ page_del->prepare_state == WT_PREPARE_RESOLVED);
+
+ if (page_del == NULL)
+ __wt_verbose_multi(session, WT_VERB_RECOVERY_RTS(session),
+ "%p: deleted page walk skipped", (void *)ref);
+ else {
+ __wt_verbose_multi(session, WT_VERB_RECOVERY_RTS(session),
+ "%p: deleted page walk skipped page_del %s", (void *)ref,
+ __wt_time_point_to_string(page_del->timestamp, page_del->durable_timestamp,
+ page_del->txnid, time_string));
+ }
+ WT_STAT_CONN_INCR(session, txn_rts_tree_walk_skip_pages);
+ *skipp = true;
+ }
+ WT_REF_SET_STATE(ref, WT_REF_DELETED);
+ return (0);
+ }
+
+ /* Otherwise, if the page state is other than on disk, we want to look at it. */
+ if (ref->state != WT_REF_DISK)
+ return (0);
+
+ /*
+ * Check whether this on-disk page has any updates to be aborted. We are not holding a hazard
+ * reference on the page and so we rely on there being no other threads of control in the tree,
+ * that is, eviction ignores WT_REF_DISK pages and no other thread is reading pages, this page
+ * cannot change state from on-disk to something else.
+ */
+ if (!__wt_rts_visibility_page_needs_abort(session, ref, rollback_timestamp)) {
+ *skipp = true;
+ __wt_verbose_multi(
+ session, WT_VERB_RECOVERY_RTS(session), "%p: stable page walk skipped", (void *)ref);
+ WT_STAT_CONN_INCR(session, txn_rts_tree_walk_skip_pages);
+ }
+
+ return (0);
+}
+
+/*
+ * __rts_btree_walk --
+ * Called for each open handle - choose to either skip or wipe the commits
+ */
+static int
+__rts_btree_walk(WT_SESSION_IMPL *session, wt_timestamp_t rollback_timestamp)
+{
+ WT_DECL_RET;
+ WT_REF *ref;
+
+ /* Walk the tree, marking commits aborted where appropriate. */
+ ref = NULL;
+ while (
+ (ret = __wt_tree_walk_custom_skip(session, &ref, __rts_btree_walk_page_skip,
+ &rollback_timestamp, WT_READ_NO_EVICT | WT_READ_VISIBLE_ALL | WT_READ_WONT_NEED)) == 0 &&
+ ref != NULL)
+ if (F_ISSET(ref, WT_REF_FLAG_LEAF))
+ WT_RET(__wt_rts_btree_abort_updates(session, ref, rollback_timestamp));
+
+ return (ret);
+}
+
+/*
+ * __wt_rts_btree_walk_btree_apply --
+ * Perform rollback to stable on a single file.
+ */
+int
+__wt_rts_btree_walk_btree_apply(
+ WT_SESSION_IMPL *session, const char *uri, const char *config, wt_timestamp_t rollback_timestamp)
+{
+ WT_CONFIG ckptconf;
+ WT_CONFIG_ITEM cval, value, key;
+ WT_DECL_RET;
+ wt_timestamp_t max_durable_ts, newest_start_durable_ts, newest_stop_durable_ts;
+ size_t addr_size;
+ uint64_t rollback_txnid, write_gen;
+ uint32_t btree_id;
+ char ts_string[2][WT_TS_INT_STRING_SIZE];
+ bool dhandle_allocated, durable_ts_found, has_txn_updates_gt_than_ckpt_snap, modified;
+ bool prepared_updates;
+
+ /* Ignore non-btree objects as well as the metadata and history store files. */
+ if (!WT_BTREE_PREFIX(uri) || strcmp(uri, WT_HS_URI) == 0 || strcmp(uri, WT_METAFILE_URI) == 0)
+ return (0);
+
+ addr_size = 0;
+ rollback_txnid = 0;
+ write_gen = 0;
+ dhandle_allocated = false;
+
+ /* Find out the max durable timestamp of the object from checkpoint. */
+ newest_start_durable_ts = newest_stop_durable_ts = WT_TS_NONE;
+ durable_ts_found = prepared_updates = has_txn_updates_gt_than_ckpt_snap = false;
+
+ WT_RET(__wt_config_getones(session, config, "checkpoint", &cval));
+ __wt_config_subinit(session, &ckptconf, &cval);
+ for (; __wt_config_next(&ckptconf, &key, &cval) == 0;) {
+ ret = __wt_config_subgets(session, &cval, "newest_start_durable_ts", &value);
+ if (ret == 0) {
+ newest_start_durable_ts = WT_MAX(newest_start_durable_ts, (wt_timestamp_t)value.val);
+ durable_ts_found = true;
+ }
+ WT_RET_NOTFOUND_OK(ret);
+ ret = __wt_config_subgets(session, &cval, "newest_stop_durable_ts", &value);
+ if (ret == 0) {
+ newest_stop_durable_ts = WT_MAX(newest_stop_durable_ts, (wt_timestamp_t)value.val);
+ durable_ts_found = true;
+ }
+ WT_RET_NOTFOUND_OK(ret);
+ ret = __wt_config_subgets(session, &cval, "prepare", &value);
+ if (ret == 0) {
+ if (value.val)
+ prepared_updates = true;
+ }
+ WT_RET_NOTFOUND_OK(ret);
+ ret = __wt_config_subgets(session, &cval, "newest_txn", &value);
+ if (value.len != 0)
+ rollback_txnid = (uint64_t)value.val;
+ WT_RET_NOTFOUND_OK(ret);
+ ret = __wt_config_subgets(session, &cval, "addr", &value);
+ if (ret == 0)
+ addr_size = value.len;
+ WT_RET_NOTFOUND_OK(ret);
+ ret = __wt_config_subgets(session, &cval, "write_gen", &value);
+ if (ret == 0)
+ write_gen = (uint64_t)value.val;
+ WT_RET_NOTFOUND_OK(ret);
+ }
+ max_durable_ts = WT_MAX(newest_start_durable_ts, newest_stop_durable_ts);
+
+ /*
+ * Perform rollback to stable when the newest written transaction of the btree is greater than
+ * or equal to the checkpoint snapshot. The snapshot comparison is valid only when the btree
+ * write generation number is greater than the last checkpoint connection base write generation
+ * to confirm that the btree is modified in the previous restart cycle.
+ */
+ if (WT_CHECK_RECOVERY_FLAG_TXNID(session, rollback_txnid) &&
+ (write_gen >= S2C(session)->last_ckpt_base_write_gen)) {
+ has_txn_updates_gt_than_ckpt_snap = true;
+ /* Increment the inconsistent checkpoint stats counter. */
+ WT_STAT_CONN_DATA_INCR(session, txn_rts_inconsistent_ckpt);
+ }
+
+ /*
+ * The rollback to stable will skip the tables during recovery and shutdown in the following
+ * conditions.
+ * 1. Empty table.
+ * 2. Table has timestamped updates without a stable timestamp.
+ */
+ if ((F_ISSET(S2C(session), WT_CONN_RECOVERING) ||
+ F_ISSET(S2C(session), WT_CONN_CLOSING_CHECKPOINT)) &&
+ (addr_size == 0 || (rollback_timestamp == WT_TS_NONE && max_durable_ts != WT_TS_NONE))) {
+ __wt_verbose_multi(session, WT_VERB_RECOVERY_RTS(session),
+ "skip rollback to stable on file %s because %s", uri,
+ addr_size == 0 ? "its checkpoint address length is 0" :
+ "it has timestamped updates and the stable timestamp is 0");
+ return (0);
+ }
+
+ /*
+ * The rollback operation should be performed on this file based on the following:
+ * 1. The dhandle is present in the cache and tree is modified.
+ * 2. The checkpoint durable start/stop timestamp is greater than the rollback timestamp.
+ * 3. The checkpoint has prepared updates written to disk.
+ * 4. There is no durable timestamp in any checkpoint.
+ * 5. The checkpoint newest txn is greater than snapshot min txn id.
+ */
+ WT_WITHOUT_DHANDLE(session,
+ WT_WITH_HANDLE_LIST_READ_LOCK(
+ session, (ret = __rts_btree_walk_check_btree_modified(session, uri, &modified))));
+
+ WT_ERR_NOTFOUND_OK(ret, false);
+
+ if (modified || max_durable_ts > rollback_timestamp || prepared_updates || !durable_ts_found ||
+ has_txn_updates_gt_than_ckpt_snap) {
+ /*
+ * Open a handle; we're potentially opening a lot of handles and there's no reason to cache
+ * all of them for future unknown use, discard on close.
+ */
+ ret = __wt_session_get_dhandle(session, uri, NULL, NULL, WT_DHANDLE_DISCARD);
+ if (ret != 0)
+ WT_ERR_MSG(session, ret, "%s: unable to open handle%s", uri,
+ ret == EBUSY ? ", error indicates handle is unavailable due to concurrent use" : "");
+ dhandle_allocated = true;
+
+ __wt_verbose_multi(session, WT_VERB_RECOVERY_RTS(session),
+ "tree rolled back because it is modified: %s, or its durable timestamp (%s) > stable "
+ "timestamp (%s): "
+ "%s, or it has prepared updates: %s, or durable "
+ "timestamp is not found: %s, or txnid (%" PRIu64
+ ") > recovery checkpoint snap min (%" PRIu64 "): %s",
+ S2BT(session)->modified ? "true" : "false",
+ __wt_timestamp_to_string(max_durable_ts, ts_string[0]),
+ __wt_timestamp_to_string(rollback_timestamp, ts_string[1]),
+ max_durable_ts > rollback_timestamp ? "true" : "false",
+ prepared_updates ? "true" : "false", !durable_ts_found ? "true" : "false", rollback_txnid,
+ S2C(session)->recovery_ckpt_snap_min,
+ has_txn_updates_gt_than_ckpt_snap ? "true" : "false");
+
+ WT_ERR(__wt_rts_btree_walk_btree(session, rollback_timestamp));
+ } else
+ __wt_verbose_multi(session, WT_VERB_RECOVERY_RTS(session),
+ "%s: tree skipped with durable timestamp: %s and stable timestamp: %s or txnid: %" PRIu64,
+ uri, __wt_timestamp_to_string(max_durable_ts, ts_string[0]),
+ __wt_timestamp_to_string(rollback_timestamp, ts_string[1]), rollback_txnid);
+
+ /*
+ * Truncate history store entries for the non-timestamped table.
+ * Exceptions:
+ * 1. Modified tree - Scenarios where the tree is never checkpointed lead to zero
+ * durable timestamp even they are timestamped tables. Until we have a special
+ * indication of letting to know the table type other than checking checkpointed durable
+ * timestamp to WT_TS_NONE, we need this exception.
+ * 2. In-memory database - In this scenario, there is no history store to truncate.
+ */
+ if ((!dhandle_allocated || !S2BT(session)->modified) && max_durable_ts == WT_TS_NONE &&
+ !F_ISSET(S2C(session), WT_CONN_IN_MEMORY)) {
+ WT_ERR(__wt_config_getones(session, config, "id", &cval));
+ btree_id = (uint32_t)cval.val;
+ WT_ERR(__wt_rts_history_btree_hs_truncate(session, btree_id));
+ }
+
+err:
+ if (dhandle_allocated)
+ WT_TRET(__wt_session_release_dhandle(session));
+ return (ret);
+}
+
+/*
+ * __wt_rts_btree_walk_btree --
+ * Called for each object handle - choose to either skip or wipe the commits
+ */
+int
+__wt_rts_btree_walk_btree(WT_SESSION_IMPL *session, wt_timestamp_t rollback_timestamp)
+{
+ WT_BTREE *btree;
+ WT_CONNECTION_IMPL *conn;
+
+ btree = S2BT(session);
+ conn = S2C(session);
+
+ __wt_verbose_multi(session, WT_VERB_RECOVERY_RTS(session),
+ "rollback to stable connection logging enabled: %s and btree logging enabled: %s",
+ FLD_ISSET(conn->log_flags, WT_CONN_LOG_ENABLED) ? "true" : "false",
+ F_ISSET(btree, WT_BTREE_LOGGED) ? "true" : "false");
+
+ /* Files with commit-level durability (without timestamps), don't get their commits wiped. */
+ if (F_ISSET(btree, WT_BTREE_LOGGED))
+ return (0);
+
+ /* There is never anything to do for checkpoint handles. */
+ if (WT_READING_CHECKPOINT(session))
+ return (0);
+
+ /* There is nothing to do on an empty tree. */
+ if (btree->root.page == NULL)
+ return (0);
+
+ return (__rts_btree_walk(session, rollback_timestamp));
+}
diff --git a/src/third_party/wiredtiger/src/rollback_to_stable/rts_history.c b/src/third_party/wiredtiger/src/rollback_to_stable/rts_history.c
new file mode 100644
index 00000000000..2d1d4ecd292
--- /dev/null
+++ b/src/third_party/wiredtiger/src/rollback_to_stable/rts_history.c
@@ -0,0 +1,230 @@
+/*-
+ * Copyright (c) 2014-present MongoDB, Inc.
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __wt_rts_history_delete_hs --
+ * Delete the updates for a key in the history store until the first update (including) that is
+ * larger than or equal to the specified timestamp.
+ */
+int
+__wt_rts_history_delete_hs(WT_SESSION_IMPL *session, WT_ITEM *key, wt_timestamp_t ts)
+{
+ WT_CURSOR *hs_cursor;
+ WT_DECL_ITEM(hs_key);
+ WT_DECL_RET;
+ WT_TIME_WINDOW *hs_tw;
+
+ /* Open a history store table cursor. */
+ WT_RET(__wt_curhs_open(session, NULL, &hs_cursor));
+ /*
+ * Rollback-to-stable operates exclusively (i.e., it is the only active operation in the system)
+ * outside the constraints of transactions. Therefore, there is no need for snapshot based
+ * visibility checks.
+ */
+ F_SET(hs_cursor, WT_CURSTD_HS_READ_COMMITTED);
+
+ WT_ERR(__wt_scr_alloc(session, 0, &hs_key));
+
+ /*
+ * Scan the history store for the given btree and key with maximum start timestamp to let the
+ * search point to the last version of the key and start traversing backwards to delete all the
+ * records until the first update with the start timestamp larger than or equal to the specified
+ * timestamp.
+ */
+ hs_cursor->set_key(hs_cursor, 4, S2BT(session)->id, key, WT_TS_MAX, UINT64_MAX);
+ ret = __wt_curhs_search_near_before(session, hs_cursor);
+ for (; ret == 0; ret = hs_cursor->prev(hs_cursor)) {
+ /* Retrieve the time window from the history cursor. */
+ __wt_hs_upd_time_window(hs_cursor, &hs_tw);
+
+ /*
+ * Remove all history store versions with a stop timestamp greater than the start/stop
+ * timestamp of a stable update in the data store.
+ */
+ if (hs_tw->stop_ts <= ts)
+ break;
+
+ WT_ERR(hs_cursor->remove(hs_cursor));
+ WT_STAT_CONN_DATA_INCR(session, txn_rts_hs_removed);
+
+ /*
+ * The globally visible start time window's are cleared during history store reconciliation.
+ * Treat them also as a stable entry removal from the history store.
+ */
+ if (hs_tw->start_ts == ts || hs_tw->start_ts == WT_TS_NONE)
+ WT_STAT_CONN_DATA_INCR(session, cache_hs_key_truncate_rts);
+ else
+ WT_STAT_CONN_DATA_INCR(session, cache_hs_key_truncate_rts_unstable);
+ }
+ WT_ERR_NOTFOUND_OK(ret, false);
+
+err:
+ __wt_scr_free(session, &hs_key);
+ WT_TRET(hs_cursor->close(hs_cursor));
+ return (ret);
+}
+
+/*
+ * __wt_rts_history_btree_hs_truncate --
+ * Wipe all history store updates for the btree (non-timestamped tables)
+ */
+int
+__wt_rts_history_btree_hs_truncate(WT_SESSION_IMPL *session, uint32_t btree_id)
+{
+ WT_CURSOR *hs_cursor_start, *hs_cursor_stop;
+ WT_DECL_ITEM(hs_key);
+ WT_DECL_RET;
+ WT_SESSION *truncate_session;
+ wt_timestamp_t hs_start_ts;
+ uint64_t hs_counter;
+ uint32_t hs_btree_id;
+
+ hs_cursor_start = hs_cursor_stop = NULL;
+ hs_btree_id = 0;
+ truncate_session = (WT_SESSION *)session;
+
+ WT_RET(__wt_scr_alloc(session, 0, &hs_key));
+
+ /* Open a history store start cursor. */
+ WT_ERR(__wt_curhs_open(session, NULL, &hs_cursor_start));
+ F_SET(hs_cursor_start, WT_CURSTD_HS_READ_COMMITTED);
+
+ hs_cursor_start->set_key(hs_cursor_start, 1, btree_id);
+ WT_ERR_NOTFOUND_OK(__wt_curhs_search_near_after(session, hs_cursor_start), true);
+ if (ret == WT_NOTFOUND) {
+ ret = 0;
+ goto done;
+ }
+
+ /* Open a history store stop cursor. */
+ WT_ERR(__wt_curhs_open(session, NULL, &hs_cursor_stop));
+ F_SET(hs_cursor_stop, WT_CURSTD_HS_READ_COMMITTED | WT_CURSTD_HS_READ_ACROSS_BTREE);
+
+ hs_cursor_stop->set_key(hs_cursor_stop, 1, btree_id + 1);
+ WT_ERR_NOTFOUND_OK(__wt_curhs_search_near_after(session, hs_cursor_stop), true);
+
+#ifdef HAVE_DIAGNOSTIC
+ /* If we get not found, we are at the largest btree id in the history store. */
+ if (ret == 0) {
+ hs_cursor_stop->get_key(hs_cursor_stop, &hs_btree_id, hs_key, &hs_start_ts, &hs_counter);
+ WT_ASSERT(session, hs_btree_id > btree_id);
+ }
+#endif
+
+ do {
+ WT_ASSERT(session, ret == WT_NOTFOUND || hs_btree_id > btree_id);
+
+ WT_ERR_NOTFOUND_OK(hs_cursor_stop->prev(hs_cursor_stop), true);
+ /* We can find the start point then we must be able to find the stop point. */
+ if (ret == WT_NOTFOUND)
+ WT_ERR_PANIC(
+ session, ret, "cannot locate the stop point to truncate the history store.");
+ hs_cursor_stop->get_key(hs_cursor_stop, &hs_btree_id, hs_key, &hs_start_ts, &hs_counter);
+ } while (hs_btree_id != btree_id);
+
+ WT_ERR(
+ truncate_session->truncate(truncate_session, NULL, hs_cursor_start, hs_cursor_stop, NULL));
+
+ WT_STAT_CONN_DATA_INCR(session, cache_hs_btree_truncate);
+
+ __wt_verbose(session, WT_VERB_RECOVERY_PROGRESS,
+ "Rollback to stable has truncated records for btree %u from the history store", btree_id);
+
+done:
+err:
+ __wt_scr_free(session, &hs_key);
+ if (hs_cursor_start != NULL)
+ WT_TRET(hs_cursor_start->close(hs_cursor_start));
+ if (hs_cursor_stop != NULL)
+ WT_TRET(hs_cursor_stop->close(hs_cursor_stop));
+
+ return (ret);
+}
+
+/*
+ * __wt_rts_history_final_pass --
+ * Perform rollback to stable on the history store to remove any entries newer than the stable
+ * timestamp.
+ */
+int
+__wt_rts_history_final_pass(WT_SESSION_IMPL *session, wt_timestamp_t rollback_timestamp)
+{
+ WT_CONFIG ckptconf;
+ WT_CONFIG_ITEM cval, durableval, key;
+ WT_CONNECTION_IMPL *conn;
+ WT_DECL_RET;
+ wt_timestamp_t max_durable_ts, newest_stop_durable_ts, newest_stop_ts;
+ size_t i;
+ char *config;
+ char ts_string[2][WT_TS_INT_STRING_SIZE];
+
+ config = NULL;
+ conn = S2C(session);
+
+ WT_RET(__wt_metadata_search(session, WT_HS_URI, &config));
+
+ /*
+ * Find out the max durable timestamp of the history store from checkpoint. Most of the history
+ * store updates have stop timestamp either greater or equal to the start timestamp except for
+ * the updates written for the prepared updates on the data store. To abort the updates with no
+ * stop timestamp, we must include the newest stop timestamp also into the calculation of
+ * maximum timestamp of the history store.
+ */
+ newest_stop_durable_ts = newest_stop_ts = WT_TS_NONE;
+ WT_ERR(__wt_config_getones(session, config, "checkpoint", &cval));
+ __wt_config_subinit(session, &ckptconf, &cval);
+ for (; __wt_config_next(&ckptconf, &key, &cval) == 0;) {
+ ret = __wt_config_subgets(session, &cval, "newest_stop_durable_ts", &durableval);
+ if (ret == 0)
+ newest_stop_durable_ts = WT_MAX(newest_stop_durable_ts, (wt_timestamp_t)durableval.val);
+ WT_ERR_NOTFOUND_OK(ret, false);
+ ret = __wt_config_subgets(session, &cval, "newest_stop_ts", &durableval);
+ if (ret == 0)
+ newest_stop_ts = WT_MAX(newest_stop_ts, (wt_timestamp_t)durableval.val);
+ WT_ERR_NOTFOUND_OK(ret, false);
+ }
+ max_durable_ts = WT_MAX(newest_stop_ts, newest_stop_durable_ts);
+ WT_ERR(__wt_session_get_dhandle(session, WT_HS_URI, NULL, NULL, 0));
+
+ /*
+ * The rollback operation should be performed on the history store file when the checkpoint
+ * durable start/stop timestamp is greater than the rollback timestamp. But skip if there is no
+ * stable timestamp.
+ *
+ * Note that the corresponding code for RTS btree apply also checks whether there _are_
+ * timestamped updates by checking max_durable_ts; that check is redundant here for several
+ * reasons, the most immediate being that max_durable_ts cannot be none (zero) because it's
+ * greater than rollback_timestamp, which is itself greater than zero.
+ */
+ if (max_durable_ts > rollback_timestamp && rollback_timestamp != WT_TS_NONE) {
+ __wt_verbose_multi(session, WT_VERB_RECOVERY_RTS(session),
+ "tree rolled back with durable timestamp: %s",
+ __wt_timestamp_to_string(max_durable_ts, ts_string[0]));
+ WT_TRET(__wt_rts_btree_walk_btree(session, rollback_timestamp));
+ } else
+ __wt_verbose_multi(session, WT_VERB_RECOVERY_RTS(session),
+ "tree skipped with durable timestamp: %s and stable timestamp: %s",
+ __wt_timestamp_to_string(max_durable_ts, ts_string[0]),
+ __wt_timestamp_to_string(rollback_timestamp, ts_string[1]));
+
+ /*
+ * Truncate history store entries from the partial backup remove list. The list holds all of the
+ * btree ids that do not exist as part of the database anymore due to performing a selective
+ * restore from backup.
+ */
+ if (F_ISSET(conn, WT_CONN_BACKUP_PARTIAL_RESTORE) && conn->partial_backup_remove_ids != NULL)
+ for (i = 0; conn->partial_backup_remove_ids[i] != 0; ++i)
+ WT_ERR(__wt_rts_history_btree_hs_truncate(session, conn->partial_backup_remove_ids[i]));
+err:
+ if (session->dhandle != NULL)
+ WT_TRET(__wt_session_release_dhandle(session));
+ __wt_free(session, config);
+ return (ret);
+}
diff --git a/src/third_party/wiredtiger/src/rollback_to_stable/rts_visibility.c b/src/third_party/wiredtiger/src/rollback_to_stable/rts_visibility.c
new file mode 100644
index 00000000000..11bd6ac7a5e
--- /dev/null
+++ b/src/third_party/wiredtiger/src/rollback_to_stable/rts_visibility.c
@@ -0,0 +1,156 @@
+/*-
+ * Copyright (c) 2014-present MongoDB, Inc.
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __wt_rts_visibility_has_stable_update --
+ * Check if an update chain has a stable update on it. Assume the update chain has already been
+ * processed so all we need to do is look for a valid, non-aborted entry.
+ */
+bool
+__wt_rts_visibility_has_stable_update(WT_UPDATE *upd)
+{
+ while (upd != NULL && (upd->type == WT_UPDATE_INVALID || upd->txnid == WT_TXN_ABORTED))
+ upd = upd->next;
+ return (upd != NULL);
+}
+
+/*
+ * __wt_rts_visibility_txn_visible_id --
+ * Check if the transaction id is visible or not.
+ */
+bool
+__wt_rts_visibility_txn_visible_id(WT_SESSION_IMPL *session, uint64_t id)
+{
+ WT_CONNECTION_IMPL *conn;
+
+ conn = S2C(session);
+
+ /* If not recovery then assume all the data as visible. */
+ if (!F_ISSET(conn, WT_CONN_RECOVERING))
+ return (true);
+
+ /*
+ * Only full checkpoint writes the metadata with snapshot. If the recovered checkpoint snapshot
+ * details are none then return false i.e, updates are visible.
+ */
+ if (conn->recovery_ckpt_snap_min == WT_TXN_NONE && conn->recovery_ckpt_snap_max == WT_TXN_NONE)
+ return (true);
+
+ return (
+ __wt_txn_visible_id_snapshot(id, conn->recovery_ckpt_snap_min, conn->recovery_ckpt_snap_max,
+ conn->recovery_ckpt_snapshot, conn->recovery_ckpt_snapshot_count));
+}
+
+/*
+ * __rts_visibility_get_ref_max_durable_timestamp --
+ * Returns the ref aggregated max durable timestamp. The max durable timestamp is calculated
+ * between both start and stop durable timestamps except for history store, because most of the
+ * history store updates have stop timestamp either greater or equal to the start timestamp
+ * except for the updates written for the prepared updates on the data store. To abort the
+ * updates with no stop timestamp, we must include the newest stop timestamp also into the
+ * calculation of maximum durable timestamp of the history store.
+ */
+static wt_timestamp_t
+__rts_visibility_get_ref_max_durable_timestamp(WT_SESSION_IMPL *session, WT_TIME_AGGREGATE *ta)
+{
+ if (WT_IS_HS(session->dhandle))
+ return (WT_MAX(ta->newest_stop_durable_ts, ta->newest_stop_ts));
+ return (WT_MAX(ta->newest_start_durable_ts, ta->newest_stop_durable_ts));
+}
+
+/*
+ * __wt_rts_visibility_page_needs_abort --
+ * Check whether the page needs rollback, returning true if the page has modifications newer
+ * than the given timestamp.
+ */
+bool
+__wt_rts_visibility_page_needs_abort(
+ WT_SESSION_IMPL *session, WT_REF *ref, wt_timestamp_t rollback_timestamp)
+{
+ WT_ADDR *addr;
+ WT_CELL_UNPACK_ADDR vpack;
+ WT_MULTI *multi;
+ WT_PAGE_MODIFY *mod;
+ wt_timestamp_t durable_ts;
+ uint64_t newest_txn;
+ uint32_t i;
+ char ts_string[WT_TS_INT_STRING_SIZE];
+ const char *tag;
+ bool prepared, result;
+
+ addr = ref->addr;
+ mod = ref->page == NULL ? NULL : ref->page->modify;
+ durable_ts = WT_TS_NONE;
+ newest_txn = WT_TXN_NONE;
+ tag = "undefined state";
+ prepared = result = false;
+
+ /*
+ * The rollback operation should be performed on this page when any one of the following is
+ * greater than the given timestamp or during recovery if the newest transaction id on the page
+ * is greater than or equal to recovered checkpoint snapshot min:
+ * 1. The reconciled replace page max durable timestamp.
+ * 2. The reconciled multi page max durable timestamp.
+ * 3. For just-instantiated deleted pages that have not otherwise been modified, the durable
+ * timestamp in the page delete information. This timestamp isn't reflected in the address's
+ * time aggregate.
+ * 4. The on page address max durable timestamp.
+ * 5. The off page address max durable timestamp.
+ */
+ if (mod != NULL && mod->rec_result == WT_PM_REC_REPLACE) {
+ tag = "reconciled replace block";
+ durable_ts = __rts_visibility_get_ref_max_durable_timestamp(session, &mod->mod_replace.ta);
+ prepared = mod->mod_replace.ta.prepare;
+ result = (durable_ts > rollback_timestamp) || prepared;
+ } else if (mod != NULL && mod->rec_result == WT_PM_REC_MULTIBLOCK) {
+ tag = "reconciled multi block";
+ /* Calculate the max durable timestamp by traversing all multi addresses. */
+ for (multi = mod->mod_multi, i = 0; i < mod->mod_multi_entries; ++multi, ++i) {
+ durable_ts = WT_MAX(
+ durable_ts, __rts_visibility_get_ref_max_durable_timestamp(session, &multi->addr.ta));
+ if (multi->addr.ta.prepare)
+ prepared = true;
+ }
+ result = (durable_ts > rollback_timestamp) || prepared;
+ } else if (mod != NULL && mod->instantiated && !__wt_page_is_modified(ref->page) &&
+ ref->page_del != NULL) {
+ tag = "page_del info";
+ durable_ts = ref->page_del->durable_timestamp;
+ prepared = ref->page_del->prepare_state == WT_PREPARE_INPROGRESS ||
+ ref->page_del->prepare_state == WT_PREPARE_LOCKED;
+ newest_txn = ref->page_del->txnid;
+ result = (durable_ts > rollback_timestamp) || prepared ||
+ WT_CHECK_RECOVERY_FLAG_TXNID(session, newest_txn);
+ } else if (!__wt_off_page(ref->home, addr)) {
+ tag = "on page cell";
+ /* Check if the page is obsolete using the page disk address. */
+ __wt_cell_unpack_addr(session, ref->home->dsk, (WT_CELL *)addr, &vpack);
+ durable_ts = __rts_visibility_get_ref_max_durable_timestamp(session, &vpack.ta);
+ prepared = vpack.ta.prepare;
+ newest_txn = vpack.ta.newest_txn;
+ result = (durable_ts > rollback_timestamp) || prepared ||
+ WT_CHECK_RECOVERY_FLAG_TXNID(session, newest_txn);
+ } else if (addr != NULL) {
+ tag = "address";
+ durable_ts = __rts_visibility_get_ref_max_durable_timestamp(session, &addr->ta);
+ prepared = addr->ta.prepare;
+ newest_txn = addr->ta.newest_txn;
+ result = (durable_ts > rollback_timestamp) || prepared ||
+ WT_CHECK_RECOVERY_FLAG_TXNID(session, newest_txn);
+ }
+
+ __wt_verbose_multi(session, WT_VERB_RECOVERY_RTS(session),
+ "%p: page with %s durable timestamp: %s, newest txn: %" PRIu64
+ " and prepared updates: %s needs abort: %s",
+ (void *)ref, tag, __wt_timestamp_to_string(durable_ts, ts_string), newest_txn,
+ prepared ? "true" : "false", result ? "true" : "false");
+
+ return (result);
+}
diff --git a/src/third_party/wiredtiger/src/session/session_api.c b/src/third_party/wiredtiger/src/session/session_api.c
index 16e475bff58..574101c266f 100644
--- a/src/third_party/wiredtiger/src/session/session_api.c
+++ b/src/third_party/wiredtiger/src/session/session_api.c
@@ -1396,7 +1396,8 @@ __session_salvage_worker(WT_SESSION_IMPL *session, const char *uri, const char *
{
WT_RET(__wt_schema_worker(
session, uri, __wt_salvage, NULL, cfg, WT_DHANDLE_EXCLUSIVE | WT_BTREE_SALVAGE));
- WT_RET(__wt_schema_worker(session, uri, NULL, __wt_rollback_to_stable_one, cfg, 0));
+ WT_RET(
+ __wt_schema_worker(session, uri, NULL, S2C(session)->rts->rollback_to_stable_one, cfg, 0));
return (0);
}
diff --git a/src/third_party/wiredtiger/src/txn/txn.c b/src/third_party/wiredtiger/src/txn/txn.c
index 0aca6b79756..8ab3e75bf5a 100644
--- a/src/third_party/wiredtiger/src/txn/txn.c
+++ b/src/third_party/wiredtiger/src/txn/txn.c
@@ -2432,7 +2432,7 @@ __wt_txn_global_shutdown(WT_SESSION_IMPL *session, const char **cfg)
__wt_verbose(session, WT_VERB_RTS,
"performing shutdown rollback to stable with stable timestamp: %s",
__wt_timestamp_to_string(conn->txn_global.stable_timestamp, ts_string));
- WT_TRET(__wt_rollback_to_stable(session, cfg, true));
+ WT_TRET(conn->rts->rollback_to_stable(session, cfg, true));
}
s = NULL;
diff --git a/src/third_party/wiredtiger/src/txn/txn_recover.c b/src/third_party/wiredtiger/src/txn/txn_recover.c
index 300798f3b31..3e95d24e600 100644
--- a/src/third_party/wiredtiger/src/txn/txn_recover.c
+++ b/src/third_party/wiredtiger/src/txn/txn_recover.c
@@ -1014,7 +1014,7 @@ done:
__wt_timestamp_to_string(conn->txn_global.stable_timestamp, ts_string[0]),
__wt_timestamp_to_string(conn->txn_global.oldest_timestamp, ts_string[1]));
rts_executed = true;
- WT_ERR(__wt_rollback_to_stable(session, NULL, true));
+ WT_ERR(conn->rts->rollback_to_stable(session, NULL, true));
}
/*
diff --git a/src/third_party/wiredtiger/src/txn/txn_rollback_to_stable.c b/src/third_party/wiredtiger/src/txn/txn_rollback_to_stable.c
deleted file mode 100644
index 02a05cbd0bb..00000000000
--- a/src/third_party/wiredtiger/src/txn/txn_rollback_to_stable.c
+++ /dev/null
@@ -1,2056 +0,0 @@
-/*-
- * Copyright (c) 2014-present MongoDB, Inc.
- * Copyright (c) 2008-2014 WiredTiger, Inc.
- * All rights reserved.
- *
- * See the file LICENSE for redistribution information.
- */
-
-#include "wt_internal.h"
-
-#define WT_CHECK_RECOVERY_FLAG_TXNID(session, txnid) \
- (F_ISSET(S2C(session), WT_CONN_RECOVERING) && S2C(session)->recovery_ckpt_snap_min != 0 && \
- (txnid) >= S2C(session)->recovery_ckpt_snap_min)
-
-/* Enable rollback to stable verbose messaging during recovery. */
-#define WT_VERB_RECOVERY_RTS(session) \
- (F_ISSET(S2C(session), WT_CONN_RECOVERING) ? \
- WT_DECL_VERBOSE_MULTI_CATEGORY(((WT_VERBOSE_CATEGORY[]){WT_VERB_RECOVERY, WT_VERB_RTS})) : \
- WT_DECL_VERBOSE_MULTI_CATEGORY(((WT_VERBOSE_CATEGORY[]){WT_VERB_RTS})))
-
-static bool __rollback_txn_visible_id(WT_SESSION_IMPL *session, uint64_t id);
-
-/*
- * __rollback_delete_hs --
- * Delete the updates for a key in the history store until the first update (including) that is
- * larger than or equal to the specified timestamp.
- */
-static int
-__rollback_delete_hs(WT_SESSION_IMPL *session, WT_ITEM *key, wt_timestamp_t ts)
-{
- WT_CURSOR *hs_cursor;
- WT_DECL_ITEM(hs_key);
- WT_DECL_RET;
- WT_TIME_WINDOW *hs_tw;
-
- /* Open a history store table cursor. */
- WT_RET(__wt_curhs_open(session, NULL, &hs_cursor));
- /*
- * Rollback-to-stable operates exclusively (i.e., it is the only active operation in the system)
- * outside the constraints of transactions. Therefore, there is no need for snapshot based
- * visibility checks.
- */
- F_SET(hs_cursor, WT_CURSTD_HS_READ_COMMITTED);
-
- WT_ERR(__wt_scr_alloc(session, 0, &hs_key));
-
- /*
- * Scan the history store for the given btree and key with maximum start timestamp to let the
- * search point to the last version of the key and start traversing backwards to delete all the
- * records until the first update with the start timestamp larger than or equal to the specified
- * timestamp.
- */
- hs_cursor->set_key(hs_cursor, 4, S2BT(session)->id, key, WT_TS_MAX, UINT64_MAX);
- ret = __wt_curhs_search_near_before(session, hs_cursor);
- for (; ret == 0; ret = hs_cursor->prev(hs_cursor)) {
- /* Retrieve the time window from the history cursor. */
- __wt_hs_upd_time_window(hs_cursor, &hs_tw);
-
- /*
- * Remove all history store versions with a stop timestamp greater than the start/stop
- * timestamp of a stable update in the data store.
- */
- if (hs_tw->stop_ts <= ts)
- break;
-
- WT_ERR(hs_cursor->remove(hs_cursor));
- WT_STAT_CONN_DATA_INCR(session, txn_rts_hs_removed);
-
- /*
- * The globally visible start time window's are cleared during history store reconciliation.
- * Treat them also as a stable entry removal from the history store.
- */
- if (hs_tw->start_ts == ts || hs_tw->start_ts == WT_TS_NONE)
- WT_STAT_CONN_DATA_INCR(session, cache_hs_key_truncate_rts);
- else
- WT_STAT_CONN_DATA_INCR(session, cache_hs_key_truncate_rts_unstable);
- }
- WT_ERR_NOTFOUND_OK(ret, false);
-
-err:
- __wt_scr_free(session, &hs_key);
- WT_TRET(hs_cursor->close(hs_cursor));
- return (ret);
-}
-
-/*
- * __rollback_abort_update --
- * Abort updates in an update change with timestamps newer than the rollback timestamp. Also,
- * clear the history store flag for the first stable update in the update.
- */
-static int
-__rollback_abort_update(WT_SESSION_IMPL *session, WT_ITEM *key, WT_UPDATE *first_upd,
- wt_timestamp_t rollback_timestamp, bool *stable_update_found)
-{
- WT_UPDATE *stable_upd, *tombstone, *upd;
- char ts_string[2][WT_TS_INT_STRING_SIZE];
- bool txn_id_visible;
-
- stable_upd = tombstone = NULL;
- txn_id_visible = false;
- if (stable_update_found != NULL)
- *stable_update_found = false;
- for (upd = first_upd; upd != NULL; upd = upd->next) {
- /* Skip the updates that are aborted. */
- if (upd->txnid == WT_TXN_ABORTED)
- continue;
-
- /*
- * An unstable update needs to be aborted if any of the following are true:
- * 1. An update is invisible based on the checkpoint snapshot during recovery.
- * 2. The update durable timestamp is greater than the stable timestamp.
- * 3. The update is a prepared update.
- *
- * Usually during recovery, there are no in memory updates present on the page. But
- * whenever an unstable fast truncate operation is written to the disk, as part
- * of the rollback to stable page read, it instantiates the tombstones on the page.
- * The transaction id validation is ignored in all scenarios except recovery.
- */
- txn_id_visible = __rollback_txn_visible_id(session, upd->txnid);
- if (!txn_id_visible || rollback_timestamp < upd->durable_ts ||
- upd->prepare_state == WT_PREPARE_INPROGRESS) {
- __wt_verbose_multi(session, WT_VERB_RECOVERY_RTS(session),
- "rollback to stable update aborted with txnid: %" PRIu64
- ", txnid not visible: %s, or stable timestamp (%s) < durable timestamp (%s): %s, or "
- "prepare state (%d) is in progress: %s",
- upd->txnid, !txn_id_visible ? "true" : "false",
- __wt_timestamp_to_string(rollback_timestamp, ts_string[1]),
- __wt_timestamp_to_string(upd->durable_ts, ts_string[0]),
- rollback_timestamp < upd->durable_ts ? "true" : "false", upd->prepare_state,
- upd->prepare_state == WT_PREPARE_INPROGRESS ? "true" : "false");
-
- upd->txnid = WT_TXN_ABORTED;
- WT_STAT_CONN_INCR(session, txn_rts_upd_aborted);
- } else {
- /* Valid update is found. */
- stable_upd = upd;
- break;
- }
- }
-
- /*
- * Clear the history store flags for the stable update to indicate that this update should be
- * written to the history store later. The next time when this update is moved into the history
- * store, it will have a different stop time point.
- */
- if (stable_upd != NULL) {
- if (F_ISSET(stable_upd, WT_UPDATE_HS | WT_UPDATE_TO_DELETE_FROM_HS)) {
- /* Find the update following a stable tombstone. */
- if (stable_upd->type == WT_UPDATE_TOMBSTONE) {
- tombstone = stable_upd;
- for (stable_upd = stable_upd->next; stable_upd != NULL;
- stable_upd = stable_upd->next) {
- if (stable_upd->txnid != WT_TXN_ABORTED) {
- WT_ASSERT(session,
- stable_upd->type != WT_UPDATE_TOMBSTONE &&
- F_ISSET(stable_upd, WT_UPDATE_HS | WT_UPDATE_TO_DELETE_FROM_HS));
- break;
- }
- }
- }
-
- /*
- * Delete the first stable update and any newer update from the history store. If the
- * update following the stable tombstone is removed by obsolete check, no need to remove
- * that update from the history store as it has a globally visible tombstone. In that
- * case, it is enough to delete everything up until to the tombstone timestamp.
- */
- WT_RET(__rollback_delete_hs(
- session, key, stable_upd == NULL ? tombstone->start_ts : stable_upd->start_ts));
-
- /*
- * Clear the history store flags for the first stable update. Otherwise, it will not be
- * moved to history store again.
- */
- if (stable_upd != NULL)
- F_CLR(stable_upd, WT_UPDATE_HS | WT_UPDATE_TO_DELETE_FROM_HS);
- if (tombstone != NULL)
- F_CLR(tombstone, WT_UPDATE_HS | WT_UPDATE_TO_DELETE_FROM_HS);
- }
- if (stable_update_found != NULL)
- *stable_update_found = true;
- }
-
- return (0);
-}
-
-/*
- * __rollback_abort_insert_list --
- * Apply the update abort check to each entry in an insert skip list. Return how many entries
- * had stable updates.
- */
-static int
-__rollback_abort_insert_list(WT_SESSION_IMPL *session, WT_PAGE *page, WT_INSERT_HEAD *head,
- wt_timestamp_t rollback_timestamp, uint32_t *stable_updates_count)
-{
- WT_DECL_ITEM(key);
- WT_DECL_RET;
- WT_INSERT *ins;
- uint64_t recno;
- uint8_t *memp;
- bool stable_update_found;
-
- WT_ERR(
- __wt_scr_alloc(session, page->type == WT_PAGE_ROW_LEAF ? 0 : WT_INTPACK64_MAXSIZE, &key));
-
- WT_SKIP_FOREACH (ins, head)
- if (ins->upd != NULL) {
- if (page->type == WT_PAGE_ROW_LEAF) {
- key->data = WT_INSERT_KEY(ins);
- key->size = WT_INSERT_KEY_SIZE(ins);
- } else {
- recno = WT_INSERT_RECNO(ins);
- memp = key->mem;
- WT_ERR(__wt_vpack_uint(&memp, 0, recno));
- key->size = WT_PTRDIFF(memp, key->data);
- }
- WT_ERR(__rollback_abort_update(
- session, key, ins->upd, rollback_timestamp, &stable_update_found));
- if (stable_update_found && stable_updates_count != NULL)
- (*stable_updates_count)++;
- if (!stable_update_found && page->type == WT_PAGE_ROW_LEAF &&
- !F_ISSET(S2C(session), WT_CONN_IN_MEMORY))
- /*
- * When a new key is added to a page and the page is then checkpointed, updates for
- * that key can be present in the History Store while the key isn't present in the
- * disk image. RTS will then only remove these updates when there is a stable update
- * on-chain. These updates still need removing when no stable updates are on-chain,
- * so do so here explicitly. Pass in rollback_timestamp + 1 as __rollback_delete_hs
- * removes updates inclusive of the provided timestamp, but we only want to remove
- * unstable updates.
- *
- * FIXME-WT-10017: WT-9846 is an interim fix only for row-store while we investigate
- * the impacts of a long term correction in WT-10017. Once completed this change can
- * be reverted.
- */
- WT_ERR(__rollback_delete_hs(session, key, rollback_timestamp + 1));
- }
-
-err:
- __wt_scr_free(session, &key);
- return (ret);
-}
-
-/*
- * __rollback_has_stable_update --
- * Check if an update chain has a stable update on it. Assume the update chain has already been
- * processed so all we need to do is look for a valid, non-aborted entry.
- */
-static bool
-__rollback_has_stable_update(WT_UPDATE *upd)
-{
- while (upd != NULL && (upd->type == WT_UPDATE_INVALID || upd->txnid == WT_TXN_ABORTED))
- upd = upd->next;
- return (upd != NULL);
-}
-
-/*
- * __rollback_col_modify --
- * Add the provided update to the head of the update list.
- */
-static inline int
-__rollback_col_modify(WT_SESSION_IMPL *session, WT_REF *ref, WT_UPDATE *upd, uint64_t recno)
-{
- WT_CURSOR_BTREE cbt;
- WT_DECL_RET;
-
- __wt_btcur_init(session, &cbt);
- __wt_btcur_open(&cbt);
-
- /* Search the page. */
- WT_ERR(__wt_col_search(&cbt, recno, ref, true, NULL));
-
- /* Apply the modification. */
-#ifdef HAVE_DIAGNOSTIC
- WT_ERR(__wt_col_modify(&cbt, recno, NULL, upd, WT_UPDATE_INVALID, true, false));
-#else
- WT_ERR(__wt_col_modify(&cbt, recno, NULL, upd, WT_UPDATE_INVALID, true));
-#endif
-
-err:
- /* Free any resources that may have been cached in the cursor. */
- WT_TRET(__wt_btcur_close(&cbt, true));
-
- return (ret);
-}
-
-/*
- * __rollback_row_modify --
- * Add the provided update to the head of the update list.
- */
-static inline int
-__rollback_row_modify(WT_SESSION_IMPL *session, WT_REF *ref, WT_UPDATE *upd, WT_ITEM *key)
-{
- WT_CURSOR_BTREE cbt;
- WT_DECL_RET;
-
- __wt_btcur_init(session, &cbt);
- __wt_btcur_open(&cbt);
-
- /* Search the page. */
- WT_ERR(__wt_row_search(&cbt, key, true, ref, true, NULL));
-
- /* Apply the modification. */
-#ifdef HAVE_DIAGNOSTIC
- WT_ERR(__wt_row_modify(&cbt, key, NULL, upd, WT_UPDATE_INVALID, true, false));
-#else
- WT_ERR(__wt_row_modify(&cbt, key, NULL, upd, WT_UPDATE_INVALID, true));
-#endif
-
-err:
- /* Free any resources that may have been cached in the cursor. */
- WT_TRET(__wt_btcur_close(&cbt, true));
-
- return (ret);
-}
-
-/*
- * __rollback_txn_visible_id --
- * Check if the transaction id is visible or not.
- */
-static bool
-__rollback_txn_visible_id(WT_SESSION_IMPL *session, uint64_t id)
-{
- WT_CONNECTION_IMPL *conn;
-
- conn = S2C(session);
-
- /* If not recovery then assume all the data as visible. */
- if (!F_ISSET(conn, WT_CONN_RECOVERING))
- return (true);
-
- /*
- * Only full checkpoint writes the metadata with snapshot. If the recovered checkpoint snapshot
- * details are none then return false i.e, updates are visible.
- */
- if (conn->recovery_ckpt_snap_min == WT_TXN_NONE && conn->recovery_ckpt_snap_max == WT_TXN_NONE)
- return (true);
-
- return (
- __wt_txn_visible_id_snapshot(id, conn->recovery_ckpt_snap_min, conn->recovery_ckpt_snap_max,
- conn->recovery_ckpt_snapshot, conn->recovery_ckpt_snapshot_count));
-}
-
-/*
- * __rollback_ondisk_fixup_key --
- * Abort updates in the history store and replace the on-disk value with an update that
- * satisfies the given timestamp.
- */
-static int
-__rollback_ondisk_fixup_key(WT_SESSION_IMPL *session, WT_REF *ref, WT_ROW *rip, uint64_t recno,
- WT_ITEM *row_key, WT_CELL_UNPACK_KV *unpack, wt_timestamp_t rollback_timestamp)
-{
- WT_CURSOR *hs_cursor;
- WT_DECL_ITEM(full_value);
- WT_DECL_ITEM(hs_key);
- WT_DECL_ITEM(hs_value);
- WT_DECL_ITEM(key);
- WT_DECL_ITEM(key_string);
- WT_DECL_RET;
- WT_PAGE *page;
- WT_TIME_WINDOW *hs_tw;
- WT_UPDATE *tombstone, *upd;
- wt_timestamp_t hs_durable_ts, hs_start_ts, hs_stop_durable_ts, newer_hs_durable_ts, pinned_ts;
- uint64_t hs_counter, type_full;
- uint32_t hs_btree_id;
- uint8_t *memp;
- uint8_t type;
- char ts_string[4][WT_TS_INT_STRING_SIZE];
- char tw_string[WT_TIME_STRING_SIZE];
- bool valid_update_found;
-#ifdef HAVE_DIAGNOSTIC
- bool first_record;
-#endif
-
- page = ref->page;
-
- hs_cursor = NULL;
- tombstone = upd = NULL;
- hs_durable_ts = hs_start_ts = hs_stop_durable_ts = WT_TS_NONE;
- hs_btree_id = S2BT(session)->id;
- valid_update_found = false;
-#ifdef HAVE_DIAGNOSTIC
- first_record = true;
-#endif
-
- /* Allocate buffers for the data store and history store key. */
- WT_ERR(__wt_scr_alloc(session, 0, &hs_key));
- WT_ERR(__wt_scr_alloc(session, 0, &hs_value));
-
- if (rip != NULL) {
- if (row_key != NULL)
- key = row_key;
- else {
- /* Unpack a row key. */
- WT_ERR(__wt_scr_alloc(session, 0, &key));
- WT_ERR(__wt_row_leaf_key(session, page, rip, key, false));
- }
- } else {
- /* Manufacture a column key. */
- WT_ERR(__wt_scr_alloc(session, WT_INTPACK64_MAXSIZE, &key));
- memp = key->mem;
- WT_ERR(__wt_vpack_uint(&memp, 0, recno));
- key->size = WT_PTRDIFF(memp, key->data);
- }
-
- WT_ERR(__wt_scr_alloc(session, 0, &key_string));
- __wt_verbose_level_multi(session, WT_VERB_RECOVERY_RTS(session), WT_VERBOSE_DEBUG_2,
- "rolling back the on-disk key: %s",
- __wt_key_string(session, key->data, key->size, S2BT(session)->key_format, key_string));
-
- WT_ERR(__wt_scr_alloc(session, 0, &full_value));
- WT_ERR(__wt_page_cell_data_ref_kv(session, page, unpack, full_value));
- /*
- * We can read overflow removed value if checkpoint has run before rollback to stable. In this
- * case, we have already appended the on page value to the update chain. At this point, we have
- * visited the update chain and decided the value is not stable. In addition, checkpoint must
- * have moved this value to the history store as a full value. Therefore, we can safely ignore
- * the on page value if it is overflow removed.
- */
- if (__wt_cell_type_raw(unpack->cell) == WT_CELL_VALUE_OVFL_RM)
- ret = 0;
- else
- WT_ERR(__wt_buf_set(session, full_value, full_value->data, full_value->size));
-
- newer_hs_durable_ts = unpack->tw.durable_start_ts;
-
- __wt_txn_pinned_timestamp(session, &pinned_ts);
-
- /* Open a history store table cursor. */
- WT_ERR(__wt_curhs_open(session, NULL, &hs_cursor));
- /*
- * Rollback-to-stable operates exclusively (i.e., it is the only active operation in the system)
- * outside the constraints of transactions. Therefore, there is no need for snapshot based
- * visibility checks.
- */
- F_SET(hs_cursor, WT_CURSTD_HS_READ_ALL);
-
- /*
- * Scan the history store for the given btree and key with maximum start timestamp to let the
- * search point to the last version of the key and start traversing backwards to find out the
- * satisfying record according the given timestamp. Any satisfying history store record is moved
- * into data store and removed from history store. If none of the history store records satisfy
- * the given timestamp, the key is removed from data store.
- */
- hs_cursor->set_key(hs_cursor, 4, hs_btree_id, key, WT_TS_MAX, UINT64_MAX);
- ret = __wt_curhs_search_near_before(session, hs_cursor);
- for (; ret == 0; ret = hs_cursor->prev(hs_cursor)) {
- WT_ERR(hs_cursor->get_key(hs_cursor, &hs_btree_id, hs_key, &hs_start_ts, &hs_counter));
-
- /* Get current value and convert to full update if it is a modify. */
- WT_ERR(hs_cursor->get_value(
- hs_cursor, &hs_stop_durable_ts, &hs_durable_ts, &type_full, hs_value));
- type = (uint8_t)type_full;
-
- /* Retrieve the time window from the history cursor. */
- __wt_hs_upd_time_window(hs_cursor, &hs_tw);
-
- /*
- * We have a tombstone on the history update and it is obsolete according to the timestamp
- * and txnid, so no need to restore it. These obsolete updates are written to the disk when
- * they are not obsolete at the time of reconciliation by an eviction thread and later they
- * become obsolete according to the checkpoint.
- */
- if (__rollback_txn_visible_id(session, hs_tw->stop_txn) &&
- hs_tw->durable_stop_ts <= pinned_ts) {
- __wt_verbose_multi(session, WT_VERB_RECOVERY_RTS(session),
- "history store stop is obsolete with time window: %s and pinned timestamp: %s",
- __wt_time_window_to_string(hs_tw, tw_string),
- __wt_timestamp_to_string(pinned_ts, ts_string[0]));
- WT_ERR(hs_cursor->remove(hs_cursor));
- WT_STAT_CONN_DATA_INCR(session, txn_rts_hs_removed);
- continue;
- }
-
- /*
- * Do not include history store updates greater than on-disk data store version to construct
- * a full update to restore except when the on-disk update is prepared. Including more
- * recent updates than the on-disk version shouldn't be problem as the on-disk version in
- * history store is always a full update. It is better to not to include those updates as it
- * unnecessarily increases the rollback to stable time.
- *
- * Comparing with timestamps here has no problem unlike in search flow where the timestamps
- * may be reset during reconciliation. RTS detects an on-disk update is unstable based on
- * the written proper timestamp, so comparing against it with history store shouldn't have
- * any problem.
- */
- if (hs_tw->start_ts <= unpack->tw.start_ts || unpack->tw.prepare) {
- if (type == WT_UPDATE_MODIFY)
- WT_ERR(__wt_modify_apply_item(
- session, S2BT(session)->value_format, full_value, hs_value->data));
- else {
- WT_ASSERT(session, type == WT_UPDATE_STANDARD);
- WT_ERR(__wt_buf_set(session, full_value, hs_value->data, hs_value->size));
- }
- } else
- __wt_verbose_level_multi(session, WT_VERB_RECOVERY_RTS(session), WT_VERBOSE_DEBUG_2,
- "history store update more recent than on-disk update with time window: %s and type: "
- "%" PRIu8,
- __wt_time_window_to_string(hs_tw, tw_string), type);
-
- /*
- * Verify the history store timestamps are in order. The start timestamp may be equal to the
- * stop timestamp if the original update's commit timestamp is in order. We may see records
- * newer than or equal to the onpage value if eviction runs concurrently with checkpoint. In
- * that case, don't verify the first record.
- *
- * It is possible during a prepared transaction rollback, the history store update that have
- * its own stop timestamp doesn't get removed leads to duplicate records in history store
- * after further operations on that same key. Rollback to stable should ignore such records
- * for timestamp ordering verification.
- *
- * If we have fixed the missing timestamps, then the newer update reinserted with an older
- * timestamp may have a durable timestamp that is smaller than the current stop durable
- * timestamp.
- *
- * It is possible that there can be an update in the history store with a max stop timestamp
- * in the middle of the same key updates. This occurs when the checkpoint writes the
- * committed prepared update and further updates on that key including the history store
- * changes before the transaction fixes the history store update to have a proper stop
- * timestamp. It is a rare scenario.
- */
- WT_ASSERT(session,
- hs_stop_durable_ts <= newer_hs_durable_ts || hs_start_ts == hs_stop_durable_ts ||
- hs_start_ts == newer_hs_durable_ts || newer_hs_durable_ts == hs_durable_ts ||
- first_record || hs_stop_durable_ts == WT_TS_MAX);
-
- if (hs_stop_durable_ts < newer_hs_durable_ts)
- WT_STAT_CONN_DATA_INCR(session, txn_rts_hs_stop_older_than_newer_start);
-
- /*
- * Validate the timestamps in the key and the cell are same. This must be validated only
- * after verifying it's stop time window is not globally visible. The start timestamps of
- * the time window are cleared when they are globally visible and there will be no stop
- * timestamp in the history store whenever a prepared update is written to the data store.
- */
- WT_ASSERT(session,
- (hs_tw->start_ts == WT_TS_NONE || hs_tw->start_ts == hs_start_ts) &&
- (hs_tw->durable_start_ts == WT_TS_NONE || hs_tw->durable_start_ts == hs_durable_ts) &&
- ((hs_tw->durable_stop_ts == 0 && hs_stop_durable_ts == WT_TS_MAX) ||
- hs_tw->durable_stop_ts == hs_stop_durable_ts));
-
- /*
- * Stop processing when we find a stable update according to the given timestamp and
- * transaction id.
- */
- if (__rollback_txn_visible_id(session, hs_tw->start_txn) &&
- hs_tw->durable_start_ts <= rollback_timestamp) {
- __wt_verbose_level_multi(session, WT_VERB_RECOVERY_RTS(session), WT_VERBOSE_DEBUG_2,
- "history store update valid with time window: %s, type: %" PRIu8
- " and stable timestamp: %s",
- __wt_time_window_to_string(hs_tw, tw_string), type,
- __wt_timestamp_to_string(rollback_timestamp, ts_string[0]));
- WT_ASSERT(session, unpack->tw.prepare || hs_tw->start_ts <= unpack->tw.start_ts);
- valid_update_found = true;
- break;
- }
-
- __wt_verbose_multi(session, WT_VERB_RECOVERY_RTS(session),
- "history store update aborted with time window: %s, type: %" PRIu8
- " and stable timestamp: %s",
- __wt_time_window_to_string(hs_tw, tw_string), type,
- __wt_timestamp_to_string(rollback_timestamp, ts_string[3]));
-
- /*
- * Start time point of the current record may be used as stop time point of the previous
- * record. Save it to verify against the previous record and check if we need to append the
- * stop time point as a tombstone when we rollback the history store record.
- */
- newer_hs_durable_ts = hs_durable_ts;
-#ifdef HAVE_DIAGNOSTIC
- first_record = false;
-#endif
-
- WT_ERR(hs_cursor->remove(hs_cursor));
- WT_STAT_CONN_DATA_INCR(session, txn_rts_hs_removed);
- WT_STAT_CONN_DATA_INCR(session, cache_hs_key_truncate_rts_unstable);
- }
-
- /*
- * If we found a history value that satisfied the given timestamp, add it to the update list.
- * Otherwise remove the key by adding a tombstone.
- */
- if (valid_update_found) {
- /* Retrieve the time window from the history cursor. */
- __wt_hs_upd_time_window(hs_cursor, &hs_tw);
- WT_ASSERT(session,
- hs_tw->start_ts < unpack->tw.start_ts || hs_tw->start_txn < unpack->tw.start_txn);
- WT_ERR(__wt_upd_alloc(session, full_value, WT_UPDATE_STANDARD, &upd, NULL));
-
- /*
- * Set the transaction id of updates to WT_TXN_NONE when called from recovery, because the
- * connections write generation will be initialized after rollback to stable and the updates
- * in the cache will be problematic. The transaction id of pages which are in disk will be
- * automatically reset as part of unpacking cell when loaded to cache.
- */
- if (F_ISSET(S2C(session), WT_CONN_RECOVERING))
- upd->txnid = WT_TXN_NONE;
- else
- upd->txnid = hs_tw->start_txn;
- upd->durable_ts = hs_tw->durable_start_ts;
- upd->start_ts = hs_tw->start_ts;
- __wt_verbose_multi(session, WT_VERB_RECOVERY_RTS(session),
- "history store update restored txnid: %" PRIu64 ", start_ts: %s and durable_ts: %s",
- upd->txnid, __wt_timestamp_to_string(upd->start_ts, ts_string[0]),
- __wt_timestamp_to_string(upd->durable_ts, ts_string[1]));
-
- /*
- * Set the flag to indicate that this update has been restored from history store for the
- * rollback to stable operation.
- */
- F_SET(upd, WT_UPDATE_RESTORED_FROM_HS);
- WT_STAT_CONN_DATA_INCR(session, txn_rts_hs_restore_updates);
-
- /*
- * We have a tombstone on the original update chain and it is stable according to the
- * timestamp and txnid, we need to restore that as well.
- */
- if (__rollback_txn_visible_id(session, hs_tw->stop_txn) &&
- hs_tw->durable_stop_ts <= rollback_timestamp) {
- /*
- * The restoring tombstone timestamp must be zero or less than previous update start
- * timestamp.
- */
- WT_ASSERT(session,
- hs_stop_durable_ts == WT_TS_NONE || hs_stop_durable_ts < newer_hs_durable_ts ||
- unpack->tw.prepare);
-
- WT_ERR(__wt_upd_alloc_tombstone(session, &tombstone, NULL));
- /*
- * Set the transaction id of updates to WT_TXN_NONE when called from recovery, because
- * the connections write generation will be initialized after rollback to stable and the
- * updates in the cache will be problematic. The transaction id of pages which are in
- * disk will be automatically reset as part of unpacking cell when loaded to cache.
- */
- if (F_ISSET(S2C(session), WT_CONN_RECOVERING))
- tombstone->txnid = WT_TXN_NONE;
- else
- tombstone->txnid = hs_tw->stop_txn;
- tombstone->durable_ts = hs_tw->durable_stop_ts;
- tombstone->start_ts = hs_tw->stop_ts;
- __wt_verbose_multi(session, WT_VERB_RECOVERY_RTS(session),
- "history store tombstone restored txnid: %" PRIu64
- ", start_ts: %s and durable_ts: %s",
- tombstone->txnid, __wt_timestamp_to_string(tombstone->start_ts, ts_string[0]),
- __wt_timestamp_to_string(tombstone->durable_ts, ts_string[1]));
-
- /*
- * Set the flag to indicate that this update has been restored from history store for
- * the rollback to stable operation.
- */
- F_SET(tombstone, WT_UPDATE_RESTORED_FROM_HS);
-
- tombstone->next = upd;
- upd = tombstone;
- WT_STAT_CONN_DATA_INCR(session, txn_rts_hs_restore_tombstones);
- }
- } else {
- WT_ERR(__wt_upd_alloc_tombstone(session, &upd, NULL));
- WT_STAT_CONN_DATA_INCR(session, txn_rts_keys_removed);
- __wt_verbose_level_multi(
- session, WT_VERB_RECOVERY_RTS(session), WT_VERBOSE_DEBUG_3, "%s", "key removed");
- }
-
- if (rip != NULL)
- WT_ERR(__rollback_row_modify(session, ref, upd, key));
- else
- WT_ERR(__rollback_col_modify(session, ref, upd, recno));
-
- /* Finally remove that update from history store. */
- if (valid_update_found) {
- /* Avoid freeing the updates while still in use if hs_cursor->remove fails. */
- upd = tombstone = NULL;
-
- WT_ERR(hs_cursor->remove(hs_cursor));
- WT_STAT_CONN_DATA_INCR(session, txn_rts_hs_removed);
- WT_STAT_CONN_DATA_INCR(session, cache_hs_key_truncate_rts);
- }
-
- if (0) {
-err:
- WT_ASSERT(session, tombstone == NULL || upd == tombstone);
- __wt_free_update_list(session, &upd);
- }
- __wt_scr_free(session, &full_value);
- __wt_scr_free(session, &hs_key);
- __wt_scr_free(session, &hs_value);
- if (rip == NULL || row_key == NULL)
- __wt_scr_free(session, &key);
- __wt_scr_free(session, &key_string);
- if (hs_cursor != NULL)
- WT_TRET(hs_cursor->close(hs_cursor));
- return (ret);
-}
-
-/*
- * __rollback_abort_ondisk_kv --
- * Fix the on-disk K/V version according to the given timestamp.
- */
-static int
-__rollback_abort_ondisk_kv(WT_SESSION_IMPL *session, WT_REF *ref, WT_ROW *rip, uint64_t recno,
- WT_ITEM *row_key, WT_CELL_UNPACK_KV *vpack, wt_timestamp_t rollback_timestamp,
- bool *is_ondisk_stable)
-{
- WT_DECL_ITEM(key);
- WT_DECL_ITEM(key_string);
- WT_DECL_ITEM(tmp);
- WT_DECL_RET;
- WT_PAGE *page;
- WT_UPDATE *upd;
- uint8_t *memp;
- char time_string[WT_TIME_STRING_SIZE];
- char ts_string[5][WT_TS_INT_STRING_SIZE];
- bool prepared;
-
- page = ref->page;
- upd = NULL;
-
- /* Initialize the on-disk stable version flag. */
- if (is_ondisk_stable != NULL)
- *is_ondisk_stable = false;
-
- prepared = vpack->tw.prepare;
- if (WT_IS_HS(session->dhandle)) {
- /*
- * Abort the history store update with stop durable timestamp greater than the stable
- * timestamp or the updates with max stop timestamp which implies that they are associated
- * with prepared transactions.
- */
- if (vpack->tw.durable_stop_ts > rollback_timestamp || vpack->tw.stop_ts == WT_TS_MAX) {
- __wt_verbose_multi(session, WT_VERB_RECOVERY_RTS(session),
- "history store update aborted with start durable/commit timestamp: %s, %s, stop "
- "durable/commit timestamp: %s, %s and stable timestamp: %s",
- __wt_timestamp_to_string(vpack->tw.durable_start_ts, ts_string[0]),
- __wt_timestamp_to_string(vpack->tw.start_ts, ts_string[1]),
- __wt_timestamp_to_string(vpack->tw.durable_stop_ts, ts_string[2]),
- __wt_timestamp_to_string(vpack->tw.stop_ts, ts_string[3]),
- __wt_timestamp_to_string(rollback_timestamp, ts_string[4]));
- WT_RET(__wt_upd_alloc_tombstone(session, &upd, NULL));
- WT_STAT_CONN_DATA_INCR(session, txn_rts_sweep_hs_keys);
- } else
- return (0);
- } else if (vpack->tw.durable_start_ts > rollback_timestamp ||
- !__rollback_txn_visible_id(session, vpack->tw.start_txn) ||
- (!WT_TIME_WINDOW_HAS_STOP(&vpack->tw) && prepared)) {
- __wt_verbose_multi(session, WT_VERB_RECOVERY_RTS(session),
- "on-disk update aborted with time window %s. Start durable timestamp > stable timestamp: "
- "%s, or txnid is not visible: %s, or tw has not stop and is prepared: %s",
- __wt_time_point_to_string(
- vpack->tw.start_ts, vpack->tw.durable_start_ts, vpack->tw.start_txn, time_string),
- vpack->tw.durable_start_ts > rollback_timestamp ? "true" : "false",
- !__rollback_txn_visible_id(session, vpack->tw.start_txn) ? "true" : "false",
- !WT_TIME_WINDOW_HAS_STOP(&vpack->tw) && prepared ? "true" : "false");
- if (!F_ISSET(S2C(session), WT_CONN_IN_MEMORY))
- return (__rollback_ondisk_fixup_key(
- session, ref, rip, recno, row_key, vpack, rollback_timestamp));
- else {
- /*
- * In-memory database don't have a history store to provide a stable update, so remove
- * the key. Note that an in-memory database will have saved old values in the update
- * chain, so we should only get here for a key/value that never existed at all as of the
- * rollback timestamp; thus, deleting it is the correct response.
- */
- WT_RET(__wt_upd_alloc_tombstone(session, &upd, NULL));
- WT_STAT_CONN_DATA_INCR(session, txn_rts_keys_removed);
- }
- } else if (WT_TIME_WINDOW_HAS_STOP(&vpack->tw) &&
- (vpack->tw.durable_stop_ts > rollback_timestamp ||
- !__rollback_txn_visible_id(session, vpack->tw.stop_txn) || prepared)) {
- /*
- * For prepared transactions, it is possible that both the on-disk key start and stop time
- * windows can be the same. To abort these updates, check for any stable update from history
- * store or remove the key.
- */
- if (vpack->tw.start_ts == vpack->tw.stop_ts &&
- vpack->tw.durable_start_ts == vpack->tw.durable_stop_ts &&
- vpack->tw.start_txn == vpack->tw.stop_txn) {
- WT_ASSERT(session, prepared == true);
- if (!F_ISSET(S2C(session), WT_CONN_IN_MEMORY))
- return (__rollback_ondisk_fixup_key(
- session, ref, rip, recno, row_key, vpack, rollback_timestamp));
- else {
- /*
- * In-memory database don't have a history store to provide a stable update, so
- * remove the key.
- */
- WT_RET(__wt_upd_alloc_tombstone(session, &upd, NULL));
- WT_STAT_CONN_DATA_INCR(session, txn_rts_keys_removed);
- }
- } else {
- /*
- * Clear the remove operation from the key by inserting the original on-disk value as a
- * standard update.
- */
- WT_RET(__wt_scr_alloc(session, 0, &tmp));
- if ((ret = __wt_page_cell_data_ref_kv(session, page, vpack, tmp)) == 0)
- ret = __wt_upd_alloc(session, tmp, WT_UPDATE_STANDARD, &upd, NULL);
- __wt_scr_free(session, &tmp);
- WT_RET(ret);
-
- /*
- * Set the transaction id of updates to WT_TXN_NONE when called from recovery, because
- * the connections write generation will be initialized after rollback to stable and the
- * updates in the cache will be problematic. The transaction id of pages which are in
- * disk will be automatically reset as part of unpacking cell when loaded to cache.
- */
- if (F_ISSET(S2C(session), WT_CONN_RECOVERING))
- upd->txnid = WT_TXN_NONE;
- else
- upd->txnid = vpack->tw.start_txn;
- upd->durable_ts = vpack->tw.durable_start_ts;
- upd->start_ts = vpack->tw.start_ts;
- F_SET(upd, WT_UPDATE_RESTORED_FROM_DS);
- WT_STAT_CONN_DATA_INCR(session, txn_rts_keys_restored);
- __wt_verbose_multi(session, WT_VERB_RECOVERY_RTS(session),
- "key restored with commit timestamp: %s, durable timestamp: %s, stable timestamp: "
- "%s, "
- "txnid: %" PRIu64
- " and removed commit timestamp: %s, durable timestamp: %s, txnid: %" PRIu64
- ", prepared: %s",
- __wt_timestamp_to_string(upd->start_ts, ts_string[0]),
- __wt_timestamp_to_string(upd->durable_ts, ts_string[1]),
- __wt_timestamp_to_string(rollback_timestamp, ts_string[2]), upd->txnid,
- __wt_timestamp_to_string(vpack->tw.stop_ts, ts_string[3]),
- __wt_timestamp_to_string(vpack->tw.durable_stop_ts, ts_string[4]), vpack->tw.stop_txn,
- prepared ? "true" : "false");
- }
- } else {
- /* Stable version according to the timestamp. */
- if (is_ondisk_stable != NULL)
- *is_ondisk_stable = true;
- return (0);
- }
-
- if (rip != NULL) {
- if (row_key != NULL)
- key = row_key;
- else {
- /* Unpack a row key. */
- WT_ERR(__wt_scr_alloc(session, 0, &key));
- WT_ERR(__wt_row_leaf_key(session, page, rip, key, false));
- }
- } else {
- /* Manufacture a column key. */
- WT_ERR(__wt_scr_alloc(session, WT_INTPACK64_MAXSIZE, &key));
- memp = key->mem;
- WT_ERR(__wt_vpack_uint(&memp, 0, recno));
- key->size = WT_PTRDIFF(memp, key->data);
- }
-
- WT_ERR(__wt_scr_alloc(session, 0, &key_string));
- __wt_verbose_level_multi(session, WT_VERB_RECOVERY_RTS(session), WT_VERBOSE_DEBUG_2,
- "removing the key%s: %s", upd->type == WT_UPDATE_TOMBSTONE ? "" : " tombstone",
- __wt_key_string(session, key->data, key->size, S2BT(session)->key_format, key_string));
-
- if (rip != NULL)
- WT_ERR(__rollback_row_modify(session, ref, upd, key));
- else
- WT_ERR(__rollback_col_modify(session, ref, upd, recno));
-
- if (0) {
-err:
- __wt_free(session, upd);
- }
- if (rip == NULL || row_key == NULL)
- __wt_scr_free(session, &key);
- __wt_scr_free(session, &key_string);
- return (ret);
-}
-
-/*
- * __rollback_abort_col_var --
- * Abort updates on a variable length col leaf page with timestamps newer than the rollback
- * timestamp.
- */
-static int
-__rollback_abort_col_var(WT_SESSION_IMPL *session, WT_REF *ref, wt_timestamp_t rollback_timestamp)
-{
- WT_CELL *kcell;
- WT_CELL_UNPACK_KV unpack;
- WT_COL *cip;
- WT_INSERT *ins;
- WT_INSERT_HEAD *inshead;
- WT_PAGE *page;
- uint64_t ins_recno, recno, rle;
- uint32_t i, j, stable_updates_count;
- bool is_ondisk_stable;
-
- page = ref->page;
- /*
- * If a disk image exists, start from the provided recno; or else start from 0.
- */
- if (page->dsk != NULL)
- recno = page->dsk->recno;
- else
- recno = 0;
-
- /* Review the changes to the original on-page data items. */
- WT_COL_FOREACH (page, cip, i) {
- stable_updates_count = 0;
-
- if ((inshead = WT_COL_UPDATE(page, cip)) != NULL)
- WT_RET(__rollback_abort_insert_list(
- session, page, inshead, rollback_timestamp, &stable_updates_count));
-
- if (page->dsk != NULL) {
- /* Unpack the cell. We need its RLE count whether or not we're going to iterate it. */
- kcell = WT_COL_PTR(page, cip);
- __wt_cell_unpack_kv(session, page->dsk, kcell, &unpack);
- rle = __wt_cell_rle(&unpack);
-
- /*
- * Each key whose on-disk value is not stable and has no stable update on the update
- * list must be processed downstream.
- *
- * If we can determine that the cell's on-disk value is stable, we can skip iterating
- * over the cell; likewise, if we can determine that every key in the cell has a stable
- * update on the update list, we can skip the iteration. Otherwise we have to try each
- * key.
- *
- * If the on-disk cell is deleted, it is stable, because cells only appear as deleted
- * when there is no older value that might need to be restored.
- *
- * Note that in a purely timestamped world, the presence of any stable update for any
- * key in the cell means the on-disk value must be stable, because the update must be
- * newer than the on-disk value. However, this is no longer true if the stable update
- * has no timestamp. It may also not be true if the on-disk value is prepared, or other
- * corner cases. Therefore, we must iterate the cell unless _every_ key has a stable
- * update.
- *
- * We can, however, stop iterating as soon as the downstream code reports back that the
- * on-disk value is actually stable.
- */
- if (unpack.type == WT_CELL_DEL)
- WT_STAT_CONN_DATA_INCR(session, txn_rts_delete_rle_skipped);
- else if (stable_updates_count == rle)
- WT_STAT_CONN_DATA_INCR(session, txn_rts_stable_rle_skipped);
- else {
- j = 0;
- if (inshead != NULL) {
- WT_SKIP_FOREACH (ins, inshead) {
- /* If the update list goes past the end of the cell, something's wrong. */
- WT_ASSERT(session, j < rle);
- ins_recno = WT_INSERT_RECNO(ins);
- /* Process all the keys before this update. */
- while (recno + j < ins_recno) {
- WT_RET(__rollback_abort_ondisk_kv(session, ref, NULL, recno + j, NULL,
- &unpack, rollback_timestamp, &is_ondisk_stable));
- /* We can stop right away if the on-disk version is stable. */
- if (is_ondisk_stable) {
- if (rle > 1)
- WT_STAT_CONN_DATA_INCR(session, txn_rts_stable_rle_skipped);
- goto stop;
- }
- j++;
- }
- /* If this key has a stable update, skip over it. */
- if (recno + j == ins_recno && __rollback_has_stable_update(ins->upd))
- j++;
- }
- }
- /* Process the rest of the keys. */
- while (j < rle) {
- WT_RET(__rollback_abort_ondisk_kv(session, ref, NULL, recno + j, NULL, &unpack,
- rollback_timestamp, &is_ondisk_stable));
- /* We can stop right away if the on-disk version is stable. */
- if (is_ondisk_stable) {
- if (rle > 1)
- WT_STAT_CONN_DATA_INCR(session, txn_rts_stable_rle_skipped);
- goto stop;
- }
- j++;
- }
- }
-stop:
- recno += rle;
- }
- }
-
- /* Review the append list */
- if ((inshead = WT_COL_APPEND(page)) != NULL)
- WT_RET(__rollback_abort_insert_list(session, page, inshead, rollback_timestamp, NULL));
-
- return (0);
-}
-
-/*
- * __rollback_abort_col_fix_one --
- * Handle one possibly unstable on-disk time window.
- */
-static int
-__rollback_abort_col_fix_one(WT_SESSION_IMPL *session, WT_REF *ref, uint32_t tw,
- uint32_t recno_offset, wt_timestamp_t rollback_timestamp)
-{
- WT_BTREE *btree;
- WT_CELL *cell;
- WT_CELL_UNPACK_KV unpack;
- WT_PAGE *page;
- uint8_t value;
-
- btree = S2BT(session);
- page = ref->page;
-
- /* Unpack the cell to get the time window. */
- cell = WT_COL_FIX_TW_CELL(page, &page->pg_fix_tws[tw]);
- __wt_cell_unpack_kv(session, page->dsk, cell, &unpack);
-
- /* Fake up the value (which is not physically in the cell) in case it's wanted. */
- value = __bit_getv(page->pg_fix_bitf, recno_offset, btree->bitcnt);
- unpack.data = &value;
- unpack.size = 1;
-
- return (__rollback_abort_ondisk_kv(session, ref, NULL, page->dsk->recno + recno_offset, NULL,
- &unpack, rollback_timestamp, NULL));
-}
-
-/*
- * __rollback_abort_col_fix --
- * Abort updates on a fixed length col leaf page with timestamps newer than the rollback
- * timestamp.
- */
-static int
-__rollback_abort_col_fix(WT_SESSION_IMPL *session, WT_REF *ref, wt_timestamp_t rollback_timestamp)
-{
- WT_INSERT *ins;
- WT_INSERT_HEAD *inshead;
- WT_PAGE *page;
- uint32_t ins_recno_offset, recno_offset, numtws, tw;
-
- page = ref->page;
-
- /*
- * Review the changes to the original on-page data items. Note that while this can report back
- * to us whether it saw a stable update, that information doesn't do us any good -- unlike in
- * VLCS where the uniformity of cells lets us reason about the timestamps of all of them based
- * on the timestamp of an update to any of them, in FLCS everything is just thrown together, so
- * we'll need to iterate over all the keys anyway.
- */
- if ((inshead = WT_COL_UPDATE_SINGLE(page)) != NULL)
- WT_RET(__rollback_abort_insert_list(session, page, inshead, rollback_timestamp, NULL));
-
- /*
- * Iterate over all the keys, stopping only on keys that (a) have a time window on disk, and
- * also (b) do not have a stable update remaining in the update list. Keys with no on-disk time
- * window are stable. And we must not try to adjust the on-disk value for keys with stable
- * updates, because the downstream code assumes that has already been checked and in some cases
- * (e.g. in-memory databases) the wrong thing will happen.
- *
- * Iterate over the update list and carry along the iteration over the time window list in
- * parallel, even though the code would perhaps make more sense the other way around, because
- * this allows using the skiplist iterator macro instead of an open-coded mess.
- */
- numtws = WT_COL_FIX_TWS_SET(page) ? page->pg_fix_numtws : 0;
- WT_ASSERT(session, numtws == 0 || page->dsk != NULL);
- tw = 0;
- if (inshead != NULL) {
- WT_SKIP_FOREACH (ins, inshead) {
- /* Process all the keys before this update entry. */
- ins_recno_offset = (uint32_t)(WT_INSERT_RECNO(ins) - ref->ref_recno);
- while (tw < numtws &&
- (recno_offset = page->pg_fix_tws[tw].recno_offset) < ins_recno_offset) {
-
- WT_RET(
- __rollback_abort_col_fix_one(session, ref, tw, recno_offset, rollback_timestamp));
- tw++;
- }
- /* If this key has a stable update, skip over it. */
- if (tw < numtws && page->pg_fix_tws[tw].recno_offset == ins_recno_offset &&
- ins->upd != NULL && __rollback_has_stable_update(ins->upd))
- tw++;
- }
- }
- /* Process the rest of the keys with time windows. */
- while (tw < numtws) {
- recno_offset = page->pg_fix_tws[tw].recno_offset;
- WT_RET(__rollback_abort_col_fix_one(session, ref, tw, recno_offset, rollback_timestamp));
- tw++;
- }
-
- /* Review the append list. */
- if ((inshead = WT_COL_APPEND(page)) != NULL)
- WT_RET(__rollback_abort_insert_list(session, page, inshead, rollback_timestamp, NULL));
-
- return (0);
-}
-
-/*
- * __rollback_abort_row_leaf --
- * Abort updates on a row leaf page with timestamps newer than the rollback timestamp.
- */
-static int
-__rollback_abort_row_leaf(WT_SESSION_IMPL *session, WT_REF *ref, wt_timestamp_t rollback_timestamp)
-{
- WT_CELL_UNPACK_KV *vpack, _vpack;
- WT_DECL_ITEM(key);
- WT_DECL_RET;
- WT_INSERT_HEAD *insert;
- WT_PAGE *page;
- WT_ROW *rip;
- WT_UPDATE *upd;
- uint32_t i;
- bool have_key, stable_update_found;
-
- page = ref->page;
-
- WT_RET(__wt_scr_alloc(session, 0, &key));
-
- /*
- * Review the insert list for keys before the first entry on the disk page.
- */
- if ((insert = WT_ROW_INSERT_SMALLEST(page)) != NULL)
- WT_ERR(__rollback_abort_insert_list(session, page, insert, rollback_timestamp, NULL));
-
- /*
- * Review updates that belong to keys that are on the disk image, as well as for keys inserted
- * since the page was read from disk.
- */
- WT_ROW_FOREACH (page, rip, i) {
- stable_update_found = false;
- if ((upd = WT_ROW_UPDATE(page, rip)) != NULL) {
- WT_ERR(__wt_row_leaf_key(session, page, rip, key, false));
- WT_ERR(
- __rollback_abort_update(session, key, upd, rollback_timestamp, &stable_update_found));
- have_key = true;
- } else
- have_key = false;
-
- if ((insert = WT_ROW_INSERT(page, rip)) != NULL)
- WT_ERR(__rollback_abort_insert_list(session, page, insert, rollback_timestamp, NULL));
-
- /*
- * If there is no stable update found in the update list, abort any on-disk value.
- */
- if (!stable_update_found) {
- vpack = &_vpack;
- __wt_row_leaf_value_cell(session, page, rip, vpack);
- WT_ERR(__rollback_abort_ondisk_kv(
- session, ref, rip, 0, have_key ? key : NULL, vpack, rollback_timestamp, NULL));
- }
- }
-
-err:
- __wt_scr_free(session, &key);
- return (ret);
-}
-
-/*
- * __rollback_get_ref_max_durable_timestamp --
- * Returns the ref aggregated max durable timestamp. The max durable timestamp is calculated
- * between both start and stop durable timestamps except for history store, because most of the
- * history store updates have stop timestamp either greater or equal to the start timestamp
- * except for the updates written for the prepared updates on the data store. To abort the
- * updates with no stop timestamp, we must include the newest stop timestamp also into the
- * calculation of maximum durable timestamp of the history store.
- */
-static wt_timestamp_t
-__rollback_get_ref_max_durable_timestamp(WT_SESSION_IMPL *session, WT_TIME_AGGREGATE *ta)
-{
- if (WT_IS_HS(session->dhandle))
- return (WT_MAX(ta->newest_stop_durable_ts, ta->newest_stop_ts));
- return (WT_MAX(ta->newest_start_durable_ts, ta->newest_stop_durable_ts));
-}
-
-/*
- * __rollback_page_needs_abort --
- * Check whether the page needs rollback, returning true if the page has modifications newer
- * than the given timestamp.
- */
-static bool
-__rollback_page_needs_abort(
- WT_SESSION_IMPL *session, WT_REF *ref, wt_timestamp_t rollback_timestamp)
-{
- WT_ADDR *addr;
- WT_CELL_UNPACK_ADDR vpack;
- WT_MULTI *multi;
- WT_PAGE_MODIFY *mod;
- wt_timestamp_t durable_ts;
- uint64_t newest_txn;
- uint32_t i;
- char ts_string[WT_TS_INT_STRING_SIZE];
- const char *tag;
- bool prepared, result;
-
- addr = ref->addr;
- mod = ref->page == NULL ? NULL : ref->page->modify;
- durable_ts = WT_TS_NONE;
- newest_txn = WT_TXN_NONE;
- tag = "undefined state";
- prepared = result = false;
-
- /*
- * The rollback operation should be performed on this page when any one of the following is
- * greater than the given timestamp or during recovery if the newest transaction id on the page
- * is greater than or equal to recovered checkpoint snapshot min:
- * 1. The reconciled replace page max durable timestamp.
- * 2. The reconciled multi page max durable timestamp.
- * 3. For just-instantiated deleted pages that have not otherwise been modified, the durable
- * timestamp in the page delete information. This timestamp isn't reflected in the address's
- * time aggregate.
- * 4. The on page address max durable timestamp.
- * 5. The off page address max durable timestamp.
- */
- if (mod != NULL && mod->rec_result == WT_PM_REC_REPLACE) {
- tag = "reconciled replace block";
- durable_ts = __rollback_get_ref_max_durable_timestamp(session, &mod->mod_replace.ta);
- prepared = mod->mod_replace.ta.prepare;
- result = (durable_ts > rollback_timestamp) || prepared;
- } else if (mod != NULL && mod->rec_result == WT_PM_REC_MULTIBLOCK) {
- tag = "reconciled multi block";
- /* Calculate the max durable timestamp by traversing all multi addresses. */
- for (multi = mod->mod_multi, i = 0; i < mod->mod_multi_entries; ++multi, ++i) {
- durable_ts = WT_MAX(
- durable_ts, __rollback_get_ref_max_durable_timestamp(session, &multi->addr.ta));
- if (multi->addr.ta.prepare)
- prepared = true;
- }
- result = (durable_ts > rollback_timestamp) || prepared;
- } else if (mod != NULL && mod->instantiated && !__wt_page_is_modified(ref->page) &&
- ref->page_del != NULL) {
- tag = "page_del info";
- durable_ts = ref->page_del->durable_timestamp;
- prepared = ref->page_del->prepare_state == WT_PREPARE_INPROGRESS ||
- ref->page_del->prepare_state == WT_PREPARE_LOCKED;
- newest_txn = ref->page_del->txnid;
- result = (durable_ts > rollback_timestamp) || prepared ||
- WT_CHECK_RECOVERY_FLAG_TXNID(session, newest_txn);
- } else if (!__wt_off_page(ref->home, addr)) {
- tag = "on page cell";
- /* Check if the page is obsolete using the page disk address. */
- __wt_cell_unpack_addr(session, ref->home->dsk, (WT_CELL *)addr, &vpack);
- durable_ts = __rollback_get_ref_max_durable_timestamp(session, &vpack.ta);
- prepared = vpack.ta.prepare;
- newest_txn = vpack.ta.newest_txn;
- result = (durable_ts > rollback_timestamp) || prepared ||
- WT_CHECK_RECOVERY_FLAG_TXNID(session, newest_txn);
- } else if (addr != NULL) {
- tag = "address";
- durable_ts = __rollback_get_ref_max_durable_timestamp(session, &addr->ta);
- prepared = addr->ta.prepare;
- newest_txn = addr->ta.newest_txn;
- result = (durable_ts > rollback_timestamp) || prepared ||
- WT_CHECK_RECOVERY_FLAG_TXNID(session, newest_txn);
- }
-
- __wt_verbose_multi(session, WT_VERB_RECOVERY_RTS(session),
- "%p: page with %s durable timestamp: %s, newest txn: %" PRIu64
- " and prepared updates: %s needs abort: %s",
- (void *)ref, tag, __wt_timestamp_to_string(durable_ts, ts_string), newest_txn,
- prepared ? "true" : "false", result ? "true" : "false");
-
- return (result);
-}
-
-/*
- * __rollback_abort_updates --
- * Abort updates on this page newer than the timestamp.
- */
-static int
-__rollback_abort_updates(WT_SESSION_IMPL *session, WT_REF *ref, wt_timestamp_t rollback_timestamp)
-{
- WT_PAGE *page;
- bool modified;
-
- /*
- * If we have a ref with clean page, find out whether the page has any modifications that are
- * newer than the given timestamp. As eviction writes the newest version to page, even a clean
- * page may also contain modifications that need rollback.
- */
- page = ref->page;
- modified = __wt_page_is_modified(page);
- if (!modified && !__rollback_page_needs_abort(session, ref, rollback_timestamp)) {
- __wt_verbose_level_multi(session, WT_VERB_RECOVERY_RTS(session), WT_VERBOSE_DEBUG_3,
- "%p: unmodified stable page skipped", (void *)ref);
- return (0);
- }
-
- WT_STAT_CONN_INCR(session, txn_rts_pages_visited);
- __wt_verbose_level_multi(session, WT_VERB_RECOVERY_RTS(session), WT_VERBOSE_DEBUG_2,
- "%p: roll back %s page", (void *)ref, modified ? "modified" : "clean");
-
- switch (page->type) {
- case WT_PAGE_COL_FIX:
- WT_RET(__rollback_abort_col_fix(session, ref, rollback_timestamp));
- break;
- case WT_PAGE_COL_VAR:
- WT_RET(__rollback_abort_col_var(session, ref, rollback_timestamp));
- break;
- case WT_PAGE_ROW_LEAF:
- WT_RET(__rollback_abort_row_leaf(session, ref, rollback_timestamp));
- break;
- case WT_PAGE_COL_INT:
- case WT_PAGE_ROW_INT:
- /* This function is not called for internal pages. */
- WT_ASSERT(session, false);
- /* Fall through. */
- default:
- WT_RET(__wt_illegal_value(session, page->type));
- }
-
- /* Mark the page as dirty to reconcile the page. */
- if (page->modify)
- __wt_page_modify_set(session, page);
- return (0);
-}
-
-/*
- * __rollback_to_stable_page_skip --
- * Skip if rollback to stable doesn't require reading this page.
- */
-static int
-__rollback_to_stable_page_skip(
- WT_SESSION_IMPL *session, WT_REF *ref, void *context, bool visible_all, bool *skipp)
-{
- WT_PAGE_DELETED *page_del;
- wt_timestamp_t rollback_timestamp;
- char time_string[WT_TIME_STRING_SIZE];
-
- rollback_timestamp = *(wt_timestamp_t *)context;
- WT_UNUSED(visible_all);
-
- *skipp = false; /* Default to reading */
-
- /*
- * Skip pages truncated at or before the RTS timestamp. (We could read the page, but that would
- * unnecessarily instantiate it). If the page has no fast-delete information, that means either
- * it was discarded because the delete is globally visible, or the internal page holding the
- * cell was an old format page so none was loaded. In the latter case we should skip the page as
- * there's no way to get correct behavior and skipping matches the historic behavior. Note that
- * eviction is running; we must lock the WT_REF before examining the fast-delete information.
- */
- if (ref->state == WT_REF_DELETED &&
- WT_REF_CAS_STATE(session, ref, WT_REF_DELETED, WT_REF_LOCKED)) {
- page_del = ref->page_del;
- if (page_del == NULL ||
- (__rollback_txn_visible_id(session, page_del->txnid) &&
- page_del->durable_timestamp <= rollback_timestamp)) {
- /*
- * We should never see a prepared truncate here; not at recovery time because prepared
- * truncates can't be written to disk, and not during a runtime RTS either because it
- * should not be possible to do that with an unresolved prepared transaction.
- */
- WT_ASSERT(session,
- page_del == NULL || page_del->prepare_state == WT_PREPARE_INIT ||
- page_del->prepare_state == WT_PREPARE_RESOLVED);
-
- if (page_del == NULL)
- __wt_verbose_multi(session, WT_VERB_RECOVERY_RTS(session),
- "%p: deleted page walk skipped", (void *)ref);
- else {
- __wt_verbose_multi(session, WT_VERB_RECOVERY_RTS(session),
- "%p: deleted page walk skipped page_del %s", (void *)ref,
- __wt_time_point_to_string(page_del->timestamp, page_del->durable_timestamp,
- page_del->txnid, time_string));
- }
- WT_STAT_CONN_INCR(session, txn_rts_tree_walk_skip_pages);
- *skipp = true;
- }
- WT_REF_SET_STATE(ref, WT_REF_DELETED);
- return (0);
- }
-
- /* Otherwise, if the page state is other than on disk, we want to look at it. */
- if (ref->state != WT_REF_DISK)
- return (0);
-
- /*
- * Check whether this on-disk page has any updates to be aborted. We are not holding a hazard
- * reference on the page and so we rely on there being no other threads of control in the tree,
- * that is, eviction ignores WT_REF_DISK pages and no other thread is reading pages, this page
- * cannot change state from on-disk to something else.
- */
- if (!__rollback_page_needs_abort(session, ref, rollback_timestamp)) {
- *skipp = true;
- __wt_verbose_multi(
- session, WT_VERB_RECOVERY_RTS(session), "%p: stable page walk skipped", (void *)ref);
- WT_STAT_CONN_INCR(session, txn_rts_tree_walk_skip_pages);
- }
-
- return (0);
-}
-
-/*
- * __rollback_to_stable_btree_walk --
- * Called for each open handle - choose to either skip or wipe the commits
- */
-static int
-__rollback_to_stable_btree_walk(WT_SESSION_IMPL *session, wt_timestamp_t rollback_timestamp)
-{
- WT_DECL_RET;
- WT_REF *ref;
-
- /* Walk the tree, marking commits aborted where appropriate. */
- ref = NULL;
- while (
- (ret = __wt_tree_walk_custom_skip(session, &ref, __rollback_to_stable_page_skip,
- &rollback_timestamp, WT_READ_NO_EVICT | WT_READ_VISIBLE_ALL | WT_READ_WONT_NEED)) == 0 &&
- ref != NULL)
- if (F_ISSET(ref, WT_REF_FLAG_LEAF))
- WT_RET(__rollback_abort_updates(session, ref, rollback_timestamp));
-
- return (ret);
-}
-
-/*
- * __rollback_to_stable_btree --
- * Called for each object handle - choose to either skip or wipe the commits
- */
-static int
-__rollback_to_stable_btree(WT_SESSION_IMPL *session, wt_timestamp_t rollback_timestamp)
-{
- WT_BTREE *btree;
- WT_CONNECTION_IMPL *conn;
-
- btree = S2BT(session);
- conn = S2C(session);
-
- __wt_verbose_multi(session, WT_VERB_RECOVERY_RTS(session),
- "rollback to stable connection logging enabled: %s and btree logging enabled: %s",
- FLD_ISSET(conn->log_flags, WT_CONN_LOG_ENABLED) ? "true" : "false",
- F_ISSET(btree, WT_BTREE_LOGGED) ? "true" : "false");
-
- /* Files with commit-level durability (without timestamps), don't get their commits wiped. */
- if (F_ISSET(btree, WT_BTREE_LOGGED))
- return (0);
-
- /* There is never anything to do for checkpoint handles. */
- if (WT_READING_CHECKPOINT(session))
- return (0);
-
- /* There is nothing to do on an empty tree. */
- if (btree->root.page == NULL)
- return (0);
-
- return (__rollback_to_stable_btree_walk(session, rollback_timestamp));
-}
-
-/*
- * __rollback_to_stable_check --
- * Check to the extent possible that the rollback request is reasonable.
- */
-static int
-__rollback_to_stable_check(WT_SESSION_IMPL *session)
-{
- WT_CONNECTION_IMPL *conn;
- WT_DECL_RET;
- WT_SESSION_IMPL *session_in_list;
- uint32_t i, session_cnt;
- bool cursor_active, txn_active;
-
- conn = S2C(session);
- cursor_active = txn_active = false;
-
- WT_STAT_CONN_INCR(session, txn_walk_sessions);
-
- /*
- * Help the user comply with the requirement there be no concurrent user operations. It is okay
- * to have a transaction in the prepared state.
- *
- * WT_TXN structures are allocated and freed as sessions are activated and closed. Lock the
- * session open/close to ensure we don't race. This call is a rarely used RTS-only function,
- * acquiring the lock shouldn't be an issue.
- */
- __wt_spin_lock(session, &conn->api_lock);
-
- WT_ORDERED_READ(session_cnt, conn->session_cnt);
- for (i = 0, session_in_list = conn->sessions; i < session_cnt; i++, session_in_list++) {
-
- /* Skip inactive or internal sessions. */
- if (!session_in_list->active || F_ISSET(session_in_list, WT_SESSION_INTERNAL))
- continue;
-
- /* Check if a user session has a running transaction. */
- if (F_ISSET(session_in_list->txn, WT_TXN_RUNNING)) {
- txn_active = true;
- break;
- }
-
- /* Check if a user session has an active file cursor. */
- if (session_in_list->ncursors != 0) {
- cursor_active = true;
- break;
- }
- }
- __wt_spin_unlock(session, &conn->api_lock);
-
- /*
- * A new cursor may be positioned or a transaction may start after we return from this call and
- * callers should be aware of this limitation.
- */
- if (cursor_active)
- WT_RET_MSG(session, EBUSY, "rollback_to_stable illegal with active file cursors");
- if (txn_active) {
- ret = EBUSY;
- WT_TRET(__wt_verbose_dump_txn(session));
- WT_RET_MSG(session, ret, "rollback_to_stable illegal with active transactions");
- }
- return (0);
-}
-
-/*
- * __rollback_to_stable_btree_hs_truncate --
- * Wipe all history store updates for the btree (non-timestamped tables)
- */
-static int
-__rollback_to_stable_btree_hs_truncate(WT_SESSION_IMPL *session, uint32_t btree_id)
-{
- WT_CURSOR *hs_cursor_start, *hs_cursor_stop;
- WT_DECL_ITEM(hs_key);
- WT_DECL_RET;
- WT_SESSION *truncate_session;
- wt_timestamp_t hs_start_ts;
- uint64_t hs_counter;
- uint32_t hs_btree_id;
-
- hs_cursor_start = hs_cursor_stop = NULL;
- hs_btree_id = 0;
- truncate_session = (WT_SESSION *)session;
-
- WT_RET(__wt_scr_alloc(session, 0, &hs_key));
-
- /* Open a history store start cursor. */
- WT_ERR(__wt_curhs_open(session, NULL, &hs_cursor_start));
- F_SET(hs_cursor_start, WT_CURSTD_HS_READ_COMMITTED);
-
- hs_cursor_start->set_key(hs_cursor_start, 1, btree_id);
- WT_ERR_NOTFOUND_OK(__wt_curhs_search_near_after(session, hs_cursor_start), true);
- if (ret == WT_NOTFOUND) {
- ret = 0;
- goto done;
- }
-
- /* Open a history store stop cursor. */
- WT_ERR(__wt_curhs_open(session, NULL, &hs_cursor_stop));
- F_SET(hs_cursor_stop, WT_CURSTD_HS_READ_COMMITTED | WT_CURSTD_HS_READ_ACROSS_BTREE);
-
- hs_cursor_stop->set_key(hs_cursor_stop, 1, btree_id + 1);
- WT_ERR_NOTFOUND_OK(__wt_curhs_search_near_after(session, hs_cursor_stop), true);
-
-#ifdef HAVE_DIAGNOSTIC
- /* If we get not found, we are at the largest btree id in the history store. */
- if (ret == 0) {
- hs_cursor_stop->get_key(hs_cursor_stop, &hs_btree_id, hs_key, &hs_start_ts, &hs_counter);
- WT_ASSERT(session, hs_btree_id > btree_id);
- }
-#endif
-
- do {
- WT_ASSERT(session, ret == WT_NOTFOUND || hs_btree_id > btree_id);
-
- WT_ERR_NOTFOUND_OK(hs_cursor_stop->prev(hs_cursor_stop), true);
- /* We can find the start point then we must be able to find the stop point. */
- if (ret == WT_NOTFOUND)
- WT_ERR_PANIC(
- session, ret, "cannot locate the stop point to truncate the history store.");
- hs_cursor_stop->get_key(hs_cursor_stop, &hs_btree_id, hs_key, &hs_start_ts, &hs_counter);
- } while (hs_btree_id != btree_id);
-
- WT_ERR(
- truncate_session->truncate(truncate_session, NULL, hs_cursor_start, hs_cursor_stop, NULL));
-
- WT_STAT_CONN_DATA_INCR(session, cache_hs_btree_truncate);
-
- __wt_verbose(session, WT_VERB_RECOVERY_PROGRESS,
- "Rollback to stable has truncated records for btree %u from the history store", btree_id);
-
-done:
-err:
- __wt_scr_free(session, &hs_key);
- if (hs_cursor_start != NULL)
- WT_TRET(hs_cursor_start->close(hs_cursor_start));
- if (hs_cursor_stop != NULL)
- WT_TRET(hs_cursor_stop->close(hs_cursor_stop));
-
- return (ret);
-}
-
-/*
- * __rollback_to_stable_hs_final_pass --
- * Perform rollback to stable on the history store to remove any entries newer than the stable
- * timestamp.
- */
-static int
-__rollback_to_stable_hs_final_pass(WT_SESSION_IMPL *session, wt_timestamp_t rollback_timestamp)
-{
- WT_CONFIG ckptconf;
- WT_CONFIG_ITEM cval, durableval, key;
- WT_CONNECTION_IMPL *conn;
- WT_DECL_RET;
- wt_timestamp_t max_durable_ts, newest_stop_durable_ts, newest_stop_ts;
- size_t i;
- char *config;
- char ts_string[2][WT_TS_INT_STRING_SIZE];
-
- config = NULL;
- conn = S2C(session);
-
- WT_RET(__wt_metadata_search(session, WT_HS_URI, &config));
-
- /*
- * Find out the max durable timestamp of the history store from checkpoint. Most of the history
- * store updates have stop timestamp either greater or equal to the start timestamp except for
- * the updates written for the prepared updates on the data store. To abort the updates with no
- * stop timestamp, we must include the newest stop timestamp also into the calculation of
- * maximum timestamp of the history store.
- */
- newest_stop_durable_ts = newest_stop_ts = WT_TS_NONE;
- WT_ERR(__wt_config_getones(session, config, "checkpoint", &cval));
- __wt_config_subinit(session, &ckptconf, &cval);
- for (; __wt_config_next(&ckptconf, &key, &cval) == 0;) {
- ret = __wt_config_subgets(session, &cval, "newest_stop_durable_ts", &durableval);
- if (ret == 0)
- newest_stop_durable_ts = WT_MAX(newest_stop_durable_ts, (wt_timestamp_t)durableval.val);
- WT_ERR_NOTFOUND_OK(ret, false);
- ret = __wt_config_subgets(session, &cval, "newest_stop_ts", &durableval);
- if (ret == 0)
- newest_stop_ts = WT_MAX(newest_stop_ts, (wt_timestamp_t)durableval.val);
- WT_ERR_NOTFOUND_OK(ret, false);
- }
- max_durable_ts = WT_MAX(newest_stop_ts, newest_stop_durable_ts);
- WT_ERR(__wt_session_get_dhandle(session, WT_HS_URI, NULL, NULL, 0));
-
- /*
- * The rollback operation should be performed on the history store file when the checkpoint
- * durable start/stop timestamp is greater than the rollback timestamp. But skip if there is no
- * stable timestamp.
- *
- * Note that the corresponding code in __rollback_to_stable_btree_apply also checks whether
- * there _are_ timestamped updates by checking max_durable_ts; that check is redundant here for
- * several reasons, the most immediate being that max_durable_ts cannot be none (zero) because
- * it's greater than rollback_timestamp, which is itself greater than zero.
- */
- if (max_durable_ts > rollback_timestamp && rollback_timestamp != WT_TS_NONE) {
- __wt_verbose_multi(session, WT_VERB_RECOVERY_RTS(session),
- "tree rolled back with durable timestamp: %s",
- __wt_timestamp_to_string(max_durable_ts, ts_string[0]));
- WT_TRET(__rollback_to_stable_btree(session, rollback_timestamp));
- } else
- __wt_verbose_multi(session, WT_VERB_RECOVERY_RTS(session),
- "tree skipped with durable timestamp: %s and stable timestamp: %s",
- __wt_timestamp_to_string(max_durable_ts, ts_string[0]),
- __wt_timestamp_to_string(rollback_timestamp, ts_string[1]));
-
- /*
- * Truncate history store entries from the partial backup remove list. The list holds all of the
- * btree ids that do not exist as part of the database anymore due to performing a selective
- * restore from backup.
- */
- if (F_ISSET(conn, WT_CONN_BACKUP_PARTIAL_RESTORE) && conn->partial_backup_remove_ids != NULL)
- for (i = 0; conn->partial_backup_remove_ids[i] != 0; ++i)
- WT_ERR(
- __rollback_to_stable_btree_hs_truncate(session, conn->partial_backup_remove_ids[i]));
-err:
- if (session->dhandle != NULL)
- WT_TRET(__wt_session_release_dhandle(session));
- __wt_free(session, config);
- return (ret);
-}
-
-/*
- * __rollback_progress_msg --
- * Log a verbose message about the progress of the current rollback to stable.
- */
-static void
-__rollback_progress_msg(WT_SESSION_IMPL *session, struct timespec rollback_start,
- uint64_t rollback_count, uint64_t *rollback_msg_count)
-{
- struct timespec cur_time;
- uint64_t time_diff;
-
- __wt_epoch(session, &cur_time);
-
- /* Time since the rollback started. */
- time_diff = WT_TIMEDIFF_SEC(cur_time, rollback_start);
-
- if ((time_diff / WT_PROGRESS_MSG_PERIOD) > *rollback_msg_count) {
- __wt_verbose(session, WT_VERB_RECOVERY_PROGRESS,
- "Rollback to stable has been running for %" PRIu64 " seconds and has inspected %" PRIu64
- " files. For more detailed logging, enable WT_VERB_RTS",
- time_diff, rollback_count);
- ++(*rollback_msg_count);
- }
-}
-
-/*
- * __rollback_to_stable_check_btree_modified --
- * Check that the rollback to stable btree is modified or not.
- */
-static int
-__rollback_to_stable_check_btree_modified(WT_SESSION_IMPL *session, const char *uri, bool *modified)
-{
- WT_DECL_RET;
-
- ret = __wt_conn_dhandle_find(session, uri, NULL);
- *modified = ret == 0 && S2BT(session)->modified;
- return (ret);
-}
-
-/*
- * __rollback_to_stable_btree_apply --
- * Perform rollback to stable on a single file.
- */
-static int
-__rollback_to_stable_btree_apply(
- WT_SESSION_IMPL *session, const char *uri, const char *config, wt_timestamp_t rollback_timestamp)
-{
- WT_CONFIG ckptconf;
- WT_CONFIG_ITEM cval, value, key;
- WT_DECL_RET;
- wt_timestamp_t max_durable_ts, newest_start_durable_ts, newest_stop_durable_ts;
- size_t addr_size;
- uint64_t rollback_txnid, write_gen;
- uint32_t btree_id;
- char ts_string[2][WT_TS_INT_STRING_SIZE];
- bool dhandle_allocated, durable_ts_found, has_txn_updates_gt_than_ckpt_snap, modified;
- bool prepared_updates;
-
- /* Ignore non-btree objects as well as the metadata and history store files. */
- if (!WT_BTREE_PREFIX(uri) || strcmp(uri, WT_HS_URI) == 0 || strcmp(uri, WT_METAFILE_URI) == 0)
- return (0);
-
- addr_size = 0;
- rollback_txnid = 0;
- write_gen = 0;
- dhandle_allocated = false;
-
- /* Find out the max durable timestamp of the object from checkpoint. */
- newest_start_durable_ts = newest_stop_durable_ts = WT_TS_NONE;
- durable_ts_found = prepared_updates = has_txn_updates_gt_than_ckpt_snap = false;
-
- WT_RET(__wt_config_getones(session, config, "checkpoint", &cval));
- __wt_config_subinit(session, &ckptconf, &cval);
- for (; __wt_config_next(&ckptconf, &key, &cval) == 0;) {
- ret = __wt_config_subgets(session, &cval, "newest_start_durable_ts", &value);
- if (ret == 0) {
- newest_start_durable_ts = WT_MAX(newest_start_durable_ts, (wt_timestamp_t)value.val);
- durable_ts_found = true;
- }
- WT_RET_NOTFOUND_OK(ret);
- ret = __wt_config_subgets(session, &cval, "newest_stop_durable_ts", &value);
- if (ret == 0) {
- newest_stop_durable_ts = WT_MAX(newest_stop_durable_ts, (wt_timestamp_t)value.val);
- durable_ts_found = true;
- }
- WT_RET_NOTFOUND_OK(ret);
- ret = __wt_config_subgets(session, &cval, "prepare", &value);
- if (ret == 0) {
- if (value.val)
- prepared_updates = true;
- }
- WT_RET_NOTFOUND_OK(ret);
- ret = __wt_config_subgets(session, &cval, "newest_txn", &value);
- if (value.len != 0)
- rollback_txnid = (uint64_t)value.val;
- WT_RET_NOTFOUND_OK(ret);
- ret = __wt_config_subgets(session, &cval, "addr", &value);
- if (ret == 0)
- addr_size = value.len;
- WT_RET_NOTFOUND_OK(ret);
- ret = __wt_config_subgets(session, &cval, "write_gen", &value);
- if (ret == 0)
- write_gen = (uint64_t)value.val;
- WT_RET_NOTFOUND_OK(ret);
- }
- max_durable_ts = WT_MAX(newest_start_durable_ts, newest_stop_durable_ts);
-
- /*
- * Perform rollback to stable when the newest written transaction of the btree is greater than
- * or equal to the checkpoint snapshot. The snapshot comparison is valid only when the btree
- * write generation number is greater than the last checkpoint connection base write generation
- * to confirm that the btree is modified in the previous restart cycle.
- */
- if (WT_CHECK_RECOVERY_FLAG_TXNID(session, rollback_txnid) &&
- (write_gen >= S2C(session)->last_ckpt_base_write_gen)) {
- has_txn_updates_gt_than_ckpt_snap = true;
- /* Increment the inconsistent checkpoint stats counter. */
- WT_STAT_CONN_DATA_INCR(session, txn_rts_inconsistent_ckpt);
- }
-
- /*
- * The rollback to stable will skip the tables during recovery and shutdown in the following
- * conditions.
- * 1. Empty table.
- * 2. Table has timestamped updates without a stable timestamp.
- */
- if ((F_ISSET(S2C(session), WT_CONN_RECOVERING) ||
- F_ISSET(S2C(session), WT_CONN_CLOSING_CHECKPOINT)) &&
- (addr_size == 0 || (rollback_timestamp == WT_TS_NONE && max_durable_ts != WT_TS_NONE))) {
- __wt_verbose_multi(session, WT_VERB_RECOVERY_RTS(session),
- "skip rollback to stable on file %s because %s", uri,
- addr_size == 0 ? "its checkpoint address length is 0" :
- "it has timestamped updates and the stable timestamp is 0");
- return (0);
- }
-
- /*
- * The rollback operation should be performed on this file based on the following:
- * 1. The dhandle is present in the cache and tree is modified.
- * 2. The checkpoint durable start/stop timestamp is greater than the rollback timestamp.
- * 3. The checkpoint has prepared updates written to disk.
- * 4. There is no durable timestamp in any checkpoint.
- * 5. The checkpoint newest txn is greater than snapshot min txn id.
- */
- WT_WITHOUT_DHANDLE(session,
- WT_WITH_HANDLE_LIST_READ_LOCK(
- session, (ret = __rollback_to_stable_check_btree_modified(session, uri, &modified))));
-
- WT_ERR_NOTFOUND_OK(ret, false);
-
- if (modified || max_durable_ts > rollback_timestamp || prepared_updates || !durable_ts_found ||
- has_txn_updates_gt_than_ckpt_snap) {
- /*
- * Open a handle; we're potentially opening a lot of handles and there's no reason to cache
- * all of them for future unknown use, discard on close.
- */
- ret = __wt_session_get_dhandle(session, uri, NULL, NULL, WT_DHANDLE_DISCARD);
- if (ret != 0)
- WT_ERR_MSG(session, ret, "%s: unable to open handle%s", uri,
- ret == EBUSY ? ", error indicates handle is unavailable due to concurrent use" : "");
- dhandle_allocated = true;
-
- __wt_verbose_multi(session, WT_VERB_RECOVERY_RTS(session),
- "tree rolled back because it is modified: %s, or its durable timestamp (%s) > stable "
- "timestamp (%s): "
- "%s, or it has prepared updates: %s, or durable "
- "timestamp is not found: %s, or txnid (%" PRIu64
- ") > recovery checkpoint snap min (%" PRIu64 "): %s",
- S2BT(session)->modified ? "true" : "false",
- __wt_timestamp_to_string(max_durable_ts, ts_string[0]),
- __wt_timestamp_to_string(rollback_timestamp, ts_string[1]),
- max_durable_ts > rollback_timestamp ? "true" : "false",
- prepared_updates ? "true" : "false", !durable_ts_found ? "true" : "false", rollback_txnid,
- S2C(session)->recovery_ckpt_snap_min,
- has_txn_updates_gt_than_ckpt_snap ? "true" : "false");
-
- WT_ERR(__rollback_to_stable_btree(session, rollback_timestamp));
- } else
- __wt_verbose_multi(session, WT_VERB_RECOVERY_RTS(session),
- "%s: tree skipped with durable timestamp: %s and stable timestamp: %s or txnid: %" PRIu64,
- uri, __wt_timestamp_to_string(max_durable_ts, ts_string[0]),
- __wt_timestamp_to_string(rollback_timestamp, ts_string[1]), rollback_txnid);
-
- /*
- * Truncate history store entries for the non-timestamped table.
- * Exceptions:
- * 1. Modified tree - Scenarios where the tree is never checkpointed lead to zero
- * durable timestamp even they are timestamped tables. Until we have a special
- * indication of letting to know the table type other than checking checkpointed durable
- * timestamp to WT_TS_NONE, we need this exception.
- * 2. In-memory database - In this scenario, there is no history store to truncate.
- */
- if ((!dhandle_allocated || !S2BT(session)->modified) && max_durable_ts == WT_TS_NONE &&
- !F_ISSET(S2C(session), WT_CONN_IN_MEMORY)) {
- WT_ERR(__wt_config_getones(session, config, "id", &cval));
- btree_id = (uint32_t)cval.val;
- WT_ERR(__rollback_to_stable_btree_hs_truncate(session, btree_id));
- }
-
-err:
- if (dhandle_allocated)
- WT_TRET(__wt_session_release_dhandle(session));
- return (ret);
-}
-
-/*
- * __wt_rollback_to_stable_one --
- * Perform rollback to stable on a single object.
- */
-int
-__wt_rollback_to_stable_one(WT_SESSION_IMPL *session, const char *uri, bool *skipp)
-{
- WT_DECL_RET;
- wt_timestamp_t rollback_timestamp;
- char *config;
-
- /*
- * This is confusing: the caller's boolean argument "skip" stops the schema-worker loop from
- * processing this object and any underlying objects it may have (for example, a table with
- * multiple underlying file objects). We rollback-to-stable all of the file objects an object
- * may contain, so set the caller's skip argument to true on all file objects, else set the
- * caller's skip argument to false so our caller continues down the tree of objects.
- */
- *skipp = WT_BTREE_PREFIX(uri);
- if (!*skipp)
- return (0);
-
- WT_RET(__wt_metadata_search(session, uri, &config));
-
- /* Read the stable timestamp once, when we first start up. */
- WT_ORDERED_READ(rollback_timestamp, S2C(session)->txn_global.stable_timestamp);
-
- F_SET(session, WT_SESSION_QUIET_CORRUPT_FILE);
- ret = __rollback_to_stable_btree_apply(session, uri, config, rollback_timestamp);
- F_CLR(session, WT_SESSION_QUIET_CORRUPT_FILE);
-
- __wt_free(session, config);
-
- return (ret);
-}
-
-/*
- * __rollback_to_stable_btree_apply_all --
- * Perform rollback to stable to all files listed in the metadata, apart from the metadata and
- * history store files.
- */
-static int
-__rollback_to_stable_btree_apply_all(WT_SESSION_IMPL *session, wt_timestamp_t rollback_timestamp)
-{
- struct timespec rollback_timer;
- WT_CURSOR *cursor;
- WT_DECL_RET;
- uint64_t rollback_count, rollback_msg_count;
- const char *config, *uri;
-
- /* Initialize the verbose tracking timer. */
- __wt_epoch(session, &rollback_timer);
- rollback_count = 0;
- rollback_msg_count = 0;
-
- WT_RET(__wt_metadata_cursor(session, &cursor));
- while ((ret = cursor->next(cursor)) == 0) {
- /* Log a progress message. */
- __rollback_progress_msg(session, rollback_timer, rollback_count, &rollback_msg_count);
- ++rollback_count;
-
- WT_ERR(cursor->get_key(cursor, &uri));
- WT_ERR(cursor->get_value(cursor, &config));
-
- F_SET(session, WT_SESSION_QUIET_CORRUPT_FILE);
- ret = __rollback_to_stable_btree_apply(session, uri, config, rollback_timestamp);
- F_CLR(session, WT_SESSION_QUIET_CORRUPT_FILE);
-
- /*
- * Ignore rollback to stable failures on files that don't exist or files where corruption is
- * detected.
- */
- if (ret == ENOENT || (ret == WT_ERROR && F_ISSET(S2C(session), WT_CONN_DATA_CORRUPTION))) {
- __wt_verbose_multi(session, WT_VERB_RECOVERY_RTS(session),
- "%s: skipped performing rollback to stable because the file %s", uri,
- ret == ENOENT ? "does not exist" : "is corrupted.");
- continue;
- }
- WT_ERR(ret);
- }
- WT_ERR_NOTFOUND_OK(ret, false);
-
- if (F_ISSET(S2C(session), WT_CONN_RECOVERING))
- WT_ERR(__rollback_to_stable_hs_final_pass(session, rollback_timestamp));
-
-err:
- WT_TRET(__wt_metadata_cursor_release(session, &cursor));
- return (ret);
-}
-
-/*
- * __rollback_to_stable --
- * Rollback all modifications with timestamps more recent than the passed in timestamp.
- */
-static int
-__rollback_to_stable(WT_SESSION_IMPL *session, bool no_ckpt)
-{
- WT_CONNECTION_IMPL *conn;
- WT_DECL_RET;
- WT_TXN_GLOBAL *txn_global;
- wt_timestamp_t rollback_timestamp;
- char ts_string[2][WT_TS_INT_STRING_SIZE];
-
- conn = S2C(session);
- txn_global = &conn->txn_global;
-
- /*
- * Rollback to stable should ignore tombstones in the history store since it needs to scan the
- * entire table sequentially.
- */
- F_SET(session, WT_SESSION_ROLLBACK_TO_STABLE);
-
- WT_ERR(__rollback_to_stable_check(session));
-
- /*
- * Update the global time window state to have consistent view from global visibility rules for
- * the rollback to stable to bring back the database into a consistent state.
- *
- * As part of the below function call, the oldest transaction id and pinned timestamps are
- * updated.
- */
- WT_ERR(__wt_txn_update_oldest(session, WT_TXN_OLDEST_STRICT | WT_TXN_OLDEST_WAIT));
-
- WT_ASSERT_ALWAYS(session,
- (txn_global->has_pinned_timestamp || !txn_global->has_oldest_timestamp),
- "Database has no pinned timestamp but an oldest timestamp. Pinned timestamp is required to "
- "find out the global visibility/obsolete of an update.");
-
- /*
- * Copy the stable timestamp, otherwise we'd need to lock it each time it's accessed. Even
- * though the stable timestamp isn't supposed to be updated while rolling back, accessing it
- * without a lock would violate protocol.
- */
- WT_ORDERED_READ(rollback_timestamp, txn_global->stable_timestamp);
- __wt_verbose_multi(session, WT_VERB_RECOVERY_RTS(session),
- "performing rollback to stable with stable timestamp: %s and oldest timestamp: %s",
- __wt_timestamp_to_string(rollback_timestamp, ts_string[0]),
- __wt_timestamp_to_string(txn_global->oldest_timestamp, ts_string[1]));
-
- if (F_ISSET(conn, WT_CONN_RECOVERING))
- __wt_verbose_multi(session, WT_VERB_RECOVERY_RTS(session),
- "recovered checkpoint snapshot min: %" PRIu64 ", snapshot max: %" PRIu64
- ", snapshot count: %" PRIu32,
- conn->recovery_ckpt_snap_min, conn->recovery_ckpt_snap_max,
- conn->recovery_ckpt_snapshot_count);
-
- WT_ERR(__rollback_to_stable_btree_apply_all(session, rollback_timestamp));
-
- /* Rollback the global durable timestamp to the stable timestamp. */
- txn_global->has_durable_timestamp = txn_global->has_stable_timestamp;
- txn_global->durable_timestamp = txn_global->stable_timestamp;
-
- /*
- * If the configuration is not in-memory, forcibly log a checkpoint after rollback to stable to
- * ensure that both in-memory and on-disk versions are the same unless caller requested for no
- * checkpoint.
- */
- if (!F_ISSET(conn, WT_CONN_IN_MEMORY) && !no_ckpt)
- WT_ERR(session->iface.checkpoint(&session->iface, "force=1"));
-
-err:
- F_CLR(session, WT_SESSION_ROLLBACK_TO_STABLE);
- return (ret);
-}
-
-/*
- * __wt_rollback_to_stable --
- * Rollback the database to the stable timestamp.
- */
-int
-__wt_rollback_to_stable(WT_SESSION_IMPL *session, const char *cfg[], bool no_ckpt)
-{
- WT_DECL_RET;
-
- WT_UNUSED(cfg);
-
- /*
- * Don't use the connection's default session: we are working on data handles and (a) don't want
- * to cache all of them forever, plus (b) can't guarantee that no other method will be called
- * concurrently. Copy parent session no logging option to the internal session to make sure that
- * rollback to stable doesn't generate log records.
- */
- WT_RET(
- __wt_open_internal_session(S2C(session), "txn rollback_to_stable", true, 0, 0, &session));
-
- WT_STAT_CONN_SET(session, txn_rollback_to_stable_running, 1);
- WT_WITH_CHECKPOINT_LOCK(
- session, WT_WITH_SCHEMA_LOCK(session, ret = __rollback_to_stable(session, no_ckpt)));
- WT_STAT_CONN_SET(session, txn_rollback_to_stable_running, 0);
-
- WT_TRET(__wt_session_close_internal(session));
-
- return (ret);
-}