Import wiredtiger: 332dddfe0e48eb1c263455d3db9219ec5f7cdc30 from branch mongodb-4.4r4.4.4-rc0

ref: a52cd5a47a..332dddfe0e for: 4.4.4 WT-6430 Move WT_CONN_SERVER flags into their own field WT-6504 Don't fallback to onpage value as base value if we see the onpage value in the history store WT-6567 Write "rollback to stable" subpage for Architecture Guide WT-6772 Add support for prepared updates in datastore for test_hs09 WT-6901 Write "cursor" subpage for Architecture Guide WT-7069 Enable column store configuration to history store WT-7089 Don't skip checkpointing objects that have obsolete pages WT-7091 Restrict usage of LSM to only operate in conjunction with compatible incremental backup mechanism WT-7117 RTS to skip modifies that are more than on-disk base update while restoring an update WT-7121 Include log-structured allocation python tests in WT WT-7126 Coverity analysis defect 116991: Explicit null dereferenced WT-7127 Coverity analysis defect 116992: Unchecked return value WT-7128 Coverity analysis defect 116993: Resource leak WT-7131 Tiered cursors should return error if configured with zero tiers
author: Luke Chen <luke.chen@mongodb.com> 2021-02-03 13:58:53 +1100
committer: Evergreen Agent <no-reply@evergreen.mongodb.com> 2021-02-03 03:20:38 +0000
commit: 7aa1b65641938719accd595bda3e45e97dc5f475 (patch)
tree: 06442d7ad52cf475dca61aa2ca96ed908f450388
parent: ea687c2a4bcd9937d02658043b5a2529943ef950 (diff)
download: mongo-r4.4.4-rc0.tar.gz
48 files changed, 1655 insertions, 230 deletions
diff --git a/src/third_party/wiredtiger/dist/docs_data.py b/src/third_party/wiredtiger/dist/docs_data.py
index a1301c87057..9f2bb32486c 100644
--- a/src/third_party/wiredtiger/dist/docs_data.py
+++ b/src/third_party/wiredtiger/dist/docs_data.py
@@ -65,6 +65,9 @@ arch_doc_pages = [
     ArchDocPage('arch-row',
         ['WT_BTREE'],
         ['src/include/btree.h']),
+    ArchDocPage('arch-rts',
+        [''],
+        ['src/txn/']),
     ArchDocPage('arch-schema',
         ['WT_COLGROUP', 'WT_INDEX', 'WT_LSM_TREE', 'WT_TABLE'],
         ['src/include/intpack_inline.h', 'src/include/packing_inline.h',
diff --git a/src/third_party/wiredtiger/dist/s_void b/src/third_party/wiredtiger/dist/s_void
index 5beea1cddc1..fca1ccc9810 100755
--- a/src/third_party/wiredtiger/dist/s_void
+++ b/src/third_party/wiredtiger/dist/s_void
@@ -78,6 +78,7 @@ func_ok()
 	    -e '/int __wt_stat_dsrc_desc$/d' \
 	    -e '/int __wt_stat_join_desc$/d' \
 	    -e '/int __wt_stat_session_desc/d' \
+	    -e '/int __wt_txn_read_upd_list$/d' \
 	    -e '/int __wt_txn_rollback_required$/d' \
 	    -e '/int __wt_win_directory_list_free$/d' \
 	    -e '/int bdb_compare_reverse$/d' \
diff --git a/src/third_party/wiredtiger/dist/stat_data.py b/src/third_party/wiredtiger/dist/stat_data.py
index 997337c7896..3ccc3e0b57e 100644
--- a/src/third_party/wiredtiger/dist/stat_data.py
+++ b/src/third_party/wiredtiger/dist/stat_data.py
@@ -856,8 +856,10 @@ conn_dsrc_stats = [
     ##########################################
     # Transaction statistics
     ##########################################
+    TxnStat('txn_checkpoint_obsolete_applied', 'transaction checkpoints due to obsolete pages'),
     TxnStat('txn_read_race_prepare_update', 'race to read prepared update retry'),
     TxnStat('txn_rts_hs_removed', 'rollback to stable updates removed from history store'),
+    TxnStat('txn_rts_hs_restore_updates', 'rollback to stable restored updates from history store'),
     TxnStat('txn_rts_hs_restore_tombstones', 'rollback to stable restored tombstones from history store'),
     TxnStat('txn_rts_hs_stop_older_than_newer_start', 'rollback to stable hs records with stop timestamps older than newer records'),
     TxnStat('txn_rts_keys_removed', 'rollback to stable keys removed'),
diff --git a/src/third_party/wiredtiger/import.data b/src/third_party/wiredtiger/import.data
index eb0dde936e8..3bbabe4e470 100644
--- a/src/third_party/wiredtiger/import.data
+++ b/src/third_party/wiredtiger/import.data
@@ -2,5 +2,5 @@
     "vendor": "wiredtiger",
     "github": "wiredtiger/wiredtiger.git",
     "branch": "mongodb-4.4",
-    "commit": "a52cd5a47a7e9af9e2c341e66f0ffdd9bc977930"
+    "commit": "332dddfe0e48eb1c263455d3db9219ec5f7cdc30"
 }
diff --git a/src/third_party/wiredtiger/src/btree/bt_curnext.c b/src/third_party/wiredtiger/src/btree/bt_curnext.c
index 05394d0ae98..63d187b7442 100644
--- a/src/third_party/wiredtiger/src/btree/bt_curnext.c
+++ b/src/third_party/wiredtiger/src/btree/bt_curnext.c
@@ -57,7 +57,7 @@ __cursor_fix_append_next(WT_CURSOR_BTREE *cbt, bool newpage, bool restart)
         cbt->iface.value.data = &cbt->v;
     } else {
 restart_read:
-        WT_RET(__wt_txn_read_upd_list(session, cbt, cbt->ins->upd, NULL));
+        WT_RET(__wt_txn_read_upd_list(session, cbt, cbt->ins->upd));
         if (cbt->upd_value->type == WT_UPDATE_INVALID) {
             cbt->v = 0;
             cbt->iface.value.data = &cbt->v;
@@ -157,7 +157,7 @@ new_page:
 
         __cursor_set_recno(cbt, WT_INSERT_RECNO(cbt->ins));
 restart_read:
-        WT_RET(__wt_txn_read_upd_list(session, cbt, cbt->ins->upd, NULL));
+        WT_RET(__wt_txn_read_upd_list(session, cbt, cbt->ins->upd));
 
         if (cbt->upd_value->type == WT_UPDATE_INVALID) {
             ++*skippedp;
@@ -232,7 +232,7 @@ restart_read:
         cbt->ins = __col_insert_search_match(cbt->ins_head, cbt->recno);
         __wt_upd_value_clear(cbt->upd_value);
         if (cbt->ins != NULL)
-            WT_RET(__wt_txn_read_upd_list(session, cbt, cbt->ins->upd, NULL));
+            WT_RET(__wt_txn_read_upd_list(session, cbt, cbt->ins->upd));
         if (cbt->upd_value->type != WT_UPDATE_INVALID) {
             if (cbt->upd_value->type == WT_UPDATE_TOMBSTONE) {
                 if (cbt->upd_value->tw.stop_txn != WT_TXN_NONE &&
@@ -365,7 +365,7 @@ restart_read_insert:
         if ((ins = cbt->ins) != NULL) {
             key->data = WT_INSERT_KEY(ins);
             key->size = WT_INSERT_KEY_SIZE(ins);
-            WT_RET(__wt_txn_read_upd_list(session, cbt, ins->upd, NULL));
+            WT_RET(__wt_txn_read_upd_list(session, cbt, ins->upd));
             if (cbt->upd_value->type == WT_UPDATE_INVALID) {
                 ++*skippedp;
                 continue;
diff --git a/src/third_party/wiredtiger/src/btree/bt_curprev.c b/src/third_party/wiredtiger/src/btree/bt_curprev.c
index bb2c3a9e05c..7fe24232317 100644
--- a/src/third_party/wiredtiger/src/btree/bt_curprev.c
+++ b/src/third_party/wiredtiger/src/btree/bt_curprev.c
@@ -197,7 +197,7 @@ __cursor_fix_append_prev(WT_CURSOR_BTREE *cbt, bool newpage, bool restart)
         cbt->iface.value.data = &cbt->v;
     } else {
 restart_read:
-        WT_RET(__wt_txn_read_upd_list(session, cbt, cbt->ins->upd, NULL));
+        WT_RET(__wt_txn_read_upd_list(session, cbt, cbt->ins->upd));
         if (cbt->upd_value->type == WT_UPDATE_INVALID) {
             cbt->v = 0;
             cbt->iface.value.data = &cbt->v;
@@ -297,7 +297,7 @@ new_page:
 
         __cursor_set_recno(cbt, WT_INSERT_RECNO(cbt->ins));
 restart_read:
-        WT_RET(__wt_txn_read_upd_list(session, cbt, cbt->ins->upd, NULL));
+        WT_RET(__wt_txn_read_upd_list(session, cbt, cbt->ins->upd));
         if (cbt->upd_value->type == WT_UPDATE_INVALID) {
             ++*skippedp;
             continue;
@@ -372,7 +372,7 @@ restart_read:
         cbt->ins = __col_insert_search_match(cbt->ins_head, cbt->recno);
         __wt_upd_value_clear(cbt->upd_value);
         if (cbt->ins != NULL)
-            WT_RET(__wt_txn_read_upd_list(session, cbt, cbt->ins->upd, NULL));
+            WT_RET(__wt_txn_read_upd_list(session, cbt, cbt->ins->upd));
         if (cbt->upd_value->type != WT_UPDATE_INVALID) {
             if (cbt->upd_value->type == WT_UPDATE_TOMBSTONE) {
                 if (cbt->upd_value->tw.stop_txn != WT_TXN_NONE &&
@@ -514,7 +514,7 @@ restart_read_insert:
         if ((ins = cbt->ins) != NULL) {
             key->data = WT_INSERT_KEY(ins);
             key->size = WT_INSERT_KEY_SIZE(ins);
-            WT_RET(__wt_txn_read_upd_list(session, cbt, ins->upd, NULL));
+            WT_RET(__wt_txn_read_upd_list(session, cbt, ins->upd));
             if (cbt->upd_value->type == WT_UPDATE_INVALID) {
                 ++*skippedp;
                 continue;
diff --git a/src/third_party/wiredtiger/src/btree/bt_cursor.c b/src/third_party/wiredtiger/src/btree/bt_cursor.c
index fc7e542a12e..e4f538d80df 100644
--- a/src/third_party/wiredtiger/src/btree/bt_cursor.c
+++ b/src/third_party/wiredtiger/src/btree/bt_cursor.c
@@ -230,7 +230,7 @@ __wt_cursor_valid(WT_CURSOR_BTREE *cbt, WT_ITEM *key, uint64_t recno, bool *vali
      * update that's been deleted is not a valid key/value pair).
      */
     if (cbt->ins != NULL) {
-        WT_RET(__wt_txn_read_upd_list(session, cbt, cbt->ins->upd, NULL));
+        WT_RET(__wt_txn_read_upd_list(session, cbt, cbt->ins->upd));
         if (cbt->upd_value->type != WT_UPDATE_INVALID) {
             if (cbt->upd_value->type == WT_UPDATE_TOMBSTONE)
                 return (0);
diff --git a/src/third_party/wiredtiger/src/conn/conn_capacity.c b/src/third_party/wiredtiger/src/conn/conn_capacity.c
index 2213f696e4f..2d0085dfb7b 100644
--- a/src/third_party/wiredtiger/src/conn/conn_capacity.c
+++ b/src/third_party/wiredtiger/src/conn/conn_capacity.c
@@ -72,7 +72,7 @@ __capacity_config(WT_SESSION_IMPL *session, const char *cfg[])
 static bool
 __capacity_server_run_chk(WT_SESSION_IMPL *session)
 {
-    return (F_ISSET(S2C(session), WT_CONN_SERVER_CAPACITY));
+    return (FLD_ISSET(S2C(session)->server_flags, WT_CONN_SERVER_CAPACITY));
 }
 
 /*
@@ -129,7 +129,7 @@ __capacity_server_start(WT_CONNECTION_IMPL *conn)
 {
     WT_SESSION_IMPL *session;
 
-    F_SET(conn, WT_CONN_SERVER_CAPACITY);
+    FLD_SET(conn->server_flags, WT_CONN_SERVER_CAPACITY);
 
     /*
      * The capacity server gets its own session.
@@ -196,7 +196,7 @@ __wt_capacity_server_destroy(WT_SESSION_IMPL *session)
 
     conn = S2C(session);
 
-    F_CLR(conn, WT_CONN_SERVER_CAPACITY);
+    FLD_CLR(conn->server_flags, WT_CONN_SERVER_CAPACITY);
     if (conn->capacity_tid_set) {
         __wt_cond_signal(session, conn->capacity_cond);
         WT_TRET(__wt_thread_join(session, &conn->capacity_tid));
diff --git a/src/third_party/wiredtiger/src/conn/conn_ckpt.c b/src/third_party/wiredtiger/src/conn/conn_ckpt.c
index b71ac6c28f4..27a5121ab2e 100644
--- a/src/third_party/wiredtiger/src/conn/conn_ckpt.c
+++ b/src/third_party/wiredtiger/src/conn/conn_ckpt.c
@@ -63,7 +63,7 @@ __ckpt_server_config(WT_SESSION_IMPL *session, const char **cfg, bool *startp)
 static bool
 __ckpt_server_run_chk(WT_SESSION_IMPL *session)
 {
-    return (F_ISSET(S2C(session), WT_CONN_SERVER_CHECKPOINT));
+    return (FLD_ISSET(S2C(session)->server_flags, WT_CONN_SERVER_CHECKPOINT));
 }
 
 /*
@@ -134,7 +134,7 @@ __ckpt_server_start(WT_CONNECTION_IMPL *conn)
     if (conn->ckpt_session != NULL)
         return (0);
 
-    F_SET(conn, WT_CONN_SERVER_CHECKPOINT);
+    FLD_SET(conn->server_flags, WT_CONN_SERVER_CHECKPOINT);
 
     /*
      * The checkpoint server gets its own session.
@@ -201,7 +201,7 @@ __wt_checkpoint_server_destroy(WT_SESSION_IMPL *session)
 
     conn = S2C(session);
 
-    F_CLR(conn, WT_CONN_SERVER_CHECKPOINT);
+    FLD_CLR(conn->server_flags, WT_CONN_SERVER_CHECKPOINT);
     if (conn->ckpt_tid_set) {
         __wt_cond_signal(session, conn->ckpt_cond);
         WT_TRET(__wt_thread_join(session, &conn->ckpt_tid));
diff --git a/src/third_party/wiredtiger/src/conn/conn_log.c b/src/third_party/wiredtiger/src/conn/conn_log.c
index acba9ebb12c..0cb718d92f0 100644
--- a/src/third_party/wiredtiger/src/conn/conn_log.c
+++ b/src/third_party/wiredtiger/src/conn/conn_log.c
@@ -522,7 +522,7 @@ __wt_log_truncate_files(WT_SESSION_IMPL *session, WT_CURSOR *cursor, bool force)
     conn = S2C(session);
     if (!FLD_ISSET(conn->log_flags, WT_CONN_LOG_ENABLED))
         return (0);
-    if (!force && F_ISSET(conn, WT_CONN_SERVER_LOG) &&
+    if (!force && FLD_ISSET(conn->server_flags, WT_CONN_SERVER_LOG) &&
       FLD_ISSET(conn->log_flags, WT_CONN_LOG_ARCHIVE))
         WT_RET_MSG(session, EINVAL, "Attempt to archive manually while a server is running");
 
@@ -566,7 +566,7 @@ __log_file_server(void *arg)
     log = conn->log;
     locked = false;
     yield_count = 0;
-    while (F_ISSET(conn, WT_CONN_SERVER_LOG)) {
+    while (FLD_ISSET(conn->server_flags, WT_CONN_SERVER_LOG)) {
         /*
          * If there is a log file to close, make sure any outstanding write operations have
          * completed, then fsync and close it.
@@ -838,7 +838,7 @@ __log_wrlsn_server(void *arg)
     log = conn->log;
     yield = 0;
     WT_INIT_LSN(&prev);
-    while (F_ISSET(conn, WT_CONN_SERVER_LOG)) {
+    while (FLD_ISSET(conn->server_flags, WT_CONN_SERVER_LOG)) {
         /*
          * Write out any log record buffers if anything was done since last time. Only call the
          * function to walk the slots if the system is not idle. On an idle system the alloc_lsn
@@ -908,7 +908,7 @@ __log_server(void *arg)
      * records sitting in the buffer over the time it takes to sync out an earlier file.
      */
     did_work = true;
-    while (F_ISSET(conn, WT_CONN_SERVER_LOG)) {
+    while (FLD_ISSET(conn->server_flags, WT_CONN_SERVER_LOG)) {
         /*
          * Slots depend on future activity. Force out buffered writes in case we are idle. This
          * cannot be part of the wrlsn thread because of interaction advancing the write_lsn and a
@@ -1036,7 +1036,7 @@ __wt_logmgr_open(WT_SESSION_IMPL *session)
     if (!FLD_ISSET(conn->log_flags, WT_CONN_LOG_ENABLED))
         return (0);
 
-    F_SET(conn, WT_CONN_SERVER_LOG);
+    FLD_SET(conn->server_flags, WT_CONN_SERVER_LOG);
 
     /*
      * Start the log close thread. It is not configurable. If logging is enabled, this thread runs.
@@ -1104,7 +1104,7 @@ __wt_logmgr_destroy(WT_SESSION_IMPL *session)
 
     conn = S2C(session);
 
-    F_CLR(conn, WT_CONN_SERVER_LOG);
+    FLD_CLR(conn->server_flags, WT_CONN_SERVER_LOG);
 
     if (!FLD_ISSET(conn->log_flags, WT_CONN_LOG_ENABLED)) {
         /*
diff --git a/src/third_party/wiredtiger/src/conn/conn_open.c b/src/third_party/wiredtiger/src/conn/conn_open.c
index 8d45b5eb678..0c93433a59e 100644
--- a/src/third_party/wiredtiger/src/conn/conn_open.c
+++ b/src/third_party/wiredtiger/src/conn/conn_open.c
@@ -77,7 +77,7 @@ __wt_connection_close(WT_CONNECTION_IMPL *conn)
      * The LSM services are not shut down in this path (which is called when wiredtiger_open hits an
      * error (as well as during normal shutdown). Assert they're not running.
      */
-    WT_ASSERT(session, !F_ISSET(conn, WT_CONN_SERVER_LSM));
+    WT_ASSERT(session, !FLD_ISSET(conn->server_flags, WT_CONN_SERVER_LSM));
 
     /* Shut down the subsystems, ensuring workers see the state change. */
     F_SET(conn, WT_CONN_CLOSING);
diff --git a/src/third_party/wiredtiger/src/conn/conn_stat.c b/src/third_party/wiredtiger/src/conn/conn_stat.c
index 69989939c43..06ae5b14350 100644
--- a/src/third_party/wiredtiger/src/conn/conn_stat.c
+++ b/src/third_party/wiredtiger/src/conn/conn_stat.c
@@ -528,7 +528,7 @@ __statlog_on_close(WT_SESSION_IMPL *session)
     if (!FLD_ISSET(conn->stat_flags, WT_STAT_ON_CLOSE))
         return (0);
 
-    if (F_ISSET(conn, WT_CONN_SERVER_STATISTICS))
+    if (FLD_ISSET(conn->server_flags, WT_CONN_SERVER_STATISTICS))
         WT_RET_MSG(session, EINVAL, "Attempt to log statistics while a server is running");
 
     WT_RET(__wt_scr_alloc(session, strlen(conn->stat_path) + 128, &tmp));
@@ -547,7 +547,7 @@ err:
 static bool
 __statlog_server_run_chk(WT_SESSION_IMPL *session)
 {
-    return (F_ISSET(S2C(session), WT_CONN_SERVER_STATISTICS));
+    return (FLD_ISSET(S2C(session)->server_flags, WT_CONN_SERVER_STATISTICS));
 }
 
 /*
@@ -614,7 +614,7 @@ __statlog_start(WT_CONNECTION_IMPL *conn)
     if (conn->stat_session != NULL)
         return (0);
 
-    F_SET(conn, WT_CONN_SERVER_STATISTICS);
+    FLD_SET(conn->server_flags, WT_CONN_SERVER_STATISTICS);
 
     /* The statistics log server gets its own session. */
     WT_RET(__wt_open_internal_session(conn, "statlog-server", true, 0, &conn->stat_session));
@@ -685,7 +685,7 @@ __wt_statlog_destroy(WT_SESSION_IMPL *session, bool is_close)
     conn = S2C(session);
 
     /* Stop the server thread. */
-    F_CLR(conn, WT_CONN_SERVER_STATISTICS);
+    FLD_CLR(conn->server_flags, WT_CONN_SERVER_STATISTICS);
     if (conn->stat_tid_set) {
         __wt_cond_signal(session, conn->stat_cond);
         WT_TRET(__wt_thread_join(session, &conn->stat_tid));
diff --git a/src/third_party/wiredtiger/src/conn/conn_sweep.c b/src/third_party/wiredtiger/src/conn/conn_sweep.c
index 0ab2b97e21e..7931f4e8a79 100644
--- a/src/third_party/wiredtiger/src/conn/conn_sweep.c
+++ b/src/third_party/wiredtiger/src/conn/conn_sweep.c
@@ -249,7 +249,7 @@ __sweep_remove_handles(WT_SESSION_IMPL *session)
 static bool
 __sweep_server_run_chk(WT_SESSION_IMPL *session)
 {
-    return (F_ISSET(S2C(session), WT_CONN_SERVER_SWEEP));
+    return (FLD_ISSET(S2C(session)->server_flags, WT_CONN_SERVER_SWEEP));
 }
 
 /*
@@ -375,7 +375,7 @@ __wt_sweep_create(WT_SESSION_IMPL *session)
     conn = S2C(session);
 
     /* Set first, the thread might run before we finish up. */
-    F_SET(conn, WT_CONN_SERVER_SWEEP);
+    FLD_SET(conn->server_flags, WT_CONN_SERVER_SWEEP);
 
     /*
      * Handle sweep does enough I/O it may be called upon to perform slow operations for the block
@@ -406,7 +406,7 @@ __wt_sweep_destroy(WT_SESSION_IMPL *session)
 
     conn = S2C(session);
 
-    F_CLR(conn, WT_CONN_SERVER_SWEEP);
+    FLD_CLR(conn->server_flags, WT_CONN_SERVER_SWEEP);
     if (conn->sweep_tid_set) {
         __wt_cond_signal(session, conn->sweep_cond);
         WT_TRET(__wt_thread_join(session, &conn->sweep_tid));
diff --git a/src/third_party/wiredtiger/src/docs/arch-cursor.dox b/src/third_party/wiredtiger/src/docs/arch-cursor.dox
index 60e47c5a8ad..7eee860af86 100644
--- a/src/third_party/wiredtiger/src/docs/arch-cursor.dox
+++ b/src/third_party/wiredtiger/src/docs/arch-cursor.dox
@@ -3,13 +3,393 @@
 Cursors are used in WiredTiger to get and modify data.
 A caller of WiredTiger uses WT_SESSION::open_cursor to create
 a WT_CURSOR.  Methods on the WT_CURSOR can then be used to
-position, iterate, get, and set data.
+position, iterate, get, and set data.  In the typical case, a cursor
+will be used to access keys and values in a Btree.  However, cursors
+can also used to access indexed data, WiredTiger statistics, log files,
+and metadata. Additionally, cursors are used for managing backups.
 
-Depending on the <code>uri</code> used when creating a cursor, the cursor will
-be internally implemented as one of the many cursor structures that include
-WT_CURSOR_BTREE, WT_CURSOR_BACKUP, WT_CURSOR_INDEX, WT_CURSOR_LOG,
-WT_CURSOR_METADATA, WT_CURSOR_STAT. Each of these structures starts
+The various kinds of cursors are created by the WT_SESSION::open_cursor call.
+Depending on the <code>uri</code> used when opening a cursor, the cursor will
+be implemented internally as one of the many cursor structures that include
+\c WT_CURSOR_BTREE, \c WT_CURSOR_BACKUP, \c WT_CURSOR_INDEX, \c WT_CURSOR_LOG,
+\c WT_CURSOR_METADATA, \c WT_CURSOR_STAT, \c WT_CURSOR_TABLE. Each of these structures starts
 with the common \c %WT_CURSOR structure, which contain all
 of the data and method pointers that make up the public part of the API.
+Thus, any one of these "extended cursor structs" can be allocated and
+returned as a WT_CURSOR pointer.  Since the method pointers are filled with
+a specific implementations, for example \c __curtable_reset, a call to
+WT_CURSOR::reset will call the specific implementation function, and the first
+argument can be cast to be a pointer to the cursor struct used by the implementation.
+Thus, in our C code, we have something similar to a class hierarchy, having
+an abstract base class (WT_CURSOR) with virtual methods, and a number of
+implementation classes.
+
+The code for each cursor type's methods are generally organized in a single file.
+For example, a backup cursor is implemented in \c src/cursor/cur_backup.c .
+Similarly, shared or utility cursor methods are defined in \c src/cursor/cur_std.c .
+
+Several cursor types have the concept of subordinate cursors, or child
+cursors.  For example, a table cursor is composed of subordinate file cursors,
+each representing a column group.  A metadata cursor has a subordinate
+cursor that is a file cursor on the metadata file.
+
+Every open cursor appears on a list in \c WT_SESSION->cursors. This list is ordered
+in a way that cursors are closed when the session is closed. Thus, if a subordinate
+cursor needs to be closed before its parent, it must be listed before the parent.
+
+@section arch_cursor_raw Data translation
+
+Cursors that expose Btree data, like file, table and index cursors, return a set of keys and values,
+translating encoded data to types that match the schema.  For example, if the \c value_format specified for
+a table is \c "iSi" (an integer, a null-terminated string, an integer), then data `{0, "abc", 5}` might
+be stored in the Btree in a packed format as:
+```
+0x80 0x61 0x62 0x63 0x00 0x85
+```
+When retrieved, the values are decoded and stored into typed variables whose addresses are passed to WT_CURSOR::get_value.
+
+As an aside, `0x80` represents `0` using a variable length encoding.  Using 0x80 as zero allows negative integers
+to be stored (\c -1 is \c 0x7f) in a way that they will sort before zero and positive integers.  Small integer values
+can be stored in a single byte. See comments in \c src/include/intpack_inline.h for more information.
+
+When creating a cursor via WT_SESSION::open_cursor, the *raw* flag can be used.  This has the effect
+of disabling the translation provided by the schema, transferring a single block of unencoded data
+for the key or value.
+
+@section arch_cursor_file File cursors
+
+File cursors (also known as Btree cursors) are one of fundamental kind of cursors, allowing
+direct accesses to WiredTiger Btrees. The implementation structure for a file
+cursor is \c WT_CURSOR_BTREE.  The file cursor methods are generally small wrappers around calls
+into the Btree layer, where the \c WT_CURSOR_BTREE structure is used.  The Btree layer handles
+all aspects of cursor positioning, and transfers of raw key and value data.
+
+@section arch_cursor_table Table and index cursors
+
+A table cursor is a higher level concept built on file cursors.  A WiredTiger table allows data
+to be physically split into separate Btrees, this is done via the concept of column groups.  Column
+groups may be defined that contain a set of named columns (`columns` are synonymous with `fields`).
+Each column group's columns are the stored in a single btree and may be looked up by the table's key.
+See @ref schema_column_groups for API details.
+
+Tables may also have indices. These are implemented as Btrees mapping the index key(s) to the
+main key(s) of the table.  Indices may be added after the data is populated in the main table.
+This requires the index to be filled at the time it is added.  Index cursors are implemented using
+\c WT_CURSOR_INDEX.  Methods on this cursor that set and get values know to use the value that appears
+in the index as a key into the main table.
+See @ref schema_indices for API details.
+
+Internally, we define the concept of a "simple" table as one that has no named columns.  *Columns*,
+that is the set of keys and values, may be optionally named when creating a table.
+With no named columns, column groups are not possible (as they must reference names),
+and all of a table's keys and values reside in a single file.
+Also without named columns, there is no possibility that a cursor can use projections.
+A "simple" table, by its nature, can be implemented by a single Btree file and needs no special translation.
+Thus, if a cursor is opened on a "simple" table, we can return a file cursor on the single file used
+to store its data, instead of a table cursor. This optimization means that every cursor method
+goes directly to the file cursor implementation, saving CPU time throughout the lifetime of the cursor.
+
+@section arch_cursor_projections Projections and plans
+
+Projections are an indication, when a cursor is opened, that the indicated values, and possibly some keys,
+should be returned by WT_CURSOR::get_value calls.  This is only available for table cursors.
+If the table was configured with column groups, a projection has a bearing on which column group
+files must be opened in a cursor.  When a subset of values is returned, it's possible that some
+column groups will not be needed.  To implement projections and column groups, cursors use a *plan*.
+
+A plan is a string that indicates a series of actions that must be taken to retrieve the needed
+values from the subordinate cursors in the table cursor.  Remember that each column group gets
+its own cursor.  When a table is created, a default plan is created that asks to copy all
+columns from each column group cursor in order.
+When a projection is used for a cursor, a more complex plan may be created.
+A plan contains numbers and action letters.  The numbers are arguments, the
+action letters are commands.  The actions 'k' and 'v' indicate that the subordinate
+cursor indicated by the numeric argument will be used to get keys or values that follow.
+The 'n' action gets one or more next key or value columns from the cursor. The 's' skips
+one or more columns. Switching to another cursor resets the cursor to point at the first
+column in the indicated cursor.  This allows a way to find keys and values in arbitrary
+order.
+
+As a contrived example, suppose we have a table with two keys
+\c "k1,k2" and four values \c "x1,x2,x3,x4".  The column groups are created as follows:
+```
+    session->create(session, "colgroup:main:c1", "columns=(x1,x2)");
+    session->create(session, "colgroup:main:c2", "columns=(x3,x4)");
+```
+The column group \c "main:c1" has keys "k1,k2" and values "x1,x2";
+the column group \c "main:c2" has keys "k1,k2" and values "x3,x4".  A table cursor
+on the main table will have two sub-cursors, one for each column group.
+Now consider the ill-considered projection that is opened as:
+```
+    session->open_cursor(session, "table:main(x1,x4,x3,k2,k1,x2,x4)", NULL, NULL, &cursor);
+
+```
+In this case, the plan is this string (with spacing added):
+```
+0v n 1v s n 1v n 0k s n 0k n 0v s n 1v s n
+```
+To break it down,
+- \c "0v" means use sub-cursor 0 and prepare to read values.
+- \c "n" means get the next value, that is \c "x1".
+- \c "1v" means use sub-cursor 1 and prepare to read values.
+- \c "s" means skip a value, that is \c "x3".
+- \c "n" means get the next value, that is \c "x4".
+- \c ...
+
+Notice that this plan requires many switches of cursors and several \c "s" (skip) operations.
+Each skip involves enough decoding of the data item to determine its length so its data can
+be skipped over.
+
+With the default (complete) projection, getting values is fast.  Using the default plan, the needed
+columns are pulled out of each subordinate cursor one by one, and get copied to the caller's arguments.
+With projections, the simple algorithm following the plan works well if the columns in the projection
+are grouped by column group and requested in order.  Without that discipline, as in this example,
+the performance will not be optimal.
+
+The implementation of plan creation and execution resides in the @ref arch-schema.
+
+@section arch_cursor_dump Dump cursors
+
+Dump cursors are used in two rather different ways.  Regular dump cursors retrieve the raw keys and values
+and translate bytes either as raw characters or as hex values.  This flavor of dump cursor is used
+by the `wt dump` or `wt dump -x` utility.  JSON dump cursors do more sophisticated translation, returning a string that is a JSON
+formatted record, with the name for each key and its data and the name for each value and its corresponding data.
+Data is translated to either integral, floating point, or string depending on the format of the column in the schema.
+This flavor of dump cursor is used by the `wt dump -j` command as part of creating a JSON dump of a WiredTiger table or file.
+
+There is a single \c WT_CURSOR_DUMP struct that is used to implement both flavors.
+The dump code checks for the \c WT_CURSTD_DUMP_JSON flag and as needed, calls into functions
+in `src/cursor/cur_json.c` .  The code in that file also implements several external functions that are used
+by the `wt` utility when loading JSON-dumped files.  In particular, the \c __wt_json_token function
+returns individual JSON tokens from an input string.
+
+The JSON code used by the dump cursor uses some storage that hangs off of `cursor->json_private` which
+is typed as \c WT_CURSOR_JSON.  When a JSON flavored cursor is created, the list of key column names
+and value column names is populated in \c WT_CURSOR_JSON.  These names, obtained from the
+configuration string that created the table or file, are useful to have in advance,
+as they are used once per row to help fill out the JSON output.  The functions that get
+rows iterate these names and unpack the corresponding column data,
+converting them into the appropriate JSON format for the data.
+
+@section arch_cursor_backup Backup cursors
+
+A backup cursor is used to manage backups. It is implemented using a \c WT_CURSOR_BACKUP structure.
+A backup cursor can be configured to do a full backup or an incremental backup.
+First, we'll look at full backups.
+
+A backup cursor for a full backup returns the set of files that need to be copied to achieve the
+backup.  The backup cursor, when opened, ensures that it is the only backup cursor running
+in the system and returns an error if not.  This is managed using the
+\c WT_CONNECTION_IMPL->hot_backup_start variable, which can only be accessed when the
+connection schema lock is held.  A non-zero value means a hot backup is in progress.
+Closing the backup cursor sets it back to zero.
+
+Having an open backup influences actions
+elsewhere in WiredTiger, since part of the backup protocol involves the application copying
+whole data files.  Thus, having an open backup may cause the block manager and log file server
+thread to avoid truncating data and log files.  A truncation of a file being copied by the
+application would be unexpected. Also, open checkpoints are not deleted during the
+course of a backup.
+
+When the backup cursor is initialized, the complete set of files needed to back up is generated
+and stored in the cursor.  This makes the backup's \c next function easy as it just returns the next
+file in the list.
+
+Incremental backups work much the same way, except that the file list is reduced to files that
+have changed since a previous backup referenced in the configuration when the cursor is opened.
+The other twist is that for each file returned, the caller does a duplicate operation on the backup
+cursor, and the duplicate code actually returns a specialized incremental backup cursor.  This kind
+of cursor has its own \c next method that causes it to return information about individual pieces
+of this file that need to be copied.  The code to implement incremental cursors
+is in \c src/cursor/cur_backup_incr.c .
+
+@section arch_cursor_join Join cursors
+
+Join cursors implement a join mechanism for WiredTiger.  The idea is that joins can be configured
+by opening a special join cursor on a table, and attaching the table's index cursors to it, to
+return rows that match a filter, like:
+```
+(row.price > 100 and row.price <= 200) and row.in_stock < 10
+```
+Building up the conditions essentially creates a tree that is used for evaluation.  This is stored
+in \c WT_CURSOR_JOIN, and is returned as the cursor object.  Entries in the tree representing
+an index's participation in one clause are stored in \c WT_CURSOR_JOIN_ENTRY objects.  \c WT_CURSOR_JOIN and
+\c WT_CURSOR_JOIN_ENTRY objects can be composed in a hierarchical manner, representing the shape of the tree representing
+the query.  These two structures are somewhat static, being created when the join is created.
+
+The join cursor contains a pointer to a \c WT_CURSOR_JOIN_ITER, which in some sense encodes the "position"
+of the cursor.  To get the next row that satisfies a join requires that multiple cursors be iterated.
+Generally, the "left-most" index cursor is iterated first.  Using the example above, the \c price
+index would be iterated, skipping over any entries that did not satisfy `(row.price > 100 and row.price <= 200)`.
+Then other cursors are checked to see that any other conditions are satisfied.
+The join may be configured to use Bloom filters, and when that occurs,
+\c WT_CURSOR_JOIN_ENTRIES contain the bloom filter for individual index checks.  This allows checks
+to occur quickly, at the expense of an initialization that occurs when the cursor begins iteration.
+
+During a join cursor iteration, multiple \c WT_CURSOR_JOIN_ITER objects may be created.
+There is one \c WT_CURSOR_JOIN_ITER object corresponding to each level of nesting (queries can have
+arbitrary nesting of \c AND/OR).  Each \c WT_CURSOR_JOIN_ITER allows the question to be asked: does
+the current position of the main table cursor satisfy the join conditions for this part of the join tree?
+
+Part of the reason for this dynamic structure is that disjunctions may require some dynamic action.
+Consider
+```
+(row.price > 100 and row.in_stock < 10) .... or (row.aisle == 12 and row.on_sale == 1)
+```
+The last part of the query (being part of an OR clause) will be first executed after a number of items
+are returned that satisfy the first part of the query.  If the join cursor is closed before then,
+it would be a waste to have opened subordinate cursors on the \c aisle and \c on_sale indices, and
+potentially computed bloom filters, etc.
+
+When positioned on an entry that represents a nested join, a new child \c WT_CURSOR_JOIN_ITER is
+created that will be bound to the nested \c WT_CURSOR_JOIN. That iterator is then used to generate candidate
+primary keys. When its iteration is completed, that iterator is destroyed and the parent iterator
+advances to the next entry. Thus, depending on how deeply joins are nested, a similarly deep
+stack of iterators is created.
+
+@section arch_cursor_duplicate Duplicating cursors
+
+Cursors may be duplicated, this occurs by passing a cursor to be duplicated as part of the
+WT_SESSION::open_cursor call.  Cursor duplication does not occur in the cursor type code.
+Rather, a new cursor of the requested type is created, and the cursor's position is duplicated
+via a call to \c __wt_cursor_dup_position.  This function gets the key from the original cursor
+in *raw* form (not converting it using \c key_format), sets the key in the new cursor, and
+does a search to set the position properly.
+
+@section arch_cursor_dhandle File cursors, Btrees and data handles
+
+File cursors, Btrees and data handles exist in a WiredTiger system as different ways to
+reference the data in a Btree.  It is useful to understand the differences between these structures
+and how they are used.
+
+At the bottom is the data handle (also known as *dhandle*). This is an abstraction of an operating
+system file handle, with a set of flags and some reference counts.  A Btree is a much larger
+abstraction, with a memory cache of key value pairs along with functions to read and write data
+as needed to and from the data file.  A Btree is paired with a data handle to allow the transfer of data.
+Both the data handle and the Btree are owned by the connection.  That is, they are shared among all sessions.
+
+File cursors, on the other hand, are owned by a session.  When a session opens a cursor on a file for the
+first time in that session, a file cursor is created.  This occurs even if the file may be opened by cursors
+in other sessions already.  The session owns the cursor, and the cursor may only be used by that session.
+Open cursors do increment reference counts in the data handle, so that the data handle "knows" it is
+being used, so that the file may not be dropped, renamed, verified or salvaged.
+So when WT_CURSOR::close is called for a file cursor, the cursor's memory may be released (or
+retained if cached), and reference counts decremented.  Other sessions may retain open cursors on that file,
+they are independent.
+
+@section arch_cursor_caching Cursor caching
+
+Cursors, upon closing, may be cached in the session.  An open of the same URI will return a cached
+cursor if one is found matching the URI.  Cursor caching is currently only done on file cursors.
+Because of the optimization for simple tables described above, cursors on simple tables are also cached.
+
+To help implement caching, two methods, \c cache and \c reopen have been added to the cursor API.
+These are not public.  Their function is to perform cursor-type specific operations to change a cursor
+from an open state to a cached state (\c cache) and change from a cached state to the open state (\c reopen).
+
+When a cursor is opened the first time, it is marked as *cacheable* or not.  Cursors that specify
+certain options, like bulk loading, random, or readonly, are not cacheable. When a cursor is closed, the
+cursor is checked if it is cacheable.  If so and if cursor caching is enabled in the session, then it will be
+cached. Cached cursors live in a hash table that is owned by the session.  A hash function on the URI is used
+to determine which hash bucket to use.  We compute the hash of the URI once, its value is stored in
+in the cursor for future use.  Thus, caching a cursor (what happens within the type-specific \c cache function
+(e.g. \c __curfile_cache) is relatively quick:
+
+- Free storage that we don't want held (for example, storage used by the cursor's key and value).
+- Get a *weak* reference to the data handle (increment \c dhandle->session_ref).
+- Release the *strong* reference to the data handle (decrement \c dhandle->session_inuse).
+- Determine the hash bucket needed (using the hash value in the cursor).
+- Move the cursor from the session's open list to the list in the hash bucket.
+- Increment statistics and decrement the connection's open cursor count.
+- Set the cache cursor flag.
+- Unlock the dhandle.
+
+The change of reference from a *strong* to a *weak* is significant.  When a dhandle's \c session_inuse
+(*strong* reference) drops to zero, it means that no cursor is open on the dhandle, and the only
+references are from cached cursors. In this state, the dhandle may be marked dead by the dhandle sweep.
+When the dhandle is dead, the dhandle's memory will still persist, but each session will eventually
+notice, during its cursor cache sweep, and "fully close" the cursor, removing it from the cache list,
+releasing its weak reference before freeing the cursor.  Each session holding cached cursors must have
+some periodic activity that causes it to run its sweep, an occasional call to WT_SESSION::reset will suffice.
+After a dhandle has been dead for enough time, it is expected that all of its weak references will drop to zero,
+and the dhandle itself can be freed by the dhandle sweep.
+
+If there is a failure during the \c cache function, then we would want to fully close the cursor.
+Rather than having special case code to handle this rare condition, we instead call \c reopen to
+temporarily bring the cursor back to an open state, and turn cursor caching off temporarily in
+the session while we close the cursor, releasing all its resources and references.
+
+During a cursor open, if the cursor configuration options allow caching, we hash the uri, and look
+at the corresponding hash bucket in the session cursor cache.  If we find a matching cursor, we
+call \c reopen on the cursor.  This is what happens within the call to the type-specific \c reopen function
+(e.g. \c __curfile_reopen):
+
+- Lock the dhandle.
+- If the dhandle is no longer open, release it and mark the reopen to fail (but continue).
+- Get a "strong" reference to the data handle (increment \c dhandle->session_inuse).
+- Release the "weak" reference to the data handle (decrement \c dhandle->session_ref).
+- Increment statistics and the connection's open cursor count.
+- Move the handle from the hash bucket to the session's open list (the hash value was previously saved in the cursor)
+- Clear the cache cursor flag.
+- Update convenient pointers within the cursor to parts of the WT_BTREE that may have changed
+  which the cursor was cached.
+
+If the reopen fails (probably due to the dhandle no longer being open or being marked dead),
+we have ensured that enough of the cursor is opened so that it can be legally closed.
+We have studiously avoided having a cursor that is in some state that is half-open and half-closed,
+as it is hard to know how to dispose of it.
+
+@section arch_cursor_sweep Session cursor sweep
+
+Consider a large system that has many sessions using the same set of tables. When all sessions
+have closed a particular table, there will be no need to keep the underlying data handle open.
+In fact, if a WT_SESSION::drop call is called, we want to ensure that the data handle has been
+closed and the corresponding file is removed.  Some systems, like Windows, require that all open
+file handles be closed before a file can be successfully removed from the file system.  So there
+is motivation to periodically mark data handles that have no active references, and have sessions
+free cached cursors that have weak references to such marked data handles.  The former job
+occurs in the connection sweep code.  The latter job occurs in the session cursor cache sweep.
+
+The session cursor cache sweep currently happens in the WT_CURSOR::close call,
+and also on calls to WT_SESSION::reset.  On one hand, we don't want the overhead
+of a sweep to occur too frequently. It is quite possible that both close and reset can be called a lot
+and there may be many of thousands of cached cursors in the session.  For that reason, we'd like
+to do the sweep in small increments, and not on each call.  On the other hand, in a larger system,
+a session may be part of a pool servicing higher level requests.  When a session completes its work,
+it may be left idle by the caller of WiredTiger, and such a session may
+then be idle for long periods of time.  When cursor caching is enabled and sessions are not active,
+we want even occasional calls to WT_SESSION::reset to have a strong effect. We want occasional
+sweeps to keep up with freeing up references to data handles, so that otherwise unused data handles
+may in turn be freed eventually.
+
+Our solution to this is three-fold.  First, every time we want to call the sweep, a countdown
+counter is used, so we only consider a sweep every \c WT_SESSION_CURSOR_SWEEP_COUNTDOWN times
+(currently `WT_SESSION_CURSOR_SWEEP_COUNTDOWN == 40`).  Secondly, we won't sweep if it's already
+been done this second in time.  Finally, we sweep by walking a small set of buckets, initially 5 out of
+typically 512 configured buckets. However, depending on how productive our sweep is, that is,
+how many references to closed data handles are freed, we may continue our walk.  This should
+usually strike a good balance between not having a lot of overhead for sweeps, and keeping up with
+the need to free up shared resources.
+
+@section arch_cursor_debug_copy Debug copy
+
+When cursors are positioned, their data may point to data in the btree or data allocated in the cursor.
+A caller may use a pointer to that data until the next cursor call.  After that, the pointer should not be
+considered valid.  By default, WiredTiger does not enforce this.  When opening a cursor, a \c "debug=copy" configuration
+flag can be used.  This forces any data that is returned by WT_CURSOR::get_key or WT_CURSOR::get_value
+to be in malloc'd memory, and explicitly freed on the next API call.  Systems that
+are instrumented to track memory references can detect the references to freed memory, thus latent
+bugs can be detected.
+
+The implementation is straightforward.  The key and value in each cursor is represented as a \c WT_ITEM.
+A \c WT_ITEM includes a pointer and size, and it can point to arbitrary memory. However, the \c WT_ITEM
+also includes a memory buffer that may or may not be allocated.  When the \c WT_ITEM pointer points
+to the item's own memory buffer, then it is already in malloc'd memory.  When \c "debug=copy" is
+configured, it is a simple matter to check if a key and value being returned are already in the item's malloc'd memory.
+If not, memory is allocated, the copy is made and the item's pointer is updated.  On the beginning
+of the next API call using that cursor, the item's malloc'd memory is overwritten and freed.
+Thus, in the presence of a memory tracker, uses
+of "stray" pointers will be detected.  Even without a memory tracker, uses of "stray" pointers into
+the freed storage will likely yield the overwritten bytes, and not the previously seen key or value.
 
 */
diff --git a/src/third_party/wiredtiger/src/docs/arch-index.dox b/src/third_party/wiredtiger/src/docs/arch-index.dox
index 21af5d1f62f..52ebc0120d4 100644
--- a/src/third_party/wiredtiger/src/docs/arch-index.dox
+++ b/src/third_party/wiredtiger/src/docs/arch-index.dox
@@ -184,6 +184,10 @@ WiredTiger has a Python API that is useful for scripting and experimentation.
 
 Row Stores are Btrees that have a variable size key and data.
 
+@subpage arch-rts
+
+Rollback to stable to remove the unstable updates from the database.
+
 @subpage arch-schema
 
 A schema defines the format of the application data in WiredTiger.
diff --git a/src/third_party/wiredtiger/src/docs/arch-rts.dox b/src/third_party/wiredtiger/src/docs/arch-rts.dox
new file mode 100644
index 00000000000..451732f5776
--- /dev/null
+++ b/src/third_party/wiredtiger/src/docs/arch-rts.dox
@@ -0,0 +1,113 @@
+/*! @arch_page arch-rts Rollback to stable
+
+Rollback to stable is an operation that retains only the modifications that
+are stable according to the stable timestamp and recovered checkpoint snapshot.
+The checkpoint transaction snapshot details are saved at the end of every
+checkpoint are recovered and used as a recovered checkpoint snapshot.
+
+@section rts-overview Overview of rollback to stable
+
+Rollback to stable scans each and every table present in the database except
+metadata @ref arch-metadata to remove the modifications from the table
+that are more than stable timestamp and recovered checkpoint snapshot.
+
+In the process of removing newer modifications from the table, all the in-memory
+updates are aborted and the on-disk version updates are replaced with an update
+from history store otherwise the data store version is removed.
+
+Rollback to stable is performed in three phases
+1. WT startup
+2. WT shutdown
+3. Application initiated
+
+To improve the performance of rollback to stable operation, rollback to stable
+will perform only on particular tables that need rollback. Rollback to stable
+doesn't operate on logged tables as the updates on these are stable when the
+transaction gets committed.
+
+@section rts-stable-update Stable update of rollback to stable
+
+According to rollback to stable, the stable version of update is an update that
+has durable timestamp of less than or equal to the stable timestamp and it's
+transaction id must be committed according to the checkpoint snapshot.
+
+@section rts-preconditions Pre-conditions required for rollback to stable
+
+To perform rollback to stable, there shouldn't be any transaction activity happening
+in the WiredTiger.
+
+@section rts-table-check Checks performed on a table by rollback to stable
+
+Rollback to stable consider a table to be processed for rollback based on the following
+conditions.
+1. Table is modified
+2. The checkpoint durable start/stop timestamp is greater than the rollback timestamp.
+3. There is no durable timestamp in any checkpoint.
+4. Has prepared updates
+5. Has updates from transactions greater than checkpoint snapshot (only in restart phase)
+
+There are some special conditions where the table is skipped for rollback to stable.
+1. Empty file
+2. Table has timestamp updates but there is no stable timestamp set.
+3. Files that don't exist
+4. Files that are corrupted.
+
+If the table has no timestamp updates, then the history store table is
+scanned to remove any historical versions related to this table as these
+older versions no longer required.
+
+Once all the tables are process for rollback to stable, at the end the history store
+is processed to remove any unstable updates more than the stable timestamp.
+
+@section rts-how How rollback to stable fixes the unstable updates
+
+Once a table is identified to perform rollback to stable, it reads the pages into
+the cache if they don't exist and process it to remove the unstable updates.
+
+There are two types of rollbacks that rollback to stable performs:
+1. Rolling back unstable fast truncate
+2. Rolling back unstable updates
+
+All internal pages are traversed to rollback unstable fast truncate operations and
+leaf pages are traversed to remove the unstable updates.
+
+Once the leaf page is identified to rollback the updates, it is performed in the
+following order.
+1. Check smallest insert lists on the page
+2. Traverse through all the on-disk keys
+    a. Check update list
+    b. Check insert list
+    c. Check the on-disk version if no stable update found in the update list.
+3. Traverse through the reconciled pages to abort any history store updates.
+
+@section rts-abort-update How rollback to stable aborts in-memory updates
+
+Traverse through all the updates in the update list and abort them until a stable
+update is found.
+
+@section rts-abort-on-disk-update How rollback to stable aborts on-disk update
+
+If the start time pair is not stable try to find a valid update from the history
+store that is stable to replace the on-disk version otherwise remove the on-disk
+key. In case if the stop time pair if exists and if its not stable, restore the
+on-disk update again into the update list.
+
+To remove any existing update, rollback to stable adds globally visible tombstone to
+the key update list and this key will get removed later during the reconciliation.
+
+Note: As of now, rollback to stable don't work on removing on-disk columnar updates.
+
+@section rts-hs-search Rollback to stable history store search
+
+Rollback to stable searches the history store to find a stable update to replace an
+unstable update in the data store. It searches the history store with the given data
+store key with a maximum timestamp and traverse back till a stable update found. If
+no valid update is found the data store key is removed.
+
+@section rts-page-skip Skipping reading unnecessary pages into memory
+
+Rollback to stable doesn't load the pages that don't have any unstable updates to be
+removed to improve the performance of rollback to stable by verifying the time aggregated
+values with the stable timestamp or recovered checkpoint snapshot during the tree walk.
+
+*/
diff --git a/src/third_party/wiredtiger/src/docs/spell.ok b/src/third_party/wiredtiger/src/docs/spell.ok
index be45196ff97..5a37d260c1e 100644
--- a/src/third_party/wiredtiger/src/docs/spell.ok
+++ b/src/third_party/wiredtiger/src/docs/spell.ok
@@ -17,6 +17,7 @@ Christoph
 Collet's
 Coverity
 Coverity's
+CURSTD
 DB's
 DBTs
 DHANDLE
@@ -51,6 +52,8 @@ Google's
 HyperDex
 HyperLevelDB
 IEC
+IMPL
+ITER
 JDK
 JIRA
 JavaScript
@@ -114,6 +117,7 @@ Yann
 Za
 Zstd
 aR
+abc
 abstime
 ack'ed
 ajn
@@ -149,6 +153,7 @@ bufs
 builtin
 builtins
 bzip
+cacheable
 cachesize
 calc
 callbk
@@ -180,6 +185,7 @@ crashless
 crc
 curfile
 cursortype
+curtable
 customerABC
 cv
 dN
@@ -197,6 +203,7 @@ dbformat
 dbm
 dbt
 decl
+decrement
 decrementing
 decrypt
 decrypted
@@ -291,6 +298,7 @@ html
 htmlinclude
 huffman
 hugepage
+iSi
 icount
 ie
 iflag
@@ -299,6 +307,7 @@ indices
 init
 insn
 intl
+intpack
 inuse
 io
 ip
@@ -469,6 +478,7 @@ rmw
 ro
 rotn
 rpc
+rts
 runnable
 runtime
 rwlock
@@ -497,6 +507,7 @@ src
 ssd
 startsync
 startuml
+startup
 statlog
 stderr
 stdout
diff --git a/src/third_party/wiredtiger/src/history/hs_cursor.c b/src/third_party/wiredtiger/src/history/hs_cursor.c
index 333ec0e25bb..6baaf94847c 100644
--- a/src/third_party/wiredtiger/src/history/hs_cursor.c
+++ b/src/third_party/wiredtiger/src/history/hs_cursor.c
@@ -180,7 +180,7 @@ __wt_hs_cursor_position(WT_SESSION_IMPL *session, WT_CURSOR *cursor, uint32_t bt
 static int
 __hs_find_upd_int(WT_SESSION_IMPL *session, uint32_t btree_id, WT_ITEM *key,
   const char *value_format, uint64_t recno, WT_UPDATE_VALUE *upd_value, bool allow_prepare,
-  WT_ITEM *on_disk_buf, WT_TIME_WINDOW *on_disk_tw)
+  WT_ITEM *base_value_buf)
 {
     WT_CURSOR *hs_cursor;
     WT_CURSOR_BTREE *hs_cbt;
@@ -333,14 +333,14 @@ __hs_find_upd_int(WT_SESSION_IMPL *session, uint32_t btree_id, WT_ITEM *key,
             WT_ERR_NOTFOUND_OK(__wt_hs_cursor_next(session, hs_cursor), true);
             if (ret == WT_NOTFOUND) {
                 /*
-                 * Fallback to the onpage value as the base value.
+                 * Fallback to the provided value as the base value.
                  *
                  * Work around of clang analyzer complaining the value is never read as it is reset
                  * again by the following WT_ERR macro.
                  */
                 WT_NOT_READ(ret, 0);
                 orig_hs_value_buf = hs_value;
-                hs_value = on_disk_buf;
+                hs_value = base_value_buf;
                 upd_type = WT_UPDATE_STANDARD;
                 break;
             }
@@ -356,9 +356,9 @@ __hs_find_upd_int(WT_SESSION_IMPL *session, uint32_t btree_id, WT_ITEM *key,
               hs_cursor, &hs_btree_id, &hs_key, &hs_start_ts_tmp, &hs_counter_tmp));
 
             if (hs_btree_id != btree_id) {
-                /* Fallback to the onpage value as the base value. */
+                /* Fallback to the provided value as the base value. */
                 orig_hs_value_buf = hs_value;
-                hs_value = on_disk_buf;
+                hs_value = base_value_buf;
                 upd_type = WT_UPDATE_STANDARD;
                 break;
             }
@@ -366,24 +366,22 @@ __hs_find_upd_int(WT_SESSION_IMPL *session, uint32_t btree_id, WT_ITEM *key,
             WT_ERR(__wt_compare(session, NULL, &hs_key, key, &cmp));
 
             if (cmp != 0) {
-                /* Fallback to the onpage value as the base value. */
+                /* Fallback to the provided value as the base value. */
                 orig_hs_value_buf = hs_value;
-                hs_value = on_disk_buf;
+                hs_value = base_value_buf;
                 upd_type = WT_UPDATE_STANDARD;
                 break;
             }
 
             /*
-             * If we find a history store record that either corresponds to the on-disk value or is
-             * newer than it then we should use the on-disk value as the base value and apply our
-             * modifies on top of it.
+             * If the stop time pair on the tombstone in the history store is already globally
+             * visible fall back to the base value. This is possible in scenarios where the latest
+             * updates are aborted by RTS according to stable timestamp.
              */
-            if (on_disk_tw->start_ts < hs_start_ts_tmp ||
-              (on_disk_tw->start_ts == hs_start_ts_tmp &&
-                on_disk_tw->start_txn <= hs_cbt->upd_value->tw.start_txn)) {
-                /* Fallback to the onpage value as the base value. */
+            if (__wt_txn_tw_stop_visible_all(session, &hs_cbt->upd_value->tw)) {
+                /* Fallback to the provided value as the base value. */
                 orig_hs_value_buf = hs_value;
-                hs_value = on_disk_buf;
+                hs_value = base_value_buf;
                 upd_type = WT_UPDATE_STANDARD;
                 break;
             }
@@ -451,7 +449,7 @@ err:
  */
 int
 __wt_hs_find_upd(WT_SESSION_IMPL *session, WT_ITEM *key, const char *value_format, uint64_t recno,
-  WT_UPDATE_VALUE *upd_value, bool allow_prepare, WT_ITEM *on_disk_buf, WT_TIME_WINDOW *on_disk_tw)
+  WT_UPDATE_VALUE *upd_value, bool allow_prepare, WT_ITEM *base_value_buf)
 {
     WT_BTREE *btree;
     WT_DECL_RET;
@@ -460,8 +458,8 @@ __wt_hs_find_upd(WT_SESSION_IMPL *session, WT_ITEM *key, const char *value_forma
 
     WT_RET(__wt_hs_cursor_open(session));
     WT_WITH_BTREE(session, CUR2BT(session->hs_cursor),
-      (ret = __hs_find_upd_int(session, btree->id, key, value_format, recno, upd_value,
-         allow_prepare, on_disk_buf, on_disk_tw)));
+      (ret = __hs_find_upd_int(
+         session, btree->id, key, value_format, recno, upd_value, allow_prepare, base_value_buf)));
     WT_TRET(__wt_hs_cursor_close(session));
     return (ret);
 }
diff --git a/src/third_party/wiredtiger/src/include/btree.h b/src/third_party/wiredtiger/src/include/btree.h
index f11210061b1..79addcfd048 100644
--- a/src/third_party/wiredtiger/src/include/btree.h
+++ b/src/third_party/wiredtiger/src/include/btree.h
@@ -252,18 +252,19 @@ struct __wt_btree {
  * Flag values up to 0xff are reserved for WT_DHANDLE_XXX. We don't automatically generate these
  * flag values for that reason, there's no way to start at an offset.
  */
-#define WT_BTREE_ALTER 0x000100u         /* Handle is for alter */
-#define WT_BTREE_BULK 0x000200u          /* Bulk-load handle */
-#define WT_BTREE_CLOSED 0x000400u        /* Handle closed */
-#define WT_BTREE_IGNORE_CACHE 0x000800u  /* Cache-resident object */
-#define WT_BTREE_IN_MEMORY 0x001000u     /* Cache-resident object */
-#define WT_BTREE_NO_CHECKPOINT 0x002000u /* Disable checkpoints */
-#define WT_BTREE_NO_LOGGING 0x004000u    /* Disable logging */
-#define WT_BTREE_READONLY 0x008000u      /* Handle is readonly */
-#define WT_BTREE_SALVAGE 0x010000u       /* Handle is for salvage */
-#define WT_BTREE_SKIP_CKPT 0x020000u     /* Handle skipped checkpoint */
-#define WT_BTREE_UPGRADE 0x040000u       /* Handle is for upgrade */
-#define WT_BTREE_VERIFY 0x080000u        /* Handle is for verify */
+#define WT_BTREE_ALTER 0x000100u          /* Handle is for alter */
+#define WT_BTREE_BULK 0x000200u           /* Bulk-load handle */
+#define WT_BTREE_CLOSED 0x000400u         /* Handle closed */
+#define WT_BTREE_IGNORE_CACHE 0x000800u   /* Cache-resident object */
+#define WT_BTREE_IN_MEMORY 0x001000u      /* Cache-resident object */
+#define WT_BTREE_NO_CHECKPOINT 0x002000u  /* Disable checkpoints */
+#define WT_BTREE_NO_LOGGING 0x004000u     /* Disable logging */
+#define WT_BTREE_OBSOLETE_PAGES 0x008000u /* Handle has obsolete pages */
+#define WT_BTREE_READONLY 0x010000u       /* Handle is readonly */
+#define WT_BTREE_SALVAGE 0x020000u        /* Handle is for salvage */
+#define WT_BTREE_SKIP_CKPT 0x040000u      /* Handle skipped checkpoint */
+#define WT_BTREE_UPGRADE 0x080000u        /* Handle is for upgrade */
+#define WT_BTREE_VERIFY 0x100000u         /* Handle is for verify */
     uint32_t flags;
 };
 
diff --git a/src/third_party/wiredtiger/src/include/connection.h b/src/third_party/wiredtiger/src/include/connection.h
index f62570a8041..5af42c1a0e7 100644
--- a/src/third_party/wiredtiger/src/include/connection.h
+++ b/src/third_party/wiredtiger/src/include/connection.h
@@ -548,35 +548,42 @@ struct __wt_connection_impl {
      */
     WT_FILE_SYSTEM *file_system;
 
+/*
+ * Server subsystem flags.
+ */
+/* AUTOMATIC FLAG VALUE GENERATION START */
+#define WT_CONN_SERVER_CAPACITY 0x01u
+#define WT_CONN_SERVER_CHECKPOINT 0x02u
+#define WT_CONN_SERVER_LOG 0x04u
+#define WT_CONN_SERVER_LSM 0x08u
+#define WT_CONN_SERVER_STATISTICS 0x10u
+#define WT_CONN_SERVER_SWEEP 0x20u
+    /* AUTOMATIC FLAG VALUE GENERATION STOP */
+    uint32_t server_flags;
+
 /* AUTOMATIC FLAG VALUE GENERATION START */
-#define WT_CONN_CACHE_CURSORS 0x0000001u
-#define WT_CONN_CACHE_POOL 0x0000002u
-#define WT_CONN_CKPT_SYNC 0x0000004u
-#define WT_CONN_CLOSING 0x0000008u
-#define WT_CONN_CLOSING_NO_MORE_OPENS 0x0000010u
-#define WT_CONN_CLOSING_TIMESTAMP 0x0000020u
-#define WT_CONN_COMPATIBILITY 0x0000040u
-#define WT_CONN_DATA_CORRUPTION 0x0000080u
-#define WT_CONN_EVICTION_RUN 0x0000100u
-#define WT_CONN_FILE_CLOSE_SYNC 0x0000200u
-#define WT_CONN_HS_OPEN 0x0000400u
-#define WT_CONN_INCR_BACKUP 0x0000800u
-#define WT_CONN_IN_MEMORY 0x0001000u
-#define WT_CONN_LEAK_MEMORY 0x0002000u
-#define WT_CONN_LSM_MERGE 0x0004000u
-#define WT_CONN_OPTRACK 0x0008000u
-#define WT_CONN_PANIC 0x0010000u
-#define WT_CONN_READONLY 0x0020000u
-#define WT_CONN_RECONFIGURING 0x0040000u
-#define WT_CONN_RECOVERING 0x0080000u
-#define WT_CONN_SALVAGE 0x0100000u
-#define WT_CONN_SERVER_CAPACITY 0x0200000u
-#define WT_CONN_SERVER_CHECKPOINT 0x0400000u
-#define WT_CONN_SERVER_LOG 0x0800000u
-#define WT_CONN_SERVER_LSM 0x1000000u
-#define WT_CONN_SERVER_STATISTICS 0x2000000u
-#define WT_CONN_SERVER_SWEEP 0x4000000u
-#define WT_CONN_WAS_BACKUP 0x8000000u
+#define WT_CONN_CACHE_CURSORS 0x000001u
+#define WT_CONN_CACHE_POOL 0x000002u
+#define WT_CONN_CKPT_SYNC 0x000004u
+#define WT_CONN_CLOSING 0x000008u
+#define WT_CONN_CLOSING_NO_MORE_OPENS 0x000010u
+#define WT_CONN_CLOSING_TIMESTAMP 0x000020u
+#define WT_CONN_COMPATIBILITY 0x000040u
+#define WT_CONN_DATA_CORRUPTION 0x000080u
+#define WT_CONN_EVICTION_RUN 0x000100u
+#define WT_CONN_FILE_CLOSE_SYNC 0x000200u
+#define WT_CONN_HS_OPEN 0x000400u
+#define WT_CONN_INCR_BACKUP 0x000800u
+#define WT_CONN_IN_MEMORY 0x001000u
+#define WT_CONN_LEAK_MEMORY 0x002000u
+#define WT_CONN_LSM_MERGE 0x004000u
+#define WT_CONN_OPTRACK 0x008000u
+#define WT_CONN_PANIC 0x010000u
+#define WT_CONN_READONLY 0x020000u
+#define WT_CONN_RECONFIGURING 0x040000u
+#define WT_CONN_RECOVERING 0x080000u
+#define WT_CONN_SALVAGE 0x100000u
+#define WT_CONN_WAS_BACKUP 0x200000u
     /* AUTOMATIC FLAG VALUE GENERATION STOP */
     uint32_t flags;
 };
diff --git a/src/third_party/wiredtiger/src/include/extern.h b/src/third_party/wiredtiger/src/include/extern.h
index 68e636cdebf..405ae73401b 100644
--- a/src/third_party/wiredtiger/src/include/extern.h
+++ b/src/third_party/wiredtiger/src/include/extern.h
@@ -763,8 +763,8 @@ extern int __wt_hs_delete_key_from_ts(
   WT_SESSION_IMPL *session, uint32_t btree_id, const WT_ITEM *key, wt_timestamp_t ts, bool reinsert)
   WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
 extern int __wt_hs_find_upd(WT_SESSION_IMPL *session, WT_ITEM *key, const char *value_format,
-  uint64_t recno, WT_UPDATE_VALUE *upd_value, bool allow_prepare, WT_ITEM *on_disk_buf,
-  WT_TIME_WINDOW *on_disk_tw) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
+  uint64_t recno, WT_UPDATE_VALUE *upd_value, bool allow_prepare, WT_ITEM *base_value_buf)
+  WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
 extern int __wt_hs_get_btree(WT_SESSION_IMPL *session, WT_BTREE **hs_btreep)
   WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
 extern int __wt_hs_insert_updates(WT_SESSION_IMPL *session, WT_PAGE *page, WT_MULTI *multi)
@@ -2070,7 +2070,10 @@ static inline int __wt_txn_read(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt,
   uint64_t recno, WT_UPDATE *upd, WT_CELL_UNPACK_KV *vpack)
   WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
 static inline int __wt_txn_read_upd_list(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt,
-  WT_UPDATE *upd, WT_UPDATE **prepare_updp) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
+  WT_UPDATE *upd) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
+static inline int __wt_txn_read_upd_list_internal(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt,
+  WT_UPDATE *upd, WT_UPDATE **prepare_updp, WT_UPDATE **restored_updp)
+  WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
 static inline int __wt_txn_search_check(WT_SESSION_IMPL *session)
   WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
 static inline int __wt_txn_update_check(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt,
diff --git a/src/third_party/wiredtiger/src/include/stat.h b/src/third_party/wiredtiger/src/include/stat.h
index e1c6cea488a..3635dd41d1b 100644
--- a/src/third_party/wiredtiger/src/include/stat.h
+++ b/src/third_party/wiredtiger/src/include/stat.h
@@ -783,8 +783,10 @@ struct __wt_connection_stats {
     int64_t txn_rts_keys_removed;
     int64_t txn_rts_keys_restored;
     int64_t txn_rts_hs_restore_tombstones;
+    int64_t txn_rts_hs_restore_updates;
     int64_t txn_rts_sweep_hs_keys;
     int64_t txn_rts_hs_removed;
+    int64_t txn_checkpoint_obsolete_applied;
     int64_t txn_update_conflict;
 };
 
@@ -996,8 +998,10 @@ struct __wt_dsrc_stats {
     int64_t txn_rts_keys_removed;
     int64_t txn_rts_keys_restored;
     int64_t txn_rts_hs_restore_tombstones;
+    int64_t txn_rts_hs_restore_updates;
     int64_t txn_rts_sweep_hs_keys;
     int64_t txn_rts_hs_removed;
+    int64_t txn_checkpoint_obsolete_applied;
     int64_t txn_update_conflict;
 };
 
diff --git a/src/third_party/wiredtiger/src/include/txn_inline.h b/src/third_party/wiredtiger/src/include/txn_inline.h
index 9aa86728d59..f4a4c552ddb 100644
--- a/src/third_party/wiredtiger/src/include/txn_inline.h
+++ b/src/third_party/wiredtiger/src/include/txn_inline.h
@@ -836,18 +836,21 @@ __wt_upd_alloc_tombstone(WT_SESSION_IMPL *session, WT_UPDATE **updp, size_t *siz
 }
 
 /*
- * __wt_txn_read_upd_list --
- *     Get the first visible update in a list (or NULL if none are visible).
+ * __wt_txn_read_upd_list_internal --
+ *     Internal helper function to get the first visible update in a list (or NULL if none are
+ *     visible).
  */
 static inline int
-__wt_txn_read_upd_list(
-  WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, WT_UPDATE *upd, WT_UPDATE **prepare_updp)
+__wt_txn_read_upd_list_internal(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, WT_UPDATE *upd,
+  WT_UPDATE **prepare_updp, WT_UPDATE **restored_updp)
 {
     WT_VISIBLE_TYPE upd_visible;
     uint8_t prepare_state, type;
 
     if (prepare_updp != NULL)
         *prepare_updp = NULL;
+    if (restored_updp != NULL)
+        *restored_updp = NULL;
     __wt_upd_value_clear(cbt->upd_value);
 
     for (; upd != NULL; upd = upd->next) {
@@ -888,6 +891,16 @@ __wt_txn_read_upd_list(
           F_ISSET(upd, WT_UPDATE_PREPARE_RESTORED_FROM_DS))
             *prepare_updp = upd;
 
+        /*
+         * Save the restored update to use it as base value update in case if we need to reach
+         * history store instead of on-disk value.
+         */
+        if (restored_updp != NULL && F_ISSET(upd, WT_UPDATE_RESTORED_FROM_HS) &&
+          type == WT_UPDATE_STANDARD) {
+            WT_ASSERT(session, *restored_updp == NULL);
+            *restored_updp = upd;
+        }
+
         if (upd_visible == WT_VISIBLE_PREPARE) {
             /* Ignore the prepared update, if transaction configuration says so. */
             if (F_ISSET(session->txn, WT_TXN_IGNORE_PREPARE))
@@ -916,6 +929,16 @@ __wt_txn_read_upd_list(
 }
 
 /*
+ * __wt_txn_read_upd_list --
+ *     Get the first visible update in a list (or NULL if none are visible).
+ */
+static inline int
+__wt_txn_read_upd_list(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, WT_UPDATE *upd)
+{
+    return __wt_txn_read_upd_list_internal(session, cbt, upd, NULL, NULL);
+}
+
+/*
  * __wt_txn_read --
  *     Get the first visible update in a chain. This function will first check the update list
  *     supplied as a function argument. If there is no visible update, it will check the onpage
@@ -927,14 +950,14 @@ __wt_txn_read(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, WT_ITEM *key, uint
   WT_UPDATE *upd, WT_CELL_UNPACK_KV *vpack)
 {
     WT_TIME_WINDOW tw;
-    WT_UPDATE *prepare_upd;
+    WT_UPDATE *prepare_upd, *restored_upd;
     bool have_stop_tw, retry;
 
-    prepare_upd = NULL;
+    prepare_upd = restored_upd = NULL;
     retry = true;
 
 retry:
-    WT_RET(__wt_txn_read_upd_list(session, cbt, upd, &prepare_upd));
+    WT_RET(__wt_txn_read_upd_list_internal(session, cbt, upd, &prepare_upd, &restored_upd));
     if (WT_UPDATE_DATA_VALUE(cbt->upd_value) ||
       (cbt->upd_value->type == WT_UPDATE_MODIFY && cbt->upd_value->skip_buf))
         return (0);
@@ -947,66 +970,77 @@ retry:
     }
 
     /*
-     * When we inspected the update list we may have seen a tombstone leaving us with a valid stop
-     * time window, we don't want to overwrite this stop time window.
+     * Skip retrieving the on-disk value when there exists a restored update from history store in
+     * the update list. Having a restored update as part of the update list indicates that the
+     * existing on-disk value is unstable.
      */
-    have_stop_tw = WT_TIME_WINDOW_HAS_STOP(&cbt->upd_value->tw);
-
-    /* Check the ondisk value. */
-    if (vpack == NULL) {
-        WT_TIME_WINDOW_INIT(&tw);
-        WT_RET(__wt_value_return_buf(cbt, cbt->ref, &cbt->upd_value->buf, &tw));
+    if (restored_upd != NULL) {
+        WT_ASSERT(session, !WT_IS_HS(session->dhandle));
+        cbt->upd_value->buf.data = restored_upd->data;
+        cbt->upd_value->buf.size = restored_upd->size;
     } else {
-        WT_TIME_WINDOW_COPY(&tw, &vpack->tw);
-        cbt->upd_value->buf.data = vpack->data;
-        cbt->upd_value->buf.size = vpack->size;
-    }
-
-    /*
-     * If the stop time point is set, that means that there is a tombstone at that time. If it is
-     * not prepared and it is visible to our txn it means we've just spotted a tombstone and should
-     * return "not found", except scanning the history store during rollback to stable and when we
-     * are told to ignore non-globally visible tombstones.
-     */
-    if (!have_stop_tw && __wt_txn_tw_stop_visible(session, &tw) &&
-      !F_ISSET(&cbt->iface, WT_CURSTD_IGNORE_TOMBSTONE)) {
-        cbt->upd_value->buf.data = NULL;
-        cbt->upd_value->buf.size = 0;
-        cbt->upd_value->tw.durable_stop_ts = tw.durable_stop_ts;
-        cbt->upd_value->tw.stop_ts = tw.stop_ts;
-        cbt->upd_value->tw.stop_txn = tw.stop_txn;
-        cbt->upd_value->tw.prepare = tw.prepare;
-        cbt->upd_value->type = WT_UPDATE_TOMBSTONE;
-        return (0);
-    }
-
-    /* Store the stop time pair of the history store record that is returning. */
-    if (!have_stop_tw && WT_TIME_WINDOW_HAS_STOP(&tw) && WT_IS_HS(session->dhandle)) {
-        cbt->upd_value->tw.durable_stop_ts = tw.durable_stop_ts;
-        cbt->upd_value->tw.stop_ts = tw.stop_ts;
-        cbt->upd_value->tw.stop_txn = tw.stop_txn;
-        cbt->upd_value->tw.prepare = tw.prepare;
-    }
+        /*
+         * When we inspected the update list we may have seen a tombstone leaving us with a valid
+         * stop time window, we don't want to overwrite this stop time window.
+         */
+        have_stop_tw = WT_TIME_WINDOW_HAS_STOP(&cbt->upd_value->tw);
+
+        /* Check the ondisk value. */
+        if (vpack == NULL) {
+            WT_TIME_WINDOW_INIT(&tw);
+            WT_RET(__wt_value_return_buf(cbt, cbt->ref, &cbt->upd_value->buf, &tw));
+        } else {
+            WT_TIME_WINDOW_COPY(&tw, &vpack->tw);
+            cbt->upd_value->buf.data = vpack->data;
+            cbt->upd_value->buf.size = vpack->size;
+        }
 
-    /* If the start time point is visible then we need to return the ondisk value. */
-    if (WT_IS_HS(session->dhandle) || __wt_txn_tw_start_visible(session, &tw)) {
-        if (cbt->upd_value->skip_buf) {
+        /*
+         * If the stop time point is set, that means that there is a tombstone at that time. If it
+         * is not prepared and it is visible to our txn it means we've just spotted a tombstone and
+         * should return "not found", except scanning the history store during rollback to stable
+         * and when we are told to ignore non-globally visible tombstones.
+         */
+        if (!have_stop_tw && __wt_txn_tw_stop_visible(session, &tw) &&
+          !F_ISSET(&cbt->iface, WT_CURSTD_IGNORE_TOMBSTONE)) {
             cbt->upd_value->buf.data = NULL;
             cbt->upd_value->buf.size = 0;
+            cbt->upd_value->tw.durable_stop_ts = tw.durable_stop_ts;
+            cbt->upd_value->tw.stop_ts = tw.stop_ts;
+            cbt->upd_value->tw.stop_txn = tw.stop_txn;
+            cbt->upd_value->tw.prepare = tw.prepare;
+            cbt->upd_value->type = WT_UPDATE_TOMBSTONE;
+            return (0);
+        }
+
+        /* Store the stop time pair of the history store record that is returning. */
+        if (!have_stop_tw && WT_TIME_WINDOW_HAS_STOP(&tw) && WT_IS_HS(session->dhandle)) {
+            cbt->upd_value->tw.durable_stop_ts = tw.durable_stop_ts;
+            cbt->upd_value->tw.stop_ts = tw.stop_ts;
+            cbt->upd_value->tw.stop_txn = tw.stop_txn;
+            cbt->upd_value->tw.prepare = tw.prepare;
+        }
+
+        /* If the start time point is visible then we need to return the ondisk value. */
+        if (WT_IS_HS(session->dhandle) || __wt_txn_tw_start_visible(session, &tw)) {
+            if (cbt->upd_value->skip_buf) {
+                cbt->upd_value->buf.data = NULL;
+                cbt->upd_value->buf.size = 0;
+            }
+            cbt->upd_value->tw.durable_start_ts = tw.durable_start_ts;
+            cbt->upd_value->tw.start_ts = tw.start_ts;
+            cbt->upd_value->tw.start_txn = tw.start_txn;
+            cbt->upd_value->tw.prepare = tw.prepare;
+            cbt->upd_value->type = WT_UPDATE_STANDARD;
+            return (0);
         }
-        cbt->upd_value->tw.durable_start_ts = tw.durable_start_ts;
-        cbt->upd_value->tw.start_ts = tw.start_ts;
-        cbt->upd_value->tw.start_txn = tw.start_txn;
-        cbt->upd_value->tw.prepare = tw.prepare;
-        cbt->upd_value->type = WT_UPDATE_STANDARD;
-        return (0);
     }
 
     /* If there's no visible update in the update chain or ondisk, check the history store file. */
     if (F_ISSET(S2C(session), WT_CONN_HS_OPEN) && !F_ISSET(session->dhandle, WT_DHANDLE_HS)) {
         __wt_timing_stress(session, WT_TIMING_STRESS_HS_SEARCH);
         WT_RET(__wt_hs_find_upd(session, key, cbt->iface.value_format, recno, cbt->upd_value, false,
-          &cbt->upd_value->buf, &tw));
+          &cbt->upd_value->buf));
     }
 
     /*
diff --git a/src/third_party/wiredtiger/src/include/wiredtiger.in b/src/third_party/wiredtiger/src/include/wiredtiger.in
index d59138e1bb0..8b760c6df42 100644
--- a/src/third_party/wiredtiger/src/include/wiredtiger.in
+++ b/src/third_party/wiredtiger/src/include/wiredtiger.in
@@ -5875,12 +5875,16 @@ extern int wiredtiger_extension_terminate(WT_CONNECTION *connection);
 #define	WT_STAT_CONN_TXN_RTS_KEYS_RESTORED		1468
 /*! transaction: rollback to stable restored tombstones from history store */
 #define	WT_STAT_CONN_TXN_RTS_HS_RESTORE_TOMBSTONES	1469
+/*! transaction: rollback to stable restored updates from history store */
+#define	WT_STAT_CONN_TXN_RTS_HS_RESTORE_UPDATES		1470
 /*! transaction: rollback to stable sweeping history store keys */
-#define	WT_STAT_CONN_TXN_RTS_SWEEP_HS_KEYS		1470
+#define	WT_STAT_CONN_TXN_RTS_SWEEP_HS_KEYS		1471
 /*! transaction: rollback to stable updates removed from history store */
-#define	WT_STAT_CONN_TXN_RTS_HS_REMOVED			1471
+#define	WT_STAT_CONN_TXN_RTS_HS_REMOVED			1472
+/*! transaction: transaction checkpoints due to obsolete pages */
+#define	WT_STAT_CONN_TXN_CHECKPOINT_OBSOLETE_APPLIED	1473
 /*! transaction: update conflicts */
-#define	WT_STAT_CONN_TXN_UPDATE_CONFLICT		1472
+#define	WT_STAT_CONN_TXN_UPDATE_CONFLICT		1474
 
 /*!
  * @}
@@ -6484,12 +6488,16 @@ extern int wiredtiger_extension_terminate(WT_CONNECTION *connection);
 #define	WT_STAT_DSRC_TXN_RTS_KEYS_RESTORED		2201
 /*! transaction: rollback to stable restored tombstones from history store */
 #define	WT_STAT_DSRC_TXN_RTS_HS_RESTORE_TOMBSTONES	2202
+/*! transaction: rollback to stable restored updates from history store */
+#define	WT_STAT_DSRC_TXN_RTS_HS_RESTORE_UPDATES		2203
 /*! transaction: rollback to stable sweeping history store keys */
-#define	WT_STAT_DSRC_TXN_RTS_SWEEP_HS_KEYS		2203
+#define	WT_STAT_DSRC_TXN_RTS_SWEEP_HS_KEYS		2204
 /*! transaction: rollback to stable updates removed from history store */
-#define	WT_STAT_DSRC_TXN_RTS_HS_REMOVED			2204
+#define	WT_STAT_DSRC_TXN_RTS_HS_REMOVED			2205
+/*! transaction: transaction checkpoints due to obsolete pages */
+#define	WT_STAT_DSRC_TXN_CHECKPOINT_OBSOLETE_APPLIED	2206
 /*! transaction: update conflicts */
-#define	WT_STAT_DSRC_TXN_UPDATE_CONFLICT		2205
+#define	WT_STAT_DSRC_TXN_UPDATE_CONFLICT		2207
 
 /*!
  * @}
diff --git a/src/third_party/wiredtiger/src/lsm/lsm_manager.c b/src/third_party/wiredtiger/src/lsm/lsm_manager.c
index 707ef28c086..1ca3f32f262 100644
--- a/src/third_party/wiredtiger/src/lsm/lsm_manager.c
+++ b/src/third_party/wiredtiger/src/lsm/lsm_manager.c
@@ -213,7 +213,7 @@ __wt_lsm_manager_start(WT_SESSION_IMPL *session)
         manager->lsm_worker_cookies[i].session = worker_session;
     }
 
-    F_SET(conn, WT_CONN_SERVER_LSM);
+    FLD_SET(conn->server_flags, WT_CONN_SERVER_LSM);
 
     /* Start the LSM manager thread. */
     WT_ERR(__wt_thread_create(session, &manager->lsm_worker_cookies[0].tid, __lsm_worker_manager,
@@ -269,7 +269,7 @@ __wt_lsm_manager_destroy(WT_SESSION_IMPL *session)
     removed = 0;
 
     /* Clear the LSM server flag. */
-    F_CLR(conn, WT_CONN_SERVER_LSM);
+    FLD_CLR(conn->server_flags, WT_CONN_SERVER_LSM);
 
     WT_ASSERT(session, !F_ISSET(conn, WT_CONN_READONLY) || manager->lsm_workers == 0);
     if (manager->lsm_workers > 0) {
@@ -351,7 +351,7 @@ __lsm_manager_run_server(WT_SESSION_IMPL *session)
     conn = S2C(session);
     dhandle_locked = false;
 
-    while (F_ISSET(conn, WT_CONN_SERVER_LSM)) {
+    while (FLD_ISSET(conn->server_flags, WT_CONN_SERVER_LSM)) {
         __wt_sleep(0, 10000);
         if (TAILQ_EMPTY(&conn->lsmqh))
             continue;
diff --git a/src/third_party/wiredtiger/src/session/session_api.c b/src/third_party/wiredtiger/src/session/session_api.c
index 7ebdf736c2f..9581144489a 100644
--- a/src/third_party/wiredtiger/src/session/session_api.c
+++ b/src/third_party/wiredtiger/src/session/session_api.c
@@ -1760,7 +1760,7 @@ __transaction_sync_run_chk(WT_SESSION_IMPL *session)
 
     conn = S2C(session);
 
-    return (FLD_ISSET(conn->flags, WT_CONN_SERVER_LOG));
+    return (FLD_ISSET(conn->server_flags, WT_CONN_SERVER_LOG));
 }
 
 /*
diff --git a/src/third_party/wiredtiger/src/support/stat.c b/src/third_party/wiredtiger/src/support/stat.c
index 1ee255be706..100fa7b3ed8 100644
--- a/src/third_party/wiredtiger/src/support/stat.c
+++ b/src/third_party/wiredtiger/src/support/stat.c
@@ -210,8 +210,10 @@ static const char *const __stats_dsrc_desc[] = {
   "transaction: rollback to stable keys removed",
   "transaction: rollback to stable keys restored",
   "transaction: rollback to stable restored tombstones from history store",
+  "transaction: rollback to stable restored updates from history store",
   "transaction: rollback to stable sweeping history store keys",
   "transaction: rollback to stable updates removed from history store",
+  "transaction: transaction checkpoints due to obsolete pages",
   "transaction: update conflicts",
 };
 
@@ -456,8 +458,10 @@ __wt_stat_dsrc_clear_single(WT_DSRC_STATS *stats)
     stats->txn_rts_keys_removed = 0;
     stats->txn_rts_keys_restored = 0;
     stats->txn_rts_hs_restore_tombstones = 0;
+    stats->txn_rts_hs_restore_updates = 0;
     stats->txn_rts_sweep_hs_keys = 0;
     stats->txn_rts_hs_removed = 0;
+    stats->txn_checkpoint_obsolete_applied = 0;
     stats->txn_update_conflict = 0;
 }
 
@@ -689,8 +693,10 @@ __wt_stat_dsrc_aggregate_single(WT_DSRC_STATS *from, WT_DSRC_STATS *to)
     to->txn_rts_keys_removed += from->txn_rts_keys_removed;
     to->txn_rts_keys_restored += from->txn_rts_keys_restored;
     to->txn_rts_hs_restore_tombstones += from->txn_rts_hs_restore_tombstones;
+    to->txn_rts_hs_restore_updates += from->txn_rts_hs_restore_updates;
     to->txn_rts_sweep_hs_keys += from->txn_rts_sweep_hs_keys;
     to->txn_rts_hs_removed += from->txn_rts_hs_removed;
+    to->txn_checkpoint_obsolete_applied += from->txn_checkpoint_obsolete_applied;
     to->txn_update_conflict += from->txn_update_conflict;
 }
 
@@ -928,8 +934,10 @@ __wt_stat_dsrc_aggregate(WT_DSRC_STATS **from, WT_DSRC_STATS *to)
     to->txn_rts_keys_removed += WT_STAT_READ(from, txn_rts_keys_removed);
     to->txn_rts_keys_restored += WT_STAT_READ(from, txn_rts_keys_restored);
     to->txn_rts_hs_restore_tombstones += WT_STAT_READ(from, txn_rts_hs_restore_tombstones);
+    to->txn_rts_hs_restore_updates += WT_STAT_READ(from, txn_rts_hs_restore_updates);
     to->txn_rts_sweep_hs_keys += WT_STAT_READ(from, txn_rts_sweep_hs_keys);
     to->txn_rts_hs_removed += WT_STAT_READ(from, txn_rts_hs_removed);
+    to->txn_checkpoint_obsolete_applied += WT_STAT_READ(from, txn_checkpoint_obsolete_applied);
     to->txn_update_conflict += WT_STAT_READ(from, txn_update_conflict);
 }
 
@@ -1416,8 +1424,10 @@ static const char *const __stats_connection_desc[] = {
   "transaction: rollback to stable keys removed",
   "transaction: rollback to stable keys restored",
   "transaction: rollback to stable restored tombstones from history store",
+  "transaction: rollback to stable restored updates from history store",
   "transaction: rollback to stable sweeping history store keys",
   "transaction: rollback to stable updates removed from history store",
+  "transaction: transaction checkpoints due to obsolete pages",
   "transaction: update conflicts",
 };
 
@@ -1929,8 +1939,10 @@ __wt_stat_connection_clear_single(WT_CONNECTION_STATS *stats)
     stats->txn_rts_keys_removed = 0;
     stats->txn_rts_keys_restored = 0;
     stats->txn_rts_hs_restore_tombstones = 0;
+    stats->txn_rts_hs_restore_updates = 0;
     stats->txn_rts_sweep_hs_keys = 0;
     stats->txn_rts_hs_removed = 0;
+    stats->txn_checkpoint_obsolete_applied = 0;
     stats->txn_update_conflict = 0;
 }
 
@@ -2453,8 +2465,10 @@ __wt_stat_connection_aggregate(WT_CONNECTION_STATS **from, WT_CONNECTION_STATS *
     to->txn_rts_keys_removed += WT_STAT_READ(from, txn_rts_keys_removed);
     to->txn_rts_keys_restored += WT_STAT_READ(from, txn_rts_keys_restored);
     to->txn_rts_hs_restore_tombstones += WT_STAT_READ(from, txn_rts_hs_restore_tombstones);
+    to->txn_rts_hs_restore_updates += WT_STAT_READ(from, txn_rts_hs_restore_updates);
     to->txn_rts_sweep_hs_keys += WT_STAT_READ(from, txn_rts_sweep_hs_keys);
     to->txn_rts_hs_removed += WT_STAT_READ(from, txn_rts_hs_removed);
+    to->txn_checkpoint_obsolete_applied += WT_STAT_READ(from, txn_checkpoint_obsolete_applied);
     to->txn_update_conflict += WT_STAT_READ(from, txn_update_conflict);
 }
 
diff --git a/src/third_party/wiredtiger/src/tiered/tiered_cursor.c b/src/third_party/wiredtiger/src/tiered/tiered_cursor.c
index 26c750fb496..1694e57dbc3 100644
--- a/src/third_party/wiredtiger/src/tiered/tiered_cursor.c
+++ b/src/third_party/wiredtiger/src/tiered/tiered_cursor.c
@@ -34,8 +34,7 @@ __curtiered_open_cursors(WT_CURSOR_TIERED *curtiered)
     dhandle = NULL;
     tiered = curtiered->tiered;
 
-    if (tiered->ntiers == 0)
-        return (0);
+    WT_ASSERT(session, tiered->ntiers > 0);
 
     /*
      * If the key is pointing to memory that is pinned by a chunk cursor, take a copy before closing
@@ -1017,21 +1016,14 @@ err:
  *     documents avoids biasing towards small chunks. Then return the cursor on the chunk we have
  *     picked.
  */
-static int
+static void
 __curtiered_random_chunk(WT_SESSION_IMPL *session, WT_CURSOR_TIERED *curtiered, WT_CURSOR **cursor)
 {
-    u_int i, ntiers;
-
-    /*
-     * If the tree is empty we cannot do a random lookup, so return a WT_NOTFOUND.
-     */
-    if ((ntiers = curtiered->tiered->ntiers) == 0)
-        return (WT_NOTFOUND);
+    u_int i;
 
     /* TODO: make randomness respect tree size. */
-    i = __wt_random(&session->rnd) % ntiers;
+    i = __wt_random(&session->rnd) % curtiered->tiered->ntiers;
     *cursor = curtiered->cursors[i];
-    return (0);
 }
 
 /*
@@ -1055,7 +1047,7 @@ __curtiered_next_random(WT_CURSOR *cursor)
     WT_ERR(__curtiered_enter(curtiered, false));
 
     for (;;) {
-        WT_ERR(__curtiered_random_chunk(session, curtiered, &c));
+        __curtiered_random_chunk(session, curtiered, &c);
         /*
          * This call to next_random on the chunk can potentially end in WT_NOTFOUND if the chunk we
          * picked is empty. We want to retry in that case.
diff --git a/src/third_party/wiredtiger/src/tiered/tiered_schema.c b/src/third_party/wiredtiger/src/tiered/tiered_schema.c
index dc153b31e43..6e7dd84c0e3 100644
--- a/src/third_party/wiredtiger/src/tiered/tiered_schema.c
+++ b/src/third_party/wiredtiger/src/tiered/tiered_schema.c
@@ -15,12 +15,16 @@
 int
 __wt_tiered_create(WT_SESSION_IMPL *session, const char *uri, bool exclusive, const char *config)
 {
+    WT_CONFIG cparser;
+    WT_CONFIG_ITEM ckey, cval, tierconf;
     WT_DECL_RET;
+    int ntiers;
     char *meta_value;
     const char *cfg[] = {WT_CONFIG_BASE(session, tiered_meta), config, NULL};
     const char *metadata;
 
     metadata = NULL;
+    ntiers = 0;
 
     /* If it can be opened, it already exists. */
     if ((ret = __wt_metadata_search(session, uri, &meta_value)) != WT_NOTFOUND) {
@@ -30,12 +34,24 @@ __wt_tiered_create(WT_SESSION_IMPL *session, const char *uri, bool exclusive, co
     }
     WT_RET_NOTFOUND_OK(ret);
 
+    /* A tiered cursor must specify at least one underlying table */
+    WT_RET(__wt_config_gets(session, cfg, "tiered.tiers", &tierconf));
+    __wt_config_subinit(session, &cparser, &tierconf);
+
+    while ((ret = __wt_config_next(&cparser, &ckey, &cval)) == 0)
+        ++ntiers;
+    WT_RET_NOTFOUND_OK(ret);
+
+    if (ntiers == 0)
+        WT_RET_MSG(session, EINVAL, "tiered table must specify at least one tier");
+
     if (!F_ISSET(S2C(session), WT_CONN_READONLY)) {
         WT_ERR(__wt_config_merge(session, cfg, NULL, &metadata));
         WT_ERR(__wt_metadata_insert(session, uri, metadata));
     }
 
 err:
+    __wt_free(session, meta_value);
     __wt_free(session, metadata);
     return (ret);
 }
@@ -188,14 +204,14 @@ __tiered_open(WT_SESSION_IMPL *session, const char *cfg[])
     /* Point to some items in the copy to save re-parsing. */
     WT_RET(__wt_config_gets(session, tiered_cfg, "tiered.tiers", &tierconf));
 
-    /*
-     * Count the number of tiers.
-     */
+    /* Count the number of tiers. */
     __wt_config_subinit(session, &cparser, &tierconf);
     while ((ret = __wt_config_next(&cparser, &ckey, &cval)) == 0)
         ++tiered->ntiers;
     WT_RET_NOTFOUND_OK(ret);
 
+    WT_ASSERT(session, tiered->ntiers > 0);
+
     WT_RET(__wt_scr_alloc(session, 0, &buf));
     WT_ERR(__wt_calloc_def(session, tiered->ntiers, &tiered->tiers));
 
@@ -204,7 +220,7 @@ __tiered_open(WT_SESSION_IMPL *session, const char *cfg[])
         WT_ERR(__wt_config_next(&cparser, &ckey, &cval));
         WT_ERR(__wt_buf_fmt(session, buf, "%.*s", (int)ckey.len, ckey.str));
         WT_ERR(__wt_session_get_dhandle(session, (const char *)buf->data, NULL, cfg, 0));
-        __wt_atomic_addi32(&session->dhandle->session_inuse, 1);
+        (void)__wt_atomic_addi32(&session->dhandle->session_inuse, 1);
         /* Load in reverse order (based on LSM logic). */
         tiered->tiers[(tiered->ntiers - 1) - i] = session->dhandle;
         WT_ERR(__wt_session_release_dhandle(session));
@@ -247,7 +263,7 @@ __wt_tiered_close(WT_SESSION_IMPL *session, WT_TIERED *tiered)
     __wt_free(session, tiered->value_format);
     if (tiered->tiers != NULL) {
         for (i = 0; i < tiered->ntiers; i++)
-            __wt_atomic_subi32(&tiered->tiers[i]->session_inuse, 1);
+            (void)__wt_atomic_subi32(&tiered->tiers[i]->session_inuse, 1);
         __wt_free(session, tiered->tiers);
     }
 
diff --git a/src/third_party/wiredtiger/src/txn/txn_ckpt.c b/src/third_party/wiredtiger/src/txn/txn_ckpt.c
index d9a15ed067c..887abaa503d 100644
--- a/src/third_party/wiredtiger/src/txn/txn_ckpt.c
+++ b/src/third_party/wiredtiger/src/txn/txn_ckpt.c
@@ -1415,14 +1415,17 @@ __checkpoint_lock_dirty_tree(
             if (now > btree->clean_ckpt_timer)
                 skip_ckpt = false;
         }
-        if (skip_ckpt) {
+
+        /* Skip the clean btree until the btree has obsolete pages. */
+        if (skip_ckpt && !F_ISSET(btree, WT_BTREE_OBSOLETE_PAGES)) {
             F_SET(btree, WT_BTREE_SKIP_CKPT);
             goto skip;
         }
     }
 
-    /* If we have to process this btree for any reason, reset the timer. */
+    /* If we have to process this btree for any reason, reset the timer and obsolete pages flag. */
     WT_BTREE_CLEAN_CKPT(session, btree, 0);
+    F_CLR(btree, WT_BTREE_OBSOLETE_PAGES);
 
     /* Get the list of checkpoints for this file. */
     WT_ERR(__wt_meta_ckptlist_get(session, dhandle->name, true, &ckptbase));
@@ -1486,6 +1489,35 @@ skip:
 }
 
 /*
+ * __checkpoint_apply_obsolete --
+ *     Returns true if the checkpoint is obsolete.
+ */
+static bool
+__checkpoint_apply_obsolete(WT_SESSION_IMPL *session, WT_BTREE *btree, WT_CKPT *ckpt)
+{
+    wt_timestamp_t stop_ts;
+
+    stop_ts = WT_TS_MAX;
+    if (ckpt->size != 0) {
+        /*
+         * If the checkpoint has a valid stop timestamp, mark the btree as having obsolete pages.
+         * This flag is used to avoid skipping the btree until the obsolete check is performed on
+         * the checkpoints.
+         */
+        if (ckpt->ta.newest_stop_ts != WT_TS_MAX) {
+            F_SET(btree, WT_BTREE_OBSOLETE_PAGES);
+            stop_ts = ckpt->ta.newest_stop_durable_ts;
+        }
+        if (__wt_txn_visible_all(session, ckpt->ta.newest_stop_txn, stop_ts)) {
+            WT_STAT_CONN_DATA_INCR(session, txn_checkpoint_obsolete_applied);
+            return (true);
+        }
+    }
+
+    return (false);
+}
+
+/*
  * __checkpoint_mark_skip --
  *     Figure out whether the checkpoint can be skipped for a tree.
  */
@@ -1523,9 +1555,17 @@ __checkpoint_mark_skip(WT_SESSION_IMPL *session, WT_CKPT *ckptbase, bool force)
     F_CLR(btree, WT_BTREE_SKIP_CKPT);
     if (!btree->modified && !force) {
         deleted = 0;
-        WT_CKPT_FOREACH (ckptbase, ckpt)
+        WT_CKPT_FOREACH (ckptbase, ckpt) {
+            /*
+             * Don't skip the objects that have obsolete pages to let them to be removed as part of
+             * checkpoint cleanup.
+             */
+            if (__checkpoint_apply_obsolete(session, btree, ckpt))
+                return (0);
+
             if (F_ISSET(ckpt, WT_CKPT_DELETE))
                 ++deleted;
+        }
 
         /*
          * Complicated test: if the tree is clean and last two checkpoints have the same name
diff --git a/src/third_party/wiredtiger/src/txn/txn_rollback_to_stable.c b/src/third_party/wiredtiger/src/txn/txn_rollback_to_stable.c
index b30cf03be69..9ad6b7abd6d 100644
--- a/src/third_party/wiredtiger/src/txn/txn_rollback_to_stable.c
+++ b/src/third_party/wiredtiger/src/txn/txn_rollback_to_stable.c
@@ -251,12 +251,22 @@ __rollback_row_ondisk_fixup_key(WT_SESSION_IMPL *session, WT_PAGE *page, WT_ROW
         WT_ERR(hs_cursor->get_value(
           hs_cursor, &hs_stop_durable_ts, &hs_durable_ts, &type_full, hs_value));
         type = (uint8_t)type_full;
-        if (type == WT_UPDATE_MODIFY)
-            WT_ERR(__wt_modify_apply_item(
-              session, S2BT(session)->value_format, &full_value, hs_value->data));
-        else {
-            WT_ASSERT(session, type == WT_UPDATE_STANDARD);
-            WT_ERR(__wt_buf_set(session, &full_value, hs_value->data, hs_value->size));
+
+        /*
+         * Do not include history store updates greater than on-disk data store version to construct
+         * a full update to restore. Comparing with timestamps here has no problem unlike in search
+         * flow where the timestamps may be reset during reconciliation. RTS detects an on-disk
+         * update is unstable based on the written proper timestamp, so comparing against it with
+         * history store shouldn't have any problem.
+         */
+        if (hs_start_ts <= unpack->tw.start_ts) {
+            if (type == WT_UPDATE_MODIFY)
+                WT_ERR(__wt_modify_apply_item(
+                  session, S2BT(session)->value_format, &full_value, hs_value->data));
+            else {
+                WT_ASSERT(session, type == WT_UPDATE_STANDARD);
+                WT_ERR(__wt_buf_set(session, &full_value, hs_value->data, hs_value->size));
+            }
         }
 
         /*
@@ -280,9 +290,10 @@ __rollback_row_ondisk_fixup_key(WT_SESSION_IMPL *session, WT_PAGE *page, WT_ROW
          */
         if (!replace && hs_stop_durable_ts <= rollback_timestamp) {
             __wt_verbose(session, WT_VERB_RECOVERY_RTS(session),
-              "history store update valid with stop timestamp: %s and stable timestamp: %s",
+              "history store update valid with stop timestamp: %s, stable timestamp: %s and type: "
+              "%" PRIu8,
               __wt_timestamp_to_string(hs_stop_durable_ts, ts_string[0]),
-              __wt_timestamp_to_string(rollback_timestamp, ts_string[1]));
+              __wt_timestamp_to_string(rollback_timestamp, ts_string[1]), type);
             break;
         }
 
@@ -290,22 +301,23 @@ __rollback_row_ondisk_fixup_key(WT_SESSION_IMPL *session, WT_PAGE *page, WT_ROW
         if (hs_durable_ts <= rollback_timestamp) {
             __wt_verbose(session, WT_VERB_RECOVERY_RTS(session),
               "history store update valid with start timestamp: %s, durable timestamp: %s, stop "
-              "timestamp: %s and stable timestamp: %s",
+              "timestamp: %s, stable timestamp: %s and type: %" PRIu8,
               __wt_timestamp_to_string(hs_start_ts, ts_string[0]),
               __wt_timestamp_to_string(hs_durable_ts, ts_string[1]),
               __wt_timestamp_to_string(hs_stop_durable_ts, ts_string[2]),
-              __wt_timestamp_to_string(rollback_timestamp, ts_string[3]));
+              __wt_timestamp_to_string(rollback_timestamp, ts_string[3]), type);
+            WT_ASSERT(session, cbt->upd_value->tw.start_ts < unpack->tw.start_ts);
             valid_update_found = true;
             break;
         }
 
         __wt_verbose(session, WT_VERB_RECOVERY_RTS(session),
           "history store update aborted with start timestamp: %s, durable timestamp: %s, stop "
-          "timestamp: %s and stable timestamp: %s",
+          "timestamp: %s, stable timestamp: %s and type: %" PRIu8,
           __wt_timestamp_to_string(hs_start_ts, ts_string[0]),
           __wt_timestamp_to_string(hs_durable_ts, ts_string[1]),
           __wt_timestamp_to_string(hs_stop_durable_ts, ts_string[2]),
-          __wt_timestamp_to_string(rollback_timestamp, ts_string[3]));
+          __wt_timestamp_to_string(rollback_timestamp, ts_string[3]), type);
 
         /*
          * Start time point of the current record may be used as stop time point of the previous
@@ -329,6 +341,7 @@ __rollback_row_ondisk_fixup_key(WT_SESSION_IMPL *session, WT_PAGE *page, WT_ROW
          * list. Otherwise remove the key by adding a tombstone.
          */
         if (valid_update_found) {
+            WT_ASSERT(session, cbt->upd_value->tw.start_ts < unpack->tw.start_ts);
             WT_ERR(__wt_upd_alloc(session, &full_value, WT_UPDATE_STANDARD, &upd, NULL));
 
             upd->txnid = cbt->upd_value->tw.start_txn;
@@ -336,7 +349,7 @@ __rollback_row_ondisk_fixup_key(WT_SESSION_IMPL *session, WT_PAGE *page, WT_ROW
             upd->start_ts = cbt->upd_value->tw.start_ts;
             __wt_verbose(session, WT_VERB_RECOVERY_RTS(session),
               "update restored from history store (txnid: %" PRIu64
-              ", start_ts: %s, durable_ts: %s",
+              ", start_ts: %s and durable_ts: %s",
               upd->txnid, __wt_timestamp_to_string(upd->start_ts, ts_string[0]),
               __wt_timestamp_to_string(upd->durable_ts, ts_string[1]));
 
@@ -345,6 +358,7 @@ __rollback_row_ondisk_fixup_key(WT_SESSION_IMPL *session, WT_PAGE *page, WT_ROW
              * the rollback to stable operation.
              */
             F_SET(upd, WT_UPDATE_RESTORED_FROM_HS);
+            WT_STAT_CONN_DATA_INCR(session, txn_rts_hs_restore_updates);
 
             /*
              * We have a tombstone on the original update chain and it is behind the stable
diff --git a/src/third_party/wiredtiger/src/utilities/util_list.c b/src/third_party/wiredtiger/src/utilities/util_list.c
index 13a3577745f..1c9cae21bbf 100644
--- a/src/third_party/wiredtiger/src/utilities/util_list.c
+++ b/src/third_party/wiredtiger/src/utilities/util_list.c
@@ -88,6 +88,12 @@ list_init_block(WT_SESSION *session, const char *key, WT_BLOCK *block)
     wt_api = session->connection->get_extension_api(session->connection);
     if ((ret = wt_api->metadata_search(wt_api, session, key, &config)) != 0)
         WT_ERR(util_err(session, ret, "%s: WT_EXTENSION_API.metadata_search", key));
+    /*
+     * The config variable should be set and not NULL, but Coverity is convinced otherwise. This is
+     * an infrequent code path. Just add this extra conditional to make it happy.
+     */
+    if (config == NULL)
+        goto err;
     if ((ret = wt_api->config_parser_open(wt_api, session, config, strlen(config), &parser)) != 0)
         WT_ERR(util_err(session, ret, "WT_EXTENSION_API.config_parser_open"));
     if ((ret = parser->get(parser, "allocation_size", &cval)) == 0)
diff --git a/src/third_party/wiredtiger/test/format/config.c b/src/third_party/wiredtiger/test/format/config.c
index 13d91b793e3..f244df90452 100644
--- a/src/third_party/wiredtiger/test/format/config.c
+++ b/src/third_party/wiredtiger/test/format/config.c
@@ -31,6 +31,7 @@
 
 static void config_backup_incr(void);
 static void config_backup_incr_granularity(void);
+static void config_backup_incr_log_compatibility_check(void);
 static void config_backward_compatible(void);
 static void config_cache(void);
 static void config_checkpoint(void);
@@ -267,12 +268,8 @@ config_backup_incr(void)
      * archival doesn't seem as useful as testing backup, let the backup configuration override.
      */
     if (config_is_perm("backup.incremental")) {
-        if (g.c_backup_incr_flag == INCREMENTAL_LOG) {
-            if (g.c_logging_archive && config_is_perm("logging.archive"))
-                testutil_die(EINVAL, "backup.incremental=log is incompatible with logging.archive");
-            if (g.c_logging_archive)
-                config_single("logging.archive=0", false);
-        }
+        if (g.c_backup_incr_flag == INCREMENTAL_LOG)
+            config_backup_incr_log_compatibility_check();
         if (g.c_backup_incr_flag == INCREMENTAL_BLOCK)
             config_backup_incr_granularity();
         return;
@@ -761,6 +758,23 @@ config_in_memory_reset(void)
 }
 
 /*
+ * config_backup_incr_compatibility_check --
+ *     Backup incremental log compatibility check.
+ */
+static void
+config_backup_incr_log_compatibility_check(void)
+{
+    /*
+     * Incremental backup using log files is incompatible with logging archival. Disable logging
+     * archival if log incremental backup is set.
+     */
+    if (g.c_logging_archive && config_is_perm("logging.archive"))
+        testutil_die(EINVAL, "backup.incremental=log is incompatible with logging.archive");
+    if (g.c_logging_archive)
+        config_single("logging.archive=0", false);
+}
+
+/*
  * config_lsm_reset --
  *     LSM configuration review.
  */
@@ -801,6 +815,7 @@ config_lsm_reset(void)
             case 2:
                 /* 50% */
                 config_single("backup.incremental=log", false);
+                config_backup_incr_log_compatibility_check();
                 break;
             }
     }
diff --git a/src/third_party/wiredtiger/test/suite/test_checkpoint08.py b/src/third_party/wiredtiger/test/suite/test_checkpoint08.py
new file mode 100755
index 00000000000..047b36dcd50
--- /dev/null
+++ b/src/third_party/wiredtiger/test/suite/test_checkpoint08.py
@@ -0,0 +1,125 @@
+#!/usr/bin/env python
+#
+# Public Domain 2014-2020 MongoDB, Inc.
+# Public Domain 2008-2014 WiredTiger, Inc.
+#
+# This is free and unencumbered software released into the public domain.
+#
+# Anyone is free to copy, modify, publish, use, compile, sell, or
+# distribute this software, either in source code form or as a compiled
+# binary, for any purpose, commercial or non-commercial, and by any
+# means.
+#
+# In jurisdictions that recognize copyright laws, the author or authors
+# of this software dedicate any and all copyright interest in the
+# software to the public domain. We make this dedication for the benefit
+# of the public at large and to the detriment of our heirs and
+# successors. We intend this dedication to be an overt act of
+# relinquishment in perpetuity of all present and future rights to this
+# software under copyright law.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+# IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+# OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+# ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+# OTHER DEALINGS IN THE SOFTWARE.
+#
+# test_checkpoint08.py
+# Test that the btree checkpoint is not skipped if there are obsolete pages.
+
+import wiredtiger, wttest
+from wiredtiger import stat
+from wtdataset import SimpleDataSet
+
+def timestamp_str(t):
+    return '%x' % t
+
+class test_checkpoint08(wttest.WiredTigerTestCase):
+    conn_config = 'cache_size=50MB,log=(enabled),statistics=(all)'
+    session_config = 'isolation=snapshot'
+
+    def get_stat(self, uri):
+        stat_uri = 'statistics:' + uri
+        stat_cursor = self.session.open_cursor(stat_uri)
+        val = stat_cursor[stat.dsrc.btree_clean_checkpoint_timer][2]
+        stat_cursor.close()
+        return val
+
+    def test_checkpoint08(self):
+        self.uri1 = 'table:ckpt08.1'
+        self.file1 = 'file:ckpt08.1.wt'
+        self.uri2 = 'table:ckpt08.2'
+        self.file2 = 'file:ckpt08.2.wt'
+        self.hsfile = 'file:WiredTigerHS.wt'
+        self.session.create(self.uri1, 'key_format=i,value_format=i')
+        self.session.create(self.uri2, 'key_format=i,value_format=i')
+
+        # Pin oldest and stable to timestamp 1.
+        self.conn.set_timestamp('oldest_timestamp=' + timestamp_str(1) +
+            ',stable_timestamp=' + timestamp_str(1))
+
+        # Setup: Insert some data and checkpoint it. Then modify only
+        # the data in the first table and checkpoint. Verify the clean skip
+        # timer is not set for the modified table and is set for the clean one.
+        c1 = self.session.open_cursor(self.uri1, None)
+        c2 = self.session.open_cursor(self.uri2, None)
+
+        self.session.begin_transaction()
+        c1[1] = 1
+        c2[1] = 1
+        self.session.commit_transaction('commit_timestamp=' + timestamp_str(2))
+
+        self.session.begin_transaction()
+        c1[1] = 10
+        c2[1] = 10
+        self.session.commit_transaction('commit_timestamp=' + timestamp_str(3))
+
+        self.conn.set_timestamp('stable_timestamp=' + timestamp_str(3))
+        self.session.checkpoint(None)
+
+        # Modify the both tables and reverify.
+        self.session.begin_transaction()
+        c1[3] = 3
+        c2[3] = 3
+        self.session.commit_transaction('commit_timestamp=' + timestamp_str(4))
+
+        self.conn.set_timestamp('stable_timestamp=' + timestamp_str(4))
+        self.session.checkpoint(None)
+
+        val = self.get_stat(self.uri1)
+        self.assertEqual(val, 0)
+        val = self.get_stat(self.uri2)
+        self.assertEqual(val, 0)
+        hsval = self.get_stat(self.hsfile)
+        self.assertNotEqual(hsval, 0)
+
+        # Modify the both tables and reverify when oldest timestamp moved.
+        self.conn.set_timestamp('oldest_timestamp=' + timestamp_str(4))
+        self.session.begin_transaction()
+        c1[4] = 4
+        c2[4] = 4
+        self.session.commit_transaction('commit_timestamp=' + timestamp_str(5))
+
+        self.conn.set_timestamp('stable_timestamp=' + timestamp_str(5))
+        self.session.checkpoint(None)
+
+        val = self.get_stat(self.uri1)
+        self.assertEqual(val, 0)
+        val = self.get_stat(self.uri2)
+        self.assertEqual(val, 0)
+        hsval = self.get_stat(self.hsfile)
+        self.assertEqual(hsval, 0)
+
+        stat_cursor = self.session.open_cursor('statistics:file:WiredTigerHS.wt', None, None)
+        obsolete_applied = stat_cursor[stat.dsrc.txn_checkpoint_obsolete_applied][2]
+        self.assertEqual(obsolete_applied, 1)
+        stat_cursor.close()
+
+        c1.close()
+        c2.close()
+        self.session.close()
+
+if __name__ == '__main__':
+    wttest.run()
diff --git a/src/third_party/wiredtiger/test/suite/test_hs05.py b/src/third_party/wiredtiger/test/suite/test_hs05.py
index f2d93a40547..745d22d5480 100644
--- a/src/third_party/wiredtiger/test/suite/test_hs05.py
+++ b/src/third_party/wiredtiger/test/suite/test_hs05.py
@@ -30,6 +30,7 @@ from helper import copy_wiredtiger_home
 import wiredtiger, wttest
 from wiredtiger import stat
 from wtdataset import SimpleDataSet
+from wtscenario import make_scenarios
 
 def timestamp_str(t):
     return '%x' % t
@@ -44,6 +45,12 @@ class test_hs05(wttest.WiredTigerTestCase):
     conn_config += 'eviction_updates_target=100,eviction_updates_trigger=100'
     session_config = 'isolation=snapshot'
     stable = 1
+    key_format_values = [
+        ('column', dict(key_format='r')),
+        ('integer', dict(key_format='i')),
+        ('string', dict(key_format='S'))
+    ]
+    scenarios = make_scenarios(key_format_values)
 
     def get_stat(self, stat):
         stat_cursor = self.session.open_cursor('statistics:')
@@ -71,7 +78,7 @@ class test_hs05(wttest.WiredTigerTestCase):
         # Create a small table.
         uri = "table:test_hs05"
         nrows = 100
-        ds = SimpleDataSet(self, uri, nrows, key_format="S", value_format='u')
+        ds = SimpleDataSet(self, uri, nrows, key_format=self.key_format, value_format='u')
         ds.populate()
         bigvalue = b"aaaaa" * 100
 
diff --git a/src/third_party/wiredtiger/test/suite/test_hs06.py b/src/third_party/wiredtiger/test/suite/test_hs06.py
index 967f1519807..cc38097da7f 100644
--- a/src/third_party/wiredtiger/test/suite/test_hs06.py
+++ b/src/third_party/wiredtiger/test/suite/test_hs06.py
@@ -47,8 +47,7 @@ class test_hs06(wttest.WiredTigerTestCase):
     conn_config = 'cache_size=50MB,statistics=(fast)'
     session_config = 'isolation=snapshot'
     key_format_values = [
-        # The commented columnar tests needs to be enabled once columnar page instantiated is fixed in (WT-6061).
-        # ('column', dict(key_format='r')),
+        ('column', dict(key_format='r')),
         ('integer', dict(key_format='i')),
         ('string', dict(key_format='S'))
     ]
@@ -210,6 +209,11 @@ class test_hs06(wttest.WiredTigerTestCase):
         self.session.rollback_transaction()
 
     def test_hs_prepare_reads(self):
+        # Prepare reads currently not supported with columnar store.
+        # Remove this once prepare reads is supported in WT-6061.
+        if self.key_format == 'r':
+            return
+
         # Create a small table.
         uri = "table:test_hs06"
         create_params = 'key_format={},value_format=S'.format(self.key_format)
diff --git a/src/third_party/wiredtiger/test/suite/test_hs07.py b/src/third_party/wiredtiger/test/suite/test_hs07.py
index 14ad15c3281..37b451f3b79 100644
--- a/src/third_party/wiredtiger/test/suite/test_hs07.py
+++ b/src/third_party/wiredtiger/test/suite/test_hs07.py
@@ -30,6 +30,7 @@ import time
 from helper import copy_wiredtiger_home
 import wiredtiger, wttest
 from wtdataset import SimpleDataSet
+from wtscenario import make_scenarios
 
 def timestamp_str(t):
     return '%x' % t
@@ -42,6 +43,12 @@ class test_hs07(wttest.WiredTigerTestCase):
                    'eviction_updates_target=80,log=(enabled)')
     session_config = 'isolation=snapshot'
 
+    key_format_values = (
+        ('column', dict(key_format='r')),
+        ('int', dict(key_format='i'))
+    )
+    scenarios = make_scenarios(key_format_values)
+
     def large_updates(self, uri, value, ds, nrows, commit_ts):
         # Update a large number of records, we'll hang if the history store table isn't working.
         session = self.session
@@ -70,11 +77,11 @@ class test_hs07(wttest.WiredTigerTestCase):
         # behavior.
         uri = "table:las07_main"
         ds = SimpleDataSet(
-            self, uri, 0, key_format="i", value_format="S", config='log=(enabled=false)')
+            self, uri, 0, key_format=self.key_format, value_format="S", config='log=(enabled=false)')
         ds.populate()
 
         uri2 = "table:las07_extra"
-        ds2 = SimpleDataSet(self, uri2, 0, key_format="i", value_format="S")
+        ds2 = SimpleDataSet(self, uri2, 0, key_format=self.key_format, value_format="S")
         ds2.populate()
 
         # Pin oldest and stable to timestamp 1.
diff --git a/src/third_party/wiredtiger/test/suite/test_hs08.py b/src/third_party/wiredtiger/test/suite/test_hs08.py
index 9899842d529..9388121cdad 100644
--- a/src/third_party/wiredtiger/test/suite/test_hs08.py
+++ b/src/third_party/wiredtiger/test/suite/test_hs08.py
@@ -38,6 +38,11 @@ def timestamp_str(t):
 class test_hs08(wttest.WiredTigerTestCase):
     conn_config = 'cache_size=100MB,statistics=(all)'
     session_config = 'isolation=snapshot'
+    key_format_values = [
+        ('column', dict(key_format='r')),
+        ('integer', dict(key_format='i')),
+    ]
+    scenarios = make_scenarios(key_format_values)
 
     def get_stat(self, stat):
         stat_cursor = self.session.open_cursor('statistics:')
@@ -47,7 +52,7 @@ class test_hs08(wttest.WiredTigerTestCase):
 
     def test_modify_insert_to_hs(self):
         uri = "table:test_hs08"
-        create_params = 'value_format=S,key_format=i'
+        create_params = 'value_format=S,key_format={}'.format(self.key_format)
         value1 = 'a' * 1000
         self.session.create(uri, create_params)
 
diff --git a/src/third_party/wiredtiger/test/suite/test_hs09.py b/src/third_party/wiredtiger/test/suite/test_hs09.py
index ac34e3f7b17..4bed2791808 100644
--- a/src/third_party/wiredtiger/test/suite/test_hs09.py
+++ b/src/third_party/wiredtiger/test/suite/test_hs09.py
@@ -38,12 +38,11 @@ def timestamp_str(t):
 # second newest committed version to history store.
 class test_hs09(wttest.WiredTigerTestCase):
     # Force a small cache.
-    conn_config = 'cache_size=50MB,statistics=(fast)'
+    conn_config = 'cache_size=20MB,statistics=(fast)'
     session_config = 'isolation=snapshot'
     uri = "table:test_hs09"
     key_format_values = [
-        # The commented columnar tests needs to be enabled once columnar page instantiated is fixed in (WT-6061).
-        #('column', dict(key_format='r')),
+        ('column', dict(key_format='r')),
         ('integer', dict(key_format='i')),
         ('string', dict(key_format='S')),
     ]
@@ -54,20 +53,31 @@ class test_hs09(wttest.WiredTigerTestCase):
             return str(i)
         return i
 
-    def check_ckpt_hs(self, expected_data_value, expected_hs_value, expected_hs_start_ts, expected_hs_stop_ts):
+    def check_ckpt_hs(self, expected_data_value, expected_hs_value, expected_hs_start_ts,
+                      expected_hs_stop_ts, expect_prepared_in_datastore = False):
         session = self.conn.open_session(self.session_config)
         session.checkpoint()
-        # Check the data file value
+        # Check the data file value.
         cursor = session.open_cursor(self.uri, None, 'checkpoint=WiredTigerCheckpoint')
+
+        # If we are expecting prepapred updates in the datastore, start an explicit transaction with
+        # ignore prepare flag to avoid getting a WT_PREPARE_CONFLICT error.
+        if expect_prepared_in_datastore:
+            session.begin_transaction("ignore_prepare=true")
+
         for _, value in cursor:
             self.assertEqual(value, expected_data_value)
+
+        if expect_prepared_in_datastore:
+            session.rollback_transaction()
+
         cursor.close()
-        # Check the history store file value
+        # Check the history store file value.
         cursor = session.open_cursor("file:WiredTigerHS.wt", None, 'checkpoint=WiredTigerCheckpoint')
         for _, _, hs_start_ts, _, hs_stop_ts, _, type, value in cursor:
-            # No WT_UPDATE_TOMBSTONE in the history store
+            # No WT_UPDATE_TOMBSTONE in the history store.
             self.assertNotEqual(type, 5)
-            # No WT_UPDATE_BIRTHMARK in the history store
+            # No WT_UPDATE_BIRTHMARK in the history store.
             self.assertNotEqual(type, 1)
             # WT_UPDATE_STANDARD
             if (type == 4):
@@ -100,7 +110,7 @@ class test_hs09(wttest.WiredTigerTestCase):
             cursor[self.create_key(i)] = value2
         self.session.commit_transaction('commit_timestamp=' + timestamp_str(3))
 
-        # Uncommitted changes
+        # Uncommitted changes.
         self.session.begin_transaction()
         for i in range(1, 11):
             cursor[self.create_key(i)] = value3
@@ -108,6 +118,11 @@ class test_hs09(wttest.WiredTigerTestCase):
         self.check_ckpt_hs(value2, value1, 2, 3)
 
     def test_prepared_updates_not_written_to_hs(self):
+        # Prepare reads currently not supported with columnar store.
+        # Remove this once prepare reads is supported in WT-6061.
+        if self.key_format == 'r':
+            return
+
         # Create a small table.
         create_params = 'key_format={},value_format=S'.format(self.key_format)
         self.session.create(self.uri, create_params)
@@ -130,13 +145,15 @@ class test_hs09(wttest.WiredTigerTestCase):
             cursor[self.create_key(i)] = value2
         self.session.commit_transaction('commit_timestamp=' + timestamp_str(3))
 
-        # Prepare some updates
+        # Prepare some updates.
         self.session.begin_transaction()
         for i in range(1, 11):
             cursor[self.create_key(i)] = value3
         self.session.prepare_transaction('prepare_timestamp=' + timestamp_str(4))
 
-        self.check_ckpt_hs(value2, value1, 2, 3)
+        # We can expect prepared values to show up in data store if the eviction runs between now
+        # and the time when we open a cursor on the user table.
+        self.check_ckpt_hs(value2, value1, 2, 3, True)
         self.session.commit_transaction('commit_timestamp=' + timestamp_str(5) +
             ',durable_timestamp=' + timestamp_str(5))
 
diff --git a/src/third_party/wiredtiger/test/suite/test_hs10.py b/src/third_party/wiredtiger/test/suite/test_hs10.py
index f41f18bb999..ef92195daf9 100644
--- a/src/third_party/wiredtiger/test/suite/test_hs10.py
+++ b/src/third_party/wiredtiger/test/suite/test_hs10.py
@@ -38,6 +38,11 @@ def timestamp_str(t):
 class test_hs10(wttest.WiredTigerTestCase):
     conn_config = 'cache_size=2MB,statistics=(all),eviction=(threads_max=1)'
     session_config = 'isolation=snapshot'
+    key_format_values = (
+        ('column', dict(key_format='r')),
+        ('int', dict(key_format='i'))
+    )
+    scenarios = make_scenarios(key_format_values)
 
     def get_stat(self, stat):
         stat_cursor = self.session.open_cursor('statistics:')
@@ -48,7 +53,7 @@ class test_hs10(wttest.WiredTigerTestCase):
     def test_modify_insert_to_hs(self):
         uri = "table:test_hs10"
         uri2 = "table:test_hs10_otherdata"
-        create_params = 'value_format=S,key_format=i'
+        create_params = 'value_format=S,key_format={}'.format(self.key_format)
         value1 = 'a' * 1000
         value2 = 'b' * 1000
         self.session.create(uri, create_params)
diff --git a/src/third_party/wiredtiger/test/suite/test_hs12.py b/src/third_party/wiredtiger/test/suite/test_hs12.py
index ed332dc3349..a4e9199323c 100644
--- a/src/third_party/wiredtiger/test/suite/test_hs12.py
+++ b/src/third_party/wiredtiger/test/suite/test_hs12.py
@@ -27,6 +27,7 @@
 # OTHER DEALINGS IN THE SOFTWARE.
 
 import wiredtiger, wttest, time
+from wtscenario import make_scenarios
 
 def timestamp_str(t):
     return '%x' % t
@@ -36,10 +37,17 @@ def timestamp_str(t):
 class test_hs12(wttest.WiredTigerTestCase):
     conn_config = 'cache_size=2MB,statistics=(all),eviction=(threads_max=1)'
     session_config = 'isolation=snapshot'
+    key_format_values = [
+        # The commented columnar tests needs to be enabled once columnar
+        # Modify type update is fixed in (WT-5550).
+        # ('column', dict(key_format='r')),
+        ('integer', dict(key_format='i')),
+    ]
+    scenarios = make_scenarios(key_format_values)
 
     def test_modify_append_to_string(self):
         uri = "table:test_reverse_modify01_notimestamp"
-        create_params = 'value_format=S,key_format=i'
+        create_params = 'value_format=S,key_format={}'.format(self.key_format)
         value1 = 'abcedfghijklmnopqrstuvwxyz' * 5
         value2 = 'b' * 100
         valuebig = 'e' * 1000
diff --git a/src/third_party/wiredtiger/test/suite/test_hs13.py b/src/third_party/wiredtiger/test/suite/test_hs13.py
index 7ada5d7a0b6..27c6c6d4ade 100644
--- a/src/third_party/wiredtiger/test/suite/test_hs13.py
+++ b/src/third_party/wiredtiger/test/suite/test_hs13.py
@@ -27,6 +27,8 @@
 # OTHER DEALINGS IN THE SOFTWARE.
 
 import wiredtiger, wttest
+from wtscenario import make_scenarios
+
 def timestamp_str(t):
     return '%x' % t
 
@@ -35,10 +37,16 @@ def timestamp_str(t):
 class test_hs13(wttest.WiredTigerTestCase):
     conn_config = 'cache_size=2MB,statistics=(all),eviction=(threads_max=1)'
     session_config = 'isolation=snapshot'
+    key_format_values = [
+        # The commented columnar tests needs to be enabled once columnar Modify type update is fixed in (WT-5550).
+        # ('column', dict(key_format='r')),
+        ('integer', dict(key_format='i'))
+    ]
+    scenarios = make_scenarios(key_format_values)
 
     def test_reverse_modifies_constructed_after_eviction(self):
         uri = "table:test_hs13"
-        create_params = 'value_format=S,key_format=i'
+        create_params = 'value_format=S,key_format={}'.format(self.key_format)
         value1 = 'a' * 10000
         value2 = 'b' * 10000
         value3 = 'e' * 10000
diff --git a/src/third_party/wiredtiger/test/suite/test_hs14.py b/src/third_party/wiredtiger/test/suite/test_hs14.py
index ebd5f471f2b..4e8f5148da1 100644
--- a/src/third_party/wiredtiger/test/suite/test_hs14.py
+++ b/src/third_party/wiredtiger/test/suite/test_hs14.py
@@ -27,6 +27,7 @@
 # OTHER DEALINGS IN THE SOFTWARE.
 
 import time, wiredtiger, wttest
+from wtscenario import make_scenarios
 
 def timestamp_str(t):
     return '%x' % t
@@ -37,10 +38,20 @@ def timestamp_str(t):
 class test_hs14(wttest.WiredTigerTestCase):
     conn_config = 'cache_size=50MB'
     session_config = 'isolation=snapshot'
+    key_format_values = [
+        ('column', dict(key_format='r')),
+        ('string', dict(key_format='S'))
+    ]
+    scenarios = make_scenarios(key_format_values)
+
+    def create_key(self, i):
+        if self.key_format == 'S':
+            return str(i)
+        return i
 
     def test_hs14(self):
         uri = 'table:test_hs14'
-        self.session.create(uri, 'key_format=S,value_format=S')
+        self.session.create(uri, 'key_format={},value_format=S'.format(self.key_format))
         self.conn.set_timestamp('oldest_timestamp=' + timestamp_str(1))
         cursor = self.session.open_cursor(uri)
 
@@ -52,22 +63,22 @@ class test_hs14(wttest.WiredTigerTestCase):
 
         for i in range(1, 10000):
             self.session.begin_transaction()
-            cursor[str(i)] = value1
+            cursor[self.create_key(i)] = value1
             self.session.commit_transaction('commit_timestamp=' + timestamp_str(2))
             self.session.begin_transaction()
-            cursor[str(i)] = value2
+            cursor[self.create_key(i)] = value2
             self.session.commit_transaction('commit_timestamp=' + timestamp_str(2))
             self.session.begin_transaction()
-            cursor[str(i)] = value3
+            cursor[self.create_key(i)] = value3
             self.session.commit_transaction('commit_timestamp=' + timestamp_str(3))
             self.session.begin_transaction()
-            cursor[str(i)] = value4
+            cursor[self.create_key(i)] = value4
             self.session.commit_transaction('commit_timestamp=' + timestamp_str(4))
 
         start = time.time()
         self.session.begin_transaction('read_timestamp=' + timestamp_str(3))
         for i in range(1, 10000):
-            self.assertEqual(cursor[str(i)], value3)
+            self.assertEqual(cursor[self.create_key(i)], value3)
         self.session.rollback_transaction()
         end = time.time()
 
@@ -76,17 +87,17 @@ class test_hs14(wttest.WiredTigerTestCase):
 
         for i in range(1, 10000):
             self.session.begin_transaction()
-            cursor.set_key(str(i))
+            cursor.set_key(self.create_key(i))
             cursor.remove()
             self.session.commit_transaction('commit_timestamp=' + timestamp_str(5))
             self.session.begin_transaction()
-            cursor[str(i)] = value5
+            cursor[self.create_key(i)] = value5
             self.session.commit_transaction('commit_timestamp=' + timestamp_str(10))
 
         start = time.time()
         self.session.begin_transaction('read_timestamp=' + timestamp_str(9))
         for i in range(1, 10000):
-            cursor.set_key(str(i))
+            cursor.set_key(self.create_key(i))
             self.assertEqual(cursor.search(), wiredtiger.WT_NOTFOUND)
         self.session.rollback_transaction()
         end = time.time()
diff --git a/src/third_party/wiredtiger/test/suite/test_hs20.py b/src/third_party/wiredtiger/test/suite/test_hs20.py
new file mode 100644
index 00000000000..2a9f7d02c93
--- /dev/null
+++ b/src/third_party/wiredtiger/test/suite/test_hs20.py
@@ -0,0 +1,91 @@
+#!/usr/bin/env python
+#
+# Public Domain 2014-2020 MongoDB, Inc.
+# Public Domain 2008-2014 WiredTiger, Inc.
+#
+# This is free and unencumbered software released into the public domain.
+#
+# Anyone is free to copy, modify, publish, use, compile, sell, or
+# distribute this software, either in source code form or as a compiled
+# binary, for any purpose, commercial or non-commercial, and by any
+# means.
+#
+# In jurisdictions that recognize copyright laws, the author or authors
+# of this software dedicate any and all copyright interest in the
+# software to the public domain. We make this dedication for the benefit
+# of the public at large and to the detriment of our heirs and
+# successors. We intend this dedication to be an overt act of
+# relinquishment in perpetuity of all present and future rights to this
+# software under copyright law.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+# IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+# OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+# ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+# OTHER DEALINGS IN THE SOFTWARE.
+
+import time, wiredtiger, wttest
+
+# test_hs20.py
+# Ensure we never reconstruct a reverse modify update in the history store based on the onpage overflow value
+def timestamp_str(t):
+    return '%x' % t
+
+class test_hs20(wttest.WiredTigerTestCase):
+    conn_config = 'cache_size=50MB,eviction=(threads_max=1)'
+    session_config = 'isolation=snapshot'
+
+    def test_hs20(self):
+        uri = 'table:test_hs20'
+        # Set a very small maximum leaf value to trigger writing overflow values
+        self.session.create(uri, 'key_format=S,value_format=S,leaf_value_max=10B')
+        cursor = self.session.open_cursor(uri)
+        self.conn.set_timestamp(
+            'oldest_timestamp=' + timestamp_str(1) + ',stable_timestamp=' + timestamp_str(1))
+
+        value1 = 'a' * 500
+        value2 = 'b' * 50
+
+        # Insert a value that is larger than the maximum leaf value.
+        for i in range(0, 10):
+            self.session.begin_transaction()
+            cursor[str(i)] = value1
+            self.session.commit_transaction('commit_timestamp=' + timestamp_str(2))
+
+        # Do 2 modifies.
+        for i in range(0, 10):
+            self.session.begin_transaction()
+            cursor.set_key(str(i))
+            mods = [wiredtiger.Modify('B', 500, 1)]
+            self.assertEqual(cursor.modify(mods), 0)
+            self.session.commit_transaction('commit_timestamp=' + timestamp_str(3))
+
+        for i in range(0, 10):
+            self.session.begin_transaction()
+            cursor.set_key(str(i))
+            mods = [wiredtiger.Modify('C', 501, 1)]
+            self.assertEqual(cursor.modify(mods), 0)
+            self.session.commit_transaction('commit_timestamp=' + timestamp_str(4))
+
+        # Insert more data to trigger eviction.
+        for i in range(10, 100000):
+            self.session.begin_transaction()
+            cursor[str(i)] = value2
+            self.session.commit_transaction('commit_timestamp=' + timestamp_str(5))
+
+        # Update the overflow values.
+        for i in range(0, 10):
+            self.session.begin_transaction()
+            cursor[str(i)] = value2
+            self.session.commit_transaction('commit_timestamp=' + timestamp_str(5))
+
+        # Do a checkpoint to move the overflow values to the history store but keep the current in memory disk image.
+        self.session.checkpoint()
+
+        # Search the first modifies.
+        for i in range(0, 10):
+            self.session.begin_transaction('read_timestamp=' + timestamp_str(3))
+            self.assertEqual(cursor[str(i)], value1 + "B")
+            self.session.rollback_transaction()
diff --git a/src/third_party/wiredtiger/test/suite/test_rollback_to_stable14.py b/src/third_party/wiredtiger/test/suite/test_rollback_to_stable14.py
new file mode 100755
index 00000000000..ea88a33a066
--- /dev/null
+++ b/src/third_party/wiredtiger/test/suite/test_rollback_to_stable14.py
@@ -0,0 +1,304 @@
+#!/usr/bin/env python
+#
+# Public Domain 2014-2020 MongoDB, Inc.
+# Public Domain 2008-2014 WiredTiger, Inc.
+#
+# This is free and unencumbered software released into the public domain.
+#
+# Anyone is free to copy, modify, publish, use, compile, sell, or
+# distribute this software, either in source code form or as a compiled
+# binary, for any purpose, commercial or non-commercial, and by any
+# means.
+#
+# In jurisdictions that recognize copyright laws, the author or authors
+# of this software dedicate any and all copyright interest in the
+# software to the public domain. We make this dedication for the benefit
+# of the public at large and to the detriment of our heirs and
+# successors. We intend this dedication to be an overt act of
+# relinquishment in perpetuity of all present and future rights to this
+# software under copyright law.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+# IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+# OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+# ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+# OTHER DEALINGS IN THE SOFTWARE.
+
+import fnmatch, os, shutil, threading, time
+from helper import copy_wiredtiger_home
+from test_rollback_to_stable01 import test_rollback_to_stable_base
+from wiredtiger import stat, wiredtiger_strerror, WiredTigerError, WT_ROLLBACK
+from wtdataset import SimpleDataSet
+from wtscenario import make_scenarios
+from wtthread import checkpoint_thread, op_thread
+from time import sleep
+
+def timestamp_str(t):
+    return '%x' % t
+
+def mod_val(value, char, location, nbytes=1):
+    return value[0:location] + char + value[location+nbytes:]
+
+def retry_rollback(self, name, txn_session, code):
+    retry_limit = 100
+    retries = 0
+    completed = False
+    saved_exception = None
+    while not completed and retries < retry_limit:
+        if retries != 0:
+            self.pr("Retrying operation for " + name)
+            if txn_session:
+                txn_session.rollback_transaction()
+            sleep(0.1)
+            if txn_session:
+                txn_session.begin_transaction('isolation=snapshot')
+                self.pr("Began new transaction for " + name)
+        try:
+            code()
+            completed = True
+        except WiredTigerError as e:
+            rollback_str = wiredtiger_strerror(WT_ROLLBACK)
+            if rollback_str not in str(e):
+                raise(e)
+            retries += 1
+            saved_exception = e
+    if not completed and saved_exception:
+        raise(saved_exception)
+
+# test_rollback_to_stable14.py
+# Test the rollback to stable operation uses proper base update while restoring modifies from history store.
+class test_rollback_to_stable14(test_rollback_to_stable_base):
+    session_config = 'isolation=snapshot'
+
+    prepare_values = [
+        ('no_prepare', dict(prepare=False)),
+        ('prepare', dict(prepare=True))
+    ]
+
+    scenarios = make_scenarios(prepare_values)
+
+    def conn_config(self):
+        config = 'cache_size=8MB,statistics=(all),statistics_log=(json,on_close,wait=1),log=(enabled=true),timing_stress_for_test=[history_store_checkpoint_delay]'
+        return config
+
+    def simulate_crash_restart(self, olddir, newdir):
+        ''' Simulate a crash from olddir and restart in newdir. '''
+        # with the connection still open, copy files to new directory
+        shutil.rmtree(newdir, ignore_errors=True)
+        os.mkdir(newdir)
+        for fname in os.listdir(olddir):
+            fullname = os.path.join(olddir, fname)
+            # Skip lock file on Windows since it is locked
+            if os.path.isfile(fullname) and \
+                "WiredTiger.lock" not in fullname and \
+                "Tmplog" not in fullname and \
+                "Preplog" not in fullname:
+                shutil.copy(fullname, newdir)
+        #
+        # close the original connection and open to new directory
+        # NOTE:  This really cannot test the difference between the
+        # write-no-sync (off) version of log_flush and the sync
+        # version since we're not crashing the system itself.
+        #
+        self.close_conn()
+        self.conn = self.setUpConnectionOpen(newdir)
+        self.session = self.setUpSessionOpen(self.conn)
+
+    def test_rollback_to_stable(self):
+        nrows = 1500
+
+        # Create a table without logging.
+        self.pr("create/populate table")
+        uri = "table:rollback_to_stable14"
+        ds = SimpleDataSet(
+            self, uri, 0, key_format="i", value_format="S", config='log=(enabled=false)')
+        ds.populate()
+
+        # Pin oldest and stable to timestamp 10.
+        self.conn.set_timestamp('oldest_timestamp=' + timestamp_str(10) +
+            ',stable_timestamp=' + timestamp_str(10))
+
+        value_a = "aaaaa" * 100
+
+        value_modQ = mod_val(value_a, 'Q', 0)
+        value_modR = mod_val(value_modQ, 'R', 1)
+        value_modS = mod_val(value_modR, 'S', 2)
+        value_modT = mod_val(value_modS, 'T', 3)
+
+        # Perform a combination of modifies and updates.
+        self.pr("large updates and modifies")
+        self.large_updates(uri, value_a, ds, nrows, 20)
+        self.large_modifies(uri, 'Q', ds, 0, 1, nrows, 30)
+        self.large_modifies(uri, 'R', ds, 1, 1, nrows, 40)
+        self.large_modifies(uri, 'S', ds, 2, 1, nrows, 50)
+        self.large_modifies(uri, 'T', ds, 3, 1, nrows, 60)
+
+        # Verify data is visible and correct.
+        self.check(value_a, uri, nrows, 20)
+        self.check(value_modQ, uri, nrows, 30)
+        self.check(value_modR, uri, nrows, 40)
+        self.check(value_modS, uri, nrows, 50)
+        self.check(value_modT, uri, nrows, 60)
+
+        # Pin stable to timestamp 60 if prepare otherwise 50.
+        if self.prepare:
+            self.conn.set_timestamp('stable_timestamp=' + timestamp_str(60))
+        else:
+            self.conn.set_timestamp('stable_timestamp=' + timestamp_str(50))
+
+        # Create a checkpoint thread
+        done = threading.Event()
+        ckpt = checkpoint_thread(self.conn, done)
+        try:
+            self.pr("start checkpoint")
+            ckpt.start()
+
+            # Perform several modifies in parallel with checkpoint.
+            # Rollbacks may occur when checkpoint is running, so retry as needed.
+            self.pr("modifies")
+            retry_rollback(self, 'modify ds1, W', None,
+                           lambda: self.large_modifies(uri, 'W', ds, 4, 1, nrows, 70))
+            retry_rollback(self, 'modify ds1, X', None,
+                           lambda: self.large_modifies(uri, 'X', ds, 5, 1, nrows, 80))
+            retry_rollback(self, 'modify ds1, Y', None,
+                           lambda: self.large_modifies(uri, 'Y', ds, 6, 1, nrows, 90))
+            retry_rollback(self, 'modify ds1, Z', None,
+                           lambda: self.large_modifies(uri, 'Z', ds, 7, 1, nrows, 100))
+        finally:
+            done.set()
+            ckpt.join()
+
+        # Simulate a server crash and restart.
+        self.pr("restart")
+        self.simulate_crash_restart(".", "RESTART")
+        self.pr("restart complete")
+
+        stat_cursor = self.session.open_cursor('statistics:', None, None)
+        calls = stat_cursor[stat.conn.txn_rts][2]
+        hs_removed = stat_cursor[stat.conn.txn_rts_hs_removed][2]
+        hs_sweep = stat_cursor[stat.conn.txn_rts_sweep_hs_keys][2]
+        hs_restore_updates = stat_cursor[stat.conn.txn_rts_hs_restore_updates][2]
+        keys_removed = stat_cursor[stat.conn.txn_rts_keys_removed][2]
+        keys_restored = stat_cursor[stat.conn.txn_rts_keys_restored][2]
+        pages_visited = stat_cursor[stat.conn.txn_rts_pages_visited][2]
+        upd_aborted = stat_cursor[stat.conn.txn_rts_upd_aborted][2]
+        stat_cursor.close()
+
+        self.assertEqual(calls, 0)
+        self.assertEqual(keys_removed, 0)
+        self.assertEqual(hs_restore_updates, nrows)
+        self.assertEqual(keys_restored, 0)
+        self.assertEqual(upd_aborted, 0)
+        self.assertGreater(pages_visited, 0)
+        self.assertGreaterEqual(hs_removed, nrows)
+        self.assertGreaterEqual(hs_sweep, 0)
+
+        # Check that the correct data is seen at and after the stable timestamp.
+        self.check(value_a, uri, nrows, 20)
+        self.check(value_modQ, uri, nrows, 30)
+        self.check(value_modR, uri, nrows, 40)
+        self.check(value_modS, uri, nrows, 50)
+
+        # The test may output the following message in eviction under cache pressure. Ignore that.
+        self.ignoreStdoutPatternIfExists("oldest pinned transaction ID rolled back for eviction")
+
+    def test_rollback_to_stable_same_ts(self):
+        nrows = 1500
+
+        # Create a table without logging.
+        self.pr("create/populate table")
+        uri = "table:rollback_to_stable14"
+        ds = SimpleDataSet(
+            self, uri, 0, key_format="i", value_format="S", config='log=(enabled=false)')
+        ds.populate()
+
+        # Pin oldest and stable to timestamp 10.
+        self.conn.set_timestamp('oldest_timestamp=' + timestamp_str(10) +
+            ',stable_timestamp=' + timestamp_str(10))
+
+        value_a = "aaaaa" * 100
+
+        value_modQ = mod_val(value_a, 'Q', 0)
+        value_modR = mod_val(value_modQ, 'R', 1)
+        value_modS = mod_val(value_modR, 'S', 2)
+        value_modT = mod_val(value_modS, 'T', 3)
+
+        # Perform a combination of modifies and updates.
+        self.pr("large updates and modifies")
+        self.large_updates(uri, value_a, ds, nrows, 20)
+        self.large_modifies(uri, 'Q', ds, 0, 1, nrows, 30)
+        # prepare cannot use same timestamp always, so use a different timestamps that are aborted.
+        if self.prepare:
+            self.large_modifies(uri, 'R', ds, 1, 1, nrows, 51)
+            self.large_modifies(uri, 'S', ds, 2, 1, nrows, 55)
+            self.large_modifies(uri, 'T', ds, 3, 1, nrows, 60)
+        else:
+            self.large_modifies(uri, 'R', ds, 1, 1, nrows, 60)
+            self.large_modifies(uri, 'S', ds, 2, 1, nrows, 60)
+            self.large_modifies(uri, 'T', ds, 3, 1, nrows, 60)
+
+        # Verify data is visible and correct.
+        self.check(value_a, uri, nrows, 20)
+        self.check(value_modQ, uri, nrows, 30)
+        self.check(value_modT, uri, nrows, 60)
+
+        self.conn.set_timestamp('stable_timestamp=' + timestamp_str(50))
+
+        # Create a checkpoint thread
+        done = threading.Event()
+        ckpt = checkpoint_thread(self.conn, done)
+        try:
+            self.pr("start checkpoint")
+            ckpt.start()
+
+            # Perform several modifies in parallel with checkpoint.
+            # Rollbacks may occur when checkpoint is running, so retry as needed.
+            self.pr("modifies")
+            retry_rollback(self, 'modify ds1, W', None,
+                           lambda: self.large_modifies(uri, 'W', ds, 4, 1, nrows, 70))
+            retry_rollback(self, 'modify ds1, X', None,
+                           lambda: self.large_modifies(uri, 'X', ds, 5, 1, nrows, 80))
+            retry_rollback(self, 'modify ds1, Y', None,
+                           lambda: self.large_modifies(uri, 'Y', ds, 6, 1, nrows, 90))
+            retry_rollback(self, 'modify ds1, Z', None,
+                           lambda: self.large_modifies(uri, 'Z', ds, 7, 1, nrows, 100))
+        finally:
+            done.set()
+            ckpt.join()
+
+        # Simulate a server crash and restart.
+        self.pr("restart")
+        self.simulate_crash_restart(".", "RESTART")
+        self.pr("restart complete")
+
+        stat_cursor = self.session.open_cursor('statistics:', None, None)
+        calls = stat_cursor[stat.conn.txn_rts][2]
+        hs_removed = stat_cursor[stat.conn.txn_rts_hs_removed][2]
+        hs_restore_updates = stat_cursor[stat.conn.txn_rts_hs_restore_updates][2]
+        hs_sweep = stat_cursor[stat.conn.txn_rts_sweep_hs_keys][2]
+        keys_removed = stat_cursor[stat.conn.txn_rts_keys_removed][2]
+        keys_restored = stat_cursor[stat.conn.txn_rts_keys_restored][2]
+        pages_visited = stat_cursor[stat.conn.txn_rts_pages_visited][2]
+        upd_aborted = stat_cursor[stat.conn.txn_rts_upd_aborted][2]
+        stat_cursor.close()
+
+        self.assertEqual(calls, 0)
+        self.assertEqual(keys_removed, 0)
+        self.assertEqual(hs_restore_updates, nrows)
+        self.assertEqual(keys_restored, 0)
+        self.assertEqual(upd_aborted, 0)
+        self.assertGreater(pages_visited, 0)
+        self.assertGreaterEqual(hs_removed, nrows * 3)
+        self.assertGreaterEqual(hs_sweep, 0)
+
+        # Check that the correct data is seen at and after the stable timestamp.
+        self.check(value_a, uri, nrows, 20)
+        self.check(value_modQ, uri, nrows, 30)
+
+        # The test may output the following message in eviction under cache pressure. Ignore that.
+        self.ignoreStdoutPatternIfExists("oldest pinned transaction ID rolled back for eviction")
+
+if __name__ == '__main__':
+    wttest.run()
diff --git a/src/third_party/wiredtiger/test/suite/test_tiered01.py b/src/third_party/wiredtiger/test/suite/test_tiered01.py
index 2a41c3ff7ef..9a7066fd708 100644
--- a/src/third_party/wiredtiger/test/suite/test_tiered01.py
+++ b/src/third_party/wiredtiger/test/suite/test_tiered01.py
@@ -71,5 +71,12 @@ class test_tiered01(wttest.WiredTigerTestCase):
 
        #  self.session.drop(self.uri)
 
+    # It is an error to configure a tiered table with no tiers
+    def test_no_tiers(self):
+        msg = '/tiered table must specify at least one tier/'
+        self.assertRaisesWithMessage(wiredtiger.WiredTigerError,
+            lambda: self.session.create(self.uri, 'type=tiered,key_format=S,tiered=(tiers=())'),
+            msg)
+
 if __name__ == '__main__':
     wttest.run()
diff --git a/src/third_party/wiredtiger/test/suite/test_tiered02.py b/src/third_party/wiredtiger/test/suite/test_tiered02.py
new file mode 100644
index 00000000000..17eb3073c39
--- /dev/null
+++ b/src/third_party/wiredtiger/test/suite/test_tiered02.py
@@ -0,0 +1,63 @@
+#!/usr/bin/env python
+#
+# Public Domain 2014-2021 MongoDB, Inc.
+# Public Domain 2008-2014 WiredTiger, Inc.
+#
+# This is free and unencumbered software released into the public domain.
+#
+# Anyone is free to copy, modify, publish, use, compile, sell, or
+# distribute this software, either in source code form or as a compiled
+# binary, for any purpose, commercial or non-commercial, and by any
+# means.
+#
+# In jurisdictions that recognize copyright laws, the author or authors
+# of this software dedicate any and all copyright interest in the
+# software to the public domain. We make this dedication for the benefit
+# of the public at large and to the detriment of our heirs and
+# successors. We intend this dedication to be an overt act of
+# relinquishment in perpetuity of all present and future rights to this
+# software under copyright law.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+# IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+# OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+# ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+# OTHER DEALINGS IN THE SOFTWARE.
+
+import wiredtiger, wtscenario, wttest
+from wtdataset import SimpleDataSet
+
+# test_tiered02.py
+#    Test block-log-structured tree configuration options.
+class test_tiered02(wttest.WiredTigerTestCase):
+    K = 1024
+    M = 1024 * K
+    G = 1024 * M
+    uri = "file:test_tiered02"
+
+    # Occasionally add a lot of records, so that merges (and bloom) happen.
+    record_count_scenarios = wtscenario.quick_scenarios(
+        'nrecs', [10, 10000], [0.9, 0.1])
+
+    scenarios = wtscenario.make_scenarios(record_count_scenarios, prune=100, prunelong=500)
+
+    # Test drop of an object.
+    def test_tiered(self):
+        args = 'key_format=S,block_allocation=log-structured'
+        self.verbose(3,
+            'Test log-structured allocation with config: ' + args + ' count: ' + str(self.nrecs))
+        #ds = SimpleDataSet(self, self.uri, self.nrecs, config=args)
+        ds = SimpleDataSet(self, self.uri, 10, config=args)
+        ds.populate()
+        self.session.checkpoint()
+        ds = SimpleDataSet(self, self.uri, 10000, config=args)
+        ds.populate()
+
+        self.reopen_conn()
+        ds = SimpleDataSet(self, self.uri, 1000, config=args)
+        ds.populate()
+
+if __name__ == '__main__':
+    wttest.run()
diff --git a/src/third_party/wiredtiger/test/suite/test_tiered03.py b/src/third_party/wiredtiger/test/suite/test_tiered03.py
new file mode 100644
index 00000000000..624387c21a3
--- /dev/null
+++ b/src/third_party/wiredtiger/test/suite/test_tiered03.py
@@ -0,0 +1,97 @@
+#!/usr/bin/env python
+#
+# Public Domain 2014-2021 MongoDB, Inc.
+# Public Domain 2008-2014 WiredTiger, Inc.
+#
+# This is free and unencumbered software released into the public domain.
+#
+# Anyone is free to copy, modify, publish, use, compile, sell, or
+# distribute this software, either in source code form or as a compiled
+# binary, for any purpose, commercial or non-commercial, and by any
+# means.
+#
+# In jurisdictions that recognize copyright laws, the author or authors
+# of this software dedicate any and all copyright interest in the
+# software to the public domain. We make this dedication for the benefit
+# of the public at large and to the detriment of our heirs and
+# successors. We intend this dedication to be an overt act of
+# relinquishment in perpetuity of all present and future rights to this
+# software under copyright law.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+# IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+# OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+# ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+# OTHER DEALINGS IN THE SOFTWARE.
+
+import os, re
+import wiredtiger, wtscenario, wttest
+from wtdataset import SimpleDataSet
+
+# test_tiered03.py
+#    Test block-log-structured tree configuration options.
+class test_tiered03(wttest.WiredTigerTestCase):
+    K = 1024
+    M = 1024 * K
+    G = 1024 * M
+    uri = 'file:test_tiered03'
+
+    # Occasionally add a lot of records, so that merges (and bloom) happen.
+    record_count_scenarios = wtscenario.quick_scenarios(
+        'nrecs', [10, 10000], [0.9, 0.1])
+
+    scenarios = wtscenario.make_scenarios(record_count_scenarios, prune=100, prunelong=500)
+
+    # Test sharing data between a primary and a secondary
+    def test_sharing(self):
+        args = 'block_allocation=log-structured'
+        self.verbose(3,
+            'Test log-structured allocation with config: ' + args + ' count: ' + str(self.nrecs))
+        ds = SimpleDataSet(self, self.uri, 10, config=args)
+        ds.populate()
+        ds.check()
+        self.session.checkpoint()
+        ds.check()
+
+        # Create a secondary database
+        dir2 = os.path.join(self.home, 'SECONDARY')
+        os.mkdir(dir2)
+        conn2 = self.setUpConnectionOpen(dir2)
+        session2 = conn2.open_session()
+
+        # Reference the tree from the secondary:
+        metac = self.session.open_cursor('metadata:')
+        metac2 = session2.open_cursor('metadata:', None, 'readonly=0')
+        uri2 = self.uri[:5] + '../' + self.uri[5:]
+        metac2[uri2] = metac[self.uri] + ",readonly=1"
+
+        cursor2 = session2.open_cursor(uri2)
+        ds.check_cursor(cursor2)
+        cursor2.close()
+
+        newds = SimpleDataSet(self, self.uri, 10000, config=args)
+        newds.populate()
+        newds.check()
+        self.session.checkpoint()
+        newds.check()
+
+        # Check we can still read from the last checkpoint
+        cursor2 = session2.open_cursor(uri2)
+        ds.check_cursor(cursor2)
+        cursor2.close()
+
+        # Bump to new checkpoint
+        origmeta = metac[self.uri]
+        checkpoint = re.search(r',checkpoint=\(.+?\)\)', origmeta).group(0)[1:]
+        self.pr('Orig checkpoint: ' + checkpoint)
+        session2.alter(uri2, checkpoint)
+        self.pr('New metadata on secondaery: ' + metac2[uri2])
+
+        # Check that we can see the new data
+        cursor2 = session2.open_cursor(uri2)
+        newds.check_cursor(cursor2)
+
+if __name__ == '__main__':
+    wttest.run()
author	Luke Chen <luke.chen@mongodb.com>	2021-02-03 13:58:53 +1100
committer	Evergreen Agent <no-reply@evergreen.mongodb.com>	2021-02-03 03:20:38 +0000
commit	7aa1b65641938719accd595bda3e45e97dc5f475 (patch)
tree	06442d7ad52cf475dca61aa2ca96ed908f450388
parent	ea687c2a4bcd9937d02658043b5a2529943ef950 (diff)
download	mongo-r4.4.4-rc0.tar.gz