Import wiredtiger: c6ea0d18b5bcd7a6e7d91eece81ada238904c80e from branch mongodb-5.0

ref: b385c98487..c6ea0d18b5 for: 5.0.3 WT-6908 Write "cache" subpage for Architecture Guide WT-6911 Write "block manager" subpage for Architecture Guide WT-7005 Write "session" subpage for Architecture Guide WT-7006 Write Connection subpage for Architecture Guide WT-7905 Fix incorrect builtin behaviour for builds in CMake WT-7909 Create a new method to check for running user transactions before starting rollback-to-stable operation WT-7917 Add evergreen validation to s_all WT-7931 Evicting modifies using the evict cursor in test_multiple_older_readers_with_multiple_mixed_mode() to ensure that eviction happens. WT-7941 Add an Evergreen task to test abort/recovery using test/format WT-7964 Fix rollback to stable incorrectly not rolling back updates at snap_max WT-7965 Update connection base write generation number at the end of recovery checkpoint WT-7970 Set the stable timestamp before starting the checkpointer and clock threads WT-7974 More column-store fixes and tests WT-7984 Fix a bug that could cause a checkpoint to omit a page of data WT-7995 Fix the global visibility that it cannot go beyond checkpoint visibility WT-7998 Minor fixes on Cache subpage of Architecture Guide
author: Luke Chen <luke.chen@mongodb.com> 2021-08-26 13:54:52 +1000
committer: Luke Chen <luke.chen@mongodb.com> 2021-08-26 13:54:52 +1000
commit: ba6c7287e5ad13a65ef35e2468694219160775ce (patch)
tree: 4d8ca6a8837fa82fe21e4eab430eb887becbd38c
parent: e15a429d9c1c2c6d5fe5d186b866ae7d8e7c6e60 (diff)
download: mongo-ba6c7287e5ad13a65ef35e2468694219160775ce.tar.gz
41 files changed, 1448 insertions, 364 deletions
diff --git a/src/third_party/wiredtiger/dist/docs_data.py b/src/third_party/wiredtiger/dist/docs_data.py
index 831cdc61f30..2d0ba1e6068 100644
--- a/src/third_party/wiredtiger/dist/docs_data.py
+++ b/src/third_party/wiredtiger/dist/docs_data.py
@@ -20,13 +20,18 @@ arch_doc_pages = [
         ['src/include/block.h', 'src/include/block_inline.h',
          'src/block/']),
     ArchDocPage('arch-cache',
-        ['WT_CACHE', 'WT_CACHE_POOL'],
-        ['src/include/cache.h', 'src/include/cache_inline.h']),
+        ['WT_CACHE', 'WT_CACHE_POOL', 'WT_COL', 'WT_COL_RLE', 'WT_INSERT', 'WT_PAGE',
+         'WT_PAGE_MODIFY', 'WT_REF', 'WT_ROW', 'WT_UPDATE'],
+        ['src/include/btmem.h', 'src/include/cache.h', 'src/include/cache_inline.h',
+         'src/conn/conn_cache.c', 'src/conn/conn_cache_pool.c']),
     ArchDocPage('arch-checkpoint',
         ['WT_CONNECTION'],
         ['src/block/block_ckpt.c', 'src/block/block_ckpt_scan.c',
          'src/conn/conn_ckpt.c', 'src/meta/meta_ckpt.c',
          'src/txn/txn_ckpt.c']),
+    ArchDocPage('arch-connection',
+        ['WT_CONNECTION'],
+        ['src/include/connection.h']),
     ArchDocPage('arch-cursor',
         ['WT_CURSOR', 'WT_CURSOR_BACKUP', 'WT_CURSOR_BTREE', 'WT_CURSOR_BULK',
          'WT_CURSOR_DATA_SOURCE', 'WT_CURSOR_DUMP', 'WT_CURSOR_INDEX',
@@ -78,6 +83,9 @@ arch_doc_pages = [
         ['src/include/intpack_inline.h', 'src/include/packing_inline.h',
          'src/include/schema.h',
          'src/lsm/', 'src/packing/', 'src/schema/']),
+    ArchDocPage('arch-session',
+        ['WT_SESSION'],
+        ['src/include/session.h']),
     ArchDocPage('arch-snapshot',
         ['WT_TXN'],
         ['src/include/txn.h']),
diff --git a/src/third_party/wiredtiger/dist/s_all b/src/third_party/wiredtiger/dist/s_all
index 50d1909eea0..4c39bc8e321 100755
--- a/src/third_party/wiredtiger/dist/s_all
+++ b/src/third_party/wiredtiger/dist/s_all
@@ -47,6 +47,12 @@ errchk()
 		return
 	fi
 
+	# Return if evergreen validate runs sucessfully.
+	if echo "$1" | grep -q evergreen && [ "$(cat "$2")" = "../test/evergreen.yml is valid" ] ; then
+		rm -f "$2"
+		return
+	fi
+
 	echo "####################### MESSAGE ############################"
 	echo "s_all run of: \"$1\" resulted in:"
 	sed -e 's/^/    /' $2
@@ -81,6 +87,9 @@ run "./s_clang-format"
 run "python prototypes.py"
 run "sh ./s_typedef -b"
 run "python test_tag.py"
+if command -v evergreen > /dev/null; then
+	run "evergreen validate ../test/evergreen.yml"
+fi
 
 COMMANDS="
 2>&1 ./s_define > ${t_pfx}s_define
diff --git a/src/third_party/wiredtiger/import.data b/src/third_party/wiredtiger/import.data
index 52e281c34f8..d19319d772a 100644
--- a/src/third_party/wiredtiger/import.data
+++ b/src/third_party/wiredtiger/import.data
@@ -2,5 +2,5 @@
     "vendor": "wiredtiger",
     "github": "wiredtiger/wiredtiger.git",
     "branch": "mongodb-5.0",
-    "commit": "b385c984870fec2d693dece8e79876fd5e8bf867"
+    "commit": "c6ea0d18b5bcd7a6e7d91eece81ada238904c80e"
 }
diff --git a/src/third_party/wiredtiger/src/docs/arch-block.dox b/src/third_party/wiredtiger/src/docs/arch-block.dox
index 7c8fdf5d72b..1738d2fdb9b 100644
--- a/src/third_party/wiredtiger/src/docs/arch-block.dox
+++ b/src/third_party/wiredtiger/src/docs/arch-block.dox
@@ -1,9 +1,248 @@
 /*! @arch_page arch-block Block Manager
 
-The Block Manager manages the reading and writing of disk blocks
-in WiredTiger.  It does compression and encryption when these
-are configured.
+The WiredTiger block manager subsystem manages the reading and writing of data
+from the disk. It is designed to facilitate high performance, economic use of
+disk space and customizability.
+
+@section block What is a block?
+
+A block is a chunk of data that is stored on the disk and operated on as a
+single unit. Each WiredTiger data file (any file in the home directory with the
+\c .wt suffix) is made up of these blocks. Each block consists of a page header,
+a block header and contains a single page of the btree from which it was
+generated. WiredTiger is a no-overwrite storage engine, and when blocks are
+re-written, they are written to new locations in the file. The size of a block
+is a multiple of the allocation size which is set during creation of the
+associated WiredTiger data file see: WT_SESSION::create.
+
+Once a block is written an address cookie is returned. This address cookie is
+stored as the \c addr on the associated page ref. The \c WT_REF structure can
+be found in \c btmem.h. The address cookie is opaque to other parts of the
+system and cannot be interpreted meaningfully.
+
+The address cookie is made up of 4 components:
+ - offset: The offset in the file. In order to avoid storing large offsets this
+   value is divided by the allocation size.
+ - file_id: Optional and only relevant to the tiered storage type, the file_id
+   is maintained in the address.
+ - size: The size of the block, also divided by the allocation size.
+ - checksum: The checksum of the block for validation purposes.
+
+The block header contains the following fields:
+ - size: The size of the block on disk, used when salvaging data from a corrupt
+   file.
+ - checksum: The checksum of the block, again used for salvaging.
+ - flags: Flags set on the block itself.
+ - padding
+
+The page header is not described in this document but can be found in
+\c btmem.h.
+
+@section block_implementation Block manager implementation details
+
+@subsection write_once Writing
+
+The block manager decides where in the file a block will be written. It has two
+forms of writing modes, "first fit" and "best fit". The default behavior is best
+fit. While operating in best fit mode the block manager will search a skip list
+of extents sorted by size, returning either an exact match or the next largest.
+This is done to avoid fragmenting the file when possible. In first fit mode the
+block manager will place the newly created block in the first available extent.
+First fit mode is used for all root pages.
+
+Additionally the block manager is a no-overwrite system. As such once a block is
+written it cannot be modified. This is for crash recovery reasons, because if
+the system were to crash during an overwrite the block state would be unknown.
+This doesn't mean that the associated page cannot be modified, once the
+associated page is modified a subsequent reconciliation will result in a new
+block being created.
+
+@subsection desc_block Descriptor blocks
+
+A file is divided up into blocks. The first block in a file is special as it
+contains metadata about the file and is referred to as the "descriptor block".
+It contains the WiredTiger major and minor version, a checksum of the block
+contents as well as a "magic" number to check against.
+
+The descriptor block serves as a safety check to ensure that the file being
+loaded into the block manager is actually a WiredTiger data file, that it
+belongs to a compatible version of WiredTiger and that the entire file has not
+been corrupted. WiredTiger also uses checksums to defend against file corruption
+which is described in the @ref checksum section.
+
+@subsection block_lists Extent lists
+
+Internally, the block manager uses a data structure called an extent list or a
+\c WT_EXTLIST to track file usage. An extent list consists of a series of
+extents (or \c WT_EXT elements). Each extent uses a file offset and size to
+track a portion of the file.
+
+There are three extent lists that are maintained per checkpoint:
+
+- \c alloc: The file ranges allocated for a given checkpoint.
+- \c avail: The file ranges that are unused and available for allocation.
+- \c discard: The file ranges freed in the current checkpoint.
+
+The alloc and discard extent lists are maintained as a skiplist sorted by file
+offset. The avail extent list also maintains an extra skiplist sorted by the
+extent size to aid with allocating new blocks.
+
+@section configuration Configuration options
+
+There are a number of configuration options that affect the block manager's
+behavior. This does not aim to be an exhaustive list, however, these are the
+configuration options that are more commonly of interest to developers.
+
+All of the configuration options below are passed into the \c WT_SESSION::create
+API at the time of file creation.
+
+@subsection alloc_size Allocation size
+
+The allocation_size configuration controls the file unit allocation_size.
+Any blocks allocated by the block manager must be a multiple of this value.
+
+For example, if we specify an allocation_size of \c 4KB, blocks of size
+\c 8KB and \c 12KB would be permitted but NOT \c 10KB. The allocation_size
+is set to \c 4KB by default which is a good choice unless the OS or storage
+device has special requirements.
+
+@subsection checksum Checksum
+
+The \c checksum "on" configuration can be provided during creation of the file.
+This configuration instructs the block manager to checksum the full length of
+the buffer provided to be written into the block. Be default it is enabled.
+When disabled the block manager still does perform a checksum operation but only
+the first 64 bytes of the buffer are included.
+
+The checksum is used when reading blocks to validate their contents, it
+is compared with the checksum extracted from the address cookie and it is
+compared with a checksum generated from the buffer that was held in the block
+being read. In both cases the checksum has to match.
+
+There are other options that can be provided for this configuration option,
+they are not discussed here.
+
+@section block_usage How WiredTiger uses the block manager
+
+@subsection creation File creation and the block manager
+
+When a new file is created in WiredTiger via WT_SESSION::create, the file is
+created on disk and the associated \c allocation_size is written out to the
+metadata file. However the block manager itself only exists on the btree
+structure and is allocated when opening a closed btree.
+
+@subsection read Reading files and pages
+
+When an existing btree is opened for the first time, the location of the root
+block is contained in the metadata file \c WiredTiger.wt. The block manager will
+read the block at the location specified and return the page image as a buffer
+to the layer above. This will then be instantiated as a page in memory.
+
+From there subsequent page addresses can be read from the root page and the
+process repeated as required. If a cursor traverses to a page which hasn't been
+read into memory the same process will take place.
+
+@subsection Writing
+
+Two cases exist for writing out data using the block manager: checkpoint and
+eviction. When a page image is written out the block manager the \c bm->write
+API is called. See \c bt_io.c for more detail.
+
+@subsubsection Checkpoint
+
+For details on checkpoint at the WiredTiger level see: @ref arch-checkpoint.
+
+At the block manager level, a checkpoint corresponds with a set of blocks that
+are stored in the file associated with the given URI. Typically each file will
+contain a minimum of two checkpoints. Upon opening an existing file the most
+recent checkpoint is read.
+
+During a checkpoint new blocks are only written out for dirty pages. A block can
+be included in multiple checkpoints. Assuming a page \c X is dirty and gets
+checkpointed in checkpoint \c A, it will be created as a new block on disk. Now
+the same page \c X isn't modified and another checkpoint is taken. The page is
+clean and as such will not require a new block to be written for it. The address
+of the original block is still valid.
+
+Checkpoints are created in depth first order, leaf blocks are created, then
+the parent blocks. This is a requirement as the parent blocks contain the
+addresses of the leaf blocks.
+
+The block manager doesn't guarantee that calling \c bm->write will result in
+the data being flushed to disk. In the checkpoint scenario WiredTiger will also
+call \c bm->sync once all blocks have been written which will call the file
+system dependent flush function.
+
+* Checkpoint deletion and merging *
+
+As a checkpoint progresses it takes a snapshot of the three extent lists kept
+by the block manager, these extent lists are written out to disk as part of
+the checkpoint in blocks. Between checkpoints these extent lists are being
+updated via normal operation of WiredTiger.
+
+Suppose we have a checkpoint \c A, which has an alloc list which contains 3
+blocks \c I, \c J, \c K as such its extent lists are as follows:
+
+Alloc: \c I, \c J, \c K
+Avail: \c L, \c M
+Discard: Empty
+
+A second checkpoint \c B completes and has removed a page which corresponds with
+block \c J, it also has allocated an additional block \c L.
+
+Checkpoint \c B's extent lists are as follows:
+Alloc: \c L
+Avail: \c M
+Discard: \c J
+
+Finally we complete a 3rd checkpoint \c C which allocates an additional block
+\c M. Upon completion of this checkpoint we are able to remove checkpoint \c A,
+to do that, the block manager will merge checkpoint \c A's extent lists into
+checkpoint \c B's.
+
+What's important here is that if a block appears in both the alloc list and
+the discard list it can be freed which means it goes on the avail list.
+
+Which gives us the following lists for checkpoint \c C:
+Alloc: \c M
+Avail: Empty
+Discard: Empty
+
+And the following lists for checkpoint \c B:
+Alloc: \c I, \c K, \c L
+Avail: \c J
+Discard: Empty
+
+These extent lists are written out with the checkpoint \c C. Anything on the
+avail list is considered free space and can be reused as of the completion
+of checkpoint \c C.
+
+We don't want to list each block individually in the extent lists, so instead of
+listing each block separately in the list, we use extents, which can describe a
+range in the file, that is, any number of contiguous blocks.
+
+@subsubsection Eviction
+
+For more detail on how WiredTiger eviction works see: @ref arch-eviction.
+
+Eviction also utilizes the block manager. When a page is evicted and contains
+data that needs to be maintained, logically a block needs to be written.
+Eviction calls \c bm->write however it does not instruct the block manager to
+sync the data.
+
+@subsection Compaction
+
+As new blocks are written, the block manager will place them where they fit best.
+Because of this it's common that removal of data will not result in the file
+shrinking. The file can only be shrunk when there are available blocks at the
+end of the file.
+
+To manage this, WiredTiger provides a compaction API call WT_SESSION::compact.
+The block manager operates in first fit mode during compaction to maximize block
+movement towards the beginning of the file. WiredTiger walks the btree and asks
+the block manager if relocating that page will reduce the file size. If so, the
+page is marked dirty, forcing the block to be rewritten. WiredTiger then
+performs two checkpoints, as at least two checkpoints are required to delete the
+checkpoint originally containing the block.
 
-The state of the block manager is represented by the \c WT_BM structure.
-Individual blocks being tracked are in \c WT_BLOCK structures.
 */
diff --git a/src/third_party/wiredtiger/src/docs/arch-cache.dox b/src/third_party/wiredtiger/src/docs/arch-cache.dox
index 94888260cfe..716c4966d45 100644
--- a/src/third_party/wiredtiger/src/docs/arch-cache.dox
+++ b/src/third_party/wiredtiger/src/docs/arch-cache.dox
@@ -1,13 +1,118 @@
 /*! @arch_page arch-cache Cache
 
-Cache in WiredTiger is represented by the various shared data structures
-that make up in-memory Btrees and subordinate data structures.
+The WiredTiger cache is memory used to hold copies of recently accessed or modified data.
+WiredTiger reads Btree pages into the cache on demand.  When the cache runs low on space, Eviction
+removes unneeded pages.  Updates modify data in the cache and
+are flushed to storage asynchronously, either by @ref arch-checkpoint "Checkpoint" or
+@ref arch-eviction "Eviction".
 
-Memory used to read in and write out the on-disk representations of Btrees
-is not cached, it only exists temporarily during the I/O operation and
-while the data is transferred to or from the on-disk format.
+The page layout in the WiredTiger cache is optimized for fast, concurrent access by multiple
+application threads.  In contrast, WiredTiger organizes pages in storage to minimize storage space.
+As a result, WiredTiger has to convert between the in-memory and on-storage representations of a
+page whenever it reads or writes the page.
 
-Internally, the current cache state is represented by the WT_CACHE structure,
-which contains various counters that drive statistics and information
-used for eviction.
+@section arch_cache_basics Basic operation
+
+Cached Btree pages point to each other, mirroring the structure of the on-disk Btree.
+When WiredTiger opens a file, it loads the root page of the Btree into memory along with the first
+level of internal pages.  To lookup an entry in a Btree, WiredTiger starts from the root page
+and searches the Btree until finds the entry.  If WiredTiger encounters a page that is not in
+memory, it loads that page from storage and continues the search.
+
+To load a page into the cache, WiredTiger passes the page's address cookie to the
+@ref arch-block "Block Manager"
+and gets back a buffer containing the corresponding block from the underlying file.
+If necessary, WiredTiger decrypts and decompresses the block.  Then it allocates indexing
+structures to facilitate quick binary search of the keys in the page.  The first time WiredTiger
+needs to modify or insert an entry on a page, it allocates additional structures to track these
+changes.
+
+WiredTiger tracks the total amount of data in the cache.  It also tracks the space used by
+_clean_, (unmodified) pages and by _dirty_ (modified) pages.  When the cache becomes too
+full or contains too much dirty data, WiredTiger invokes @ref arch-eviction "Eviction" to
+remove data from the
+cache.  To remove a clean page from the cache, WiredTiger simply frees the page's memory.
+To remove a dirty page, WiredTiger must first _reconcile_ the page (converting it from
+in-memory format to on-disk format) and then write it to storage.
+
+@section arch_cache_structure Cache structure
+
+Internally, WiredTiger's cache state is represented by the \c WT_CACHE structure, which contains
+counters and parameter settings for tracking cache usage and controlling eviction policy.
+The \c WT_CACHE also includes state WiredTiger uses to track the progress of eviction.  There
+is a single \c WT_CACHE for each connection, accessed via the \c WT_CONNECTION_IMPL structure.
+
+Each page in the cache is accessed via a \c WT_REF structure.  When WiredTiger opens a Btree,
+it places a \c WT_REF for the cached root page in the corresponding \c WT_BTREE structure.
+A \c WT_REF can represent either
+a page in the cache or one that has not been loaded yet.
+The page itself is represented by a \c WT_PAGE structure.  This includes a pointer to a buffer
+that contains the on-disk page image (decrypted and uncompressed).  It also holds the supplemental
+structures that WiredTiger uses to access and update the page while it is cached.
+
+When WiredTiger loads a page into the cache, it allocates an internal table with one entry
+for each entry on the page.  The type and content of these entries depends on the page type. An
+internal Btree page will have an array of \c WT_REF structures.  A row-store leaf page will have
+an array of \c WT_ROW structures representing the KV pairs stored on the page.  A variable-length
+column-store leaf page will have an array of \c WT_COL structures along with a parallel array
+of \c WT_COL_RLE structures indicating run lengths for items that are repeated more then once
+on the page.  Both of these leaf page formats support binary search to quickly find an entry.
+In a fixed-length column-store leaf page, values will be packed into a simple byte array, allowing
+WiredTiger to access entries using bit operations based on the value length.
+
+The first time an entry on a leaf page is inserted or modified, WiredTiger adds a
+\c WT_PAGE_MODIFY structure to the corresponding \c WT_PAGE in the cache. For a row-store leaf
+page the \c WT_PAGE_MODIFY tracks changes using an array of \c WT_UPDATE pointers with one element
+for each
+KV pair on the leaf page.  When WiredTiger updates an entry, it inserts a \c WT_UPDATE in
+this array.  If there are multiple updates to the same item, WiredTiger chains them together
+in a linked list.  When a record is deleted,
+WiredTiger adds an update with a special tombstone value.  WiredTiger stores newly inserted
+elements in a similar array of skip lists represented by \c WT_INSERT structures.  There is a
+separate skiplist for the gap between each pair of keys on the page, as well as skiplists for
+the gaps between the beginning and end of the page and the first and last keys, respectively.
+
+For a column-store leaf page the \c WT_PAGE_MODIFY structure tracks changes using a pair of
+skip lists, one for appended items and one for updated items.
+
+Almost all operations on these data structures are lock-free, allowing a high level of
+concurrency in the cache.
+
+@section arch_cache_size Cache size and content
+
+The amount of memory used by the WiredTiger cache is controlled by the \c cache_size configuration
+parameter, which defaults to 100 MB.  (Note that MongoDB sets the cache size, by default, to be
+half the size of RAM.)  WiredTiger does not explicitly manage this memory, relying instead on
+the C memory allocator to acquire and free memory as needed.  Since the cache is
+allocated from the heap, evicting data from the cache simply returns the memory to the allocator;
+it does not reduce the application's memory footprint.
+
+The WiredTiger cache is only used for Btree data, including associated in-memory structures such
+as indexes, insert lists, and update chains.  Other WiredTiger data structures, such as
+dhandles, cursors, and sessions, are not considered part of the cache and do not count against
+the cache size.  Similarly, memory used to read in and write out the on-disk representations of
+Btree pages is not cached; it is only allocated temporarily during the I/O operation and
+while the data is converted to or from the on-disk format.
+
+@section arch_cache_shared Shared caches
+
+WiredTiger supports sharing a single cache among multiple databases within a process.  Normally
+if a process opens connections to multiple different databases, each connection would use a
+separate fixed-size cache.  With a shared cache, WiredTiger dynamically partitions a fixed
+amount of cache space between participating connections.
+
+When shared caching is enabled, WiredTiger creates a cache pool server thread to manage the
+shared cache.  It also allocates a global \c WT_CACHE_POOL structure, which stores settings
+and statistics for the shared cache.  These settings include a minimum and
+maximum cache size for connections participating in the shared cache.
+
+The cache pool server thread wakes up periodically and adjusts the sizes of the individual
+per-connection caches.  Adjustments are based on a pressure metric for each cache computed
+using a weighted average of the amount of data read into the cache (i.e., cache misses)
+and how often applications threads have evicted data from the cache or waited while
+performing eviction.  If a cache has higher pressure than average and is not yet at the maximum
+size, WiredTiger grows that cache.  Conversely, if a cache has low pressure, WiredTiger shrinks
+it, subject to the minimum cache size.  To change the size of a cache, the cache pool server
+simply changes the cache size parameters in the corresponding \c WT_CACHE structure.  WiredTiger's
+eviction code will adjust the amount of data in the cache accordingly.
 */
diff --git a/src/third_party/wiredtiger/src/docs/arch-connection.dox b/src/third_party/wiredtiger/src/docs/arch-connection.dox
new file mode 100644
index 00000000000..cdc3eeecb9a
--- /dev/null
+++ b/src/third_party/wiredtiger/src/docs/arch-connection.dox
@@ -0,0 +1,60 @@
+/*! @arch_page arch-connection Connection
+
+@section arch_conn_def Definition
+
+A connection is a handle to a WiredTiger database instance. The connection has exclusive access to
+the database through a file lock, hence only one connection can be opened at a time. Internally, a
+connection is represented by \c WT_CONNECTION.
+
+@section arch_conn_lifecycle Life cycle
+
+@subsection arch_conn_init Initialization
+
+A connection is initialized when WT_CONNECTION::wiredtiger_open is called by the user application.
+WT_CONNECTION::wiredtiger_open accepts a list of configuration items (see @ref database_config) that
+can be used to enable different WiredTiger features and tune their behavior. Those features are for
+example related to @ref arch-eviction, @ref arch-logging, @ref arch-checkpoint, @ref arch-cache,
+statistics, etc. All the different available configuration settings are described in the
+documentation for WT_CONNECTION::wiredtiger_open.
+
+WT_CONNECTION::wiredtiger_open also performs different sanity checks depending on the configuration
+item "create". When "create" is specified and a database does not already exist, a new database is
+created along with specific WiredTiger files such as the turtle file and other metadata files. If a
+database already exists, whether "create" is specified or not, WiredTiger will try to open it and
+check for the existence of the different required WiredTiger files. If "create" is not specified,
+WiredTiger expects a previously created database where it is executed. If the existing database is
+corrupted and cannot be opened, either \c WT_RUN_RECOVERY or \c WT_TRY_SALVAGE error (see @ref
+error_handling) is returned to the user application and the connection is not created. In this case,
+a recovery operation will be required to bring the database to a consistent state (see @ref
+command_line for more details) before a connection can be successfully established with the
+database.
+
+Once the database has been successfully opened, internal worker threads are started to provide
+global services used at runtime. Those services consist of different threads to handle statistics,
+logging, eviction, checkpoint and cache management. The sweeping server that manages the active and
+inactive dhandles is started too, see @ref arch-dhandle for more information.
+
+Finally, before the connection is completely initialized, the database is set to a consistent state
+by running rollback to stable, see @ref arch-rts for more details.
+
+@subsection arch_conn_runtime Runtime
+
+At runtime, database-wide operations can be executed using the connection interface. For instance,
+it is possible to reconfigure WiredTiger features and behavior using WT_CONNECTION::reconfigure
+instead of closing the connection and calling WT_CONNECTION::open again. However, almost all CRUD
+operations on the database are executed in the context of a session (see @ref arch-session) which
+can be created using WT_CONNECTION::open_session. See the WT_CONNECTION:: documentation to discover
+other available APIs related to WiredTiger connections.
+
+A connection also keeps tracks of global information, see \c WT_CONNECTION_IMPL defined in \c
+connection.h. Finally, a \c WT_CONNECTION handle may be shared between threads, see @ref threads for
+more information.
+
+@subsection arch_conn_closure Closure
+
+When a connection is no longer required, it can be closed using WT_CONNECTION::close. As a result,
+any resource held by the connection (i.e sessions) is freed unless configured differently and the
+database is restored to a consistent state if necessary. It is worth noting that this final step
+might take some time as it may involve running the rollback to stable operation.
+
+*/
diff --git a/src/third_party/wiredtiger/src/docs/arch-index.dox b/src/third_party/wiredtiger/src/docs/arch-index.dox
index 3d916e675aa..906e6959be8 100644
--- a/src/third_party/wiredtiger/src/docs/arch-index.dox
+++ b/src/third_party/wiredtiger/src/docs/arch-index.dox
@@ -139,6 +139,10 @@ make up in-memory Btrees and subordinate data structures.
 
 A checkpoint is created by WiredTiger to serve as a point from which it can recover.
 
+@subpage arch-connection
+
+A connection is a handle to a WiredTiger database instance.
+
 @subpage arch-cursor
 
 Cursors are used to get and modify data.
@@ -196,6 +200,10 @@ Rollback to stable to remove the unstable updates from the database.
 
 A schema defines the format of the application data in WiredTiger.
 
+@subpage arch-session
+
+A session defines the context for most operations performed in WiredTiger.
+
 @subpage arch-snapshot
 
 Snapshots are implemented by storing transaction ids committed before
diff --git a/src/third_party/wiredtiger/src/docs/arch-session.dox b/src/third_party/wiredtiger/src/docs/arch-session.dox
new file mode 100644
index 00000000000..e17eca10b1b
--- /dev/null
+++ b/src/third_party/wiredtiger/src/docs/arch-session.dox
@@ -0,0 +1,53 @@
+/*! @arch_page arch-session Session
+
+@section arch_session_def Definition
+After a @ref arch-connection has been established between the application and WiredTiger, the
+application can start sending requests to WiredTiger using a session. A session is internally
+represented by WT_SESSION and plays an important role since almost all operations are performed
+under the context of a session.
+
+A session can only be created through an existing connection with the API
+WT_CONNECTION::open_session and it is possible to create multiple sessions through the same
+connection. In fact, one connection can have multiple sessions but one session can only be
+associated with one connection. The maximum number of sessions is set through the configuration item
+\c session_max as part of the configuration string in ::wiredtiger_open.
+
+Sessions created by the calling application are called "user sessions". WiredTiger also performs
+some internal operations such as @ref arch-eviction through self-created sessions. These sessions
+are called "internal sessions". The usage rules and guidelines for both internal sessions and user
+sessions are the same and the only difference between them is their origin of creation.
+
+@section arch_session_ops Operations
+The different operations that can be performed on a WiredTiger session are related to cursors,
+tables and transactions. You can read the complete description of each possible operation in the
+documentation related to WT_SESSION.
+
+@section arch_session_txn Transactions
+It is possible to group several operations within a session, in other words, multiple operations can
+be treated as a single atomic operation. This can be done using @ref arch-transaction. Furthermore,
+a session can hold only one running transaction at any given time and this transaction only belongs
+to that session.
+
+@section arch_session_cur Cursors
+A session can perform multiple data operations on one or several collections using multiple cursors
+(see @ref arch-cursor for more details). All the cursors associated with a session share that
+session transaction context. It is also possible to cache those cursors if required through the
+configuration string given to WT_CONNECTION::open_session or ::wiredtiger_open. The configuration
+item for this purpose is \c cache_cursors.
+
+@section arch_session_dhandles Data Handles
+During its lifetime, a session can accumulate a list of data handles (see @ref arch-dhandle).
+Indeed, when a session accesses a table for the first time, the data handle of that table is
+acquired and cached. Once a session no longer needs to operate on a table, it marks the associated
+data handle as idle. This helps the sweep server release data handles that are inactive, see @ref
+arch-dhandle-lifecycle for more details.
+
+@section arch_session_closure Closure
+A session can be closed using WT_SESSION::close. Closing the connection will also close all opened
+sessions. When a session is closed, it releases all the resources associated with it including
+rolling back any active transaction and closing the cursors that are still open.
+
+@section arch_session_thread Multithreading
+A session is always executed as a single thread, see @ref threads for more details.
+
+*/
diff --git a/src/third_party/wiredtiger/src/docs/spell.ok b/src/third_party/wiredtiger/src/docs/spell.ok
index 1d302409f2e..182a9af00f3 100644
--- a/src/third_party/wiredtiger/src/docs/spell.ok
+++ b/src/third_party/wiredtiger/src/docs/spell.ok
@@ -48,6 +48,7 @@ ECMA
 EINVAL
 ENCRYPTOR
 ENOTSUP
+EXTLIST
 EmpId
 Encryptors
 Facebook
@@ -180,7 +181,9 @@ bokeh
 bool
 boolean
 booleans
+bm
 br
+bt
 btmem
 btree
 btrees
@@ -236,6 +239,7 @@ curhs
 cursortype
 curtable
 customerABC
+customizability
 cv
 cyclomatic
 dN
@@ -257,6 +261,7 @@ decrement
 decrementing
 decrypt
 decrypted
+decrypts
 del
 desc
 destructor
@@ -574,6 +579,7 @@ sess
 sid
 skinparam
 skiplist
+skiplists
 sortable
 spinlock
 spinlocks
@@ -596,6 +602,7 @@ subdatabases
 subdirectory
 subpage
 substring
+subsubsection
 sudo
 superset
 svg
diff --git a/src/third_party/wiredtiger/src/include/cell_inline.h b/src/third_party/wiredtiger/src/include/cell_inline.h
index 2d91b9a8ee6..97b856e8a62 100644
--- a/src/third_party/wiredtiger/src/include/cell_inline.h
+++ b/src/third_party/wiredtiger/src/include/cell_inline.h
@@ -1045,24 +1045,6 @@ __cell_unpack_window_cleanup(WT_SESSION_IMPL *session, const WT_PAGE_HEADER *dsk
 }
 
 /*
- * __cell_pack_kv_window_cleanup --
- *     Clean up cells loaded from a previous run while writing to disk.
- */
-static inline void
-__cell_pack_kv_window_cleanup(
-  WT_SESSION_IMPL *session, const WT_PAGE_HEADER *dsk, WT_CELL_UNPACK_KV *unpack_kv)
-{
-    /*
-     * If the page came from a previous run, reset the transaction ids to "none" and timestamps to 0
-     * as appropriate when the cell information is used for packing the new cell.
-     */
-    if (F_ISSET(S2C(session), WT_CONN_RECOVERING) &&
-      dsk->write_gen > S2BT(session)->base_write_gen &&
-      dsk->write_gen < S2BT(session)->run_write_gen)
-        __cell_kv_window_cleanup(session, unpack_kv);
-}
-
-/*
  * __wt_cell_unpack_addr --
  *     Unpack an address WT_CELL into a structure.
  */
diff --git a/src/third_party/wiredtiger/src/include/extern.h b/src/third_party/wiredtiger/src/include/extern.h
index 1892edd09a3..8061ce88008 100644
--- a/src/third_party/wiredtiger/src/include/extern.h
+++ b/src/third_party/wiredtiger/src/include/extern.h
@@ -39,6 +39,8 @@ extern bool __wt_rwlock_islocked(WT_SESSION_IMPL *session, WT_RWLOCK *l)
   WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
 extern bool __wt_txn_active(WT_SESSION_IMPL *session, uint64_t txnid)
   WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
+extern bool __wt_txn_user_active(WT_SESSION_IMPL *session)
+  WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
 extern char *__wt_time_aggregate_to_string(WT_TIME_AGGREGATE *ta, char *ta_string)
   WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
 extern char *__wt_time_point_to_string(wt_timestamp_t ts, wt_timestamp_t durable_ts,
@@ -1088,6 +1090,8 @@ extern int __wt_meta_track_update(WT_SESSION_IMPL *session, const char *key)
   WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
 extern int __wt_metadata_btree_id_to_uri(WT_SESSION_IMPL *session, uint32_t btree_id, char **uri)
   WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
+extern int __wt_metadata_correct_base_write_gen(WT_SESSION_IMPL *session)
+  WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
 extern int __wt_metadata_cursor(WT_SESSION_IMPL *session, WT_CURSOR **cursorp)
   WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
 extern int __wt_metadata_cursor_close(WT_SESSION_IMPL *session)
@@ -1966,6 +1970,8 @@ static inline bool __wt_txn_visible(WT_SESSION_IMPL *session, uint64_t id, wt_ti
   WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
 static inline bool __wt_txn_visible_all(WT_SESSION_IMPL *session, uint64_t id,
   wt_timestamp_t timestamp) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
+static inline bool __wt_txn_visible_id_snapshot(uint64_t id, uint64_t snap_min, uint64_t snap_max,
+  uint64_t *snapshot, uint32_t snapshot_count) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
 static inline double __wt_eviction_dirty_target(WT_CACHE *cache)
   WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
 static inline int __wt_btree_block_free(WT_SESSION_IMPL *session, const uint8_t *addr,
diff --git a/src/third_party/wiredtiger/src/include/txn.h b/src/third_party/wiredtiger/src/include/txn.h
index 3258cafb29f..6b84061ff82 100644
--- a/src/third_party/wiredtiger/src/include/txn.h
+++ b/src/third_party/wiredtiger/src/include/txn.h
@@ -244,8 +244,8 @@ struct __wt_txn {
 
     /*
      * Snapshot data:
+     *	ids >= snap_max are invisible,
      *	ids < snap_min are visible,
-     *	ids > snap_max are invisible,
      *	everything else is visible unless it is in the snapshot.
      */
     uint64_t snap_min, snap_max;
diff --git a/src/third_party/wiredtiger/src/include/txn_inline.h b/src/third_party/wiredtiger/src/include/txn_inline.h
index 6d3b4e8fa55..b57f0d8203f 100644
--- a/src/third_party/wiredtiger/src/include/txn_inline.h
+++ b/src/third_party/wiredtiger/src/include/txn_inline.h
@@ -449,13 +449,10 @@ err:
 static inline uint64_t
 __wt_txn_oldest_id(WT_SESSION_IMPL *session)
 {
-    WT_BTREE *btree;
     WT_TXN_GLOBAL *txn_global;
     uint64_t checkpoint_pinned, oldest_id;
-    bool include_checkpoint_txn;
 
     txn_global = &S2C(session)->txn_global;
-    btree = S2BT_SAFE(session);
 
     /*
      * The metadata is tracked specially because of optimizations for checkpoints.
@@ -467,10 +464,6 @@ __wt_txn_oldest_id(WT_SESSION_IMPL *session)
      * Take a local copy of these IDs in case they are updated while we are checking visibility.
      */
     oldest_id = txn_global->oldest_id;
-    include_checkpoint_txn =
-      btree == NULL || (btree->checkpoint_gen != __wt_gen(session, WT_GEN_CHECKPOINT));
-    if (!include_checkpoint_txn)
-        return (oldest_id);
 
     /*
      * The read of the transaction ID pinned by a checkpoint needs to be carefully ordered: if a
@@ -501,14 +494,11 @@ __wt_txn_oldest_id(WT_SESSION_IMPL *session)
 static inline void
 __wt_txn_pinned_timestamp(WT_SESSION_IMPL *session, wt_timestamp_t *pinned_tsp)
 {
-    WT_BTREE *btree;
     WT_TXN_GLOBAL *txn_global;
     wt_timestamp_t checkpoint_ts, pinned_ts;
-    bool include_checkpoint_txn;
 
     *pinned_tsp = WT_TS_NONE;
 
-    btree = S2BT_SAFE(session);
     txn_global = &S2C(session)->txn_global;
 
     /*
@@ -520,19 +510,6 @@ __wt_txn_pinned_timestamp(WT_SESSION_IMPL *session, wt_timestamp_t *pinned_tsp)
     *pinned_tsp = pinned_ts = txn_global->pinned_timestamp;
 
     /*
-     * Checkpoint transactions often fall behind ordinary application threads. Take special effort
-     * to not keep changes pinned in cache if they are only required for the checkpoint and it has
-     * already seen them.
-     *
-     * If there is no active checkpoint or this handle is up to date with the active checkpoint then
-     * it's safe to ignore the checkpoint ID in the visibility check.
-     */
-    include_checkpoint_txn =
-      btree == NULL || (btree->checkpoint_gen != __wt_gen(session, WT_GEN_CHECKPOINT));
-    if (!include_checkpoint_txn)
-        return;
-
-    /*
      * The read of checkpoint timestamp needs to be carefully ordered: it needs to be after we have
      * read the pinned timestamp and the checkpoint generation, otherwise, we may read earlier
      * checkpoint timestamp before the checkpoint generation that is read resulting more data being
@@ -680,6 +657,37 @@ __wt_txn_tw_stop_visible_all(WT_SESSION_IMPL *session, WT_TIME_WINDOW *tw)
 }
 
 /*
+ * __wt_txn_visible_id_snapshot --
+ *     Is the id visible in terms of the given snapshot?
+ */
+static inline bool
+__wt_txn_visible_id_snapshot(
+  uint64_t id, uint64_t snap_min, uint64_t snap_max, uint64_t *snapshot, uint32_t snapshot_count)
+{
+    bool found;
+
+    /*
+     * WT_ISO_SNAPSHOT, WT_ISO_READ_COMMITTED: the ID is visible if it is not the result of a
+     * concurrent transaction, that is, if was committed before the snapshot was taken.
+     *
+     * The order here is important: anything newer than or equal to the maximum ID we saw when
+     * taking the snapshot should be invisible, even if the snapshot is empty.
+     *
+     * Snapshot data:
+     *	ids >= snap_max not visible,
+     *	ids < snap_min are visible,
+     *	everything else is visible unless it is found in the snapshot.
+     */
+    if (WT_TXNID_LE(snap_max, id))
+        return (false);
+    if (snapshot_count == 0 || WT_TXNID_LT(id, snap_min))
+        return (true);
+
+    WT_BINARY_SEARCH(id, snapshot, snapshot_count, found);
+    return (!found);
+}
+
+/*
  * __txn_visible_id --
  *     Can the current transaction see the given ID?
  */
@@ -687,7 +695,6 @@ static inline bool
 __txn_visible_id(WT_SESSION_IMPL *session, uint64_t id)
 {
     WT_TXN *txn;
-    bool found;
 
     txn = session->txn;
 
@@ -710,20 +717,8 @@ __txn_visible_id(WT_SESSION_IMPL *session, uint64_t id)
     /* Otherwise, we should be called with a snapshot. */
     WT_ASSERT(session, F_ISSET(txn, WT_TXN_HAS_SNAPSHOT) || session->dhandle->checkpoint != NULL);
 
-    /*
-     * WT_ISO_SNAPSHOT, WT_ISO_READ_COMMITTED: the ID is visible if it is not the result of a
-     * concurrent transaction, that is, if was committed before the snapshot was taken.
-     *
-     * The order here is important: anything newer than the maximum ID we saw when taking the
-     * snapshot should be invisible, even if the snapshot is empty.
-     */
-    if (WT_TXNID_LE(txn->snap_max, id))
-        return (false);
-    if (txn->snapshot_count == 0 || WT_TXNID_LT(id, txn->snap_min))
-        return (true);
-
-    WT_BINARY_SEARCH(id, txn->snapshot, txn->snapshot_count, found);
-    return (!found);
+    return (__wt_txn_visible_id_snapshot(
+      id, txn->snap_min, txn->snap_max, txn->snapshot, txn->snapshot_count));
 }
 
 /*
diff --git a/src/third_party/wiredtiger/src/meta/meta_ckpt.c b/src/third_party/wiredtiger/src/meta/meta_ckpt.c
index b4c7c933c62..d58d0149658 100644
--- a/src/third_party/wiredtiger/src/meta/meta_ckpt.c
+++ b/src/third_party/wiredtiger/src/meta/meta_ckpt.c
@@ -974,6 +974,40 @@ err:
 }
 
 /*
+ * __wt_metadata_correct_base_write_gen --
+ *     Update the connection's base write generation from all files in metadata at then end of the
+ *     recovery checkpoint.
+ */
+int
+__wt_metadata_correct_base_write_gen(WT_SESSION_IMPL *session)
+{
+    WT_CURSOR *cursor;
+    WT_DECL_RET;
+    char *config, *uri;
+
+    uri = NULL;
+    WT_RET(__wt_metadata_cursor(session, &cursor));
+    while ((ret = cursor->next(cursor)) == 0) {
+        WT_ERR(cursor->get_key(cursor, &uri));
+
+        if (!WT_PREFIX_MATCH(uri, "file:") && !WT_PREFIX_MATCH(uri, "tiered:"))
+            continue;
+
+        WT_ERR(cursor->get_value(cursor, &config));
+
+        /* Update base write gen to the write gen. */
+        WT_ERR(__wt_metadata_update_base_write_gen(session, config));
+    }
+    WT_ERR_NOTFOUND_OK(ret, false);
+
+err:
+    if (ret != 0 && uri != NULL)
+        __wt_err(session, ret, "unable to correct write gen for %s", uri);
+    WT_TRET(__wt_metadata_cursor_release(session, &cursor));
+    return (ret);
+}
+
+/*
  * __wt_meta_ckptlist_to_meta --
  *     Convert a checkpoint list into its metadata representation.
  */
diff --git a/src/third_party/wiredtiger/src/reconcile/rec_col.c b/src/third_party/wiredtiger/src/reconcile/rec_col.c
index 4ea57b8acc7..8000026c58b 100644
--- a/src/third_party/wiredtiger/src/reconcile/rec_col.c
+++ b/src/third_party/wiredtiger/src/reconcile/rec_col.c
@@ -744,7 +744,6 @@ record_loop:
                     twp = &clear_tw;
                     goto compare;
                 }
-                __cell_pack_kv_window_cleanup(session, page->dsk, vpack);
                 twp = &vpack->tw;
 
                 /*
diff --git a/src/third_party/wiredtiger/src/reconcile/rec_row.c b/src/third_party/wiredtiger/src/reconcile/rec_row.c
index a72bc170245..99d887da573 100644
--- a/src/third_party/wiredtiger/src/reconcile/rec_row.c
+++ b/src/third_party/wiredtiger/src/reconcile/rec_row.c
@@ -808,10 +808,9 @@ __wt_rec_row_leaf(
         upd = upd_select.upd;
 
         /* Take the timestamp from the update or the cell. */
-        if (upd == NULL) {
-            __cell_pack_kv_window_cleanup(session, page->dsk, vpack);
+        if (upd == NULL)
             twp = &vpack->tw;
-        } else
+        else
             twp = &upd_select.tw;
 
         /*
diff --git a/src/third_party/wiredtiger/src/reconcile/rec_write.c b/src/third_party/wiredtiger/src/reconcile/rec_write.c
index 9ad305792b9..cf9d1be3175 100644
--- a/src/third_party/wiredtiger/src/reconcile/rec_write.c
+++ b/src/third_party/wiredtiger/src/reconcile/rec_write.c
@@ -159,6 +159,9 @@ __reconcile(WT_SESSION_IMPL *session, WT_REF *ref, WT_SALVAGE_COOKIE *salvage, u
     WT_DECL_RET;
     WT_PAGE *page;
     WT_RECONCILE *r;
+#ifdef HAVE_DIAGNOSTIC
+    void *addr;
+#endif
 
     btree = S2BT(session);
     page = ref->page;
@@ -215,11 +218,17 @@ __reconcile(WT_SESSION_IMPL *session, WT_REF *ref, WT_SALVAGE_COOKIE *salvage, u
       F_ISSET(r, WT_REC_CALL_URGENT) && !r->update_used && r->cache_write_restore)
         ret = __wt_set_return(session, EBUSY);
 
+#ifdef HAVE_DIAGNOSTIC
+    addr = ref->addr;
+#endif
     /* Wrap up the page reconciliation. */
     if (ret == 0 && (ret = __rec_write_wrapup(session, r, page)) == 0)
         __rec_write_page_status(session, r);
-    else
+    else {
+        /* Make sure that reconciliation doesn't free the page that has been written to disk. */
+        WT_ASSERT(session, addr == NULL || ref->addr != NULL);
         WT_TRET(__rec_write_wrapup_err(session, r, page));
+    }
 
     /* Release the reconciliation lock. */
     *page_lockedp = false;
@@ -1516,7 +1525,7 @@ err:
  *     Initialize the page write generation number.
  */
 static void
-__rec_set_page_write_gen(WT_PAGE_HEADER *dsk, WT_BTREE *btree)
+__rec_set_page_write_gen(WT_BTREE *btree, WT_PAGE_HEADER *dsk)
 {
     /*
      * We increment the block's write generation so it's easy to identify newer versions of blocks
@@ -1553,7 +1562,7 @@ __rec_split_write_header(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_REC_CHUNK
 
     dsk->recno = btree->type == BTREE_ROW ? WT_RECNO_OOB : multi->key.recno;
 
-    __rec_set_page_write_gen(dsk, btree);
+    __rec_set_page_write_gen(btree, dsk);
     dsk->mem_size = multi->size;
     dsk->u.entries = chunk->entries;
     dsk->type = page->type;
@@ -2088,6 +2097,22 @@ __rec_write_wrapup(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page)
     WT_TIME_AGGREGATE_INIT(&ta);
 
     /*
+     * If using the history store table eviction path and we found updates that weren't globally
+     * visible when reconciling this page, copy them into the database's history store. This can
+     * fail, so try before clearing the page's previous reconciliation state.
+     */
+    if (F_ISSET(r, WT_REC_HS))
+        WT_RET(__rec_hs_wrapup(session, r));
+
+    /*
+     * Wrap up overflow tracking. If we are about to create a checkpoint, the system must be
+     * entirely consistent at that point (the underlying block manager is presumably going to do
+     * some action to resolve the list of allocated/free/whatever blocks that are associated with
+     * the checkpoint).
+     */
+    WT_RET(__wt_ovfl_track_wrapup(session, page));
+
+    /*
      * This page may have previously been reconciled, and that information is now about to be
      * replaced. Make sure it's discarded at some point, and clear the underlying modification
      * information, we're creating a new reality.
@@ -2137,21 +2162,6 @@ __rec_write_wrapup(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page)
     /* Reset the reconciliation state. */
     mod->rec_result = 0;
 
-    /*
-     * If using the history store table eviction path and we found updates that weren't globally
-     * visible when reconciling this page, copy them into the database's history store.
-     */
-    if (F_ISSET(r, WT_REC_HS))
-        WT_RET(__rec_hs_wrapup(session, r));
-
-    /*
-     * Wrap up overflow tracking. If we are about to create a checkpoint, the system must be
-     * entirely consistent at that point (the underlying block manager is presumably going to do
-     * some action to resolve the list of allocated/free/whatever blocks that are associated with
-     * the checkpoint).
-     */
-    WT_RET(__wt_ovfl_track_wrapup(session, page));
-
     __wt_verbose(session, WT_VERB_RECONCILE, "%p reconciled into %" PRIu32 " pages", (void *)ref,
       r->multi_next);
 
@@ -2367,7 +2377,7 @@ __wt_rec_cell_build_ovfl(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_REC_KV *k
         dsk = tmp->mem;
         memset(dsk, 0, WT_PAGE_HEADER_SIZE);
         dsk->type = WT_PAGE_OVFL;
-        __rec_set_page_write_gen(dsk, btree);
+        __rec_set_page_write_gen(btree, dsk);
         dsk->u.datalen = (uint32_t)kv->buf.size;
         memcpy(WT_PAGE_HEADER_BYTE(btree, dsk), kv->buf.data, kv->buf.size);
         dsk->mem_size = WT_PAGE_HEADER_BYTE_SIZE(btree) + (uint32_t)kv->buf.size;
diff --git a/src/third_party/wiredtiger/src/txn/txn.c b/src/third_party/wiredtiger/src/txn/txn.c
index 35eb2e08d6f..e3ab3fecc16 100644
--- a/src/third_party/wiredtiger/src/txn/txn.c
+++ b/src/third_party/wiredtiger/src/txn/txn.c
@@ -139,6 +139,47 @@ __wt_txn_release_snapshot(WT_SESSION_IMPL *session)
 }
 
 /*
+ * __wt_txn_user_active --
+ *     Check whether there are any running user transactions. Note that a new transactions may start
+ *     on a session we have already examined and the caller needs to be aware of this limitation.
+ *     Exclude prepared user transactions from this check.
+ */
+bool
+__wt_txn_user_active(WT_SESSION_IMPL *session)
+{
+    WT_CONNECTION_IMPL *conn;
+    WT_SESSION_IMPL *session_in_list;
+    uint32_t i, session_cnt;
+    bool txn_active;
+
+    conn = S2C(session);
+    txn_active = false;
+
+    /*
+     * No lock is required because the session array is fixed size, but it may contain inactive
+     * entries. We must review any active session, so insert a read barrier after reading the active
+     * session count. That way, no matter what sessions come or go, we'll check the slots for all of
+     * the user sessions for active transactions when we started our check.
+     */
+    WT_ORDERED_READ(session_cnt, conn->session_cnt);
+    for (i = 0, session_in_list = conn->sessions; i < session_cnt; i++, session_in_list++) {
+        /* Skip inactive sessions. */
+        if (!session_in_list->active)
+            continue;
+        /* Check if a user session has a running transaction. Ignore prepared transactions. */
+        if (F_ISSET(session_in_list->txn, WT_TXN_RUNNING) &&
+          !F_ISSET(session_in_list, WT_SESSION_INTERNAL) &&
+          !F_ISSET(session_in_list->txn, WT_TXN_PREPARE)) {
+
+            txn_active = true;
+            break;
+        }
+    }
+
+    return (txn_active);
+}
+
+/*
  * __wt_txn_active --
  *     Check if a transaction is still active. If not, it is either committed, prepared, or rolled
  *     back. It is possible that we race with commit, prepare or rollback and a transaction is still
diff --git a/src/third_party/wiredtiger/src/txn/txn_ckpt.c b/src/third_party/wiredtiger/src/txn/txn_ckpt.c
index 1a7090cd2c6..c32cd43b0bc 100644
--- a/src/third_party/wiredtiger/src/txn/txn_ckpt.c
+++ b/src/third_party/wiredtiger/src/txn/txn_ckpt.c
@@ -922,6 +922,14 @@ __txn_checkpoint(WT_SESSION_IMPL *session, const char *cfg[])
     }
 
     /*
+     * As part of recovery, rollback to stable may have left out clearing stale transaction ids.
+     * Update the connection base write generation based on the latest checkpoint write generations
+     * to reset these transaction ids present on the pages when reading them.
+     */
+    if (F_ISSET(conn, WT_CONN_RECOVERING))
+        WT_ERR(__wt_metadata_correct_base_write_gen(session));
+
+    /*
      * Clear the dhandle so the visibility check doesn't get confused about the snap min. Don't
      * bother restoring the handle since it doesn't make sense to carry a handle across a
      * checkpoint.
diff --git a/src/third_party/wiredtiger/src/txn/txn_recover.c b/src/third_party/wiredtiger/src/txn/txn_recover.c
index 9f32e9346f0..c76e2af3597 100644
--- a/src/third_party/wiredtiger/src/txn/txn_recover.c
+++ b/src/third_party/wiredtiger/src/txn/txn_recover.c
@@ -548,39 +548,6 @@ err:
 }
 
 /*
- * __recovery_correct_write_gen --
- *     Update the connection's base write generation from all files in metadata.
- */
-static int
-__recovery_correct_write_gen(WT_SESSION_IMPL *session)
-{
-    WT_CURSOR *cursor;
-    WT_DECL_RET;
-    char *config, *uri;
-
-    uri = NULL;
-    WT_RET(__wt_metadata_cursor(session, &cursor));
-    while ((ret = cursor->next(cursor)) == 0) {
-        WT_ERR(cursor->get_key(cursor, &uri));
-
-        if (!WT_PREFIX_MATCH(uri, "file:") && !WT_PREFIX_MATCH(uri, "tiered:"))
-            continue;
-
-        WT_ERR(cursor->get_value(cursor, &config));
-
-        /* Update base write gen to the write gen. */
-        WT_ERR(__wt_metadata_update_base_write_gen(session, config));
-    }
-    WT_ERR_NOTFOUND_OK(ret, false);
-
-err:
-    if (ret != 0 && uri != NULL)
-        __wt_err(session, ret, "unable to correct write gen for %s", uri);
-    WT_TRET(__wt_metadata_cursor_release(session, &cursor));
-    return (ret);
-}
-
-/*
  * __recovery_setup_file --
  *     Set up the recovery slot for a file, track the largest file ID, and update the base write gen
  *     based on the file's configuration.
@@ -1055,16 +1022,11 @@ done:
         WT_ERR(session->iface.checkpoint(&session->iface, "force=1"));
 
     /*
-     * Rollback to stable may have left out clearing stale transaction ids. Update the connection
-     * base write generation based on the latest checkpoint write generations to reset them.
-     */
-    if (rts_executed)
-        WT_ERR(__recovery_correct_write_gen(session));
-
-    /*
      * Update the open dhandles write generations and base write generation with the connection's
      * base write generation because the recovery checkpoint writes the pages to disk with new write
-     * generation number which contains transaction ids that are needed to reset later.
+     * generation number which contains transaction ids that are needed to reset later. The
+     * connection level base write generation number is updated at the end of the recovery
+     * checkpoint.
      */
     __wt_dhandle_update_write_gens(session);
 
diff --git a/src/third_party/wiredtiger/src/txn/txn_rollback_to_stable.c b/src/third_party/wiredtiger/src/txn/txn_rollback_to_stable.c
index 3ec6bb95934..6004ddd3db2 100644
--- a/src/third_party/wiredtiger/src/txn/txn_rollback_to_stable.c
+++ b/src/third_party/wiredtiger/src/txn/txn_rollback_to_stable.c
@@ -281,50 +281,30 @@ err:
 }
 
 /*
- * __rollback_check_if_txnid_non_committed --
- *     Check if the transaction id is non committed.
+ * __rollback_txn_visible_id --
+ *     Check if the transaction id is visible or not.
  */
 static bool
-__rollback_check_if_txnid_non_committed(WT_SESSION_IMPL *session, uint64_t txnid)
+__rollback_txn_visible_id(WT_SESSION_IMPL *session, uint64_t id)
 {
     WT_CONNECTION_IMPL *conn;
-    bool found;
 
     conn = S2C(session);
 
-    /* If not recovery then assume all the data as committed. */
+    /* If not recovery then assume all the data as visible. */
     if (!F_ISSET(conn, WT_CONN_RECOVERING))
-        return (false);
+        return (true);
 
     /*
      * Only full checkpoint writes the metadata with snapshot. If the recovered checkpoint snapshot
-     * details are zero then return false i.e, updates are committed.
-     */
-    if (conn->recovery_ckpt_snap_min == 0 && conn->recovery_ckpt_snap_max == 0)
-        return (false);
-
-    /*
-     * Snapshot data:
-     *	ids < recovery_ckpt_snap_min are committed,
-     *	ids > recovery_ckpt_snap_max are non committed,
-     *	everything else is committed unless it is found in the recovery_ckpt_snapshot array.
+     * details are none then return false i.e, updates are visible.
      */
-    if (txnid < conn->recovery_ckpt_snap_min)
-        return (false);
-    else if (txnid > conn->recovery_ckpt_snap_max)
+    if (conn->recovery_ckpt_snap_min == WT_TXN_NONE && conn->recovery_ckpt_snap_max == WT_TXN_NONE)
         return (true);
 
-    /*
-     * Return false when the recovery snapshot count is 0, which means there is no uncommitted
-     * transaction ids.
-     */
-    if (conn->recovery_ckpt_snapshot_count == 0)
-        return (false);
-
-    WT_BINARY_SEARCH(
-      txnid, conn->recovery_ckpt_snapshot, conn->recovery_ckpt_snapshot_count, found);
-
-    return (found);
+    return (
+      __wt_txn_visible_id_snapshot(id, conn->recovery_ckpt_snap_min, conn->recovery_ckpt_snap_max,
+        conn->recovery_ckpt_snapshot, conn->recovery_ckpt_snapshot_count));
 }
 
 /*
@@ -484,7 +464,7 @@ __rollback_ondisk_fixup_key(WT_SESSION_IMPL *session, WT_REF *ref, WT_PAGE *page
          * Stop processing when we find a stable update according to the given timestamp and
          * transaction id.
          */
-        if (!__rollback_check_if_txnid_non_committed(session, hs_tw->start_txn) &&
+        if (__rollback_txn_visible_id(session, hs_tw->start_txn) &&
           hs_durable_ts <= rollback_timestamp) {
             __wt_verbose(session, WT_VERB_RECOVERY_RTS(session),
               "history store update valid with start timestamp: %s, durable timestamp: %s, stop "
@@ -562,7 +542,7 @@ __rollback_ondisk_fixup_key(WT_SESSION_IMPL *session, WT_REF *ref, WT_PAGE *page
          * We have a tombstone on the original update chain and it is stable according to the
          * timestamp and txnid, we need to restore that as well.
          */
-        if (!__rollback_check_if_txnid_non_committed(session, hs_tw->stop_txn) &&
+        if (__rollback_txn_visible_id(session, hs_tw->stop_txn) &&
           hs_stop_durable_ts <= rollback_timestamp) {
             /*
              * The restoring tombstone timestamp must be zero or less than previous update start
@@ -614,6 +594,9 @@ __rollback_ondisk_fixup_key(WT_SESSION_IMPL *session, WT_REF *ref, WT_PAGE *page
 
     /* Finally remove that update from history store. */
     if (valid_update_found) {
+        /* Avoid freeing the updates while still in use if hs_cursor->remove fails. */
+        upd = tombstone = NULL;
+
         WT_ERR(hs_cursor->remove(hs_cursor));
         WT_STAT_CONN_DATA_INCR(session, txn_rts_hs_removed);
         WT_STAT_CONN_DATA_INCR(session, cache_hs_key_truncate_rts);
@@ -692,7 +675,7 @@ __rollback_abort_ondisk_kv(WT_SESSION_IMPL *session, WT_REF *ref, WT_COL *cip, W
         } else
             return (0);
     } else if (vpack->tw.durable_start_ts > rollback_timestamp ||
-      __rollback_check_if_txnid_non_committed(session, vpack->tw.start_txn) ||
+      !__rollback_txn_visible_id(session, vpack->tw.start_txn) ||
       (!WT_TIME_WINDOW_HAS_STOP(&vpack->tw) && prepared)) {
         __wt_verbose(session, WT_VERB_RECOVERY_RTS(session),
           "on-disk update aborted with start durable timestamp: %s, commit timestamp: %s, "
@@ -713,7 +696,7 @@ __rollback_abort_ondisk_kv(WT_SESSION_IMPL *session, WT_REF *ref, WT_COL *cip, W
         }
     } else if (WT_TIME_WINDOW_HAS_STOP(&vpack->tw) &&
       (vpack->tw.durable_stop_ts > rollback_timestamp ||
-        __rollback_check_if_txnid_non_committed(session, vpack->tw.stop_txn) || prepared)) {
+        !__rollback_txn_visible_id(session, vpack->tw.stop_txn) || prepared)) {
         /*
          * For prepared transactions, it is possible that both the on-disk key start and stop time
          * windows can be the same. To abort these updates, check for any stable update from history
@@ -805,7 +788,7 @@ __rollback_abort_col_var(WT_SESSION_IMPL *session, WT_REF *ref, wt_timestamp_t r
     WT_PAGE *page;
     uint64_t recno, rle;
     uint32_t i, j;
-    bool stable_update_found;
+    bool is_ondisk_stable, stable_update_found;
 
     page = ref->page;
     /*
@@ -824,26 +807,46 @@ __rollback_abort_col_var(WT_SESSION_IMPL *session, WT_REF *ref, wt_timestamp_t r
             WT_RET(__rollback_abort_insert_list(
               session, page, ins, rollback_timestamp, &stable_update_found));
 
-        if (!stable_update_found && page->dsk != NULL) {
+        if (page->dsk != NULL) {
+            /* Unpack the cell. We need its RLE count whether or not we're going to iterate it. */
             kcell = WT_COL_PTR(page, cip);
             __wt_cell_unpack_kv(session, page->dsk, kcell, &unpack);
             rle = __wt_cell_rle(&unpack);
-            if (unpack.type != WT_CELL_DEL) {
+
+            /*
+             * If we found a stable update on the insert list, this key needs no further attention.
+             * Any other keys in this cell with stable updates also do not require attention. But
+             * beyond that, the on-disk value must be older than
+             * the update we found. That means it too is stable(*), so any keys in the cell that
+             * _don't_ have stable updates on the update list don't need further attention either.
+             * (And any unstable updates were just handled above.) Thus we can skip iterating over
+             * the cell.
+             *
+             * Furthermore, if the cell is deleted it must be
+             * itself stable, because cells only appear as deleted if there is no older value that
+             * might need to be restored. We can skip iterating over the cell.
+             *
+             * (*) Either that, or the update is not timestamped, in which case the on-disk value
+             * might not be stable but the non-timestamp update will hide it until the next
+             * reconciliation and then overwrite it.
+             */
+            if (stable_update_found)
+                WT_STAT_CONN_DATA_INCR(session, txn_rts_stable_rle_skipped);
+            else if (unpack.type == WT_CELL_DEL)
+                WT_STAT_CONN_DATA_INCR(session, txn_rts_delete_rle_skipped);
+            else {
                 for (j = 0; j < rle; j++) {
-                    WT_RET(__rollback_abort_ondisk_kv(session, ref, cip, NULL, rollback_timestamp,
-                      recno + j, &stable_update_found));
-                    /* Skip processing all RLE if the on-disk version is stable. */
-                    if (stable_update_found) {
+                    WT_RET(__rollback_abort_ondisk_kv(
+                      session, ref, cip, NULL, rollback_timestamp, recno + j, &is_ondisk_stable));
+                    /* We can stop right away if the on-disk version is stable. */
+                    if (is_ondisk_stable) {
                         if (rle > 1)
                             WT_STAT_CONN_DATA_INCR(session, txn_rts_stable_rle_skipped);
                         break;
                     }
                 }
-            } else
-                WT_STAT_CONN_DATA_INCR(session, txn_rts_delete_rle_skipped);
+            }
             recno += rle;
-        } else {
-            recno++;
         }
     }
 
@@ -1214,17 +1217,16 @@ __rollback_to_stable_check(WT_SESSION_IMPL *session)
     bool txn_active;
 
     /*
-     * Help the user comply with the requirement that there are no concurrent operations. Protect
-     * against spurious conflicts with the sweep server: we exclude it from running concurrent with
-     * rolling back the history store contents.
+     * Help the user comply with the requirement that there are no concurrent user operations. It is
+     * okay to have a transaction in prepared state.
      */
-    ret = __wt_txn_activity_check(session, &txn_active);
+    txn_active = __wt_txn_user_active(session);
 #ifdef HAVE_DIAGNOSTIC
     if (txn_active)
         WT_TRET(__wt_verbose_dump_txn(session));
 #endif
 
-    if (ret == 0 && txn_active)
+    if (txn_active)
         WT_RET_MSG(session, EINVAL, "rollback_to_stable illegal with active transactions");
 
     return (ret);
@@ -1622,85 +1624,16 @@ err:
 static int
 __rollback_to_stable(WT_SESSION_IMPL *session, bool no_ckpt)
 {
-    WT_CACHE *cache;
     WT_CONNECTION_IMPL *conn;
     WT_DECL_RET;
     WT_TXN_GLOBAL *txn_global;
     wt_timestamp_t rollback_timestamp;
-    size_t retries;
-    uint32_t cache_flags;
     char ts_string[2][WT_TS_INT_STRING_SIZE];
 
     conn = S2C(session);
-    cache = conn->cache;
     txn_global = &conn->txn_global;
 
     /*
-     * We're about to run a check for active transactions in the system to stop users from shooting
-     * themselves in the foot. Eviction threads may interfere with this check if they involve writes
-     * to the history store so we need to wait until the system is no longer evicting content.
-     *
-     * If we detect active evictions, we should wait a millisecond and check again. If we're waiting
-     * for evictions to quiesce for more than 2 minutes, we should give up on waiting and proceed
-     * with the transaction check anyway.
-     */
-#define WT_RTS_EVICT_MAX_RETRIES (2 * WT_MINUTE * WT_THOUSAND)
-    /*
-     * These are the types of evictions that can result in a history store operation. Since we want
-     * to avoid these happening concurrently with our check, we need to look for these flags.
-     */
-#define WT_CACHE_EVICT_HS_FLAGS \
-    (WT_CACHE_EVICT_DIRTY | WT_CACHE_EVICT_UPDATES | WT_CACHE_EVICT_URGENT)
-    for (retries = 0; retries < WT_RTS_EVICT_MAX_RETRIES; ++retries) {
-        /*
-         * If we're shutting down or running with an in-memory configuration, we aren't at risk of
-         * racing with history store transactions.
-         */
-        if (F_ISSET(conn, WT_CONN_CLOSING_TIMESTAMP | WT_CONN_IN_MEMORY))
-            break;
-
-        /* Check whether eviction has quiesced. */
-        WT_ORDERED_READ(cache_flags, cache->flags);
-        if (!FLD_ISSET(cache_flags, WT_CACHE_EVICT_HS_FLAGS)) {
-            /*
-             * If we we find that the eviction flags are unset, interrupt the eviction server and
-             * acquire the pass lock to stop the server from setting the eviction flags AFTER this
-             * point and racing with our check.
-             */
-            (void)__wt_atomic_addv32(&cache->pass_intr, 1);
-            __wt_spin_lock(session, &cache->evict_pass_lock);
-            (void)__wt_atomic_subv32(&cache->pass_intr, 1);
-            FLD_SET(session->lock_flags, WT_SESSION_LOCKED_PASS);
-
-            /*
-             * Check that the flags didn't get set in between when we checked and when we acquired
-             * the server lock. If it did get set, release the locks and keep trying. If they're
-             * still unset, break out of this loop and commence our check.
-             */
-            WT_ORDERED_READ(cache_flags, cache->flags);
-            if (!FLD_ISSET(cache_flags, WT_CACHE_EVICT_HS_FLAGS))
-                break;
-            else {
-                __wt_spin_unlock(session, &cache->evict_pass_lock);
-                FLD_CLR(session->lock_flags, WT_SESSION_LOCKED_PASS);
-            }
-        }
-        /* If we're retrying, pause for a millisecond and let eviction make some progress. */
-        __wt_sleep(0, WT_THOUSAND);
-    }
-    if (retries == WT_RTS_EVICT_MAX_RETRIES) {
-        WT_ERR(__wt_msg(
-          session, "timed out waiting for eviction to quiesce, running rollback to stable"));
-        /*
-         * FIXME: WT-7877 RTS fails when there are active transactions running in parallel to it.
-         * Waiting in a loop for eviction to quiesce is not efficient in some scenarios where the
-         * cache is not cleared in 2 minutes. Enable the following assert and
-         * test_rollback_to_stable22.py when the cache issue is addressed.
-         */
-        /* WT_ASSERT(session, false && "Timed out waiting for eviction to quiesce prior to rts"); */
-    }
-
-    /*
      * Rollback to stable should ignore tombstones in the history store since it needs to scan the
      * entire table sequentially.
      */
@@ -1708,11 +1641,6 @@ __rollback_to_stable(WT_SESSION_IMPL *session, bool no_ckpt)
 
     WT_ERR(__rollback_to_stable_check(session));
 
-    if (FLD_ISSET(session->lock_flags, WT_SESSION_LOCKED_PASS)) {
-        __wt_spin_unlock(session, &cache->evict_pass_lock);
-        FLD_CLR(session->lock_flags, WT_SESSION_LOCKED_PASS);
-    }
-
     /*
      * Copy the stable timestamp, otherwise we'd need to lock it each time it's accessed. Even
      * though the stable timestamp isn't supposed to be updated while rolling back, accessing it
@@ -1746,10 +1674,6 @@ __rollback_to_stable(WT_SESSION_IMPL *session, bool no_ckpt)
         WT_ERR(session->iface.checkpoint(&session->iface, "force=1"));
 
 err:
-    if (FLD_ISSET(session->lock_flags, WT_SESSION_LOCKED_PASS)) {
-        __wt_spin_unlock(session, &cache->evict_pass_lock);
-        FLD_CLR(session->lock_flags, WT_SESSION_LOCKED_PASS);
-    }
     F_CLR(session, WT_SESSION_ROLLBACK_TO_STABLE);
     return (ret);
 }
diff --git a/src/third_party/wiredtiger/test/checkpoint/checkpointer.c b/src/third_party/wiredtiger/test/checkpoint/checkpointer.c
index b7e7a9a952b..a2445225e2e 100644
--- a/src/third_party/wiredtiger/test/checkpoint/checkpointer.c
+++ b/src/third_party/wiredtiger/test/checkpoint/checkpointer.c
@@ -36,12 +36,26 @@ static int real_checkpointer(void);
 static int verify_consistency(WT_SESSION *, char *);
 
 /*
+ * set_stable --
+ *     Set the stable timestamp from g.ts_stable.
+ */
+static void
+set_stable(void)
+{
+    char buf[128];
+
+    testutil_check(__wt_snprintf(buf, sizeof(buf), "stable_timestamp=%x", g.ts_stable));
+    testutil_check(g.conn->set_timestamp(g.conn, buf));
+}
+
+/*
  * start_checkpoints --
  *     Responsible for creating the checkpoint thread.
  */
 void
 start_checkpoints(void)
 {
+    set_stable();
     testutil_check(__wt_thread_create(NULL, &g.checkpoint_thread, checkpointer, NULL));
     if (g.use_timestamps) {
         testutil_check(__wt_rwlock_init(NULL, &g.clock_lock));
@@ -74,7 +88,6 @@ clock_thread(void *arg)
     WT_SESSION *wt_session;
     WT_SESSION_IMPL *session;
     uint64_t delay;
-    char buf[128];
 
     WT_UNUSED(arg);
 
@@ -85,8 +98,7 @@ clock_thread(void *arg)
     while (g.running) {
         __wt_writelock(session, &g.clock_lock);
         ++g.ts_stable;
-        testutil_check(__wt_snprintf(buf, sizeof(buf), "stable_timestamp=%x", g.ts_stable));
-        testutil_check(g.conn->set_timestamp(g.conn, buf));
+        set_stable();
         if (g.ts_stable % 997 == 0) {
             /*
              * Random value between 6 and 10 seconds.
diff --git a/src/third_party/wiredtiger/test/checkpoint/test_checkpoint.c b/src/third_party/wiredtiger/test/checkpoint/test_checkpoint.c
index 90378cc0de9..1d4c99a2b03 100644
--- a/src/third_party/wiredtiger/test/checkpoint/test_checkpoint.c
+++ b/src/third_party/wiredtiger/test/checkpoint/test_checkpoint.c
@@ -140,7 +140,8 @@ main(int argc, char *argv[])
 
     testutil_work_dir_from_path(g.home, 512, working_dir);
 
-    g.ts_stable = 0;
+    /* Start time at 1 since 0 is not a valid timestamp. */
+    g.ts_stable = 1;
 
     printf("%s: process %" PRIu64 "\n", progname, (uint64_t)getpid());
     for (cnt = 1; (runs == 0 || cnt <= runs) && g.status == 0; ++cnt) {
diff --git a/src/third_party/wiredtiger/test/csuite/Makefile.am b/src/third_party/wiredtiger/test/csuite/Makefile.am
index f5c3eaed361..f5da81e75ed 100644
--- a/src/third_party/wiredtiger/test/csuite/Makefile.am
+++ b/src/third_party/wiredtiger/test/csuite/Makefile.am
@@ -37,7 +37,7 @@ all_TESTS += timestamp_abort/smoke.sh
 
 test_truncated_log_SOURCES = truncated_log/main.c
 noinst_PROGRAMS += test_truncated_log
-all_TESTS += test_truncated_log
+all_TESTS += truncated_log/smoke.sh
 
 test_wt1965_col_efficiency_SOURCES = wt1965_col_efficiency/main.c
 noinst_PROGRAMS += test_wt1965_col_efficiency
@@ -49,8 +49,7 @@ all_TESTS += test_wt2403_lsm_workload
 
 test_wt2246_col_append_SOURCES = wt2246_col_append/main.c
 noinst_PROGRAMS += test_wt2246_col_append
-# Temporarily disabled (WT-5790)
-# all_TESTS += test_wt2246_col_append
+all_TESTS += test_wt2246_col_append
 
 test_wt2323_join_visibility_SOURCES = wt2323_join_visibility/main.c
 noinst_PROGRAMS += test_wt2323_join_visibility
@@ -146,11 +145,11 @@ all_TESTS += test_wt4891_meta_ckptlist_get_alloc
 
 test_wt6185_modify_ts_SOURCES = wt6185_modify_ts/main.c
 noinst_PROGRAMS += test_wt6185_modify_ts
-all_TESTS += test_wt6185_modify_ts
+all_TESTS += wt6185_modify_ts/smoke.sh
 
 test_wt6616_checkpoint_oldest_ts_SOURCES = wt6616_checkpoint_oldest_ts/main.c
 noinst_PROGRAMS += test_wt6616_checkpoint_oldest_ts
-all_TESTS += test_wt6616_checkpoint_oldest_ts
+all_TESTS += wt6616_checkpoint_oldest_ts/smoke.sh
 
 # Run this during a "make check" smoke test.
 TESTS = $(all_TESTS)
diff --git a/src/third_party/wiredtiger/test/csuite/schema_abort/main.c b/src/third_party/wiredtiger/test/csuite/schema_abort/main.c
index e16c7d7e79e..acf14348c0a 100644
--- a/src/third_party/wiredtiger/test/csuite/schema_abort/main.c
+++ b/src/third_party/wiredtiger/test/csuite/schema_abort/main.c
@@ -75,7 +75,7 @@ static const char *const uri_collection = "table:collection";
 
 static const char *const ckpt_file = "checkpoint_done";
 
-static bool compat, inmem, stable_set, use_ts, use_txn;
+static bool compat, inmem, stable_set, use_columns, use_ts, use_txn;
 static volatile uint64_t global_ts = 1;
 static volatile uint64_t uid = 1;
 typedef struct {
@@ -96,9 +96,10 @@ static volatile THREAD_TS th_ts[MAX_TH];
 
 /*
  * A minimum width of 10, along with zero filling, means that all the keys sort according to their
- * integer value, making each thread's key space distinct.
+ * integer value, making each thread's key space distinct. For column-store we just use the integer
+ * values and that has the same effect.
  */
-#define KEY_FORMAT ("%010" PRIu64)
+#define ROW_KEY_FORMAT ("%010" PRIu64)
 
 typedef struct {
     uint64_t absent_key; /* Last absent key */
@@ -670,14 +671,20 @@ thread_run(void *arg)
             }
         if (use_ts)
             stable_ts = __wt_atomic_addv64(&global_ts, 1);
-        testutil_check(__wt_snprintf(kname, sizeof(kname), KEY_FORMAT, i));
 
         testutil_check(session->begin_transaction(session, NULL));
         if (use_prep)
             testutil_check(oplog_session->begin_transaction(oplog_session, NULL));
-        cur_coll->set_key(cur_coll, kname);
-        cur_local->set_key(cur_local, kname);
-        cur_oplog->set_key(cur_oplog, kname);
+        if (use_columns) {
+            cur_coll->set_key(cur_coll, i + 1);
+            cur_local->set_key(cur_local, i + 1);
+            cur_oplog->set_key(cur_oplog, i + 1);
+        } else {
+            testutil_check(__wt_snprintf(kname, sizeof(kname), ROW_KEY_FORMAT, i));
+            cur_coll->set_key(cur_coll, kname);
+            cur_local->set_key(cur_local, kname);
+            cur_oplog->set_key(cur_oplog, kname);
+        }
         /*
          * Put an informative string into the value so that it can be viewed well in a binary dump.
          */
@@ -764,7 +771,7 @@ run_workload(uint32_t nth)
     THREAD_DATA *td;
     wt_thread_t *thr;
     uint32_t ckpt_id, i, ts_id;
-    char envconf[512];
+    char envconf[512], tableconf[128];
 
     thr = dcalloc(nth + 2, sizeof(*thr));
     td = dcalloc(nth + 2, sizeof(THREAD_DATA));
@@ -783,10 +790,13 @@ run_workload(uint32_t nth)
     /*
      * Create all the tables.
      */
-    testutil_check(
-      session->create(session, uri_collection, "key_format=S,value_format=u,log=(enabled=false)"));
-    testutil_check(session->create(session, uri_local, "key_format=S,value_format=u"));
-    testutil_check(session->create(session, uri_oplog, "key_format=S,value_format=u"));
+    testutil_check(__wt_snprintf(tableconf, sizeof(tableconf),
+      "key_format=%s,value_format=u,log=(enabled=false)", use_columns ? "r" : "S"));
+    testutil_check(session->create(session, uri_collection, tableconf));
+    testutil_check(__wt_snprintf(
+      tableconf, sizeof(tableconf), "key_format=%s,value_format=u", use_columns ? "r" : "S"));
+    testutil_check(session->create(session, uri_local, tableconf));
+    testutil_check(session->create(session, uri_oplog, tableconf));
     /*
      * Don't log the stable timestamp table so that we know what timestamp was stored at the
      * checkpoint.
@@ -909,11 +919,15 @@ main(int argc, char *argv[])
     verify_only = false;
     working_dir = "WT_TEST.schema-abort";
 
-    while ((ch = __wt_getopt(progname, argc, argv, "Ch:mT:t:vxz")) != EOF)
+    while ((ch = __wt_getopt(progname, argc, argv, "Cch:mT:t:vxz")) != EOF)
         switch (ch) {
         case 'C':
             compat = true;
             break;
+        case 'c':
+            /* Variable-length columns only; fixed would require considerable changes */
+            use_columns = true;
+            break;
         case 'h':
             working_dir = __wt_optarg;
             break;
@@ -1087,10 +1101,16 @@ main(int argc, char *argv[])
                   key, last_key);
                 break;
             }
-            testutil_check(__wt_snprintf(kname, sizeof(kname), KEY_FORMAT, key));
-            cur_coll->set_key(cur_coll, kname);
-            cur_local->set_key(cur_local, kname);
-            cur_oplog->set_key(cur_oplog, kname);
+            if (use_columns) {
+                cur_coll->set_key(cur_coll, key + 1);
+                cur_local->set_key(cur_local, key + 1);
+                cur_oplog->set_key(cur_oplog, key + 1);
+            } else {
+                testutil_check(__wt_snprintf(kname, sizeof(kname), ROW_KEY_FORMAT, key));
+                cur_coll->set_key(cur_coll, kname);
+                cur_local->set_key(cur_local, kname);
+                cur_oplog->set_key(cur_oplog, kname);
+            }
             /*
              * The collection table should always only have the data as of the checkpoint.
              */
diff --git a/src/third_party/wiredtiger/test/csuite/schema_abort/smoke.sh b/src/third_party/wiredtiger/test/csuite/schema_abort/smoke.sh
index 5e82ae180bc..e7d21ec30e6 100755
--- a/src/third_party/wiredtiger/test/csuite/schema_abort/smoke.sh
+++ b/src/third_party/wiredtiger/test/csuite/schema_abort/smoke.sh
@@ -21,6 +21,14 @@ $TEST_WRAPPER $test_bin -t 10 -T 5
 $TEST_WRAPPER $test_bin -m -t 10 -T 5
 $TEST_WRAPPER $test_bin -C -t 10 -T 5
 $TEST_WRAPPER $test_bin -C -m -t 10 -T 5
+
+$TEST_WRAPPER $test_bin -c -t 10 -T 5
+$TEST_WRAPPER $test_bin -c -m -t 10 -T 5
+$TEST_WRAPPER $test_bin -c -C -t 10 -T 5
+$TEST_WRAPPER $test_bin -c -C -m -t 10 -T 5
+
 # FIXME: In WT-6116 the test is failing if timestamps are turned off.
 #$TEST_WRAPPER $test_bin -m -t 10 -T 5 -z
+#$TEST_WRAPPER $test_bin -c -m -t 10 -T 5 -z
 $TEST_WRAPPER $test_bin -m -t 10 -T 5 -x
+$TEST_WRAPPER $test_bin -c -m -t 10 -T 5 -x
diff --git a/src/third_party/wiredtiger/test/csuite/timestamp_abort/main.c b/src/third_party/wiredtiger/test/csuite/timestamp_abort/main.c
index 6fa41f0d82c..1d59222104b 100644
--- a/src/third_party/wiredtiger/test/csuite/timestamp_abort/main.c
+++ b/src/third_party/wiredtiger/test/csuite/timestamp_abort/main.c
@@ -79,7 +79,7 @@ static const char *const uri_shadow = "shadow";
 
 static const char *const ckpt_file = "checkpoint_done";
 
-static bool compat, inmem, stress, use_ts;
+static bool columns, compat, inmem, stress, use_ts;
 static volatile uint64_t global_ts = 1;
 
 /*
@@ -107,9 +107,10 @@ static volatile uint64_t global_ts = 1;
 
 /*
  * A minimum width of 10, along with zero filling, means that all the keys sort according to their
- * integer value, making each thread's key space distinct.
+ * integer value, making each thread's key space distinct. For column-store we just use the integer
+ * values and that has the same effect.
  */
-#define KEY_FORMAT ("%010" PRIu64)
+#define KEY_STRINGFORMAT ("%010" PRIu64)
 
 typedef struct {
     uint64_t absent_key; /* Last absent key */
@@ -334,8 +335,6 @@ thread_run(void *arg)
     printf("Thread %" PRIu32 " starts at %" PRIu64 "\n", td->info, td->start);
     active_ts = 0;
     for (i = td->start;; ++i) {
-        testutil_check(__wt_snprintf(kname, sizeof(kname), KEY_FORMAT, i));
-
         testutil_check(session->begin_transaction(session, NULL));
         if (use_prep)
             testutil_check(prepared_session->begin_transaction(prepared_session, NULL));
@@ -354,10 +353,18 @@ thread_run(void *arg)
             testutil_check(pthread_rwlock_unlock(&ts_lock));
         }
 
-        cur_coll->set_key(cur_coll, kname);
-        cur_local->set_key(cur_local, kname);
-        cur_oplog->set_key(cur_oplog, kname);
-        cur_shadow->set_key(cur_shadow, kname);
+        if (columns) {
+            cur_coll->set_key(cur_coll, i + 1);
+            cur_local->set_key(cur_local, i + 1);
+            cur_oplog->set_key(cur_oplog, i + 1);
+            cur_shadow->set_key(cur_shadow, i + 1);
+        } else {
+            testutil_check(__wt_snprintf(kname, sizeof(kname), KEY_STRINGFORMAT, i));
+            cur_coll->set_key(cur_coll, kname);
+            cur_local->set_key(cur_local, kname);
+            cur_oplog->set_key(cur_oplog, kname);
+            cur_shadow->set_key(cur_shadow, kname);
+        }
         /*
          * Put an informative string into the value so that it can be viewed well in a binary dump.
          */
@@ -459,6 +466,7 @@ run_workload(uint32_t nth)
     wt_thread_t *thr;
     uint32_t cache_mb, ckpt_id, i, ts_id;
     char envconf[512], uri[128];
+    const char *table_config, *table_config_nolog;
 
     thr = dcalloc(nth + 2, sizeof(*thr));
     td = dcalloc(nth + 2, sizeof(THREAD_DATA));
@@ -495,19 +503,25 @@ run_workload(uint32_t nth)
     printf("wiredtiger_open configuration: %s\n", envconf);
     testutil_check(wiredtiger_open(NULL, NULL, envconf, &conn));
     testutil_check(conn->open_session(conn, NULL, NULL, &session));
+
     /*
      * Create all the tables.
      */
+    if (columns) {
+        table_config_nolog = "key_format=r,value_format=u,log=(enabled=false)";
+        table_config = "key_format=r,value_format=u";
+    } else {
+        table_config_nolog = "key_format=S,value_format=u,log=(enabled=false)";
+        table_config = "key_format=S,value_format=u";
+    }
     testutil_check(__wt_snprintf(uri, sizeof(uri), "%s:%s", table_pfx, uri_collection));
-    testutil_check(
-      session->create(session, uri, "key_format=S,value_format=u,log=(enabled=false)"));
+    testutil_check(session->create(session, uri, table_config_nolog));
     testutil_check(__wt_snprintf(uri, sizeof(uri), "%s:%s", table_pfx, uri_shadow));
-    testutil_check(
-      session->create(session, uri, "key_format=S,value_format=u,log=(enabled=false)"));
+    testutil_check(session->create(session, uri, table_config_nolog));
     testutil_check(__wt_snprintf(uri, sizeof(uri), "%s:%s", table_pfx, uri_local));
-    testutil_check(session->create(session, uri, "key_format=S,value_format=u"));
+    testutil_check(session->create(session, uri, table_config));
     testutil_check(__wt_snprintf(uri, sizeof(uri), "%s:%s", table_pfx, uri_oplog));
-    testutil_check(session->create(session, uri, "key_format=S,value_format=u"));
+    testutil_check(session->create(session, uri, table_config));
     /*
      * Don't log the stable timestamp table so that we know what timestamp was stored at the
      * checkpoint.
@@ -616,7 +630,7 @@ main(int argc, char *argv[])
 
     (void)testutil_set_progname(argv);
 
-    compat = inmem = stress = false;
+    columns = compat = inmem = stress = false;
     use_ts = true;
     nth = MIN_TH;
     rand_th = rand_time = true;
@@ -624,11 +638,15 @@ main(int argc, char *argv[])
     verify_only = false;
     working_dir = "WT_TEST.timestamp-abort";
 
-    while ((ch = __wt_getopt(progname, argc, argv, "Ch:LmsT:t:vz")) != EOF)
+    while ((ch = __wt_getopt(progname, argc, argv, "Cch:LmsT:t:vz")) != EOF)
         switch (ch) {
         case 'C':
             compat = true;
             break;
+        case 'c':
+            /* Variable-length columns only (for now) */
+            columns = true;
+            break;
         case 'h':
             working_dir = __wt_optarg;
             break;
@@ -699,9 +717,9 @@ main(int argc, char *argv[])
           compat ? "true" : "false", inmem ? "true" : "false", stress ? "true" : "false",
           use_ts ? "true" : "false");
         printf("Parent: Create %" PRIu32 " threads; sleep %" PRIu32 " seconds\n", nth, timeout);
-        printf("CONFIG: %s%s%s%s%s -h %s -T %" PRIu32 " -t %" PRIu32 "\n", progname,
-          compat ? " -C" : "", inmem ? " -m" : "", stress ? " -s" : "", !use_ts ? " -z" : "",
-          working_dir, nth, timeout);
+        printf("CONFIG: %s%s%s%s%s%s -h %s -T %" PRIu32 " -t %" PRIu32 "\n", progname,
+          compat ? " -C" : "", columns ? " -c" : "", inmem ? " -m" : "", stress ? " -s" : "",
+          !use_ts ? " -z" : "", working_dir, nth, timeout);
         /*
          * Fork a child to insert as many items. We will then randomly kill the child, run recovery
          * and make sure all items we wrote exist after recovery runs.
@@ -823,11 +841,20 @@ main(int argc, char *argv[])
                   key, last_key);
                 break;
             }
-            testutil_check(__wt_snprintf(kname, sizeof(kname), KEY_FORMAT, key));
-            cur_coll->set_key(cur_coll, kname);
-            cur_local->set_key(cur_local, kname);
-            cur_oplog->set_key(cur_oplog, kname);
-            cur_shadow->set_key(cur_shadow, kname);
+
+            if (columns) {
+                cur_coll->set_key(cur_coll, key + 1);
+                cur_local->set_key(cur_local, key + 1);
+                cur_oplog->set_key(cur_oplog, key + 1);
+                cur_shadow->set_key(cur_shadow, key + 1);
+            } else {
+                testutil_check(__wt_snprintf(kname, sizeof(kname), KEY_STRINGFORMAT, key));
+                cur_coll->set_key(cur_coll, kname);
+                cur_local->set_key(cur_local, kname);
+                cur_oplog->set_key(cur_oplog, kname);
+                cur_shadow->set_key(cur_shadow, kname);
+            }
+
             /*
              * The collection table should always only have the data as of the checkpoint. The
              * shadow table should always have the exact same data (or not) as the collection table,
diff --git a/src/third_party/wiredtiger/test/csuite/timestamp_abort/smoke.sh b/src/third_party/wiredtiger/test/csuite/timestamp_abort/smoke.sh
index 18d7f9b8dae..b2c70340f4c 100755
--- a/src/third_party/wiredtiger/test/csuite/timestamp_abort/smoke.sh
+++ b/src/third_party/wiredtiger/test/csuite/timestamp_abort/smoke.sh
@@ -23,8 +23,12 @@ then
 fi
 
 $TEST_WRAPPER $test_bin $default_test_args
+$TEST_WRAPPER $test_bin $default_test_args -c
 #$TEST_WRAPPER $test_bin $default_test_args -L
 $TEST_WRAPPER $test_bin -m $default_test_args
+$TEST_WRAPPER $test_bin -m $default_test_args -c
 #$TEST_WRAPPER $test_bin -m $default_test_args -L
 $TEST_WRAPPER $test_bin -C $default_test_args
+$TEST_WRAPPER $test_bin -C $default_test_args -c
 $TEST_WRAPPER $test_bin -C -m $default_test_args
+$TEST_WRAPPER $test_bin -C -m $default_test_args -c
diff --git a/src/third_party/wiredtiger/test/csuite/truncated_log/main.c b/src/third_party/wiredtiger/test/csuite/truncated_log/main.c
index d46b75d48c9..fa45e573781 100644
--- a/src/third_party/wiredtiger/test/csuite/truncated_log/main.c
+++ b/src/third_party/wiredtiger/test/csuite/truncated_log/main.c
@@ -32,6 +32,7 @@
 
 static char home[1024]; /* Program working dir */
 static const char *const uri = "table:main";
+static bool use_columns = false;
 
 #define RECORDS_FILE "records"
 
@@ -128,8 +129,14 @@ fill_db(void)
     WT_SESSION *session;
     uint32_t i, max_key, min_key, units, unused;
     char k[K_SIZE], v[V_SIZE];
+    const char *table_config;
     bool first;
 
+    if (use_columns)
+        table_config = "key_format=r,value_format=S";
+    else
+        table_config = "key_format=S,value_format=S";
+
     /*
      * Run in the home directory so that the records file is in there too.
      */
@@ -137,7 +144,7 @@ fill_db(void)
         testutil_die(errno, "chdir: %s", home);
     testutil_check(wiredtiger_open(NULL, NULL, ENV_CONFIG, &conn));
     testutil_check(conn->open_session(conn, NULL, NULL, &session));
-    testutil_check(session->create(session, uri, "key_format=S,value_format=S"));
+    testutil_check(session->create(session, uri, table_config));
     testutil_check(session->open_cursor(session, uri, NULL, NULL, &cursor));
 
     /*
@@ -164,10 +171,14 @@ fill_db(void)
     max_key = min_key * 2;
     first = true;
     for (i = 0; i < max_key; ++i) {
-        testutil_check(__wt_snprintf(k, sizeof(k), "key%03d", (int)i));
+        if (use_columns)
+            cursor->set_key(cursor, i + 1);
+        else {
+            testutil_check(__wt_snprintf(k, sizeof(k), "key%03" PRIu32, i));
+            cursor->set_key(cursor, k);
+        }
         testutil_check(
-          __wt_snprintf(v, sizeof(v), "value%0*d", (int)(V_SIZE - (strlen("value") + 1)), (int)i));
-        cursor->set_key(cursor, k);
+          __wt_snprintf(v, sizeof(v), "value%0*" PRIu32, (int)(V_SIZE - (strlen("value") + 1)), i));
         cursor->set_value(cursor, v);
         testutil_check(cursor->insert(cursor));
 
@@ -230,8 +241,12 @@ main(int argc, char *argv[])
     (void)testutil_set_progname(argv);
 
     working_dir = "WT_TEST.truncated-log";
-    while ((ch = __wt_getopt(progname, argc, argv, "h:")) != EOF)
+    while ((ch = __wt_getopt(progname, argc, argv, "ch:")) != EOF)
         switch (ch) {
+        case 'c':
+            /* Variable-length columns only (for now) */
+            use_columns = true;
+            break;
         case 'h':
             working_dir = __wt_optarg;
             break;
diff --git a/src/third_party/wiredtiger/test/csuite/truncated_log/smoke.sh b/src/third_party/wiredtiger/test/csuite/truncated_log/smoke.sh
new file mode 100755
index 00000000000..0079adf0340
--- /dev/null
+++ b/src/third_party/wiredtiger/test/csuite/truncated_log/smoke.sh
@@ -0,0 +1,20 @@
+#! /bin/sh
+
+set -e
+
+# Smoke-test truncated_log as part of running "make check".
+
+if [ -n "$1" ]
+then
+    # If the test binary is passed in manually.
+    test_bin=$1
+else
+    # If $top_builddir/$top_srcdir aren't set, default to building in build_posix
+    # and running in test/csuite.
+    top_builddir=${top_builddir:-../../build_posix}
+    top_srcdir=${top_srcdir:-../..}
+    test_bin=$top_builddir/test/csuite/test_truncated_log
+fi
+
+$TEST_WRAPPER $test_bin 
+$TEST_WRAPPER $test_bin -c
diff --git a/src/third_party/wiredtiger/test/csuite/wt2246_col_append/main.c b/src/third_party/wiredtiger/test/csuite/wt2246_col_append/main.c
index 6b54402fe66..307f47578e3 100644
--- a/src/third_party/wiredtiger/test/csuite/wt2246_col_append/main.c
+++ b/src/third_party/wiredtiger/test/csuite/wt2246_col_append/main.c
@@ -89,8 +89,8 @@ int
 main(int argc, char *argv[])
 {
     WT_SESSION *session;
+    wt_thread_t idlist[100];
     clock_t ce, cs;
-    pthread_t idlist[100];
     uint64_t i, id;
     char buf[100];
 
@@ -125,15 +125,16 @@ main(int argc, char *argv[])
 
     (void)signal(SIGINT, onsig);
 
+    memset(idlist, 0, sizeof(idlist));
     cs = clock();
     id = 0;
     for (i = 0; i < opts->n_append_threads; ++i, ++id) {
         printf("append: %" PRIu64 "\n", id);
-        testutil_check(pthread_create(&idlist[id], NULL, thread_append, opts));
+        testutil_check(__wt_thread_create(NULL, &idlist[id], thread_append, opts));
     }
 
     for (i = 0; i < id; ++i)
-        testutil_check(pthread_join(idlist[i], NULL));
+        testutil_check(__wt_thread_join(NULL, &idlist[i]));
 
     ce = clock();
     printf("%" PRIu64 "M records: %.2lf processor seconds\n", opts->max_inserted_id / MILLION,
diff --git a/src/third_party/wiredtiger/test/csuite/wt6185_modify_ts/main.c b/src/third_party/wiredtiger/test/csuite/wt6185_modify_ts/main.c
index 1f2e824047b..02205c88429 100644
--- a/src/third_party/wiredtiger/test/csuite/wt6185_modify_ts/main.c
+++ b/src/third_party/wiredtiger/test/csuite/wt6185_modify_ts/main.c
@@ -50,7 +50,10 @@ static u_int tnext;
 
 static uint64_t ts; /* Current timestamp. */
 
-static char key[100], modify_repl[256], tmp[4 * 1024];
+static char keystr[100], modify_repl[256], tmp[4 * 1024];
+static uint64_t keyrecno;
+
+static bool use_columns = false;
 
 /*
  * trace --
@@ -117,6 +120,32 @@ mmrand(u_int min, u_int max)
 }
 
 /*
+ * change_key --
+ *     Switch to a different key.
+ */
+static void
+change_key(u_int n)
+{
+    if (use_columns)
+        keyrecno = n + 1;
+    else
+        testutil_check(__wt_snprintf(keystr, sizeof(keystr), "%010u.key", n));
+}
+
+/*
+ * set_key --
+ *     Set the current key in the cursor.
+ */
+static void
+set_key(WT_CURSOR *c)
+{
+    if (use_columns)
+        c->set_key(c, keyrecno);
+    else
+        c->set_key(c, keystr);
+}
+
+/*
  * modify_repl_init --
  *     Initialize the replacement information.
  */
@@ -181,13 +210,13 @@ modify(WT_SESSION *session, WT_CURSOR *c)
     for (cnt = loop = 1; loop < 5; ++cnt, ++loop)
         if (mmrand(1, 10) <= 8) {
             modify_build(entries, &nentries, cnt);
-            c->set_key(c, key);
+            set_key(c);
             testutil_check(c->modify(c, entries, nentries));
         }
 
     /* Commit 90% of the time, else rollback. */
     if (mmrand(1, 10) != 1) {
-        c->set_key(c, key);
+        set_key(c);
         testutil_check(c->search(c));
         testutil_check(c->get_value(c, &v));
         free(list[lnext].v);
@@ -223,7 +252,7 @@ repeat(WT_SESSION *session, WT_CURSOR *c)
         testutil_check(__wt_snprintf(tmp, sizeof(tmp), "read_timestamp=%" PRIx64, list[i].ts));
         testutil_check(session->timestamp_transaction(session, tmp));
 
-        c->set_key(c, key);
+        set_key(c);
         testutil_check(c->search(c));
         testutil_check(c->get_value(c, &v));
 
@@ -246,7 +275,7 @@ evict(WT_CURSOR *c)
 {
     trace("%s", "eviction");
 
-    c->set_key(c, key);
+    set_key(c);
     testutil_check(c->search(c));
     F_SET(c, WT_CURSTD_DEBUG_RESET_EVICT);
     testutil_check(c->reset(c));
@@ -286,7 +315,7 @@ main(int argc, char *argv[])
     WT_SESSION *session;
     u_int i, j;
     int ch;
-    char path[1024], value[VALUE_SIZE];
+    char path[1024], table_config[128], value[VALUE_SIZE];
     const char *home, *v;
     bool no_checkpoint, no_eviction;
 
@@ -298,8 +327,12 @@ main(int argc, char *argv[])
 
     no_checkpoint = no_eviction = false;
     home = "WT_TEST.wt6185_modify_ts";
-    while ((ch = __wt_getopt(progname, argc, argv, "ceh:S:")) != EOF)
+    while ((ch = __wt_getopt(progname, argc, argv, "Cceh:S:")) != EOF)
         switch (ch) {
+        case 'C':
+            /* Variable-length columns only (for now anyway) */
+            use_columns = true;
+            break;
         case 'c':
             no_checkpoint = true;
             break;
@@ -322,14 +355,17 @@ main(int argc, char *argv[])
     testutil_work_dir_from_path(path, sizeof(path), home);
     testutil_make_work_dir(path);
 
+    testutil_check(__wt_snprintf(
+      table_config, sizeof(table_config), "key_format=%s,value_format=S", use_columns ? "r" : "S"));
+
     /* Load 100 records. */
     testutil_check(wiredtiger_open(path, NULL, "create", &conn));
     testutil_check(conn->open_session(conn, NULL, NULL, &session));
-    testutil_check(session->create(session, "file:xxx", "key_format=S,value_format=S"));
+    testutil_check(session->create(session, "file:xxx", table_config));
     testutil_check(session->open_cursor(session, "file:xxx", NULL, NULL, &c));
     for (i = 0; i <= 100; ++i) {
-        testutil_check(__wt_snprintf(key, sizeof(key), "%010u.key", i));
-        c->set_key(c, key);
+        change_key(i);
+        set_key(c);
         SET_VALUE(i, value);
         c->set_value(c, value);
         testutil_check(c->insert(c));
@@ -341,8 +377,8 @@ main(int argc, char *argv[])
     testutil_check(conn->open_session(conn, NULL, NULL, &session));
     testutil_check(session->create(session, "file:xxx", NULL));
     testutil_check(session->open_cursor(session, "file:xxx", NULL, NULL, &c));
-    testutil_check(__wt_snprintf(key, sizeof(key), "%010d.key", KEYNO));
-    c->set_key(c, key);
+    change_key(KEYNO);
+    set_key(c);
     testutil_check(c->search(c));
     testutil_check(c->get_value(c, &v));
     SET_VALUE(KEYNO, value);
diff --git a/src/third_party/wiredtiger/test/csuite/wt6185_modify_ts/smoke.sh b/src/third_party/wiredtiger/test/csuite/wt6185_modify_ts/smoke.sh
new file mode 100755
index 00000000000..b317eeeb2ed
--- /dev/null
+++ b/src/third_party/wiredtiger/test/csuite/wt6185_modify_ts/smoke.sh
@@ -0,0 +1,21 @@
+#! /bin/sh
+
+set -e
+
+# Smoke-test wt6185_modify_ts as part of running "make check".
+
+if [ -n "$1" ]
+then
+    # If the test binary is passed in manually.
+    test_bin=$1
+else
+    # If $top_builddir/$top_srcdir aren't set, default to building in build_posix
+    # and running in test/csuite.
+    top_builddir=${top_builddir:-../../build_posix}
+    top_srcdir=${top_srcdir:-../..}
+    test_bin=$top_builddir/test/csuite/test_wt6185_modify_ts
+fi
+
+$TEST_WRAPPER $test_bin 
+$TEST_WRAPPER $test_bin -C
+
diff --git a/src/third_party/wiredtiger/test/csuite/wt6616_checkpoint_oldest_ts/main.c b/src/third_party/wiredtiger/test/csuite/wt6616_checkpoint_oldest_ts/main.c
index 5a75777aa78..2e9648efea0 100644
--- a/src/third_party/wiredtiger/test/csuite/wt6616_checkpoint_oldest_ts/main.c
+++ b/src/third_party/wiredtiger/test/csuite/wt6616_checkpoint_oldest_ts/main.c
@@ -32,6 +32,7 @@
 #include <signal.h>
 
 static char home[1024]; /* Program working dir */
+static bool use_columns = false;
 
 /*
  * Spin up a child process to do operations and checkpoint. For each set of operations on a key,
@@ -48,7 +49,7 @@ static char home[1024]; /* Program working dir */
  * recovery by reading without a timestamp. Whether it is possible to read historical versions based
  * on timestamps from a logged table after recovery is not defined and implemented yet.
  */
-#define KEY_FORMAT ("%010" PRIu64)
+#define ROW_KEY_FORMAT ("%010" PRIu64)
 
 #define MAX_CKPT_INVL 5 /* Maximum interval between checkpoints */
 #define MAX_DATA 1000
@@ -147,11 +148,14 @@ thread_run(void *arg)
     /* Insert and then delete the keys until we're killed. */
     printf("Worker thread started.\n");
     for (oldest_ts = 0, ts = 1;; ++ts) {
-        testutil_check(__wt_snprintf(kname, sizeof(kname), KEY_FORMAT, ts));
+        testutil_check(__wt_snprintf(kname, sizeof(kname), ROW_KEY_FORMAT, ts));
 
         /* Insert the same value for key and value. */
         testutil_check(session->begin_transaction(session, NULL));
-        cursor->set_key(cursor, kname);
+        if (use_columns)
+            cursor->set_key(cursor, ts);
+        else
+            cursor->set_key(cursor, kname);
         data.data = kname;
         data.size = sizeof(kname);
         cursor->set_value(cursor, &data);
@@ -193,7 +197,7 @@ run_workload(void)
     WT_SESSION *session;
     wt_thread_t *thr;
     uint32_t i;
-    char envconf[512];
+    char envconf[512], tableconf[512];
 
     thr = dcalloc(2, sizeof(*thr));
 
@@ -206,8 +210,9 @@ run_workload(void)
     testutil_check(conn->open_session(conn, NULL, NULL, &session));
 
     /* Create the table. */
-    testutil_check(
-      session->create(session, uri, "key_format=S,value_format=u,log=(enabled=false)"));
+    testutil_check(__wt_snprintf(tableconf, sizeof(tableconf),
+      "key_format=%s,value_format=u,log=(enabled=false)", use_columns ? "r" : "S"));
+    testutil_check(session->create(session, uri, tableconf));
     testutil_check(session->close(session, NULL));
 
     /* The checkpoint thread is added at the end. */
@@ -268,8 +273,12 @@ main(int argc, char *argv[])
     timeout = MIN_TIME;
     working_dir = "WT_TEST.wt6616-checkpoint-oldest-ts";
 
-    while ((ch = __wt_getopt(progname, argc, argv, "h:t:")) != EOF)
+    while ((ch = __wt_getopt(progname, argc, argv, "ch:t:")) != EOF)
         switch (ch) {
+        case 'c':
+            /* Variable-length columns only (for now) */
+            use_columns = true;
+            break;
         case 'h':
             working_dir = __wt_optarg;
             break;
@@ -363,8 +372,11 @@ main(int argc, char *argv[])
     for (ts = oldest_ts; ts <= stable_ts; ++ts) {
         testutil_check(__wt_snprintf(tscfg, sizeof(tscfg), "read_timestamp=%" PRIx64, ts));
         testutil_check(session->begin_transaction(session, tscfg));
-        testutil_check(__wt_snprintf(kname, sizeof(kname), KEY_FORMAT, ts));
-        cursor->set_key(cursor, kname);
+        testutil_check(__wt_snprintf(kname, sizeof(kname), ROW_KEY_FORMAT, ts));
+        if (use_columns)
+            cursor->set_key(cursor, ts);
+        else
+            cursor->set_key(cursor, kname);
         ret = cursor->search(cursor);
         if (ret == WT_NOTFOUND) {
             fatal = true;
diff --git a/src/third_party/wiredtiger/test/csuite/wt6616_checkpoint_oldest_ts/smoke.sh b/src/third_party/wiredtiger/test/csuite/wt6616_checkpoint_oldest_ts/smoke.sh
new file mode 100755
index 00000000000..9b9cc997026
--- /dev/null
+++ b/src/third_party/wiredtiger/test/csuite/wt6616_checkpoint_oldest_ts/smoke.sh
@@ -0,0 +1,21 @@
+#! /bin/sh
+
+set -e
+
+# Smoke-test wt6616_checkpoint_oldest_ts as part of running "make check".
+
+if [ -n "$1" ]
+then
+    # If the test binary is passed in manually.
+    test_bin=$1
+else
+    # If $top_builddir/$top_srcdir aren't set, default to building in build_posix
+    # and running in test/csuite.
+    top_builddir=${top_builddir:-../../build_posix}
+    top_srcdir=${top_srcdir:-../..}
+    test_bin=$top_builddir/test/csuite/test_wt6616_checkpoint_oldest_ts
+fi
+
+$TEST_WRAPPER $test_bin 
+$TEST_WRAPPER $test_bin -c
+
diff --git a/src/third_party/wiredtiger/test/evergreen.yml b/src/third_party/wiredtiger/test/evergreen.yml
index 5bc5fa6580b..41362d91097 100755
--- a/src/third_party/wiredtiger/test/evergreen.yml
+++ b/src/third_party/wiredtiger/test/evergreen.yml
@@ -2580,6 +2580,17 @@ tasks:
     name: recovery-stress-test-3
     tags: ["stress-test-3", "stress-test-zseries-3"]
 
+  - name: format-abort-recovery-stress-test
+    commands:
+      - command: timeout.update
+        params:
+          exec_timeout_secs: 2500
+      - func: "get project"
+      - func: "compile wiredtiger with builtins"
+      - func: "format test script"
+        vars:
+          format_test_script_args: -a -t 30
+
   - name: many-dhandle-stress-test
     commands:
       - func: "get project"
@@ -2830,6 +2841,7 @@ buildvariants:
     - name: ".stress-test-2"
     - name: ".stress-test-3"
     - name: ".stress-test-4"
+    - name: format-abort-recovery-stress-test
 
 - name: large-scale-tests
   display_name: "Large scale tests"
@@ -2856,13 +2868,11 @@ buildvariants:
   run_on:
   - ubuntu1804-test
   expansions:
-    test_env_vars: LD_LIBRARY_PATH=$(pwd)/../../.libs
+    test_env_vars: LD_LIBRARY_PATH=$(pwd)/../../.libs PATH=/opt/mongodbtoolchain/v3/bin:$PATH
     make_command: PATH=/opt/mongodbtoolchain/v3/bin:$PATH make
     posix_configure_flags:
       --enable-silent-rules --enable-python --enable-zlib --enable-snappy
       --enable-strict --enable-static
-    test_env_vars:
-      PATH=/opt/mongodbtoolchain/v3/bin:$PATH
   tasks:
     - name: compile
     - name: cppsuite-hs-cleanup-stress
diff --git a/src/third_party/wiredtiger/test/format/config.c b/src/third_party/wiredtiger/test/format/config.c
index 88964902d98..33b8d7a9112 100644
--- a/src/third_party/wiredtiger/test/format/config.c
+++ b/src/third_party/wiredtiger/test/format/config.c
@@ -715,6 +715,8 @@ config_in_memory(void)
         return;
     if (config_is_perm("checkpoint"))
         return;
+    if (config_is_perm("format.abort"))
+        return;
     if (config_is_perm("import"))
         return;
     if (config_is_perm("logging"))
diff --git a/src/third_party/wiredtiger/test/suite/test_hs18.py b/src/third_party/wiredtiger/test/suite/test_hs18.py
index bcef53e4d17..5ed21e3c90a 100644
--- a/src/third_party/wiredtiger/test/suite/test_hs18.py
+++ b/src/third_party/wiredtiger/test/suite/test_hs18.py
@@ -438,8 +438,6 @@ class test_hs18(wttest.WiredTigerTestCase):
         session_ts_reader = self.setUpSessionOpen(self.conn)
         cursor_ts_reader = session_ts_reader.open_cursor(uri)
 
-        self.skipTest('Skip this part of test_hs18 until WT-7931 is resolved')
-
         # The ID of the session corresponds the value it should see.
         sessions = []
         cursors = []
@@ -448,8 +446,6 @@ class test_hs18(wttest.WiredTigerTestCase):
             sessions.append(self.setUpSessionOpen(self.conn))
             cursors.append(sessions[i].open_cursor(uri))
 
-        value_junk = 'aaaaa' * 100
-
         values.append('f' * 10)
         values.append('a' + values[0])
         values.append('b' + values[1])
@@ -485,11 +481,13 @@ class test_hs18(wttest.WiredTigerTestCase):
         # Start a long running transaction which could see modify 1.
         self.start_txn(sessions, cursors, values, 2)
 
-        # Insert a bunch of contents to fill the cache
-        for i in range(2000, 10000):
-            self.session.begin_transaction()
-            cursor[self.create_key(i)] = value_junk
-            self.session.commit_transaction()
+        # Evict the update using a debug cursor
+        cursor.reset()
+        evict_cursor = self.session.open_cursor(uri, None, "debug=(release_evict)")
+        evict_cursor.set_key(self.create_key(1))
+        self.assertEqual(evict_cursor.search(), 0)
+        evict_cursor.reset()
+        evict_cursor.close()
 
         # Commit a modify without a timestamp on our original key
         self.session.begin_transaction()
@@ -511,11 +509,13 @@ class test_hs18(wttest.WiredTigerTestCase):
         for i in range(0, 5):
             self.check_value(cursors[i], values[i])
 
-        # Insert a bunch of other contents to trigger eviction
-        for i in range(10001, 11000):
-            self.session.begin_transaction()
-            cursor[self.create_key(i)] = value_junk
-            self.session.commit_transaction()
+        # Evict the update using a debug cursor
+        cursor.reset()
+        evict_cursor = self.session.open_cursor(uri, None, "debug=(release_evict)")
+        evict_cursor.set_key(self.create_key(1))
+        self.assertEqual(evict_cursor.search(), 0)
+        evict_cursor.reset()
+        evict_cursor.close()
 
         # Check our values are still correct.
         for i in range(0, 5):
diff --git a/src/third_party/wiredtiger/test/suite/test_rollback_to_stable22.py b/src/third_party/wiredtiger/test/suite/test_rollback_to_stable22.py
index f6ef52cc388..7c85800b070 100644
--- a/src/third_party/wiredtiger/test/suite/test_rollback_to_stable22.py
+++ b/src/third_party/wiredtiger/test/suite/test_rollback_to_stable22.py
@@ -43,8 +43,6 @@ class test_rollback_to_stable22(test_rollback_to_stable_base):
         nrows = 1000
         nds = 10
 
-        self.skipTest('Skip it until the fix is provided to handle concurrent internal transactions running in parallel.')
-
         # Create a few tables and populate them with some initial data.
         #
         # Our way of preventing history store operations from interfering with rollback to stable's
diff --git a/src/third_party/wiredtiger/test/suite/test_rollback_to_stable24.py b/src/third_party/wiredtiger/test/suite/test_rollback_to_stable24.py
new file mode 100644
index 00000000000..ea690506dc9
--- /dev/null
+++ b/src/third_party/wiredtiger/test/suite/test_rollback_to_stable24.py
@@ -0,0 +1,135 @@
+#!/usr/bin/env python
+#
+# Public Domain 2014-present MongoDB, Inc.
+# Public Domain 2008-2014 WiredTiger, Inc.
+#
+# This is free and unencumbered software released into the public domain.
+#
+# Anyone is free to copy, modify, publish, use, compile, sell, or
+# distribute this software, either in source code form or as a compiled
+# binary, for any purpose, commercial or non-commercial, and by any
+# means.
+#
+# In jurisdictions that recognize copyright laws, the author or authors
+# of this software dedicate any and all copyright interest in the
+# software to the public domain. We make this dedication for the benefit
+# of the public at large and to the detriment of our heirs and
+# successors. We intend this dedication to be an overt act of
+# relinquishment in perpetuity of all present and future rights to this
+# software under copyright law.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+# IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+# OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+# ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+# OTHER DEALINGS IN THE SOFTWARE.
+
+import wiredtiger, wttest
+from wtscenario import make_scenarios
+
+# test_rollback_to_stable24.py
+# Exercise a recno-counting bug in column store.
+#
+# Prior to August 2021 a cell for which there's a pending stable update was counted (in the
+# column-store RTS code) as having RLE count 1 regardless of what the actual count was.
+#
+# In order to exploit this we have to do janky things with timestamps, but I think they're
+# allowable.
+#
+# Construct a cell with RLE count of 3 by writing 3 copies of aaaaaa at timestamp 10.
+# Then at the next key write bbbbbb at timestamp 10 and cccccc at timestamp 50.
+# Evict the page to reconcile it and produce the RLE cell.
+#
+# Then post an update to the first key of the RLE cell at timestamp 30 (to dddddd), and roll
+# back to 40.
+#
+# Reading at 40, we should at that point see dddddd and two aaaaaa's followed by bbbbbb, but
+# with the bad counting we get a key error on the second key.
+#
+# This happens because it goes to process key 4 but thinks it's on key 2; it finds that it
+# needs to roll back the value it's looking at (the cccccc from timestamp 50) but because it
+# thinks it's on key to it asks the history store for key 2 and finds nothing. (The bbbbbb
+# from timestamp 10 is in the history store, but under key 4; there's nothing in the history
+# store for key 2.) So it issues a tombstone, and issues it for key 2, so key 2 improperly
+# disappears.
+#
+# Run this test on rows as well as columns to help make sure the test itself is valid (and
+# stays so over time...)
+class test_rollback_to_stable24(wttest.WiredTigerTestCase):
+    session_config = 'isolation=snapshot'
+    conn_config = 'in_memory=false'
+
+    key_format_values = [
+        ('column', dict(key_format='r')),
+        ('integer_row', dict(key_format='i')),
+    ]
+
+    scenarios = make_scenarios(key_format_values)
+
+    def test_rollback_to_stable24(self):
+        # Create a table without logging.
+        uri = "table:rollback_to_stable24"
+        format = 'key_format={},value_format=S'.format(self.key_format)
+        self.session.create(uri, format + ', log=(enabled=false)')
+
+        # Pin oldest timestamp to 10.
+        self.conn.set_timestamp('oldest_timestamp=' + self.timestamp_str(10))
+
+        # Start stable timestamp at 10.
+        self.conn.set_timestamp('stable_timestamp=' + self.timestamp_str(10))
+
+        value_a = "aaaaa" * 100
+        value_b = "bbbbb" * 100
+        value_c = "ccccc" * 100
+        value_d = "ddddd" * 100
+
+        s = self.conn.open_session()
+        cursor = s.open_cursor(uri)
+
+        # Write some keys at time 10.
+        s.begin_transaction()
+        cursor[1] = value_a
+        cursor[2] = value_a
+        cursor[3] = value_a
+        cursor[4] = value_b
+        s.commit_transaction('commit_timestamp=' + self.timestamp_str(10))
+
+        # Update key 4 at time 50.
+        s.begin_transaction()
+        cursor[4] = value_c
+        s.commit_transaction('commit_timestamp=' + self.timestamp_str(50))
+
+        cursor.close()
+
+        # Evict the page to force reconciliation.
+        evict_cursor = self.session.open_cursor(uri, None, "debug=(release_evict)")
+        s.begin_transaction()
+        # Search the key to evict it.
+        v = evict_cursor[1]
+        self.assertEqual(v, value_a)
+        self.assertEqual(evict_cursor.reset(), 0)
+        s.rollback_transaction()
+        evict_cursor.close()
+
+        # Now update key 1 at time 30.
+        cursor = s.open_cursor(uri)
+        s.begin_transaction()
+        cursor[1] = value_d
+        s.commit_transaction('commit_timestamp=' + self.timestamp_str(30))
+        cursor.close()
+
+        # Roll back to 40.
+        self.conn.set_timestamp('stable_timestamp=' + self.timestamp_str(40))
+        self.conn.rollback_to_stable()
+
+        # Now read at 40.
+        cursor = s.open_cursor(uri)
+        s.begin_transaction('read_timestamp=' + self.timestamp_str(40))
+        self.assertEqual(cursor[1], value_d)
+        self.assertEqual(cursor[2], value_a)
+        self.assertEqual(cursor[3], value_a)
+        self.assertEqual(cursor[4], value_b)
+        s.rollback_transaction()
+        cursor.close()
diff --git a/src/third_party/wiredtiger/test/suite/test_rollback_to_stable25.py b/src/third_party/wiredtiger/test/suite/test_rollback_to_stable25.py
new file mode 100644
index 00000000000..2d800a17d32
--- /dev/null
+++ b/src/third_party/wiredtiger/test/suite/test_rollback_to_stable25.py
@@ -0,0 +1,293 @@
+#!/usr/bin/env python
+#
+# Public Domain 2014-present MongoDB, Inc.
+# Public Domain 2008-2014 WiredTiger, Inc.
+#
+# This is free and unencumbered software released into the public domain.
+#
+# Anyone is free to copy, modify, publish, use, compile, sell, or
+# distribute this software, either in source code form or as a compiled
+# binary, for any purpose, commercial or non-commercial, and by any
+# means.
+#
+# In jurisdictions that recognize copyright laws, the author or authors
+# of this software dedicate any and all copyright interest in the
+# software to the public domain. We make this dedication for the benefit
+# of the public at large and to the detriment of our heirs and
+# successors. We intend this dedication to be an overt act of
+# relinquishment in perpetuity of all present and future rights to this
+# software under copyright law.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+# IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+# OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+# ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+# OTHER DEALINGS IN THE SOFTWARE.
+
+import wiredtiger, wttest
+from wtscenario import make_scenarios, filter_scenarios
+
+# test_rollback_to_stable25.py
+# Check various scenarios relating to RLE cells in column-store.
+#
+# We write at three different timestamps:
+#    10 - aaaaaa or none
+#    20 - bbbbbb or delete or none
+#    30 - cccccc or delete or none
+#
+# and we evict to push things to disk after any of these,
+# and we roll back to either 15 or 25.
+#
+# The writes can be either uniform, heterogeneous, first key, middle key, or last key.
+#
+# We do this with a group of 5 keys 2..6. Keys 1 and 6 are written with zzzzzz at
+# timestamp 5 and evicted to ensure that the group of keys we're using is isolated
+# from other unused keys.
+#
+# This generates a lot of cases, but we filter pointless combinations and they run fast.
+
+# Put these bits outside the class definition so they can be referred to both in class
+# instances and in the scenario setup logic, which doesn't have a class instance yet.
+
+my_rle_size = 5
+
+def keys_of_write(write):
+    if write == 'u' or write == 'h':
+        return range(2, 2 + my_rle_size)
+    elif write == 'f':
+        return [2]
+    elif write == 'm':
+        return [2 + my_rle_size // 2]
+    else:
+        return [2 + my_rle_size - 1]
+
+class test_rollback_to_stable25(wttest.WiredTigerTestCase):
+    session_config = 'isolation=snapshot'
+    conn_config = 'in_memory=false'
+
+    write_10_values = [
+        ('10u', dict(write_10='u')),
+        ('10h', dict(write_10='h')),
+        ('10f', dict(write_10='f')),
+        ('10m', dict(write_10='m')),
+        ('10l', dict(write_10='l')),
+    ]
+    type_10_values = [
+        ('nil', dict(type_10=None)),
+        ('upd', dict(type_10='upd')),
+    ]
+
+    write_20_values = [
+        ('20u', dict(write_20='u')),
+        ('20h', dict(write_20='h')),
+        ('20f', dict(write_20='f')),
+        ('20m', dict(write_20='m')),
+        ('20l', dict(write_20='l')),
+    ]
+    type_20_values = [
+        ('nil', dict(type_20=None)),
+        ('upd', dict(type_20='upd')),
+        ('del', dict(type_20='del')),
+    ]
+
+    write_30_values = [
+        ('30u', dict(write_30='u')),
+        ('30h', dict(write_30='h')),
+        ('30f', dict(write_30='f')),
+        ('30m', dict(write_30='m')),
+        ('30l', dict(write_30='l')),
+    ]
+    type_30_values = [
+        ('nil', dict(type_30=None)),
+        ('upd', dict(type_30='upd')),
+        ('del', dict(type_30='del')),
+    ]
+
+    evict_time_values = [
+        ('chk10', dict(evict_time=10)),
+        ('chk20', dict(evict_time=20)),
+        ('chk30', dict(evict_time=30)),
+    ]
+
+    rollback_time_values = [
+        ('roll15', dict(rollback_time=15)),
+        ('roll25', dict(rollback_time=25)),
+    ]
+
+    def is_meaningful(name, vals):
+        # The last write at evict time should be uniform, to get an RLE cell.
+        if vals['evict_time'] == 10 and vals['write_10'] != 'u':
+            return False
+        if vals['evict_time'] == 20 and vals['write_20'] != 'u':
+            return False
+        if vals['evict_time'] == 30 and vals['write_30'] != 'u':
+            return False
+        # If the type is nil, the value must be uniform.
+        if vals['type_10'] is None and vals['write_10'] != 'u':
+            return False
+        if vals['type_20'] is None and vals['write_20'] != 'u':
+            return False
+        if vals['type_30'] is None and vals['write_30'] != 'u':
+            return False
+        # Similarly, delete and heterogeneous doesn't make sense.
+        if vals['type_10'] == 'del' and vals['write_10'] == 'h':
+            return False
+        if vals['type_20'] == 'del' and vals['write_20'] == 'h':
+            return False
+        if vals['type_20'] == 'del' and vals['write_30'] == 'h':
+            return False
+        # Both 10 and 20 shouldn't be nil. That's equivalent to 10 and 30 being nil.
+        if vals['type_10'] is None and vals['type_20'] is None:
+            return False
+
+        # Avoid cases that delete nonexistent values.
+        def deletes_nonexistent():
+            present = {}
+            for k in range(2, 2 + my_rle_size):
+                present[k] = False
+            def adjust(ty, write):
+                if ty is None:
+                    return
+                for k in keys_of_write(write):
+                    if ty == 'upd':
+                         present[k] = True
+                    elif ty == 'del':
+                        if present[k]:
+                            present[k] = False
+                        else:
+                            raise KeyError
+
+            adjust(vals['type_10'], vals['write_10'])
+            adjust(vals['type_20'], vals['write_20'])
+            adjust(vals['type_30'], vals['write_30'])
+        try:
+            deletes_nonexistent()
+        except KeyError:
+            return False
+        return True
+
+    scenarios = filter_scenarios(make_scenarios(write_10_values, type_10_values,
+                                                write_20_values, type_20_values,
+                                                write_30_values, type_30_values,
+                                                evict_time_values,
+                                                rollback_time_values),
+                                 is_meaningful)
+
+    value_z = "zzzzz" * 10
+
+    def writes(self, uri, s, expected, ty, write, value, ts):
+        if ty is None:
+             # do nothing at all
+             return
+        cursor = s.open_cursor(uri)
+        s.begin_transaction()
+        for k in keys_of_write(write):
+            if ty == 'upd':
+                myval = value + str(k) if write == 'h' else value
+                cursor[k] = myval
+                expected[k] = myval
+            else:
+                cursor.set_key(k)
+                cursor.remove()
+                del expected[k]
+        s.commit_transaction('commit_timestamp=' + self.timestamp_str(ts))
+        cursor.close()
+
+    def evict(self, uri, s):
+        # Evict the page to force reconciliation.
+        evict_cursor = s.open_cursor(uri, None, "debug=(release_evict)")
+        s.begin_transaction()
+        # Search the key to evict it. Use both bookends.
+        v = evict_cursor[1]
+        self.assertEqual(v, self. value_z)
+        v = evict_cursor[2 + my_rle_size]
+        self.assertEqual(v, self. value_z)
+        self.assertEqual(evict_cursor.reset(), 0)
+        s.rollback_transaction()
+        evict_cursor.close()
+
+    def check(self, uri, s, ts, expected):
+        cursor = s.open_cursor(uri)
+        s.begin_transaction('read_timestamp=' + self.timestamp_str(ts))
+        # endpoints should still be in place
+        self.assertEqual(cursor[1], self.value_z)
+        self.assertEqual(cursor[2 + my_rle_size], self.value_z)
+
+        for k in range(2, 2 + my_rle_size):
+            if k in expected:
+                self.assertEqual(cursor[k], expected[k])
+            else:
+                cursor.set_key(k)
+                r = cursor.search()
+                self.assertEqual(r, wiredtiger.WT_NOTFOUND)
+        s.rollback_transaction()
+        cursor.close()
+
+    def test_rollback_to_stable25(self):
+        # Create a table without logging.
+        uri = "table:rollback_to_stable25"
+        format = 'key_format=r,value_format=S'
+        self.session.create(uri, format + ', log=(enabled=false)')
+
+        # Pin oldest timestamp to 5.
+        self.conn.set_timestamp('oldest_timestamp=' + self.timestamp_str(5))
+
+        # Start stable timestamp at 5.
+        self.conn.set_timestamp('stable_timestamp=' + self.timestamp_str(5))
+
+        value_a = "aaaaa" * 10
+        value_b = "bbbbb" * 10
+        value_c = "ccccc" * 10
+
+        s = self.conn.open_session()
+
+        # Write the endpoints at time 5.
+        cursor = s.open_cursor(uri)
+        s.begin_transaction()
+        cursor[1] = self.value_z
+        cursor[2 + my_rle_size] = self.value_z
+        s.commit_transaction('commit_timestamp=' + self.timestamp_str(5))
+        self.evict(uri, s)
+        cursor.close()
+
+        # Do writes at time 10.
+        expected = {}
+        self.writes(uri, s, expected, self.type_10, self.write_10, value_a, 10)
+        expected10 = expected.copy()
+
+        # Evict at time 10 if requested.
+        if self.evict_time == 10:
+            self.evict(uri, s)
+
+        # Do more writes at time 20.
+        self.writes(uri, s, expected, self.type_20, self.write_20, value_b, 20)
+        expected20 = expected.copy()
+
+        # Evict at time 20 if requested.
+        if self.evict_time == 20:
+            self.evict(uri, s)
+
+        # Do still more writes at time 30.
+        self.writes(uri, s, expected, self.type_30, self.write_30, value_c, 30)
+        expected30 = expected.copy()
+
+        # Evict at time 30 if requested.
+        if self.evict_time == 30:
+            self.evict(uri, s)
+
+        # Now roll back.
+        self.conn.set_timestamp('stable_timestamp=' + self.timestamp_str(self.rollback_time))
+        self.conn.rollback_to_stable()
+
+        if self.rollback_time < 20:
+            expected20 = expected10
+            expected30 = expected10
+        elif self.rollback_time < 30:
+            expected30 = expected20
+
+        # Now make sure we see what we expect.
+        self.check(uri, s, 10, expected10)
+        self.check(uri, s, 20, expected20)
+        self.check(uri, s, 30, expected30)
author	Luke Chen <luke.chen@mongodb.com>	2021-08-26 13:54:52 +1000
committer	Luke Chen <luke.chen@mongodb.com>	2021-08-26 13:54:52 +1000
commit	ba6c7287e5ad13a65ef35e2468694219160775ce (patch)
tree	4d8ca6a8837fa82fe21e4eab430eb887becbd38c
parent	e15a429d9c1c2c6d5fe5d186b866ae7d8e7c6e60 (diff)
download	mongo-ba6c7287e5ad13a65ef35e2468694219160775ce.tar.gz