summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorLuke Chen <luke.chen@mongodb.com>2021-08-26 13:54:52 +1000
committerLuke Chen <luke.chen@mongodb.com>2021-08-26 13:54:52 +1000
commitba6c7287e5ad13a65ef35e2468694219160775ce (patch)
tree4d8ca6a8837fa82fe21e4eab430eb887becbd38c
parente15a429d9c1c2c6d5fe5d186b866ae7d8e7c6e60 (diff)
downloadmongo-ba6c7287e5ad13a65ef35e2468694219160775ce.tar.gz
Import wiredtiger: c6ea0d18b5bcd7a6e7d91eece81ada238904c80e from branch mongodb-5.0
ref: b385c98487..c6ea0d18b5 for: 5.0.3 WT-6908 Write "cache" subpage for Architecture Guide WT-6911 Write "block manager" subpage for Architecture Guide WT-7005 Write "session" subpage for Architecture Guide WT-7006 Write Connection subpage for Architecture Guide WT-7905 Fix incorrect builtin behaviour for builds in CMake WT-7909 Create a new method to check for running user transactions before starting rollback-to-stable operation WT-7917 Add evergreen validation to s_all WT-7931 Evicting modifies using the evict cursor in test_multiple_older_readers_with_multiple_mixed_mode() to ensure that eviction happens. WT-7941 Add an Evergreen task to test abort/recovery using test/format WT-7964 Fix rollback to stable incorrectly not rolling back updates at snap_max WT-7965 Update connection base write generation number at the end of recovery checkpoint WT-7970 Set the stable timestamp before starting the checkpointer and clock threads WT-7974 More column-store fixes and tests WT-7984 Fix a bug that could cause a checkpoint to omit a page of data WT-7995 Fix the global visibility that it cannot go beyond checkpoint visibility WT-7998 Minor fixes on Cache subpage of Architecture Guide
-rw-r--r--src/third_party/wiredtiger/dist/docs_data.py12
-rwxr-xr-xsrc/third_party/wiredtiger/dist/s_all9
-rw-r--r--src/third_party/wiredtiger/import.data2
-rw-r--r--src/third_party/wiredtiger/src/docs/arch-block.dox249
-rw-r--r--src/third_party/wiredtiger/src/docs/arch-cache.dox121
-rw-r--r--src/third_party/wiredtiger/src/docs/arch-connection.dox60
-rw-r--r--src/third_party/wiredtiger/src/docs/arch-index.dox8
-rw-r--r--src/third_party/wiredtiger/src/docs/arch-session.dox53
-rw-r--r--src/third_party/wiredtiger/src/docs/spell.ok7
-rw-r--r--src/third_party/wiredtiger/src/include/cell_inline.h18
-rw-r--r--src/third_party/wiredtiger/src/include/extern.h6
-rw-r--r--src/third_party/wiredtiger/src/include/txn.h2
-rw-r--r--src/third_party/wiredtiger/src/include/txn_inline.h71
-rw-r--r--src/third_party/wiredtiger/src/meta/meta_ckpt.c34
-rw-r--r--src/third_party/wiredtiger/src/reconcile/rec_col.c1
-rw-r--r--src/third_party/wiredtiger/src/reconcile/rec_row.c5
-rw-r--r--src/third_party/wiredtiger/src/reconcile/rec_write.c48
-rw-r--r--src/third_party/wiredtiger/src/txn/txn.c41
-rw-r--r--src/third_party/wiredtiger/src/txn/txn_ckpt.c8
-rw-r--r--src/third_party/wiredtiger/src/txn/txn_recover.c44
-rw-r--r--src/third_party/wiredtiger/src/txn/txn_rollback_to_stable.c180
-rw-r--r--src/third_party/wiredtiger/test/checkpoint/checkpointer.c18
-rw-r--r--src/third_party/wiredtiger/test/checkpoint/test_checkpoint.c3
-rw-r--r--src/third_party/wiredtiger/test/csuite/Makefile.am9
-rw-r--r--src/third_party/wiredtiger/test/csuite/schema_abort/main.c54
-rwxr-xr-xsrc/third_party/wiredtiger/test/csuite/schema_abort/smoke.sh8
-rw-r--r--src/third_party/wiredtiger/test/csuite/timestamp_abort/main.c77
-rwxr-xr-xsrc/third_party/wiredtiger/test/csuite/timestamp_abort/smoke.sh4
-rw-r--r--src/third_party/wiredtiger/test/csuite/truncated_log/main.c25
-rwxr-xr-xsrc/third_party/wiredtiger/test/csuite/truncated_log/smoke.sh20
-rw-r--r--src/third_party/wiredtiger/test/csuite/wt2246_col_append/main.c7
-rw-r--r--src/third_party/wiredtiger/test/csuite/wt6185_modify_ts/main.c60
-rwxr-xr-xsrc/third_party/wiredtiger/test/csuite/wt6185_modify_ts/smoke.sh21
-rw-r--r--src/third_party/wiredtiger/test/csuite/wt6616_checkpoint_oldest_ts/main.c30
-rwxr-xr-xsrc/third_party/wiredtiger/test/csuite/wt6616_checkpoint_oldest_ts/smoke.sh21
-rwxr-xr-xsrc/third_party/wiredtiger/test/evergreen.yml16
-rw-r--r--src/third_party/wiredtiger/test/format/config.c2
-rw-r--r--src/third_party/wiredtiger/test/suite/test_hs18.py28
-rw-r--r--src/third_party/wiredtiger/test/suite/test_rollback_to_stable22.py2
-rw-r--r--src/third_party/wiredtiger/test/suite/test_rollback_to_stable24.py135
-rw-r--r--src/third_party/wiredtiger/test/suite/test_rollback_to_stable25.py293
41 files changed, 1448 insertions, 364 deletions
diff --git a/src/third_party/wiredtiger/dist/docs_data.py b/src/third_party/wiredtiger/dist/docs_data.py
index 831cdc61f30..2d0ba1e6068 100644
--- a/src/third_party/wiredtiger/dist/docs_data.py
+++ b/src/third_party/wiredtiger/dist/docs_data.py
@@ -20,13 +20,18 @@ arch_doc_pages = [
['src/include/block.h', 'src/include/block_inline.h',
'src/block/']),
ArchDocPage('arch-cache',
- ['WT_CACHE', 'WT_CACHE_POOL'],
- ['src/include/cache.h', 'src/include/cache_inline.h']),
+ ['WT_CACHE', 'WT_CACHE_POOL', 'WT_COL', 'WT_COL_RLE', 'WT_INSERT', 'WT_PAGE',
+ 'WT_PAGE_MODIFY', 'WT_REF', 'WT_ROW', 'WT_UPDATE'],
+ ['src/include/btmem.h', 'src/include/cache.h', 'src/include/cache_inline.h',
+ 'src/conn/conn_cache.c', 'src/conn/conn_cache_pool.c']),
ArchDocPage('arch-checkpoint',
['WT_CONNECTION'],
['src/block/block_ckpt.c', 'src/block/block_ckpt_scan.c',
'src/conn/conn_ckpt.c', 'src/meta/meta_ckpt.c',
'src/txn/txn_ckpt.c']),
+ ArchDocPage('arch-connection',
+ ['WT_CONNECTION'],
+ ['src/include/connection.h']),
ArchDocPage('arch-cursor',
['WT_CURSOR', 'WT_CURSOR_BACKUP', 'WT_CURSOR_BTREE', 'WT_CURSOR_BULK',
'WT_CURSOR_DATA_SOURCE', 'WT_CURSOR_DUMP', 'WT_CURSOR_INDEX',
@@ -78,6 +83,9 @@ arch_doc_pages = [
['src/include/intpack_inline.h', 'src/include/packing_inline.h',
'src/include/schema.h',
'src/lsm/', 'src/packing/', 'src/schema/']),
+ ArchDocPage('arch-session',
+ ['WT_SESSION'],
+ ['src/include/session.h']),
ArchDocPage('arch-snapshot',
['WT_TXN'],
['src/include/txn.h']),
diff --git a/src/third_party/wiredtiger/dist/s_all b/src/third_party/wiredtiger/dist/s_all
index 50d1909eea0..4c39bc8e321 100755
--- a/src/third_party/wiredtiger/dist/s_all
+++ b/src/third_party/wiredtiger/dist/s_all
@@ -47,6 +47,12 @@ errchk()
return
fi
+ # Return if evergreen validate runs sucessfully.
+ if echo "$1" | grep -q evergreen && [ "$(cat "$2")" = "../test/evergreen.yml is valid" ] ; then
+ rm -f "$2"
+ return
+ fi
+
echo "####################### MESSAGE ############################"
echo "s_all run of: \"$1\" resulted in:"
sed -e 's/^/ /' $2
@@ -81,6 +87,9 @@ run "./s_clang-format"
run "python prototypes.py"
run "sh ./s_typedef -b"
run "python test_tag.py"
+if command -v evergreen > /dev/null; then
+ run "evergreen validate ../test/evergreen.yml"
+fi
COMMANDS="
2>&1 ./s_define > ${t_pfx}s_define
diff --git a/src/third_party/wiredtiger/import.data b/src/third_party/wiredtiger/import.data
index 52e281c34f8..d19319d772a 100644
--- a/src/third_party/wiredtiger/import.data
+++ b/src/third_party/wiredtiger/import.data
@@ -2,5 +2,5 @@
"vendor": "wiredtiger",
"github": "wiredtiger/wiredtiger.git",
"branch": "mongodb-5.0",
- "commit": "b385c984870fec2d693dece8e79876fd5e8bf867"
+ "commit": "c6ea0d18b5bcd7a6e7d91eece81ada238904c80e"
}
diff --git a/src/third_party/wiredtiger/src/docs/arch-block.dox b/src/third_party/wiredtiger/src/docs/arch-block.dox
index 7c8fdf5d72b..1738d2fdb9b 100644
--- a/src/third_party/wiredtiger/src/docs/arch-block.dox
+++ b/src/third_party/wiredtiger/src/docs/arch-block.dox
@@ -1,9 +1,248 @@
/*! @arch_page arch-block Block Manager
-The Block Manager manages the reading and writing of disk blocks
-in WiredTiger. It does compression and encryption when these
-are configured.
+The WiredTiger block manager subsystem manages the reading and writing of data
+from the disk. It is designed to facilitate high performance, economic use of
+disk space and customizability.
+
+@section block What is a block?
+
+A block is a chunk of data that is stored on the disk and operated on as a
+single unit. Each WiredTiger data file (any file in the home directory with the
+\c .wt suffix) is made up of these blocks. Each block consists of a page header,
+a block header and contains a single page of the btree from which it was
+generated. WiredTiger is a no-overwrite storage engine, and when blocks are
+re-written, they are written to new locations in the file. The size of a block
+is a multiple of the allocation size which is set during creation of the
+associated WiredTiger data file see: WT_SESSION::create.
+
+Once a block is written an address cookie is returned. This address cookie is
+stored as the \c addr on the associated page ref. The \c WT_REF structure can
+be found in \c btmem.h. The address cookie is opaque to other parts of the
+system and cannot be interpreted meaningfully.
+
+The address cookie is made up of 4 components:
+ - offset: The offset in the file. In order to avoid storing large offsets this
+ value is divided by the allocation size.
+ - file_id: Optional and only relevant to the tiered storage type, the file_id
+ is maintained in the address.
+ - size: The size of the block, also divided by the allocation size.
+ - checksum: The checksum of the block for validation purposes.
+
+The block header contains the following fields:
+ - size: The size of the block on disk, used when salvaging data from a corrupt
+ file.
+ - checksum: The checksum of the block, again used for salvaging.
+ - flags: Flags set on the block itself.
+ - padding
+
+The page header is not described in this document but can be found in
+\c btmem.h.
+
+@section block_implementation Block manager implementation details
+
+@subsection write_once Writing
+
+The block manager decides where in the file a block will be written. It has two
+forms of writing modes, "first fit" and "best fit". The default behavior is best
+fit. While operating in best fit mode the block manager will search a skip list
+of extents sorted by size, returning either an exact match or the next largest.
+This is done to avoid fragmenting the file when possible. In first fit mode the
+block manager will place the newly created block in the first available extent.
+First fit mode is used for all root pages.
+
+Additionally the block manager is a no-overwrite system. As such once a block is
+written it cannot be modified. This is for crash recovery reasons, because if
+the system were to crash during an overwrite the block state would be unknown.
+This doesn't mean that the associated page cannot be modified, once the
+associated page is modified a subsequent reconciliation will result in a new
+block being created.
+
+@subsection desc_block Descriptor blocks
+
+A file is divided up into blocks. The first block in a file is special as it
+contains metadata about the file and is referred to as the "descriptor block".
+It contains the WiredTiger major and minor version, a checksum of the block
+contents as well as a "magic" number to check against.
+
+The descriptor block serves as a safety check to ensure that the file being
+loaded into the block manager is actually a WiredTiger data file, that it
+belongs to a compatible version of WiredTiger and that the entire file has not
+been corrupted. WiredTiger also uses checksums to defend against file corruption
+which is described in the @ref checksum section.
+
+@subsection block_lists Extent lists
+
+Internally, the block manager uses a data structure called an extent list or a
+\c WT_EXTLIST to track file usage. An extent list consists of a series of
+extents (or \c WT_EXT elements). Each extent uses a file offset and size to
+track a portion of the file.
+
+There are three extent lists that are maintained per checkpoint:
+
+- \c alloc: The file ranges allocated for a given checkpoint.
+- \c avail: The file ranges that are unused and available for allocation.
+- \c discard: The file ranges freed in the current checkpoint.
+
+The alloc and discard extent lists are maintained as a skiplist sorted by file
+offset. The avail extent list also maintains an extra skiplist sorted by the
+extent size to aid with allocating new blocks.
+
+@section configuration Configuration options
+
+There are a number of configuration options that affect the block manager's
+behavior. This does not aim to be an exhaustive list, however, these are the
+configuration options that are more commonly of interest to developers.
+
+All of the configuration options below are passed into the \c WT_SESSION::create
+API at the time of file creation.
+
+@subsection alloc_size Allocation size
+
+The allocation_size configuration controls the file unit allocation_size.
+Any blocks allocated by the block manager must be a multiple of this value.
+
+For example, if we specify an allocation_size of \c 4KB, blocks of size
+\c 8KB and \c 12KB would be permitted but NOT \c 10KB. The allocation_size
+is set to \c 4KB by default which is a good choice unless the OS or storage
+device has special requirements.
+
+@subsection checksum Checksum
+
+The \c checksum "on" configuration can be provided during creation of the file.
+This configuration instructs the block manager to checksum the full length of
+the buffer provided to be written into the block. Be default it is enabled.
+When disabled the block manager still does perform a checksum operation but only
+the first 64 bytes of the buffer are included.
+
+The checksum is used when reading blocks to validate their contents, it
+is compared with the checksum extracted from the address cookie and it is
+compared with a checksum generated from the buffer that was held in the block
+being read. In both cases the checksum has to match.
+
+There are other options that can be provided for this configuration option,
+they are not discussed here.
+
+@section block_usage How WiredTiger uses the block manager
+
+@subsection creation File creation and the block manager
+
+When a new file is created in WiredTiger via WT_SESSION::create, the file is
+created on disk and the associated \c allocation_size is written out to the
+metadata file. However the block manager itself only exists on the btree
+structure and is allocated when opening a closed btree.
+
+@subsection read Reading files and pages
+
+When an existing btree is opened for the first time, the location of the root
+block is contained in the metadata file \c WiredTiger.wt. The block manager will
+read the block at the location specified and return the page image as a buffer
+to the layer above. This will then be instantiated as a page in memory.
+
+From there subsequent page addresses can be read from the root page and the
+process repeated as required. If a cursor traverses to a page which hasn't been
+read into memory the same process will take place.
+
+@subsection Writing
+
+Two cases exist for writing out data using the block manager: checkpoint and
+eviction. When a page image is written out the block manager the \c bm->write
+API is called. See \c bt_io.c for more detail.
+
+@subsubsection Checkpoint
+
+For details on checkpoint at the WiredTiger level see: @ref arch-checkpoint.
+
+At the block manager level, a checkpoint corresponds with a set of blocks that
+are stored in the file associated with the given URI. Typically each file will
+contain a minimum of two checkpoints. Upon opening an existing file the most
+recent checkpoint is read.
+
+During a checkpoint new blocks are only written out for dirty pages. A block can
+be included in multiple checkpoints. Assuming a page \c X is dirty and gets
+checkpointed in checkpoint \c A, it will be created as a new block on disk. Now
+the same page \c X isn't modified and another checkpoint is taken. The page is
+clean and as such will not require a new block to be written for it. The address
+of the original block is still valid.
+
+Checkpoints are created in depth first order, leaf blocks are created, then
+the parent blocks. This is a requirement as the parent blocks contain the
+addresses of the leaf blocks.
+
+The block manager doesn't guarantee that calling \c bm->write will result in
+the data being flushed to disk. In the checkpoint scenario WiredTiger will also
+call \c bm->sync once all blocks have been written which will call the file
+system dependent flush function.
+
+* Checkpoint deletion and merging *
+
+As a checkpoint progresses it takes a snapshot of the three extent lists kept
+by the block manager, these extent lists are written out to disk as part of
+the checkpoint in blocks. Between checkpoints these extent lists are being
+updated via normal operation of WiredTiger.
+
+Suppose we have a checkpoint \c A, which has an alloc list which contains 3
+blocks \c I, \c J, \c K as such its extent lists are as follows:
+
+Alloc: \c I, \c J, \c K
+Avail: \c L, \c M
+Discard: Empty
+
+A second checkpoint \c B completes and has removed a page which corresponds with
+block \c J, it also has allocated an additional block \c L.
+
+Checkpoint \c B's extent lists are as follows:
+Alloc: \c L
+Avail: \c M
+Discard: \c J
+
+Finally we complete a 3rd checkpoint \c C which allocates an additional block
+\c M. Upon completion of this checkpoint we are able to remove checkpoint \c A,
+to do that, the block manager will merge checkpoint \c A's extent lists into
+checkpoint \c B's.
+
+What's important here is that if a block appears in both the alloc list and
+the discard list it can be freed which means it goes on the avail list.
+
+Which gives us the following lists for checkpoint \c C:
+Alloc: \c M
+Avail: Empty
+Discard: Empty
+
+And the following lists for checkpoint \c B:
+Alloc: \c I, \c K, \c L
+Avail: \c J
+Discard: Empty
+
+These extent lists are written out with the checkpoint \c C. Anything on the
+avail list is considered free space and can be reused as of the completion
+of checkpoint \c C.
+
+We don't want to list each block individually in the extent lists, so instead of
+listing each block separately in the list, we use extents, which can describe a
+range in the file, that is, any number of contiguous blocks.
+
+@subsubsection Eviction
+
+For more detail on how WiredTiger eviction works see: @ref arch-eviction.
+
+Eviction also utilizes the block manager. When a page is evicted and contains
+data that needs to be maintained, logically a block needs to be written.
+Eviction calls \c bm->write however it does not instruct the block manager to
+sync the data.
+
+@subsection Compaction
+
+As new blocks are written, the block manager will place them where they fit best.
+Because of this it's common that removal of data will not result in the file
+shrinking. The file can only be shrunk when there are available blocks at the
+end of the file.
+
+To manage this, WiredTiger provides a compaction API call WT_SESSION::compact.
+The block manager operates in first fit mode during compaction to maximize block
+movement towards the beginning of the file. WiredTiger walks the btree and asks
+the block manager if relocating that page will reduce the file size. If so, the
+page is marked dirty, forcing the block to be rewritten. WiredTiger then
+performs two checkpoints, as at least two checkpoints are required to delete the
+checkpoint originally containing the block.
-The state of the block manager is represented by the \c WT_BM structure.
-Individual blocks being tracked are in \c WT_BLOCK structures.
*/
diff --git a/src/third_party/wiredtiger/src/docs/arch-cache.dox b/src/third_party/wiredtiger/src/docs/arch-cache.dox
index 94888260cfe..716c4966d45 100644
--- a/src/third_party/wiredtiger/src/docs/arch-cache.dox
+++ b/src/third_party/wiredtiger/src/docs/arch-cache.dox
@@ -1,13 +1,118 @@
/*! @arch_page arch-cache Cache
-Cache in WiredTiger is represented by the various shared data structures
-that make up in-memory Btrees and subordinate data structures.
+The WiredTiger cache is memory used to hold copies of recently accessed or modified data.
+WiredTiger reads Btree pages into the cache on demand. When the cache runs low on space, Eviction
+removes unneeded pages. Updates modify data in the cache and
+are flushed to storage asynchronously, either by @ref arch-checkpoint "Checkpoint" or
+@ref arch-eviction "Eviction".
-Memory used to read in and write out the on-disk representations of Btrees
-is not cached, it only exists temporarily during the I/O operation and
-while the data is transferred to or from the on-disk format.
+The page layout in the WiredTiger cache is optimized for fast, concurrent access by multiple
+application threads. In contrast, WiredTiger organizes pages in storage to minimize storage space.
+As a result, WiredTiger has to convert between the in-memory and on-storage representations of a
+page whenever it reads or writes the page.
-Internally, the current cache state is represented by the WT_CACHE structure,
-which contains various counters that drive statistics and information
-used for eviction.
+@section arch_cache_basics Basic operation
+
+Cached Btree pages point to each other, mirroring the structure of the on-disk Btree.
+When WiredTiger opens a file, it loads the root page of the Btree into memory along with the first
+level of internal pages. To lookup an entry in a Btree, WiredTiger starts from the root page
+and searches the Btree until finds the entry. If WiredTiger encounters a page that is not in
+memory, it loads that page from storage and continues the search.
+
+To load a page into the cache, WiredTiger passes the page's address cookie to the
+@ref arch-block "Block Manager"
+and gets back a buffer containing the corresponding block from the underlying file.
+If necessary, WiredTiger decrypts and decompresses the block. Then it allocates indexing
+structures to facilitate quick binary search of the keys in the page. The first time WiredTiger
+needs to modify or insert an entry on a page, it allocates additional structures to track these
+changes.
+
+WiredTiger tracks the total amount of data in the cache. It also tracks the space used by
+_clean_, (unmodified) pages and by _dirty_ (modified) pages. When the cache becomes too
+full or contains too much dirty data, WiredTiger invokes @ref arch-eviction "Eviction" to
+remove data from the
+cache. To remove a clean page from the cache, WiredTiger simply frees the page's memory.
+To remove a dirty page, WiredTiger must first _reconcile_ the page (converting it from
+in-memory format to on-disk format) and then write it to storage.
+
+@section arch_cache_structure Cache structure
+
+Internally, WiredTiger's cache state is represented by the \c WT_CACHE structure, which contains
+counters and parameter settings for tracking cache usage and controlling eviction policy.
+The \c WT_CACHE also includes state WiredTiger uses to track the progress of eviction. There
+is a single \c WT_CACHE for each connection, accessed via the \c WT_CONNECTION_IMPL structure.
+
+Each page in the cache is accessed via a \c WT_REF structure. When WiredTiger opens a Btree,
+it places a \c WT_REF for the cached root page in the corresponding \c WT_BTREE structure.
+A \c WT_REF can represent either
+a page in the cache or one that has not been loaded yet.
+The page itself is represented by a \c WT_PAGE structure. This includes a pointer to a buffer
+that contains the on-disk page image (decrypted and uncompressed). It also holds the supplemental
+structures that WiredTiger uses to access and update the page while it is cached.
+
+When WiredTiger loads a page into the cache, it allocates an internal table with one entry
+for each entry on the page. The type and content of these entries depends on the page type. An
+internal Btree page will have an array of \c WT_REF structures. A row-store leaf page will have
+an array of \c WT_ROW structures representing the KV pairs stored on the page. A variable-length
+column-store leaf page will have an array of \c WT_COL structures along with a parallel array
+of \c WT_COL_RLE structures indicating run lengths for items that are repeated more then once
+on the page. Both of these leaf page formats support binary search to quickly find an entry.
+In a fixed-length column-store leaf page, values will be packed into a simple byte array, allowing
+WiredTiger to access entries using bit operations based on the value length.
+
+The first time an entry on a leaf page is inserted or modified, WiredTiger adds a
+\c WT_PAGE_MODIFY structure to the corresponding \c WT_PAGE in the cache. For a row-store leaf
+page the \c WT_PAGE_MODIFY tracks changes using an array of \c WT_UPDATE pointers with one element
+for each
+KV pair on the leaf page. When WiredTiger updates an entry, it inserts a \c WT_UPDATE in
+this array. If there are multiple updates to the same item, WiredTiger chains them together
+in a linked list. When a record is deleted,
+WiredTiger adds an update with a special tombstone value. WiredTiger stores newly inserted
+elements in a similar array of skip lists represented by \c WT_INSERT structures. There is a
+separate skiplist for the gap between each pair of keys on the page, as well as skiplists for
+the gaps between the beginning and end of the page and the first and last keys, respectively.
+
+For a column-store leaf page the \c WT_PAGE_MODIFY structure tracks changes using a pair of
+skip lists, one for appended items and one for updated items.
+
+Almost all operations on these data structures are lock-free, allowing a high level of
+concurrency in the cache.
+
+@section arch_cache_size Cache size and content
+
+The amount of memory used by the WiredTiger cache is controlled by the \c cache_size configuration
+parameter, which defaults to 100 MB. (Note that MongoDB sets the cache size, by default, to be
+half the size of RAM.) WiredTiger does not explicitly manage this memory, relying instead on
+the C memory allocator to acquire and free memory as needed. Since the cache is
+allocated from the heap, evicting data from the cache simply returns the memory to the allocator;
+it does not reduce the application's memory footprint.
+
+The WiredTiger cache is only used for Btree data, including associated in-memory structures such
+as indexes, insert lists, and update chains. Other WiredTiger data structures, such as
+dhandles, cursors, and sessions, are not considered part of the cache and do not count against
+the cache size. Similarly, memory used to read in and write out the on-disk representations of
+Btree pages is not cached; it is only allocated temporarily during the I/O operation and
+while the data is converted to or from the on-disk format.
+
+@section arch_cache_shared Shared caches
+
+WiredTiger supports sharing a single cache among multiple databases within a process. Normally
+if a process opens connections to multiple different databases, each connection would use a
+separate fixed-size cache. With a shared cache, WiredTiger dynamically partitions a fixed
+amount of cache space between participating connections.
+
+When shared caching is enabled, WiredTiger creates a cache pool server thread to manage the
+shared cache. It also allocates a global \c WT_CACHE_POOL structure, which stores settings
+and statistics for the shared cache. These settings include a minimum and
+maximum cache size for connections participating in the shared cache.
+
+The cache pool server thread wakes up periodically and adjusts the sizes of the individual
+per-connection caches. Adjustments are based on a pressure metric for each cache computed
+using a weighted average of the amount of data read into the cache (i.e., cache misses)
+and how often applications threads have evicted data from the cache or waited while
+performing eviction. If a cache has higher pressure than average and is not yet at the maximum
+size, WiredTiger grows that cache. Conversely, if a cache has low pressure, WiredTiger shrinks
+it, subject to the minimum cache size. To change the size of a cache, the cache pool server
+simply changes the cache size parameters in the corresponding \c WT_CACHE structure. WiredTiger's
+eviction code will adjust the amount of data in the cache accordingly.
*/
diff --git a/src/third_party/wiredtiger/src/docs/arch-connection.dox b/src/third_party/wiredtiger/src/docs/arch-connection.dox
new file mode 100644
index 00000000000..cdc3eeecb9a
--- /dev/null
+++ b/src/third_party/wiredtiger/src/docs/arch-connection.dox
@@ -0,0 +1,60 @@
+/*! @arch_page arch-connection Connection
+
+@section arch_conn_def Definition
+
+A connection is a handle to a WiredTiger database instance. The connection has exclusive access to
+the database through a file lock, hence only one connection can be opened at a time. Internally, a
+connection is represented by \c WT_CONNECTION.
+
+@section arch_conn_lifecycle Life cycle
+
+@subsection arch_conn_init Initialization
+
+A connection is initialized when WT_CONNECTION::wiredtiger_open is called by the user application.
+WT_CONNECTION::wiredtiger_open accepts a list of configuration items (see @ref database_config) that
+can be used to enable different WiredTiger features and tune their behavior. Those features are for
+example related to @ref arch-eviction, @ref arch-logging, @ref arch-checkpoint, @ref arch-cache,
+statistics, etc. All the different available configuration settings are described in the
+documentation for WT_CONNECTION::wiredtiger_open.
+
+WT_CONNECTION::wiredtiger_open also performs different sanity checks depending on the configuration
+item "create". When "create" is specified and a database does not already exist, a new database is
+created along with specific WiredTiger files such as the turtle file and other metadata files. If a
+database already exists, whether "create" is specified or not, WiredTiger will try to open it and
+check for the existence of the different required WiredTiger files. If "create" is not specified,
+WiredTiger expects a previously created database where it is executed. If the existing database is
+corrupted and cannot be opened, either \c WT_RUN_RECOVERY or \c WT_TRY_SALVAGE error (see @ref
+error_handling) is returned to the user application and the connection is not created. In this case,
+a recovery operation will be required to bring the database to a consistent state (see @ref
+command_line for more details) before a connection can be successfully established with the
+database.
+
+Once the database has been successfully opened, internal worker threads are started to provide
+global services used at runtime. Those services consist of different threads to handle statistics,
+logging, eviction, checkpoint and cache management. The sweeping server that manages the active and
+inactive dhandles is started too, see @ref arch-dhandle for more information.
+
+Finally, before the connection is completely initialized, the database is set to a consistent state
+by running rollback to stable, see @ref arch-rts for more details.
+
+@subsection arch_conn_runtime Runtime
+
+At runtime, database-wide operations can be executed using the connection interface. For instance,
+it is possible to reconfigure WiredTiger features and behavior using WT_CONNECTION::reconfigure
+instead of closing the connection and calling WT_CONNECTION::open again. However, almost all CRUD
+operations on the database are executed in the context of a session (see @ref arch-session) which
+can be created using WT_CONNECTION::open_session. See the WT_CONNECTION:: documentation to discover
+other available APIs related to WiredTiger connections.
+
+A connection also keeps tracks of global information, see \c WT_CONNECTION_IMPL defined in \c
+connection.h. Finally, a \c WT_CONNECTION handle may be shared between threads, see @ref threads for
+more information.
+
+@subsection arch_conn_closure Closure
+
+When a connection is no longer required, it can be closed using WT_CONNECTION::close. As a result,
+any resource held by the connection (i.e sessions) is freed unless configured differently and the
+database is restored to a consistent state if necessary. It is worth noting that this final step
+might take some time as it may involve running the rollback to stable operation.
+
+*/
diff --git a/src/third_party/wiredtiger/src/docs/arch-index.dox b/src/third_party/wiredtiger/src/docs/arch-index.dox
index 3d916e675aa..906e6959be8 100644
--- a/src/third_party/wiredtiger/src/docs/arch-index.dox
+++ b/src/third_party/wiredtiger/src/docs/arch-index.dox
@@ -139,6 +139,10 @@ make up in-memory Btrees and subordinate data structures.
A checkpoint is created by WiredTiger to serve as a point from which it can recover.
+@subpage arch-connection
+
+A connection is a handle to a WiredTiger database instance.
+
@subpage arch-cursor
Cursors are used to get and modify data.
@@ -196,6 +200,10 @@ Rollback to stable to remove the unstable updates from the database.
A schema defines the format of the application data in WiredTiger.
+@subpage arch-session
+
+A session defines the context for most operations performed in WiredTiger.
+
@subpage arch-snapshot
Snapshots are implemented by storing transaction ids committed before
diff --git a/src/third_party/wiredtiger/src/docs/arch-session.dox b/src/third_party/wiredtiger/src/docs/arch-session.dox
new file mode 100644
index 00000000000..e17eca10b1b
--- /dev/null
+++ b/src/third_party/wiredtiger/src/docs/arch-session.dox
@@ -0,0 +1,53 @@
+/*! @arch_page arch-session Session
+
+@section arch_session_def Definition
+After a @ref arch-connection has been established between the application and WiredTiger, the
+application can start sending requests to WiredTiger using a session. A session is internally
+represented by WT_SESSION and plays an important role since almost all operations are performed
+under the context of a session.
+
+A session can only be created through an existing connection with the API
+WT_CONNECTION::open_session and it is possible to create multiple sessions through the same
+connection. In fact, one connection can have multiple sessions but one session can only be
+associated with one connection. The maximum number of sessions is set through the configuration item
+\c session_max as part of the configuration string in ::wiredtiger_open.
+
+Sessions created by the calling application are called "user sessions". WiredTiger also performs
+some internal operations such as @ref arch-eviction through self-created sessions. These sessions
+are called "internal sessions". The usage rules and guidelines for both internal sessions and user
+sessions are the same and the only difference between them is their origin of creation.
+
+@section arch_session_ops Operations
+The different operations that can be performed on a WiredTiger session are related to cursors,
+tables and transactions. You can read the complete description of each possible operation in the
+documentation related to WT_SESSION.
+
+@section arch_session_txn Transactions
+It is possible to group several operations within a session, in other words, multiple operations can
+be treated as a single atomic operation. This can be done using @ref arch-transaction. Furthermore,
+a session can hold only one running transaction at any given time and this transaction only belongs
+to that session.
+
+@section arch_session_cur Cursors
+A session can perform multiple data operations on one or several collections using multiple cursors
+(see @ref arch-cursor for more details). All the cursors associated with a session share that
+session transaction context. It is also possible to cache those cursors if required through the
+configuration string given to WT_CONNECTION::open_session or ::wiredtiger_open. The configuration
+item for this purpose is \c cache_cursors.
+
+@section arch_session_dhandles Data Handles
+During its lifetime, a session can accumulate a list of data handles (see @ref arch-dhandle).
+Indeed, when a session accesses a table for the first time, the data handle of that table is
+acquired and cached. Once a session no longer needs to operate on a table, it marks the associated
+data handle as idle. This helps the sweep server release data handles that are inactive, see @ref
+arch-dhandle-lifecycle for more details.
+
+@section arch_session_closure Closure
+A session can be closed using WT_SESSION::close. Closing the connection will also close all opened
+sessions. When a session is closed, it releases all the resources associated with it including
+rolling back any active transaction and closing the cursors that are still open.
+
+@section arch_session_thread Multithreading
+A session is always executed as a single thread, see @ref threads for more details.
+
+*/
diff --git a/src/third_party/wiredtiger/src/docs/spell.ok b/src/third_party/wiredtiger/src/docs/spell.ok
index 1d302409f2e..182a9af00f3 100644
--- a/src/third_party/wiredtiger/src/docs/spell.ok
+++ b/src/third_party/wiredtiger/src/docs/spell.ok
@@ -48,6 +48,7 @@ ECMA
EINVAL
ENCRYPTOR
ENOTSUP
+EXTLIST
EmpId
Encryptors
Facebook
@@ -180,7 +181,9 @@ bokeh
bool
boolean
booleans
+bm
br
+bt
btmem
btree
btrees
@@ -236,6 +239,7 @@ curhs
cursortype
curtable
customerABC
+customizability
cv
cyclomatic
dN
@@ -257,6 +261,7 @@ decrement
decrementing
decrypt
decrypted
+decrypts
del
desc
destructor
@@ -574,6 +579,7 @@ sess
sid
skinparam
skiplist
+skiplists
sortable
spinlock
spinlocks
@@ -596,6 +602,7 @@ subdatabases
subdirectory
subpage
substring
+subsubsection
sudo
superset
svg
diff --git a/src/third_party/wiredtiger/src/include/cell_inline.h b/src/third_party/wiredtiger/src/include/cell_inline.h
index 2d91b9a8ee6..97b856e8a62 100644
--- a/src/third_party/wiredtiger/src/include/cell_inline.h
+++ b/src/third_party/wiredtiger/src/include/cell_inline.h
@@ -1045,24 +1045,6 @@ __cell_unpack_window_cleanup(WT_SESSION_IMPL *session, const WT_PAGE_HEADER *dsk
}
/*
- * __cell_pack_kv_window_cleanup --
- * Clean up cells loaded from a previous run while writing to disk.
- */
-static inline void
-__cell_pack_kv_window_cleanup(
- WT_SESSION_IMPL *session, const WT_PAGE_HEADER *dsk, WT_CELL_UNPACK_KV *unpack_kv)
-{
- /*
- * If the page came from a previous run, reset the transaction ids to "none" and timestamps to 0
- * as appropriate when the cell information is used for packing the new cell.
- */
- if (F_ISSET(S2C(session), WT_CONN_RECOVERING) &&
- dsk->write_gen > S2BT(session)->base_write_gen &&
- dsk->write_gen < S2BT(session)->run_write_gen)
- __cell_kv_window_cleanup(session, unpack_kv);
-}
-
-/*
* __wt_cell_unpack_addr --
* Unpack an address WT_CELL into a structure.
*/
diff --git a/src/third_party/wiredtiger/src/include/extern.h b/src/third_party/wiredtiger/src/include/extern.h
index 1892edd09a3..8061ce88008 100644
--- a/src/third_party/wiredtiger/src/include/extern.h
+++ b/src/third_party/wiredtiger/src/include/extern.h
@@ -39,6 +39,8 @@ extern bool __wt_rwlock_islocked(WT_SESSION_IMPL *session, WT_RWLOCK *l)
WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
extern bool __wt_txn_active(WT_SESSION_IMPL *session, uint64_t txnid)
WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
+extern bool __wt_txn_user_active(WT_SESSION_IMPL *session)
+ WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
extern char *__wt_time_aggregate_to_string(WT_TIME_AGGREGATE *ta, char *ta_string)
WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
extern char *__wt_time_point_to_string(wt_timestamp_t ts, wt_timestamp_t durable_ts,
@@ -1088,6 +1090,8 @@ extern int __wt_meta_track_update(WT_SESSION_IMPL *session, const char *key)
WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
extern int __wt_metadata_btree_id_to_uri(WT_SESSION_IMPL *session, uint32_t btree_id, char **uri)
WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
+extern int __wt_metadata_correct_base_write_gen(WT_SESSION_IMPL *session)
+ WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
extern int __wt_metadata_cursor(WT_SESSION_IMPL *session, WT_CURSOR **cursorp)
WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
extern int __wt_metadata_cursor_close(WT_SESSION_IMPL *session)
@@ -1966,6 +1970,8 @@ static inline bool __wt_txn_visible(WT_SESSION_IMPL *session, uint64_t id, wt_ti
WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
static inline bool __wt_txn_visible_all(WT_SESSION_IMPL *session, uint64_t id,
wt_timestamp_t timestamp) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
+static inline bool __wt_txn_visible_id_snapshot(uint64_t id, uint64_t snap_min, uint64_t snap_max,
+ uint64_t *snapshot, uint32_t snapshot_count) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
static inline double __wt_eviction_dirty_target(WT_CACHE *cache)
WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
static inline int __wt_btree_block_free(WT_SESSION_IMPL *session, const uint8_t *addr,
diff --git a/src/third_party/wiredtiger/src/include/txn.h b/src/third_party/wiredtiger/src/include/txn.h
index 3258cafb29f..6b84061ff82 100644
--- a/src/third_party/wiredtiger/src/include/txn.h
+++ b/src/third_party/wiredtiger/src/include/txn.h
@@ -244,8 +244,8 @@ struct __wt_txn {
/*
* Snapshot data:
+ * ids >= snap_max are invisible,
* ids < snap_min are visible,
- * ids > snap_max are invisible,
* everything else is visible unless it is in the snapshot.
*/
uint64_t snap_min, snap_max;
diff --git a/src/third_party/wiredtiger/src/include/txn_inline.h b/src/third_party/wiredtiger/src/include/txn_inline.h
index 6d3b4e8fa55..b57f0d8203f 100644
--- a/src/third_party/wiredtiger/src/include/txn_inline.h
+++ b/src/third_party/wiredtiger/src/include/txn_inline.h
@@ -449,13 +449,10 @@ err:
static inline uint64_t
__wt_txn_oldest_id(WT_SESSION_IMPL *session)
{
- WT_BTREE *btree;
WT_TXN_GLOBAL *txn_global;
uint64_t checkpoint_pinned, oldest_id;
- bool include_checkpoint_txn;
txn_global = &S2C(session)->txn_global;
- btree = S2BT_SAFE(session);
/*
* The metadata is tracked specially because of optimizations for checkpoints.
@@ -467,10 +464,6 @@ __wt_txn_oldest_id(WT_SESSION_IMPL *session)
* Take a local copy of these IDs in case they are updated while we are checking visibility.
*/
oldest_id = txn_global->oldest_id;
- include_checkpoint_txn =
- btree == NULL || (btree->checkpoint_gen != __wt_gen(session, WT_GEN_CHECKPOINT));
- if (!include_checkpoint_txn)
- return (oldest_id);
/*
* The read of the transaction ID pinned by a checkpoint needs to be carefully ordered: if a
@@ -501,14 +494,11 @@ __wt_txn_oldest_id(WT_SESSION_IMPL *session)
static inline void
__wt_txn_pinned_timestamp(WT_SESSION_IMPL *session, wt_timestamp_t *pinned_tsp)
{
- WT_BTREE *btree;
WT_TXN_GLOBAL *txn_global;
wt_timestamp_t checkpoint_ts, pinned_ts;
- bool include_checkpoint_txn;
*pinned_tsp = WT_TS_NONE;
- btree = S2BT_SAFE(session);
txn_global = &S2C(session)->txn_global;
/*
@@ -520,19 +510,6 @@ __wt_txn_pinned_timestamp(WT_SESSION_IMPL *session, wt_timestamp_t *pinned_tsp)
*pinned_tsp = pinned_ts = txn_global->pinned_timestamp;
/*
- * Checkpoint transactions often fall behind ordinary application threads. Take special effort
- * to not keep changes pinned in cache if they are only required for the checkpoint and it has
- * already seen them.
- *
- * If there is no active checkpoint or this handle is up to date with the active checkpoint then
- * it's safe to ignore the checkpoint ID in the visibility check.
- */
- include_checkpoint_txn =
- btree == NULL || (btree->checkpoint_gen != __wt_gen(session, WT_GEN_CHECKPOINT));
- if (!include_checkpoint_txn)
- return;
-
- /*
* The read of checkpoint timestamp needs to be carefully ordered: it needs to be after we have
* read the pinned timestamp and the checkpoint generation, otherwise, we may read earlier
* checkpoint timestamp before the checkpoint generation that is read resulting more data being
@@ -680,6 +657,37 @@ __wt_txn_tw_stop_visible_all(WT_SESSION_IMPL *session, WT_TIME_WINDOW *tw)
}
/*
+ * __wt_txn_visible_id_snapshot --
+ * Is the id visible in terms of the given snapshot?
+ */
+static inline bool
+__wt_txn_visible_id_snapshot(
+ uint64_t id, uint64_t snap_min, uint64_t snap_max, uint64_t *snapshot, uint32_t snapshot_count)
+{
+ bool found;
+
+ /*
+ * WT_ISO_SNAPSHOT, WT_ISO_READ_COMMITTED: the ID is visible if it is not the result of a
+ * concurrent transaction, that is, if was committed before the snapshot was taken.
+ *
+ * The order here is important: anything newer than or equal to the maximum ID we saw when
+ * taking the snapshot should be invisible, even if the snapshot is empty.
+ *
+ * Snapshot data:
+ * ids >= snap_max not visible,
+ * ids < snap_min are visible,
+ * everything else is visible unless it is found in the snapshot.
+ */
+ if (WT_TXNID_LE(snap_max, id))
+ return (false);
+ if (snapshot_count == 0 || WT_TXNID_LT(id, snap_min))
+ return (true);
+
+ WT_BINARY_SEARCH(id, snapshot, snapshot_count, found);
+ return (!found);
+}
+
+/*
* __txn_visible_id --
* Can the current transaction see the given ID?
*/
@@ -687,7 +695,6 @@ static inline bool
__txn_visible_id(WT_SESSION_IMPL *session, uint64_t id)
{
WT_TXN *txn;
- bool found;
txn = session->txn;
@@ -710,20 +717,8 @@ __txn_visible_id(WT_SESSION_IMPL *session, uint64_t id)
/* Otherwise, we should be called with a snapshot. */
WT_ASSERT(session, F_ISSET(txn, WT_TXN_HAS_SNAPSHOT) || session->dhandle->checkpoint != NULL);
- /*
- * WT_ISO_SNAPSHOT, WT_ISO_READ_COMMITTED: the ID is visible if it is not the result of a
- * concurrent transaction, that is, if was committed before the snapshot was taken.
- *
- * The order here is important: anything newer than the maximum ID we saw when taking the
- * snapshot should be invisible, even if the snapshot is empty.
- */
- if (WT_TXNID_LE(txn->snap_max, id))
- return (false);
- if (txn->snapshot_count == 0 || WT_TXNID_LT(id, txn->snap_min))
- return (true);
-
- WT_BINARY_SEARCH(id, txn->snapshot, txn->snapshot_count, found);
- return (!found);
+ return (__wt_txn_visible_id_snapshot(
+ id, txn->snap_min, txn->snap_max, txn->snapshot, txn->snapshot_count));
}
/*
diff --git a/src/third_party/wiredtiger/src/meta/meta_ckpt.c b/src/third_party/wiredtiger/src/meta/meta_ckpt.c
index b4c7c933c62..d58d0149658 100644
--- a/src/third_party/wiredtiger/src/meta/meta_ckpt.c
+++ b/src/third_party/wiredtiger/src/meta/meta_ckpt.c
@@ -974,6 +974,40 @@ err:
}
/*
+ * __wt_metadata_correct_base_write_gen --
+ * Update the connection's base write generation from all files in metadata at then end of the
+ * recovery checkpoint.
+ */
+int
+__wt_metadata_correct_base_write_gen(WT_SESSION_IMPL *session)
+{
+ WT_CURSOR *cursor;
+ WT_DECL_RET;
+ char *config, *uri;
+
+ uri = NULL;
+ WT_RET(__wt_metadata_cursor(session, &cursor));
+ while ((ret = cursor->next(cursor)) == 0) {
+ WT_ERR(cursor->get_key(cursor, &uri));
+
+ if (!WT_PREFIX_MATCH(uri, "file:") && !WT_PREFIX_MATCH(uri, "tiered:"))
+ continue;
+
+ WT_ERR(cursor->get_value(cursor, &config));
+
+ /* Update base write gen to the write gen. */
+ WT_ERR(__wt_metadata_update_base_write_gen(session, config));
+ }
+ WT_ERR_NOTFOUND_OK(ret, false);
+
+err:
+ if (ret != 0 && uri != NULL)
+ __wt_err(session, ret, "unable to correct write gen for %s", uri);
+ WT_TRET(__wt_metadata_cursor_release(session, &cursor));
+ return (ret);
+}
+
+/*
* __wt_meta_ckptlist_to_meta --
* Convert a checkpoint list into its metadata representation.
*/
diff --git a/src/third_party/wiredtiger/src/reconcile/rec_col.c b/src/third_party/wiredtiger/src/reconcile/rec_col.c
index 4ea57b8acc7..8000026c58b 100644
--- a/src/third_party/wiredtiger/src/reconcile/rec_col.c
+++ b/src/third_party/wiredtiger/src/reconcile/rec_col.c
@@ -744,7 +744,6 @@ record_loop:
twp = &clear_tw;
goto compare;
}
- __cell_pack_kv_window_cleanup(session, page->dsk, vpack);
twp = &vpack->tw;
/*
diff --git a/src/third_party/wiredtiger/src/reconcile/rec_row.c b/src/third_party/wiredtiger/src/reconcile/rec_row.c
index a72bc170245..99d887da573 100644
--- a/src/third_party/wiredtiger/src/reconcile/rec_row.c
+++ b/src/third_party/wiredtiger/src/reconcile/rec_row.c
@@ -808,10 +808,9 @@ __wt_rec_row_leaf(
upd = upd_select.upd;
/* Take the timestamp from the update or the cell. */
- if (upd == NULL) {
- __cell_pack_kv_window_cleanup(session, page->dsk, vpack);
+ if (upd == NULL)
twp = &vpack->tw;
- } else
+ else
twp = &upd_select.tw;
/*
diff --git a/src/third_party/wiredtiger/src/reconcile/rec_write.c b/src/third_party/wiredtiger/src/reconcile/rec_write.c
index 9ad305792b9..cf9d1be3175 100644
--- a/src/third_party/wiredtiger/src/reconcile/rec_write.c
+++ b/src/third_party/wiredtiger/src/reconcile/rec_write.c
@@ -159,6 +159,9 @@ __reconcile(WT_SESSION_IMPL *session, WT_REF *ref, WT_SALVAGE_COOKIE *salvage, u
WT_DECL_RET;
WT_PAGE *page;
WT_RECONCILE *r;
+#ifdef HAVE_DIAGNOSTIC
+ void *addr;
+#endif
btree = S2BT(session);
page = ref->page;
@@ -215,11 +218,17 @@ __reconcile(WT_SESSION_IMPL *session, WT_REF *ref, WT_SALVAGE_COOKIE *salvage, u
F_ISSET(r, WT_REC_CALL_URGENT) && !r->update_used && r->cache_write_restore)
ret = __wt_set_return(session, EBUSY);
+#ifdef HAVE_DIAGNOSTIC
+ addr = ref->addr;
+#endif
/* Wrap up the page reconciliation. */
if (ret == 0 && (ret = __rec_write_wrapup(session, r, page)) == 0)
__rec_write_page_status(session, r);
- else
+ else {
+ /* Make sure that reconciliation doesn't free the page that has been written to disk. */
+ WT_ASSERT(session, addr == NULL || ref->addr != NULL);
WT_TRET(__rec_write_wrapup_err(session, r, page));
+ }
/* Release the reconciliation lock. */
*page_lockedp = false;
@@ -1516,7 +1525,7 @@ err:
* Initialize the page write generation number.
*/
static void
-__rec_set_page_write_gen(WT_PAGE_HEADER *dsk, WT_BTREE *btree)
+__rec_set_page_write_gen(WT_BTREE *btree, WT_PAGE_HEADER *dsk)
{
/*
* We increment the block's write generation so it's easy to identify newer versions of blocks
@@ -1553,7 +1562,7 @@ __rec_split_write_header(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_REC_CHUNK
dsk->recno = btree->type == BTREE_ROW ? WT_RECNO_OOB : multi->key.recno;
- __rec_set_page_write_gen(dsk, btree);
+ __rec_set_page_write_gen(btree, dsk);
dsk->mem_size = multi->size;
dsk->u.entries = chunk->entries;
dsk->type = page->type;
@@ -2088,6 +2097,22 @@ __rec_write_wrapup(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page)
WT_TIME_AGGREGATE_INIT(&ta);
/*
+ * If using the history store table eviction path and we found updates that weren't globally
+ * visible when reconciling this page, copy them into the database's history store. This can
+ * fail, so try before clearing the page's previous reconciliation state.
+ */
+ if (F_ISSET(r, WT_REC_HS))
+ WT_RET(__rec_hs_wrapup(session, r));
+
+ /*
+ * Wrap up overflow tracking. If we are about to create a checkpoint, the system must be
+ * entirely consistent at that point (the underlying block manager is presumably going to do
+ * some action to resolve the list of allocated/free/whatever blocks that are associated with
+ * the checkpoint).
+ */
+ WT_RET(__wt_ovfl_track_wrapup(session, page));
+
+ /*
* This page may have previously been reconciled, and that information is now about to be
* replaced. Make sure it's discarded at some point, and clear the underlying modification
* information, we're creating a new reality.
@@ -2137,21 +2162,6 @@ __rec_write_wrapup(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page)
/* Reset the reconciliation state. */
mod->rec_result = 0;
- /*
- * If using the history store table eviction path and we found updates that weren't globally
- * visible when reconciling this page, copy them into the database's history store.
- */
- if (F_ISSET(r, WT_REC_HS))
- WT_RET(__rec_hs_wrapup(session, r));
-
- /*
- * Wrap up overflow tracking. If we are about to create a checkpoint, the system must be
- * entirely consistent at that point (the underlying block manager is presumably going to do
- * some action to resolve the list of allocated/free/whatever blocks that are associated with
- * the checkpoint).
- */
- WT_RET(__wt_ovfl_track_wrapup(session, page));
-
__wt_verbose(session, WT_VERB_RECONCILE, "%p reconciled into %" PRIu32 " pages", (void *)ref,
r->multi_next);
@@ -2367,7 +2377,7 @@ __wt_rec_cell_build_ovfl(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_REC_KV *k
dsk = tmp->mem;
memset(dsk, 0, WT_PAGE_HEADER_SIZE);
dsk->type = WT_PAGE_OVFL;
- __rec_set_page_write_gen(dsk, btree);
+ __rec_set_page_write_gen(btree, dsk);
dsk->u.datalen = (uint32_t)kv->buf.size;
memcpy(WT_PAGE_HEADER_BYTE(btree, dsk), kv->buf.data, kv->buf.size);
dsk->mem_size = WT_PAGE_HEADER_BYTE_SIZE(btree) + (uint32_t)kv->buf.size;
diff --git a/src/third_party/wiredtiger/src/txn/txn.c b/src/third_party/wiredtiger/src/txn/txn.c
index 35eb2e08d6f..e3ab3fecc16 100644
--- a/src/third_party/wiredtiger/src/txn/txn.c
+++ b/src/third_party/wiredtiger/src/txn/txn.c
@@ -139,6 +139,47 @@ __wt_txn_release_snapshot(WT_SESSION_IMPL *session)
}
/*
+ * __wt_txn_user_active --
+ * Check whether there are any running user transactions. Note that a new transactions may start
+ * on a session we have already examined and the caller needs to be aware of this limitation.
+ * Exclude prepared user transactions from this check.
+ */
+bool
+__wt_txn_user_active(WT_SESSION_IMPL *session)
+{
+ WT_CONNECTION_IMPL *conn;
+ WT_SESSION_IMPL *session_in_list;
+ uint32_t i, session_cnt;
+ bool txn_active;
+
+ conn = S2C(session);
+ txn_active = false;
+
+ /*
+ * No lock is required because the session array is fixed size, but it may contain inactive
+ * entries. We must review any active session, so insert a read barrier after reading the active
+ * session count. That way, no matter what sessions come or go, we'll check the slots for all of
+ * the user sessions for active transactions when we started our check.
+ */
+ WT_ORDERED_READ(session_cnt, conn->session_cnt);
+ for (i = 0, session_in_list = conn->sessions; i < session_cnt; i++, session_in_list++) {
+ /* Skip inactive sessions. */
+ if (!session_in_list->active)
+ continue;
+ /* Check if a user session has a running transaction. Ignore prepared transactions. */
+ if (F_ISSET(session_in_list->txn, WT_TXN_RUNNING) &&
+ !F_ISSET(session_in_list, WT_SESSION_INTERNAL) &&
+ !F_ISSET(session_in_list->txn, WT_TXN_PREPARE)) {
+
+ txn_active = true;
+ break;
+ }
+ }
+
+ return (txn_active);
+}
+
+/*
* __wt_txn_active --
* Check if a transaction is still active. If not, it is either committed, prepared, or rolled
* back. It is possible that we race with commit, prepare or rollback and a transaction is still
diff --git a/src/third_party/wiredtiger/src/txn/txn_ckpt.c b/src/third_party/wiredtiger/src/txn/txn_ckpt.c
index 1a7090cd2c6..c32cd43b0bc 100644
--- a/src/third_party/wiredtiger/src/txn/txn_ckpt.c
+++ b/src/third_party/wiredtiger/src/txn/txn_ckpt.c
@@ -922,6 +922,14 @@ __txn_checkpoint(WT_SESSION_IMPL *session, const char *cfg[])
}
/*
+ * As part of recovery, rollback to stable may have left out clearing stale transaction ids.
+ * Update the connection base write generation based on the latest checkpoint write generations
+ * to reset these transaction ids present on the pages when reading them.
+ */
+ if (F_ISSET(conn, WT_CONN_RECOVERING))
+ WT_ERR(__wt_metadata_correct_base_write_gen(session));
+
+ /*
* Clear the dhandle so the visibility check doesn't get confused about the snap min. Don't
* bother restoring the handle since it doesn't make sense to carry a handle across a
* checkpoint.
diff --git a/src/third_party/wiredtiger/src/txn/txn_recover.c b/src/third_party/wiredtiger/src/txn/txn_recover.c
index 9f32e9346f0..c76e2af3597 100644
--- a/src/third_party/wiredtiger/src/txn/txn_recover.c
+++ b/src/third_party/wiredtiger/src/txn/txn_recover.c
@@ -548,39 +548,6 @@ err:
}
/*
- * __recovery_correct_write_gen --
- * Update the connection's base write generation from all files in metadata.
- */
-static int
-__recovery_correct_write_gen(WT_SESSION_IMPL *session)
-{
- WT_CURSOR *cursor;
- WT_DECL_RET;
- char *config, *uri;
-
- uri = NULL;
- WT_RET(__wt_metadata_cursor(session, &cursor));
- while ((ret = cursor->next(cursor)) == 0) {
- WT_ERR(cursor->get_key(cursor, &uri));
-
- if (!WT_PREFIX_MATCH(uri, "file:") && !WT_PREFIX_MATCH(uri, "tiered:"))
- continue;
-
- WT_ERR(cursor->get_value(cursor, &config));
-
- /* Update base write gen to the write gen. */
- WT_ERR(__wt_metadata_update_base_write_gen(session, config));
- }
- WT_ERR_NOTFOUND_OK(ret, false);
-
-err:
- if (ret != 0 && uri != NULL)
- __wt_err(session, ret, "unable to correct write gen for %s", uri);
- WT_TRET(__wt_metadata_cursor_release(session, &cursor));
- return (ret);
-}
-
-/*
* __recovery_setup_file --
* Set up the recovery slot for a file, track the largest file ID, and update the base write gen
* based on the file's configuration.
@@ -1055,16 +1022,11 @@ done:
WT_ERR(session->iface.checkpoint(&session->iface, "force=1"));
/*
- * Rollback to stable may have left out clearing stale transaction ids. Update the connection
- * base write generation based on the latest checkpoint write generations to reset them.
- */
- if (rts_executed)
- WT_ERR(__recovery_correct_write_gen(session));
-
- /*
* Update the open dhandles write generations and base write generation with the connection's
* base write generation because the recovery checkpoint writes the pages to disk with new write
- * generation number which contains transaction ids that are needed to reset later.
+ * generation number which contains transaction ids that are needed to reset later. The
+ * connection level base write generation number is updated at the end of the recovery
+ * checkpoint.
*/
__wt_dhandle_update_write_gens(session);
diff --git a/src/third_party/wiredtiger/src/txn/txn_rollback_to_stable.c b/src/third_party/wiredtiger/src/txn/txn_rollback_to_stable.c
index 3ec6bb95934..6004ddd3db2 100644
--- a/src/third_party/wiredtiger/src/txn/txn_rollback_to_stable.c
+++ b/src/third_party/wiredtiger/src/txn/txn_rollback_to_stable.c
@@ -281,50 +281,30 @@ err:
}
/*
- * __rollback_check_if_txnid_non_committed --
- * Check if the transaction id is non committed.
+ * __rollback_txn_visible_id --
+ * Check if the transaction id is visible or not.
*/
static bool
-__rollback_check_if_txnid_non_committed(WT_SESSION_IMPL *session, uint64_t txnid)
+__rollback_txn_visible_id(WT_SESSION_IMPL *session, uint64_t id)
{
WT_CONNECTION_IMPL *conn;
- bool found;
conn = S2C(session);
- /* If not recovery then assume all the data as committed. */
+ /* If not recovery then assume all the data as visible. */
if (!F_ISSET(conn, WT_CONN_RECOVERING))
- return (false);
+ return (true);
/*
* Only full checkpoint writes the metadata with snapshot. If the recovered checkpoint snapshot
- * details are zero then return false i.e, updates are committed.
- */
- if (conn->recovery_ckpt_snap_min == 0 && conn->recovery_ckpt_snap_max == 0)
- return (false);
-
- /*
- * Snapshot data:
- * ids < recovery_ckpt_snap_min are committed,
- * ids > recovery_ckpt_snap_max are non committed,
- * everything else is committed unless it is found in the recovery_ckpt_snapshot array.
+ * details are none then return false i.e, updates are visible.
*/
- if (txnid < conn->recovery_ckpt_snap_min)
- return (false);
- else if (txnid > conn->recovery_ckpt_snap_max)
+ if (conn->recovery_ckpt_snap_min == WT_TXN_NONE && conn->recovery_ckpt_snap_max == WT_TXN_NONE)
return (true);
- /*
- * Return false when the recovery snapshot count is 0, which means there is no uncommitted
- * transaction ids.
- */
- if (conn->recovery_ckpt_snapshot_count == 0)
- return (false);
-
- WT_BINARY_SEARCH(
- txnid, conn->recovery_ckpt_snapshot, conn->recovery_ckpt_snapshot_count, found);
-
- return (found);
+ return (
+ __wt_txn_visible_id_snapshot(id, conn->recovery_ckpt_snap_min, conn->recovery_ckpt_snap_max,
+ conn->recovery_ckpt_snapshot, conn->recovery_ckpt_snapshot_count));
}
/*
@@ -484,7 +464,7 @@ __rollback_ondisk_fixup_key(WT_SESSION_IMPL *session, WT_REF *ref, WT_PAGE *page
* Stop processing when we find a stable update according to the given timestamp and
* transaction id.
*/
- if (!__rollback_check_if_txnid_non_committed(session, hs_tw->start_txn) &&
+ if (__rollback_txn_visible_id(session, hs_tw->start_txn) &&
hs_durable_ts <= rollback_timestamp) {
__wt_verbose(session, WT_VERB_RECOVERY_RTS(session),
"history store update valid with start timestamp: %s, durable timestamp: %s, stop "
@@ -562,7 +542,7 @@ __rollback_ondisk_fixup_key(WT_SESSION_IMPL *session, WT_REF *ref, WT_PAGE *page
* We have a tombstone on the original update chain and it is stable according to the
* timestamp and txnid, we need to restore that as well.
*/
- if (!__rollback_check_if_txnid_non_committed(session, hs_tw->stop_txn) &&
+ if (__rollback_txn_visible_id(session, hs_tw->stop_txn) &&
hs_stop_durable_ts <= rollback_timestamp) {
/*
* The restoring tombstone timestamp must be zero or less than previous update start
@@ -614,6 +594,9 @@ __rollback_ondisk_fixup_key(WT_SESSION_IMPL *session, WT_REF *ref, WT_PAGE *page
/* Finally remove that update from history store. */
if (valid_update_found) {
+ /* Avoid freeing the updates while still in use if hs_cursor->remove fails. */
+ upd = tombstone = NULL;
+
WT_ERR(hs_cursor->remove(hs_cursor));
WT_STAT_CONN_DATA_INCR(session, txn_rts_hs_removed);
WT_STAT_CONN_DATA_INCR(session, cache_hs_key_truncate_rts);
@@ -692,7 +675,7 @@ __rollback_abort_ondisk_kv(WT_SESSION_IMPL *session, WT_REF *ref, WT_COL *cip, W
} else
return (0);
} else if (vpack->tw.durable_start_ts > rollback_timestamp ||
- __rollback_check_if_txnid_non_committed(session, vpack->tw.start_txn) ||
+ !__rollback_txn_visible_id(session, vpack->tw.start_txn) ||
(!WT_TIME_WINDOW_HAS_STOP(&vpack->tw) && prepared)) {
__wt_verbose(session, WT_VERB_RECOVERY_RTS(session),
"on-disk update aborted with start durable timestamp: %s, commit timestamp: %s, "
@@ -713,7 +696,7 @@ __rollback_abort_ondisk_kv(WT_SESSION_IMPL *session, WT_REF *ref, WT_COL *cip, W
}
} else if (WT_TIME_WINDOW_HAS_STOP(&vpack->tw) &&
(vpack->tw.durable_stop_ts > rollback_timestamp ||
- __rollback_check_if_txnid_non_committed(session, vpack->tw.stop_txn) || prepared)) {
+ !__rollback_txn_visible_id(session, vpack->tw.stop_txn) || prepared)) {
/*
* For prepared transactions, it is possible that both the on-disk key start and stop time
* windows can be the same. To abort these updates, check for any stable update from history
@@ -805,7 +788,7 @@ __rollback_abort_col_var(WT_SESSION_IMPL *session, WT_REF *ref, wt_timestamp_t r
WT_PAGE *page;
uint64_t recno, rle;
uint32_t i, j;
- bool stable_update_found;
+ bool is_ondisk_stable, stable_update_found;
page = ref->page;
/*
@@ -824,26 +807,46 @@ __rollback_abort_col_var(WT_SESSION_IMPL *session, WT_REF *ref, wt_timestamp_t r
WT_RET(__rollback_abort_insert_list(
session, page, ins, rollback_timestamp, &stable_update_found));
- if (!stable_update_found && page->dsk != NULL) {
+ if (page->dsk != NULL) {
+ /* Unpack the cell. We need its RLE count whether or not we're going to iterate it. */
kcell = WT_COL_PTR(page, cip);
__wt_cell_unpack_kv(session, page->dsk, kcell, &unpack);
rle = __wt_cell_rle(&unpack);
- if (unpack.type != WT_CELL_DEL) {
+
+ /*
+ * If we found a stable update on the insert list, this key needs no further attention.
+ * Any other keys in this cell with stable updates also do not require attention. But
+ * beyond that, the on-disk value must be older than
+ * the update we found. That means it too is stable(*), so any keys in the cell that
+ * _don't_ have stable updates on the update list don't need further attention either.
+ * (And any unstable updates were just handled above.) Thus we can skip iterating over
+ * the cell.
+ *
+ * Furthermore, if the cell is deleted it must be
+ * itself stable, because cells only appear as deleted if there is no older value that
+ * might need to be restored. We can skip iterating over the cell.
+ *
+ * (*) Either that, or the update is not timestamped, in which case the on-disk value
+ * might not be stable but the non-timestamp update will hide it until the next
+ * reconciliation and then overwrite it.
+ */
+ if (stable_update_found)
+ WT_STAT_CONN_DATA_INCR(session, txn_rts_stable_rle_skipped);
+ else if (unpack.type == WT_CELL_DEL)
+ WT_STAT_CONN_DATA_INCR(session, txn_rts_delete_rle_skipped);
+ else {
for (j = 0; j < rle; j++) {
- WT_RET(__rollback_abort_ondisk_kv(session, ref, cip, NULL, rollback_timestamp,
- recno + j, &stable_update_found));
- /* Skip processing all RLE if the on-disk version is stable. */
- if (stable_update_found) {
+ WT_RET(__rollback_abort_ondisk_kv(
+ session, ref, cip, NULL, rollback_timestamp, recno + j, &is_ondisk_stable));
+ /* We can stop right away if the on-disk version is stable. */
+ if (is_ondisk_stable) {
if (rle > 1)
WT_STAT_CONN_DATA_INCR(session, txn_rts_stable_rle_skipped);
break;
}
}
- } else
- WT_STAT_CONN_DATA_INCR(session, txn_rts_delete_rle_skipped);
+ }
recno += rle;
- } else {
- recno++;
}
}
@@ -1214,17 +1217,16 @@ __rollback_to_stable_check(WT_SESSION_IMPL *session)
bool txn_active;
/*
- * Help the user comply with the requirement that there are no concurrent operations. Protect
- * against spurious conflicts with the sweep server: we exclude it from running concurrent with
- * rolling back the history store contents.
+ * Help the user comply with the requirement that there are no concurrent user operations. It is
+ * okay to have a transaction in prepared state.
*/
- ret = __wt_txn_activity_check(session, &txn_active);
+ txn_active = __wt_txn_user_active(session);
#ifdef HAVE_DIAGNOSTIC
if (txn_active)
WT_TRET(__wt_verbose_dump_txn(session));
#endif
- if (ret == 0 && txn_active)
+ if (txn_active)
WT_RET_MSG(session, EINVAL, "rollback_to_stable illegal with active transactions");
return (ret);
@@ -1622,85 +1624,16 @@ err:
static int
__rollback_to_stable(WT_SESSION_IMPL *session, bool no_ckpt)
{
- WT_CACHE *cache;
WT_CONNECTION_IMPL *conn;
WT_DECL_RET;
WT_TXN_GLOBAL *txn_global;
wt_timestamp_t rollback_timestamp;
- size_t retries;
- uint32_t cache_flags;
char ts_string[2][WT_TS_INT_STRING_SIZE];
conn = S2C(session);
- cache = conn->cache;
txn_global = &conn->txn_global;
/*
- * We're about to run a check for active transactions in the system to stop users from shooting
- * themselves in the foot. Eviction threads may interfere with this check if they involve writes
- * to the history store so we need to wait until the system is no longer evicting content.
- *
- * If we detect active evictions, we should wait a millisecond and check again. If we're waiting
- * for evictions to quiesce for more than 2 minutes, we should give up on waiting and proceed
- * with the transaction check anyway.
- */
-#define WT_RTS_EVICT_MAX_RETRIES (2 * WT_MINUTE * WT_THOUSAND)
- /*
- * These are the types of evictions that can result in a history store operation. Since we want
- * to avoid these happening concurrently with our check, we need to look for these flags.
- */
-#define WT_CACHE_EVICT_HS_FLAGS \
- (WT_CACHE_EVICT_DIRTY | WT_CACHE_EVICT_UPDATES | WT_CACHE_EVICT_URGENT)
- for (retries = 0; retries < WT_RTS_EVICT_MAX_RETRIES; ++retries) {
- /*
- * If we're shutting down or running with an in-memory configuration, we aren't at risk of
- * racing with history store transactions.
- */
- if (F_ISSET(conn, WT_CONN_CLOSING_TIMESTAMP | WT_CONN_IN_MEMORY))
- break;
-
- /* Check whether eviction has quiesced. */
- WT_ORDERED_READ(cache_flags, cache->flags);
- if (!FLD_ISSET(cache_flags, WT_CACHE_EVICT_HS_FLAGS)) {
- /*
- * If we we find that the eviction flags are unset, interrupt the eviction server and
- * acquire the pass lock to stop the server from setting the eviction flags AFTER this
- * point and racing with our check.
- */
- (void)__wt_atomic_addv32(&cache->pass_intr, 1);
- __wt_spin_lock(session, &cache->evict_pass_lock);
- (void)__wt_atomic_subv32(&cache->pass_intr, 1);
- FLD_SET(session->lock_flags, WT_SESSION_LOCKED_PASS);
-
- /*
- * Check that the flags didn't get set in between when we checked and when we acquired
- * the server lock. If it did get set, release the locks and keep trying. If they're
- * still unset, break out of this loop and commence our check.
- */
- WT_ORDERED_READ(cache_flags, cache->flags);
- if (!FLD_ISSET(cache_flags, WT_CACHE_EVICT_HS_FLAGS))
- break;
- else {
- __wt_spin_unlock(session, &cache->evict_pass_lock);
- FLD_CLR(session->lock_flags, WT_SESSION_LOCKED_PASS);
- }
- }
- /* If we're retrying, pause for a millisecond and let eviction make some progress. */
- __wt_sleep(0, WT_THOUSAND);
- }
- if (retries == WT_RTS_EVICT_MAX_RETRIES) {
- WT_ERR(__wt_msg(
- session, "timed out waiting for eviction to quiesce, running rollback to stable"));
- /*
- * FIXME: WT-7877 RTS fails when there are active transactions running in parallel to it.
- * Waiting in a loop for eviction to quiesce is not efficient in some scenarios where the
- * cache is not cleared in 2 minutes. Enable the following assert and
- * test_rollback_to_stable22.py when the cache issue is addressed.
- */
- /* WT_ASSERT(session, false && "Timed out waiting for eviction to quiesce prior to rts"); */
- }
-
- /*
* Rollback to stable should ignore tombstones in the history store since it needs to scan the
* entire table sequentially.
*/
@@ -1708,11 +1641,6 @@ __rollback_to_stable(WT_SESSION_IMPL *session, bool no_ckpt)
WT_ERR(__rollback_to_stable_check(session));
- if (FLD_ISSET(session->lock_flags, WT_SESSION_LOCKED_PASS)) {
- __wt_spin_unlock(session, &cache->evict_pass_lock);
- FLD_CLR(session->lock_flags, WT_SESSION_LOCKED_PASS);
- }
-
/*
* Copy the stable timestamp, otherwise we'd need to lock it each time it's accessed. Even
* though the stable timestamp isn't supposed to be updated while rolling back, accessing it
@@ -1746,10 +1674,6 @@ __rollback_to_stable(WT_SESSION_IMPL *session, bool no_ckpt)
WT_ERR(session->iface.checkpoint(&session->iface, "force=1"));
err:
- if (FLD_ISSET(session->lock_flags, WT_SESSION_LOCKED_PASS)) {
- __wt_spin_unlock(session, &cache->evict_pass_lock);
- FLD_CLR(session->lock_flags, WT_SESSION_LOCKED_PASS);
- }
F_CLR(session, WT_SESSION_ROLLBACK_TO_STABLE);
return (ret);
}
diff --git a/src/third_party/wiredtiger/test/checkpoint/checkpointer.c b/src/third_party/wiredtiger/test/checkpoint/checkpointer.c
index b7e7a9a952b..a2445225e2e 100644
--- a/src/third_party/wiredtiger/test/checkpoint/checkpointer.c
+++ b/src/third_party/wiredtiger/test/checkpoint/checkpointer.c
@@ -36,12 +36,26 @@ static int real_checkpointer(void);
static int verify_consistency(WT_SESSION *, char *);
/*
+ * set_stable --
+ * Set the stable timestamp from g.ts_stable.
+ */
+static void
+set_stable(void)
+{
+ char buf[128];
+
+ testutil_check(__wt_snprintf(buf, sizeof(buf), "stable_timestamp=%x", g.ts_stable));
+ testutil_check(g.conn->set_timestamp(g.conn, buf));
+}
+
+/*
* start_checkpoints --
* Responsible for creating the checkpoint thread.
*/
void
start_checkpoints(void)
{
+ set_stable();
testutil_check(__wt_thread_create(NULL, &g.checkpoint_thread, checkpointer, NULL));
if (g.use_timestamps) {
testutil_check(__wt_rwlock_init(NULL, &g.clock_lock));
@@ -74,7 +88,6 @@ clock_thread(void *arg)
WT_SESSION *wt_session;
WT_SESSION_IMPL *session;
uint64_t delay;
- char buf[128];
WT_UNUSED(arg);
@@ -85,8 +98,7 @@ clock_thread(void *arg)
while (g.running) {
__wt_writelock(session, &g.clock_lock);
++g.ts_stable;
- testutil_check(__wt_snprintf(buf, sizeof(buf), "stable_timestamp=%x", g.ts_stable));
- testutil_check(g.conn->set_timestamp(g.conn, buf));
+ set_stable();
if (g.ts_stable % 997 == 0) {
/*
* Random value between 6 and 10 seconds.
diff --git a/src/third_party/wiredtiger/test/checkpoint/test_checkpoint.c b/src/third_party/wiredtiger/test/checkpoint/test_checkpoint.c
index 90378cc0de9..1d4c99a2b03 100644
--- a/src/third_party/wiredtiger/test/checkpoint/test_checkpoint.c
+++ b/src/third_party/wiredtiger/test/checkpoint/test_checkpoint.c
@@ -140,7 +140,8 @@ main(int argc, char *argv[])
testutil_work_dir_from_path(g.home, 512, working_dir);
- g.ts_stable = 0;
+ /* Start time at 1 since 0 is not a valid timestamp. */
+ g.ts_stable = 1;
printf("%s: process %" PRIu64 "\n", progname, (uint64_t)getpid());
for (cnt = 1; (runs == 0 || cnt <= runs) && g.status == 0; ++cnt) {
diff --git a/src/third_party/wiredtiger/test/csuite/Makefile.am b/src/third_party/wiredtiger/test/csuite/Makefile.am
index f5c3eaed361..f5da81e75ed 100644
--- a/src/third_party/wiredtiger/test/csuite/Makefile.am
+++ b/src/third_party/wiredtiger/test/csuite/Makefile.am
@@ -37,7 +37,7 @@ all_TESTS += timestamp_abort/smoke.sh
test_truncated_log_SOURCES = truncated_log/main.c
noinst_PROGRAMS += test_truncated_log
-all_TESTS += test_truncated_log
+all_TESTS += truncated_log/smoke.sh
test_wt1965_col_efficiency_SOURCES = wt1965_col_efficiency/main.c
noinst_PROGRAMS += test_wt1965_col_efficiency
@@ -49,8 +49,7 @@ all_TESTS += test_wt2403_lsm_workload
test_wt2246_col_append_SOURCES = wt2246_col_append/main.c
noinst_PROGRAMS += test_wt2246_col_append
-# Temporarily disabled (WT-5790)
-# all_TESTS += test_wt2246_col_append
+all_TESTS += test_wt2246_col_append
test_wt2323_join_visibility_SOURCES = wt2323_join_visibility/main.c
noinst_PROGRAMS += test_wt2323_join_visibility
@@ -146,11 +145,11 @@ all_TESTS += test_wt4891_meta_ckptlist_get_alloc
test_wt6185_modify_ts_SOURCES = wt6185_modify_ts/main.c
noinst_PROGRAMS += test_wt6185_modify_ts
-all_TESTS += test_wt6185_modify_ts
+all_TESTS += wt6185_modify_ts/smoke.sh
test_wt6616_checkpoint_oldest_ts_SOURCES = wt6616_checkpoint_oldest_ts/main.c
noinst_PROGRAMS += test_wt6616_checkpoint_oldest_ts
-all_TESTS += test_wt6616_checkpoint_oldest_ts
+all_TESTS += wt6616_checkpoint_oldest_ts/smoke.sh
# Run this during a "make check" smoke test.
TESTS = $(all_TESTS)
diff --git a/src/third_party/wiredtiger/test/csuite/schema_abort/main.c b/src/third_party/wiredtiger/test/csuite/schema_abort/main.c
index e16c7d7e79e..acf14348c0a 100644
--- a/src/third_party/wiredtiger/test/csuite/schema_abort/main.c
+++ b/src/third_party/wiredtiger/test/csuite/schema_abort/main.c
@@ -75,7 +75,7 @@ static const char *const uri_collection = "table:collection";
static const char *const ckpt_file = "checkpoint_done";
-static bool compat, inmem, stable_set, use_ts, use_txn;
+static bool compat, inmem, stable_set, use_columns, use_ts, use_txn;
static volatile uint64_t global_ts = 1;
static volatile uint64_t uid = 1;
typedef struct {
@@ -96,9 +96,10 @@ static volatile THREAD_TS th_ts[MAX_TH];
/*
* A minimum width of 10, along with zero filling, means that all the keys sort according to their
- * integer value, making each thread's key space distinct.
+ * integer value, making each thread's key space distinct. For column-store we just use the integer
+ * values and that has the same effect.
*/
-#define KEY_FORMAT ("%010" PRIu64)
+#define ROW_KEY_FORMAT ("%010" PRIu64)
typedef struct {
uint64_t absent_key; /* Last absent key */
@@ -670,14 +671,20 @@ thread_run(void *arg)
}
if (use_ts)
stable_ts = __wt_atomic_addv64(&global_ts, 1);
- testutil_check(__wt_snprintf(kname, sizeof(kname), KEY_FORMAT, i));
testutil_check(session->begin_transaction(session, NULL));
if (use_prep)
testutil_check(oplog_session->begin_transaction(oplog_session, NULL));
- cur_coll->set_key(cur_coll, kname);
- cur_local->set_key(cur_local, kname);
- cur_oplog->set_key(cur_oplog, kname);
+ if (use_columns) {
+ cur_coll->set_key(cur_coll, i + 1);
+ cur_local->set_key(cur_local, i + 1);
+ cur_oplog->set_key(cur_oplog, i + 1);
+ } else {
+ testutil_check(__wt_snprintf(kname, sizeof(kname), ROW_KEY_FORMAT, i));
+ cur_coll->set_key(cur_coll, kname);
+ cur_local->set_key(cur_local, kname);
+ cur_oplog->set_key(cur_oplog, kname);
+ }
/*
* Put an informative string into the value so that it can be viewed well in a binary dump.
*/
@@ -764,7 +771,7 @@ run_workload(uint32_t nth)
THREAD_DATA *td;
wt_thread_t *thr;
uint32_t ckpt_id, i, ts_id;
- char envconf[512];
+ char envconf[512], tableconf[128];
thr = dcalloc(nth + 2, sizeof(*thr));
td = dcalloc(nth + 2, sizeof(THREAD_DATA));
@@ -783,10 +790,13 @@ run_workload(uint32_t nth)
/*
* Create all the tables.
*/
- testutil_check(
- session->create(session, uri_collection, "key_format=S,value_format=u,log=(enabled=false)"));
- testutil_check(session->create(session, uri_local, "key_format=S,value_format=u"));
- testutil_check(session->create(session, uri_oplog, "key_format=S,value_format=u"));
+ testutil_check(__wt_snprintf(tableconf, sizeof(tableconf),
+ "key_format=%s,value_format=u,log=(enabled=false)", use_columns ? "r" : "S"));
+ testutil_check(session->create(session, uri_collection, tableconf));
+ testutil_check(__wt_snprintf(
+ tableconf, sizeof(tableconf), "key_format=%s,value_format=u", use_columns ? "r" : "S"));
+ testutil_check(session->create(session, uri_local, tableconf));
+ testutil_check(session->create(session, uri_oplog, tableconf));
/*
* Don't log the stable timestamp table so that we know what timestamp was stored at the
* checkpoint.
@@ -909,11 +919,15 @@ main(int argc, char *argv[])
verify_only = false;
working_dir = "WT_TEST.schema-abort";
- while ((ch = __wt_getopt(progname, argc, argv, "Ch:mT:t:vxz")) != EOF)
+ while ((ch = __wt_getopt(progname, argc, argv, "Cch:mT:t:vxz")) != EOF)
switch (ch) {
case 'C':
compat = true;
break;
+ case 'c':
+ /* Variable-length columns only; fixed would require considerable changes */
+ use_columns = true;
+ break;
case 'h':
working_dir = __wt_optarg;
break;
@@ -1087,10 +1101,16 @@ main(int argc, char *argv[])
key, last_key);
break;
}
- testutil_check(__wt_snprintf(kname, sizeof(kname), KEY_FORMAT, key));
- cur_coll->set_key(cur_coll, kname);
- cur_local->set_key(cur_local, kname);
- cur_oplog->set_key(cur_oplog, kname);
+ if (use_columns) {
+ cur_coll->set_key(cur_coll, key + 1);
+ cur_local->set_key(cur_local, key + 1);
+ cur_oplog->set_key(cur_oplog, key + 1);
+ } else {
+ testutil_check(__wt_snprintf(kname, sizeof(kname), ROW_KEY_FORMAT, key));
+ cur_coll->set_key(cur_coll, kname);
+ cur_local->set_key(cur_local, kname);
+ cur_oplog->set_key(cur_oplog, kname);
+ }
/*
* The collection table should always only have the data as of the checkpoint.
*/
diff --git a/src/third_party/wiredtiger/test/csuite/schema_abort/smoke.sh b/src/third_party/wiredtiger/test/csuite/schema_abort/smoke.sh
index 5e82ae180bc..e7d21ec30e6 100755
--- a/src/third_party/wiredtiger/test/csuite/schema_abort/smoke.sh
+++ b/src/third_party/wiredtiger/test/csuite/schema_abort/smoke.sh
@@ -21,6 +21,14 @@ $TEST_WRAPPER $test_bin -t 10 -T 5
$TEST_WRAPPER $test_bin -m -t 10 -T 5
$TEST_WRAPPER $test_bin -C -t 10 -T 5
$TEST_WRAPPER $test_bin -C -m -t 10 -T 5
+
+$TEST_WRAPPER $test_bin -c -t 10 -T 5
+$TEST_WRAPPER $test_bin -c -m -t 10 -T 5
+$TEST_WRAPPER $test_bin -c -C -t 10 -T 5
+$TEST_WRAPPER $test_bin -c -C -m -t 10 -T 5
+
# FIXME: In WT-6116 the test is failing if timestamps are turned off.
#$TEST_WRAPPER $test_bin -m -t 10 -T 5 -z
+#$TEST_WRAPPER $test_bin -c -m -t 10 -T 5 -z
$TEST_WRAPPER $test_bin -m -t 10 -T 5 -x
+$TEST_WRAPPER $test_bin -c -m -t 10 -T 5 -x
diff --git a/src/third_party/wiredtiger/test/csuite/timestamp_abort/main.c b/src/third_party/wiredtiger/test/csuite/timestamp_abort/main.c
index 6fa41f0d82c..1d59222104b 100644
--- a/src/third_party/wiredtiger/test/csuite/timestamp_abort/main.c
+++ b/src/third_party/wiredtiger/test/csuite/timestamp_abort/main.c
@@ -79,7 +79,7 @@ static const char *const uri_shadow = "shadow";
static const char *const ckpt_file = "checkpoint_done";
-static bool compat, inmem, stress, use_ts;
+static bool columns, compat, inmem, stress, use_ts;
static volatile uint64_t global_ts = 1;
/*
@@ -107,9 +107,10 @@ static volatile uint64_t global_ts = 1;
/*
* A minimum width of 10, along with zero filling, means that all the keys sort according to their
- * integer value, making each thread's key space distinct.
+ * integer value, making each thread's key space distinct. For column-store we just use the integer
+ * values and that has the same effect.
*/
-#define KEY_FORMAT ("%010" PRIu64)
+#define KEY_STRINGFORMAT ("%010" PRIu64)
typedef struct {
uint64_t absent_key; /* Last absent key */
@@ -334,8 +335,6 @@ thread_run(void *arg)
printf("Thread %" PRIu32 " starts at %" PRIu64 "\n", td->info, td->start);
active_ts = 0;
for (i = td->start;; ++i) {
- testutil_check(__wt_snprintf(kname, sizeof(kname), KEY_FORMAT, i));
-
testutil_check(session->begin_transaction(session, NULL));
if (use_prep)
testutil_check(prepared_session->begin_transaction(prepared_session, NULL));
@@ -354,10 +353,18 @@ thread_run(void *arg)
testutil_check(pthread_rwlock_unlock(&ts_lock));
}
- cur_coll->set_key(cur_coll, kname);
- cur_local->set_key(cur_local, kname);
- cur_oplog->set_key(cur_oplog, kname);
- cur_shadow->set_key(cur_shadow, kname);
+ if (columns) {
+ cur_coll->set_key(cur_coll, i + 1);
+ cur_local->set_key(cur_local, i + 1);
+ cur_oplog->set_key(cur_oplog, i + 1);
+ cur_shadow->set_key(cur_shadow, i + 1);
+ } else {
+ testutil_check(__wt_snprintf(kname, sizeof(kname), KEY_STRINGFORMAT, i));
+ cur_coll->set_key(cur_coll, kname);
+ cur_local->set_key(cur_local, kname);
+ cur_oplog->set_key(cur_oplog, kname);
+ cur_shadow->set_key(cur_shadow, kname);
+ }
/*
* Put an informative string into the value so that it can be viewed well in a binary dump.
*/
@@ -459,6 +466,7 @@ run_workload(uint32_t nth)
wt_thread_t *thr;
uint32_t cache_mb, ckpt_id, i, ts_id;
char envconf[512], uri[128];
+ const char *table_config, *table_config_nolog;
thr = dcalloc(nth + 2, sizeof(*thr));
td = dcalloc(nth + 2, sizeof(THREAD_DATA));
@@ -495,19 +503,25 @@ run_workload(uint32_t nth)
printf("wiredtiger_open configuration: %s\n", envconf);
testutil_check(wiredtiger_open(NULL, NULL, envconf, &conn));
testutil_check(conn->open_session(conn, NULL, NULL, &session));
+
/*
* Create all the tables.
*/
+ if (columns) {
+ table_config_nolog = "key_format=r,value_format=u,log=(enabled=false)";
+ table_config = "key_format=r,value_format=u";
+ } else {
+ table_config_nolog = "key_format=S,value_format=u,log=(enabled=false)";
+ table_config = "key_format=S,value_format=u";
+ }
testutil_check(__wt_snprintf(uri, sizeof(uri), "%s:%s", table_pfx, uri_collection));
- testutil_check(
- session->create(session, uri, "key_format=S,value_format=u,log=(enabled=false)"));
+ testutil_check(session->create(session, uri, table_config_nolog));
testutil_check(__wt_snprintf(uri, sizeof(uri), "%s:%s", table_pfx, uri_shadow));
- testutil_check(
- session->create(session, uri, "key_format=S,value_format=u,log=(enabled=false)"));
+ testutil_check(session->create(session, uri, table_config_nolog));
testutil_check(__wt_snprintf(uri, sizeof(uri), "%s:%s", table_pfx, uri_local));
- testutil_check(session->create(session, uri, "key_format=S,value_format=u"));
+ testutil_check(session->create(session, uri, table_config));
testutil_check(__wt_snprintf(uri, sizeof(uri), "%s:%s", table_pfx, uri_oplog));
- testutil_check(session->create(session, uri, "key_format=S,value_format=u"));
+ testutil_check(session->create(session, uri, table_config));
/*
* Don't log the stable timestamp table so that we know what timestamp was stored at the
* checkpoint.
@@ -616,7 +630,7 @@ main(int argc, char *argv[])
(void)testutil_set_progname(argv);
- compat = inmem = stress = false;
+ columns = compat = inmem = stress = false;
use_ts = true;
nth = MIN_TH;
rand_th = rand_time = true;
@@ -624,11 +638,15 @@ main(int argc, char *argv[])
verify_only = false;
working_dir = "WT_TEST.timestamp-abort";
- while ((ch = __wt_getopt(progname, argc, argv, "Ch:LmsT:t:vz")) != EOF)
+ while ((ch = __wt_getopt(progname, argc, argv, "Cch:LmsT:t:vz")) != EOF)
switch (ch) {
case 'C':
compat = true;
break;
+ case 'c':
+ /* Variable-length columns only (for now) */
+ columns = true;
+ break;
case 'h':
working_dir = __wt_optarg;
break;
@@ -699,9 +717,9 @@ main(int argc, char *argv[])
compat ? "true" : "false", inmem ? "true" : "false", stress ? "true" : "false",
use_ts ? "true" : "false");
printf("Parent: Create %" PRIu32 " threads; sleep %" PRIu32 " seconds\n", nth, timeout);
- printf("CONFIG: %s%s%s%s%s -h %s -T %" PRIu32 " -t %" PRIu32 "\n", progname,
- compat ? " -C" : "", inmem ? " -m" : "", stress ? " -s" : "", !use_ts ? " -z" : "",
- working_dir, nth, timeout);
+ printf("CONFIG: %s%s%s%s%s%s -h %s -T %" PRIu32 " -t %" PRIu32 "\n", progname,
+ compat ? " -C" : "", columns ? " -c" : "", inmem ? " -m" : "", stress ? " -s" : "",
+ !use_ts ? " -z" : "", working_dir, nth, timeout);
/*
* Fork a child to insert as many items. We will then randomly kill the child, run recovery
* and make sure all items we wrote exist after recovery runs.
@@ -823,11 +841,20 @@ main(int argc, char *argv[])
key, last_key);
break;
}
- testutil_check(__wt_snprintf(kname, sizeof(kname), KEY_FORMAT, key));
- cur_coll->set_key(cur_coll, kname);
- cur_local->set_key(cur_local, kname);
- cur_oplog->set_key(cur_oplog, kname);
- cur_shadow->set_key(cur_shadow, kname);
+
+ if (columns) {
+ cur_coll->set_key(cur_coll, key + 1);
+ cur_local->set_key(cur_local, key + 1);
+ cur_oplog->set_key(cur_oplog, key + 1);
+ cur_shadow->set_key(cur_shadow, key + 1);
+ } else {
+ testutil_check(__wt_snprintf(kname, sizeof(kname), KEY_STRINGFORMAT, key));
+ cur_coll->set_key(cur_coll, kname);
+ cur_local->set_key(cur_local, kname);
+ cur_oplog->set_key(cur_oplog, kname);
+ cur_shadow->set_key(cur_shadow, kname);
+ }
+
/*
* The collection table should always only have the data as of the checkpoint. The
* shadow table should always have the exact same data (or not) as the collection table,
diff --git a/src/third_party/wiredtiger/test/csuite/timestamp_abort/smoke.sh b/src/third_party/wiredtiger/test/csuite/timestamp_abort/smoke.sh
index 18d7f9b8dae..b2c70340f4c 100755
--- a/src/third_party/wiredtiger/test/csuite/timestamp_abort/smoke.sh
+++ b/src/third_party/wiredtiger/test/csuite/timestamp_abort/smoke.sh
@@ -23,8 +23,12 @@ then
fi
$TEST_WRAPPER $test_bin $default_test_args
+$TEST_WRAPPER $test_bin $default_test_args -c
#$TEST_WRAPPER $test_bin $default_test_args -L
$TEST_WRAPPER $test_bin -m $default_test_args
+$TEST_WRAPPER $test_bin -m $default_test_args -c
#$TEST_WRAPPER $test_bin -m $default_test_args -L
$TEST_WRAPPER $test_bin -C $default_test_args
+$TEST_WRAPPER $test_bin -C $default_test_args -c
$TEST_WRAPPER $test_bin -C -m $default_test_args
+$TEST_WRAPPER $test_bin -C -m $default_test_args -c
diff --git a/src/third_party/wiredtiger/test/csuite/truncated_log/main.c b/src/third_party/wiredtiger/test/csuite/truncated_log/main.c
index d46b75d48c9..fa45e573781 100644
--- a/src/third_party/wiredtiger/test/csuite/truncated_log/main.c
+++ b/src/third_party/wiredtiger/test/csuite/truncated_log/main.c
@@ -32,6 +32,7 @@
static char home[1024]; /* Program working dir */
static const char *const uri = "table:main";
+static bool use_columns = false;
#define RECORDS_FILE "records"
@@ -128,8 +129,14 @@ fill_db(void)
WT_SESSION *session;
uint32_t i, max_key, min_key, units, unused;
char k[K_SIZE], v[V_SIZE];
+ const char *table_config;
bool first;
+ if (use_columns)
+ table_config = "key_format=r,value_format=S";
+ else
+ table_config = "key_format=S,value_format=S";
+
/*
* Run in the home directory so that the records file is in there too.
*/
@@ -137,7 +144,7 @@ fill_db(void)
testutil_die(errno, "chdir: %s", home);
testutil_check(wiredtiger_open(NULL, NULL, ENV_CONFIG, &conn));
testutil_check(conn->open_session(conn, NULL, NULL, &session));
- testutil_check(session->create(session, uri, "key_format=S,value_format=S"));
+ testutil_check(session->create(session, uri, table_config));
testutil_check(session->open_cursor(session, uri, NULL, NULL, &cursor));
/*
@@ -164,10 +171,14 @@ fill_db(void)
max_key = min_key * 2;
first = true;
for (i = 0; i < max_key; ++i) {
- testutil_check(__wt_snprintf(k, sizeof(k), "key%03d", (int)i));
+ if (use_columns)
+ cursor->set_key(cursor, i + 1);
+ else {
+ testutil_check(__wt_snprintf(k, sizeof(k), "key%03" PRIu32, i));
+ cursor->set_key(cursor, k);
+ }
testutil_check(
- __wt_snprintf(v, sizeof(v), "value%0*d", (int)(V_SIZE - (strlen("value") + 1)), (int)i));
- cursor->set_key(cursor, k);
+ __wt_snprintf(v, sizeof(v), "value%0*" PRIu32, (int)(V_SIZE - (strlen("value") + 1)), i));
cursor->set_value(cursor, v);
testutil_check(cursor->insert(cursor));
@@ -230,8 +241,12 @@ main(int argc, char *argv[])
(void)testutil_set_progname(argv);
working_dir = "WT_TEST.truncated-log";
- while ((ch = __wt_getopt(progname, argc, argv, "h:")) != EOF)
+ while ((ch = __wt_getopt(progname, argc, argv, "ch:")) != EOF)
switch (ch) {
+ case 'c':
+ /* Variable-length columns only (for now) */
+ use_columns = true;
+ break;
case 'h':
working_dir = __wt_optarg;
break;
diff --git a/src/third_party/wiredtiger/test/csuite/truncated_log/smoke.sh b/src/third_party/wiredtiger/test/csuite/truncated_log/smoke.sh
new file mode 100755
index 00000000000..0079adf0340
--- /dev/null
+++ b/src/third_party/wiredtiger/test/csuite/truncated_log/smoke.sh
@@ -0,0 +1,20 @@
+#! /bin/sh
+
+set -e
+
+# Smoke-test truncated_log as part of running "make check".
+
+if [ -n "$1" ]
+then
+ # If the test binary is passed in manually.
+ test_bin=$1
+else
+ # If $top_builddir/$top_srcdir aren't set, default to building in build_posix
+ # and running in test/csuite.
+ top_builddir=${top_builddir:-../../build_posix}
+ top_srcdir=${top_srcdir:-../..}
+ test_bin=$top_builddir/test/csuite/test_truncated_log
+fi
+
+$TEST_WRAPPER $test_bin
+$TEST_WRAPPER $test_bin -c
diff --git a/src/third_party/wiredtiger/test/csuite/wt2246_col_append/main.c b/src/third_party/wiredtiger/test/csuite/wt2246_col_append/main.c
index 6b54402fe66..307f47578e3 100644
--- a/src/third_party/wiredtiger/test/csuite/wt2246_col_append/main.c
+++ b/src/third_party/wiredtiger/test/csuite/wt2246_col_append/main.c
@@ -89,8 +89,8 @@ int
main(int argc, char *argv[])
{
WT_SESSION *session;
+ wt_thread_t idlist[100];
clock_t ce, cs;
- pthread_t idlist[100];
uint64_t i, id;
char buf[100];
@@ -125,15 +125,16 @@ main(int argc, char *argv[])
(void)signal(SIGINT, onsig);
+ memset(idlist, 0, sizeof(idlist));
cs = clock();
id = 0;
for (i = 0; i < opts->n_append_threads; ++i, ++id) {
printf("append: %" PRIu64 "\n", id);
- testutil_check(pthread_create(&idlist[id], NULL, thread_append, opts));
+ testutil_check(__wt_thread_create(NULL, &idlist[id], thread_append, opts));
}
for (i = 0; i < id; ++i)
- testutil_check(pthread_join(idlist[i], NULL));
+ testutil_check(__wt_thread_join(NULL, &idlist[i]));
ce = clock();
printf("%" PRIu64 "M records: %.2lf processor seconds\n", opts->max_inserted_id / MILLION,
diff --git a/src/third_party/wiredtiger/test/csuite/wt6185_modify_ts/main.c b/src/third_party/wiredtiger/test/csuite/wt6185_modify_ts/main.c
index 1f2e824047b..02205c88429 100644
--- a/src/third_party/wiredtiger/test/csuite/wt6185_modify_ts/main.c
+++ b/src/third_party/wiredtiger/test/csuite/wt6185_modify_ts/main.c
@@ -50,7 +50,10 @@ static u_int tnext;
static uint64_t ts; /* Current timestamp. */
-static char key[100], modify_repl[256], tmp[4 * 1024];
+static char keystr[100], modify_repl[256], tmp[4 * 1024];
+static uint64_t keyrecno;
+
+static bool use_columns = false;
/*
* trace --
@@ -117,6 +120,32 @@ mmrand(u_int min, u_int max)
}
/*
+ * change_key --
+ * Switch to a different key.
+ */
+static void
+change_key(u_int n)
+{
+ if (use_columns)
+ keyrecno = n + 1;
+ else
+ testutil_check(__wt_snprintf(keystr, sizeof(keystr), "%010u.key", n));
+}
+
+/*
+ * set_key --
+ * Set the current key in the cursor.
+ */
+static void
+set_key(WT_CURSOR *c)
+{
+ if (use_columns)
+ c->set_key(c, keyrecno);
+ else
+ c->set_key(c, keystr);
+}
+
+/*
* modify_repl_init --
* Initialize the replacement information.
*/
@@ -181,13 +210,13 @@ modify(WT_SESSION *session, WT_CURSOR *c)
for (cnt = loop = 1; loop < 5; ++cnt, ++loop)
if (mmrand(1, 10) <= 8) {
modify_build(entries, &nentries, cnt);
- c->set_key(c, key);
+ set_key(c);
testutil_check(c->modify(c, entries, nentries));
}
/* Commit 90% of the time, else rollback. */
if (mmrand(1, 10) != 1) {
- c->set_key(c, key);
+ set_key(c);
testutil_check(c->search(c));
testutil_check(c->get_value(c, &v));
free(list[lnext].v);
@@ -223,7 +252,7 @@ repeat(WT_SESSION *session, WT_CURSOR *c)
testutil_check(__wt_snprintf(tmp, sizeof(tmp), "read_timestamp=%" PRIx64, list[i].ts));
testutil_check(session->timestamp_transaction(session, tmp));
- c->set_key(c, key);
+ set_key(c);
testutil_check(c->search(c));
testutil_check(c->get_value(c, &v));
@@ -246,7 +275,7 @@ evict(WT_CURSOR *c)
{
trace("%s", "eviction");
- c->set_key(c, key);
+ set_key(c);
testutil_check(c->search(c));
F_SET(c, WT_CURSTD_DEBUG_RESET_EVICT);
testutil_check(c->reset(c));
@@ -286,7 +315,7 @@ main(int argc, char *argv[])
WT_SESSION *session;
u_int i, j;
int ch;
- char path[1024], value[VALUE_SIZE];
+ char path[1024], table_config[128], value[VALUE_SIZE];
const char *home, *v;
bool no_checkpoint, no_eviction;
@@ -298,8 +327,12 @@ main(int argc, char *argv[])
no_checkpoint = no_eviction = false;
home = "WT_TEST.wt6185_modify_ts";
- while ((ch = __wt_getopt(progname, argc, argv, "ceh:S:")) != EOF)
+ while ((ch = __wt_getopt(progname, argc, argv, "Cceh:S:")) != EOF)
switch (ch) {
+ case 'C':
+ /* Variable-length columns only (for now anyway) */
+ use_columns = true;
+ break;
case 'c':
no_checkpoint = true;
break;
@@ -322,14 +355,17 @@ main(int argc, char *argv[])
testutil_work_dir_from_path(path, sizeof(path), home);
testutil_make_work_dir(path);
+ testutil_check(__wt_snprintf(
+ table_config, sizeof(table_config), "key_format=%s,value_format=S", use_columns ? "r" : "S"));
+
/* Load 100 records. */
testutil_check(wiredtiger_open(path, NULL, "create", &conn));
testutil_check(conn->open_session(conn, NULL, NULL, &session));
- testutil_check(session->create(session, "file:xxx", "key_format=S,value_format=S"));
+ testutil_check(session->create(session, "file:xxx", table_config));
testutil_check(session->open_cursor(session, "file:xxx", NULL, NULL, &c));
for (i = 0; i <= 100; ++i) {
- testutil_check(__wt_snprintf(key, sizeof(key), "%010u.key", i));
- c->set_key(c, key);
+ change_key(i);
+ set_key(c);
SET_VALUE(i, value);
c->set_value(c, value);
testutil_check(c->insert(c));
@@ -341,8 +377,8 @@ main(int argc, char *argv[])
testutil_check(conn->open_session(conn, NULL, NULL, &session));
testutil_check(session->create(session, "file:xxx", NULL));
testutil_check(session->open_cursor(session, "file:xxx", NULL, NULL, &c));
- testutil_check(__wt_snprintf(key, sizeof(key), "%010d.key", KEYNO));
- c->set_key(c, key);
+ change_key(KEYNO);
+ set_key(c);
testutil_check(c->search(c));
testutil_check(c->get_value(c, &v));
SET_VALUE(KEYNO, value);
diff --git a/src/third_party/wiredtiger/test/csuite/wt6185_modify_ts/smoke.sh b/src/third_party/wiredtiger/test/csuite/wt6185_modify_ts/smoke.sh
new file mode 100755
index 00000000000..b317eeeb2ed
--- /dev/null
+++ b/src/third_party/wiredtiger/test/csuite/wt6185_modify_ts/smoke.sh
@@ -0,0 +1,21 @@
+#! /bin/sh
+
+set -e
+
+# Smoke-test wt6185_modify_ts as part of running "make check".
+
+if [ -n "$1" ]
+then
+ # If the test binary is passed in manually.
+ test_bin=$1
+else
+ # If $top_builddir/$top_srcdir aren't set, default to building in build_posix
+ # and running in test/csuite.
+ top_builddir=${top_builddir:-../../build_posix}
+ top_srcdir=${top_srcdir:-../..}
+ test_bin=$top_builddir/test/csuite/test_wt6185_modify_ts
+fi
+
+$TEST_WRAPPER $test_bin
+$TEST_WRAPPER $test_bin -C
+
diff --git a/src/third_party/wiredtiger/test/csuite/wt6616_checkpoint_oldest_ts/main.c b/src/third_party/wiredtiger/test/csuite/wt6616_checkpoint_oldest_ts/main.c
index 5a75777aa78..2e9648efea0 100644
--- a/src/third_party/wiredtiger/test/csuite/wt6616_checkpoint_oldest_ts/main.c
+++ b/src/third_party/wiredtiger/test/csuite/wt6616_checkpoint_oldest_ts/main.c
@@ -32,6 +32,7 @@
#include <signal.h>
static char home[1024]; /* Program working dir */
+static bool use_columns = false;
/*
* Spin up a child process to do operations and checkpoint. For each set of operations on a key,
@@ -48,7 +49,7 @@ static char home[1024]; /* Program working dir */
* recovery by reading without a timestamp. Whether it is possible to read historical versions based
* on timestamps from a logged table after recovery is not defined and implemented yet.
*/
-#define KEY_FORMAT ("%010" PRIu64)
+#define ROW_KEY_FORMAT ("%010" PRIu64)
#define MAX_CKPT_INVL 5 /* Maximum interval between checkpoints */
#define MAX_DATA 1000
@@ -147,11 +148,14 @@ thread_run(void *arg)
/* Insert and then delete the keys until we're killed. */
printf("Worker thread started.\n");
for (oldest_ts = 0, ts = 1;; ++ts) {
- testutil_check(__wt_snprintf(kname, sizeof(kname), KEY_FORMAT, ts));
+ testutil_check(__wt_snprintf(kname, sizeof(kname), ROW_KEY_FORMAT, ts));
/* Insert the same value for key and value. */
testutil_check(session->begin_transaction(session, NULL));
- cursor->set_key(cursor, kname);
+ if (use_columns)
+ cursor->set_key(cursor, ts);
+ else
+ cursor->set_key(cursor, kname);
data.data = kname;
data.size = sizeof(kname);
cursor->set_value(cursor, &data);
@@ -193,7 +197,7 @@ run_workload(void)
WT_SESSION *session;
wt_thread_t *thr;
uint32_t i;
- char envconf[512];
+ char envconf[512], tableconf[512];
thr = dcalloc(2, sizeof(*thr));
@@ -206,8 +210,9 @@ run_workload(void)
testutil_check(conn->open_session(conn, NULL, NULL, &session));
/* Create the table. */
- testutil_check(
- session->create(session, uri, "key_format=S,value_format=u,log=(enabled=false)"));
+ testutil_check(__wt_snprintf(tableconf, sizeof(tableconf),
+ "key_format=%s,value_format=u,log=(enabled=false)", use_columns ? "r" : "S"));
+ testutil_check(session->create(session, uri, tableconf));
testutil_check(session->close(session, NULL));
/* The checkpoint thread is added at the end. */
@@ -268,8 +273,12 @@ main(int argc, char *argv[])
timeout = MIN_TIME;
working_dir = "WT_TEST.wt6616-checkpoint-oldest-ts";
- while ((ch = __wt_getopt(progname, argc, argv, "h:t:")) != EOF)
+ while ((ch = __wt_getopt(progname, argc, argv, "ch:t:")) != EOF)
switch (ch) {
+ case 'c':
+ /* Variable-length columns only (for now) */
+ use_columns = true;
+ break;
case 'h':
working_dir = __wt_optarg;
break;
@@ -363,8 +372,11 @@ main(int argc, char *argv[])
for (ts = oldest_ts; ts <= stable_ts; ++ts) {
testutil_check(__wt_snprintf(tscfg, sizeof(tscfg), "read_timestamp=%" PRIx64, ts));
testutil_check(session->begin_transaction(session, tscfg));
- testutil_check(__wt_snprintf(kname, sizeof(kname), KEY_FORMAT, ts));
- cursor->set_key(cursor, kname);
+ testutil_check(__wt_snprintf(kname, sizeof(kname), ROW_KEY_FORMAT, ts));
+ if (use_columns)
+ cursor->set_key(cursor, ts);
+ else
+ cursor->set_key(cursor, kname);
ret = cursor->search(cursor);
if (ret == WT_NOTFOUND) {
fatal = true;
diff --git a/src/third_party/wiredtiger/test/csuite/wt6616_checkpoint_oldest_ts/smoke.sh b/src/third_party/wiredtiger/test/csuite/wt6616_checkpoint_oldest_ts/smoke.sh
new file mode 100755
index 00000000000..9b9cc997026
--- /dev/null
+++ b/src/third_party/wiredtiger/test/csuite/wt6616_checkpoint_oldest_ts/smoke.sh
@@ -0,0 +1,21 @@
+#! /bin/sh
+
+set -e
+
+# Smoke-test wt6616_checkpoint_oldest_ts as part of running "make check".
+
+if [ -n "$1" ]
+then
+ # If the test binary is passed in manually.
+ test_bin=$1
+else
+ # If $top_builddir/$top_srcdir aren't set, default to building in build_posix
+ # and running in test/csuite.
+ top_builddir=${top_builddir:-../../build_posix}
+ top_srcdir=${top_srcdir:-../..}
+ test_bin=$top_builddir/test/csuite/test_wt6616_checkpoint_oldest_ts
+fi
+
+$TEST_WRAPPER $test_bin
+$TEST_WRAPPER $test_bin -c
+
diff --git a/src/third_party/wiredtiger/test/evergreen.yml b/src/third_party/wiredtiger/test/evergreen.yml
index 5bc5fa6580b..41362d91097 100755
--- a/src/third_party/wiredtiger/test/evergreen.yml
+++ b/src/third_party/wiredtiger/test/evergreen.yml
@@ -2580,6 +2580,17 @@ tasks:
name: recovery-stress-test-3
tags: ["stress-test-3", "stress-test-zseries-3"]
+ - name: format-abort-recovery-stress-test
+ commands:
+ - command: timeout.update
+ params:
+ exec_timeout_secs: 2500
+ - func: "get project"
+ - func: "compile wiredtiger with builtins"
+ - func: "format test script"
+ vars:
+ format_test_script_args: -a -t 30
+
- name: many-dhandle-stress-test
commands:
- func: "get project"
@@ -2830,6 +2841,7 @@ buildvariants:
- name: ".stress-test-2"
- name: ".stress-test-3"
- name: ".stress-test-4"
+ - name: format-abort-recovery-stress-test
- name: large-scale-tests
display_name: "Large scale tests"
@@ -2856,13 +2868,11 @@ buildvariants:
run_on:
- ubuntu1804-test
expansions:
- test_env_vars: LD_LIBRARY_PATH=$(pwd)/../../.libs
+ test_env_vars: LD_LIBRARY_PATH=$(pwd)/../../.libs PATH=/opt/mongodbtoolchain/v3/bin:$PATH
make_command: PATH=/opt/mongodbtoolchain/v3/bin:$PATH make
posix_configure_flags:
--enable-silent-rules --enable-python --enable-zlib --enable-snappy
--enable-strict --enable-static
- test_env_vars:
- PATH=/opt/mongodbtoolchain/v3/bin:$PATH
tasks:
- name: compile
- name: cppsuite-hs-cleanup-stress
diff --git a/src/third_party/wiredtiger/test/format/config.c b/src/third_party/wiredtiger/test/format/config.c
index 88964902d98..33b8d7a9112 100644
--- a/src/third_party/wiredtiger/test/format/config.c
+++ b/src/third_party/wiredtiger/test/format/config.c
@@ -715,6 +715,8 @@ config_in_memory(void)
return;
if (config_is_perm("checkpoint"))
return;
+ if (config_is_perm("format.abort"))
+ return;
if (config_is_perm("import"))
return;
if (config_is_perm("logging"))
diff --git a/src/third_party/wiredtiger/test/suite/test_hs18.py b/src/third_party/wiredtiger/test/suite/test_hs18.py
index bcef53e4d17..5ed21e3c90a 100644
--- a/src/third_party/wiredtiger/test/suite/test_hs18.py
+++ b/src/third_party/wiredtiger/test/suite/test_hs18.py
@@ -438,8 +438,6 @@ class test_hs18(wttest.WiredTigerTestCase):
session_ts_reader = self.setUpSessionOpen(self.conn)
cursor_ts_reader = session_ts_reader.open_cursor(uri)
- self.skipTest('Skip this part of test_hs18 until WT-7931 is resolved')
-
# The ID of the session corresponds the value it should see.
sessions = []
cursors = []
@@ -448,8 +446,6 @@ class test_hs18(wttest.WiredTigerTestCase):
sessions.append(self.setUpSessionOpen(self.conn))
cursors.append(sessions[i].open_cursor(uri))
- value_junk = 'aaaaa' * 100
-
values.append('f' * 10)
values.append('a' + values[0])
values.append('b' + values[1])
@@ -485,11 +481,13 @@ class test_hs18(wttest.WiredTigerTestCase):
# Start a long running transaction which could see modify 1.
self.start_txn(sessions, cursors, values, 2)
- # Insert a bunch of contents to fill the cache
- for i in range(2000, 10000):
- self.session.begin_transaction()
- cursor[self.create_key(i)] = value_junk
- self.session.commit_transaction()
+ # Evict the update using a debug cursor
+ cursor.reset()
+ evict_cursor = self.session.open_cursor(uri, None, "debug=(release_evict)")
+ evict_cursor.set_key(self.create_key(1))
+ self.assertEqual(evict_cursor.search(), 0)
+ evict_cursor.reset()
+ evict_cursor.close()
# Commit a modify without a timestamp on our original key
self.session.begin_transaction()
@@ -511,11 +509,13 @@ class test_hs18(wttest.WiredTigerTestCase):
for i in range(0, 5):
self.check_value(cursors[i], values[i])
- # Insert a bunch of other contents to trigger eviction
- for i in range(10001, 11000):
- self.session.begin_transaction()
- cursor[self.create_key(i)] = value_junk
- self.session.commit_transaction()
+ # Evict the update using a debug cursor
+ cursor.reset()
+ evict_cursor = self.session.open_cursor(uri, None, "debug=(release_evict)")
+ evict_cursor.set_key(self.create_key(1))
+ self.assertEqual(evict_cursor.search(), 0)
+ evict_cursor.reset()
+ evict_cursor.close()
# Check our values are still correct.
for i in range(0, 5):
diff --git a/src/third_party/wiredtiger/test/suite/test_rollback_to_stable22.py b/src/third_party/wiredtiger/test/suite/test_rollback_to_stable22.py
index f6ef52cc388..7c85800b070 100644
--- a/src/third_party/wiredtiger/test/suite/test_rollback_to_stable22.py
+++ b/src/third_party/wiredtiger/test/suite/test_rollback_to_stable22.py
@@ -43,8 +43,6 @@ class test_rollback_to_stable22(test_rollback_to_stable_base):
nrows = 1000
nds = 10
- self.skipTest('Skip it until the fix is provided to handle concurrent internal transactions running in parallel.')
-
# Create a few tables and populate them with some initial data.
#
# Our way of preventing history store operations from interfering with rollback to stable's
diff --git a/src/third_party/wiredtiger/test/suite/test_rollback_to_stable24.py b/src/third_party/wiredtiger/test/suite/test_rollback_to_stable24.py
new file mode 100644
index 00000000000..ea690506dc9
--- /dev/null
+++ b/src/third_party/wiredtiger/test/suite/test_rollback_to_stable24.py
@@ -0,0 +1,135 @@
+#!/usr/bin/env python
+#
+# Public Domain 2014-present MongoDB, Inc.
+# Public Domain 2008-2014 WiredTiger, Inc.
+#
+# This is free and unencumbered software released into the public domain.
+#
+# Anyone is free to copy, modify, publish, use, compile, sell, or
+# distribute this software, either in source code form or as a compiled
+# binary, for any purpose, commercial or non-commercial, and by any
+# means.
+#
+# In jurisdictions that recognize copyright laws, the author or authors
+# of this software dedicate any and all copyright interest in the
+# software to the public domain. We make this dedication for the benefit
+# of the public at large and to the detriment of our heirs and
+# successors. We intend this dedication to be an overt act of
+# relinquishment in perpetuity of all present and future rights to this
+# software under copyright law.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+# IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+# OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+# ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+# OTHER DEALINGS IN THE SOFTWARE.
+
+import wiredtiger, wttest
+from wtscenario import make_scenarios
+
+# test_rollback_to_stable24.py
+# Exercise a recno-counting bug in column store.
+#
+# Prior to August 2021 a cell for which there's a pending stable update was counted (in the
+# column-store RTS code) as having RLE count 1 regardless of what the actual count was.
+#
+# In order to exploit this we have to do janky things with timestamps, but I think they're
+# allowable.
+#
+# Construct a cell with RLE count of 3 by writing 3 copies of aaaaaa at timestamp 10.
+# Then at the next key write bbbbbb at timestamp 10 and cccccc at timestamp 50.
+# Evict the page to reconcile it and produce the RLE cell.
+#
+# Then post an update to the first key of the RLE cell at timestamp 30 (to dddddd), and roll
+# back to 40.
+#
+# Reading at 40, we should at that point see dddddd and two aaaaaa's followed by bbbbbb, but
+# with the bad counting we get a key error on the second key.
+#
+# This happens because it goes to process key 4 but thinks it's on key 2; it finds that it
+# needs to roll back the value it's looking at (the cccccc from timestamp 50) but because it
+# thinks it's on key to it asks the history store for key 2 and finds nothing. (The bbbbbb
+# from timestamp 10 is in the history store, but under key 4; there's nothing in the history
+# store for key 2.) So it issues a tombstone, and issues it for key 2, so key 2 improperly
+# disappears.
+#
+# Run this test on rows as well as columns to help make sure the test itself is valid (and
+# stays so over time...)
+class test_rollback_to_stable24(wttest.WiredTigerTestCase):
+ session_config = 'isolation=snapshot'
+ conn_config = 'in_memory=false'
+
+ key_format_values = [
+ ('column', dict(key_format='r')),
+ ('integer_row', dict(key_format='i')),
+ ]
+
+ scenarios = make_scenarios(key_format_values)
+
+ def test_rollback_to_stable24(self):
+ # Create a table without logging.
+ uri = "table:rollback_to_stable24"
+ format = 'key_format={},value_format=S'.format(self.key_format)
+ self.session.create(uri, format + ', log=(enabled=false)')
+
+ # Pin oldest timestamp to 10.
+ self.conn.set_timestamp('oldest_timestamp=' + self.timestamp_str(10))
+
+ # Start stable timestamp at 10.
+ self.conn.set_timestamp('stable_timestamp=' + self.timestamp_str(10))
+
+ value_a = "aaaaa" * 100
+ value_b = "bbbbb" * 100
+ value_c = "ccccc" * 100
+ value_d = "ddddd" * 100
+
+ s = self.conn.open_session()
+ cursor = s.open_cursor(uri)
+
+ # Write some keys at time 10.
+ s.begin_transaction()
+ cursor[1] = value_a
+ cursor[2] = value_a
+ cursor[3] = value_a
+ cursor[4] = value_b
+ s.commit_transaction('commit_timestamp=' + self.timestamp_str(10))
+
+ # Update key 4 at time 50.
+ s.begin_transaction()
+ cursor[4] = value_c
+ s.commit_transaction('commit_timestamp=' + self.timestamp_str(50))
+
+ cursor.close()
+
+ # Evict the page to force reconciliation.
+ evict_cursor = self.session.open_cursor(uri, None, "debug=(release_evict)")
+ s.begin_transaction()
+ # Search the key to evict it.
+ v = evict_cursor[1]
+ self.assertEqual(v, value_a)
+ self.assertEqual(evict_cursor.reset(), 0)
+ s.rollback_transaction()
+ evict_cursor.close()
+
+ # Now update key 1 at time 30.
+ cursor = s.open_cursor(uri)
+ s.begin_transaction()
+ cursor[1] = value_d
+ s.commit_transaction('commit_timestamp=' + self.timestamp_str(30))
+ cursor.close()
+
+ # Roll back to 40.
+ self.conn.set_timestamp('stable_timestamp=' + self.timestamp_str(40))
+ self.conn.rollback_to_stable()
+
+ # Now read at 40.
+ cursor = s.open_cursor(uri)
+ s.begin_transaction('read_timestamp=' + self.timestamp_str(40))
+ self.assertEqual(cursor[1], value_d)
+ self.assertEqual(cursor[2], value_a)
+ self.assertEqual(cursor[3], value_a)
+ self.assertEqual(cursor[4], value_b)
+ s.rollback_transaction()
+ cursor.close()
diff --git a/src/third_party/wiredtiger/test/suite/test_rollback_to_stable25.py b/src/third_party/wiredtiger/test/suite/test_rollback_to_stable25.py
new file mode 100644
index 00000000000..2d800a17d32
--- /dev/null
+++ b/src/third_party/wiredtiger/test/suite/test_rollback_to_stable25.py
@@ -0,0 +1,293 @@
+#!/usr/bin/env python
+#
+# Public Domain 2014-present MongoDB, Inc.
+# Public Domain 2008-2014 WiredTiger, Inc.
+#
+# This is free and unencumbered software released into the public domain.
+#
+# Anyone is free to copy, modify, publish, use, compile, sell, or
+# distribute this software, either in source code form or as a compiled
+# binary, for any purpose, commercial or non-commercial, and by any
+# means.
+#
+# In jurisdictions that recognize copyright laws, the author or authors
+# of this software dedicate any and all copyright interest in the
+# software to the public domain. We make this dedication for the benefit
+# of the public at large and to the detriment of our heirs and
+# successors. We intend this dedication to be an overt act of
+# relinquishment in perpetuity of all present and future rights to this
+# software under copyright law.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+# IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+# OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+# ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+# OTHER DEALINGS IN THE SOFTWARE.
+
+import wiredtiger, wttest
+from wtscenario import make_scenarios, filter_scenarios
+
+# test_rollback_to_stable25.py
+# Check various scenarios relating to RLE cells in column-store.
+#
+# We write at three different timestamps:
+# 10 - aaaaaa or none
+# 20 - bbbbbb or delete or none
+# 30 - cccccc or delete or none
+#
+# and we evict to push things to disk after any of these,
+# and we roll back to either 15 or 25.
+#
+# The writes can be either uniform, heterogeneous, first key, middle key, or last key.
+#
+# We do this with a group of 5 keys 2..6. Keys 1 and 6 are written with zzzzzz at
+# timestamp 5 and evicted to ensure that the group of keys we're using is isolated
+# from other unused keys.
+#
+# This generates a lot of cases, but we filter pointless combinations and they run fast.
+
+# Put these bits outside the class definition so they can be referred to both in class
+# instances and in the scenario setup logic, which doesn't have a class instance yet.
+
+my_rle_size = 5
+
+def keys_of_write(write):
+ if write == 'u' or write == 'h':
+ return range(2, 2 + my_rle_size)
+ elif write == 'f':
+ return [2]
+ elif write == 'm':
+ return [2 + my_rle_size // 2]
+ else:
+ return [2 + my_rle_size - 1]
+
+class test_rollback_to_stable25(wttest.WiredTigerTestCase):
+ session_config = 'isolation=snapshot'
+ conn_config = 'in_memory=false'
+
+ write_10_values = [
+ ('10u', dict(write_10='u')),
+ ('10h', dict(write_10='h')),
+ ('10f', dict(write_10='f')),
+ ('10m', dict(write_10='m')),
+ ('10l', dict(write_10='l')),
+ ]
+ type_10_values = [
+ ('nil', dict(type_10=None)),
+ ('upd', dict(type_10='upd')),
+ ]
+
+ write_20_values = [
+ ('20u', dict(write_20='u')),
+ ('20h', dict(write_20='h')),
+ ('20f', dict(write_20='f')),
+ ('20m', dict(write_20='m')),
+ ('20l', dict(write_20='l')),
+ ]
+ type_20_values = [
+ ('nil', dict(type_20=None)),
+ ('upd', dict(type_20='upd')),
+ ('del', dict(type_20='del')),
+ ]
+
+ write_30_values = [
+ ('30u', dict(write_30='u')),
+ ('30h', dict(write_30='h')),
+ ('30f', dict(write_30='f')),
+ ('30m', dict(write_30='m')),
+ ('30l', dict(write_30='l')),
+ ]
+ type_30_values = [
+ ('nil', dict(type_30=None)),
+ ('upd', dict(type_30='upd')),
+ ('del', dict(type_30='del')),
+ ]
+
+ evict_time_values = [
+ ('chk10', dict(evict_time=10)),
+ ('chk20', dict(evict_time=20)),
+ ('chk30', dict(evict_time=30)),
+ ]
+
+ rollback_time_values = [
+ ('roll15', dict(rollback_time=15)),
+ ('roll25', dict(rollback_time=25)),
+ ]
+
+ def is_meaningful(name, vals):
+ # The last write at evict time should be uniform, to get an RLE cell.
+ if vals['evict_time'] == 10 and vals['write_10'] != 'u':
+ return False
+ if vals['evict_time'] == 20 and vals['write_20'] != 'u':
+ return False
+ if vals['evict_time'] == 30 and vals['write_30'] != 'u':
+ return False
+ # If the type is nil, the value must be uniform.
+ if vals['type_10'] is None and vals['write_10'] != 'u':
+ return False
+ if vals['type_20'] is None and vals['write_20'] != 'u':
+ return False
+ if vals['type_30'] is None and vals['write_30'] != 'u':
+ return False
+ # Similarly, delete and heterogeneous doesn't make sense.
+ if vals['type_10'] == 'del' and vals['write_10'] == 'h':
+ return False
+ if vals['type_20'] == 'del' and vals['write_20'] == 'h':
+ return False
+ if vals['type_20'] == 'del' and vals['write_30'] == 'h':
+ return False
+ # Both 10 and 20 shouldn't be nil. That's equivalent to 10 and 30 being nil.
+ if vals['type_10'] is None and vals['type_20'] is None:
+ return False
+
+ # Avoid cases that delete nonexistent values.
+ def deletes_nonexistent():
+ present = {}
+ for k in range(2, 2 + my_rle_size):
+ present[k] = False
+ def adjust(ty, write):
+ if ty is None:
+ return
+ for k in keys_of_write(write):
+ if ty == 'upd':
+ present[k] = True
+ elif ty == 'del':
+ if present[k]:
+ present[k] = False
+ else:
+ raise KeyError
+
+ adjust(vals['type_10'], vals['write_10'])
+ adjust(vals['type_20'], vals['write_20'])
+ adjust(vals['type_30'], vals['write_30'])
+ try:
+ deletes_nonexistent()
+ except KeyError:
+ return False
+ return True
+
+ scenarios = filter_scenarios(make_scenarios(write_10_values, type_10_values,
+ write_20_values, type_20_values,
+ write_30_values, type_30_values,
+ evict_time_values,
+ rollback_time_values),
+ is_meaningful)
+
+ value_z = "zzzzz" * 10
+
+ def writes(self, uri, s, expected, ty, write, value, ts):
+ if ty is None:
+ # do nothing at all
+ return
+ cursor = s.open_cursor(uri)
+ s.begin_transaction()
+ for k in keys_of_write(write):
+ if ty == 'upd':
+ myval = value + str(k) if write == 'h' else value
+ cursor[k] = myval
+ expected[k] = myval
+ else:
+ cursor.set_key(k)
+ cursor.remove()
+ del expected[k]
+ s.commit_transaction('commit_timestamp=' + self.timestamp_str(ts))
+ cursor.close()
+
+ def evict(self, uri, s):
+ # Evict the page to force reconciliation.
+ evict_cursor = s.open_cursor(uri, None, "debug=(release_evict)")
+ s.begin_transaction()
+ # Search the key to evict it. Use both bookends.
+ v = evict_cursor[1]
+ self.assertEqual(v, self. value_z)
+ v = evict_cursor[2 + my_rle_size]
+ self.assertEqual(v, self. value_z)
+ self.assertEqual(evict_cursor.reset(), 0)
+ s.rollback_transaction()
+ evict_cursor.close()
+
+ def check(self, uri, s, ts, expected):
+ cursor = s.open_cursor(uri)
+ s.begin_transaction('read_timestamp=' + self.timestamp_str(ts))
+ # endpoints should still be in place
+ self.assertEqual(cursor[1], self.value_z)
+ self.assertEqual(cursor[2 + my_rle_size], self.value_z)
+
+ for k in range(2, 2 + my_rle_size):
+ if k in expected:
+ self.assertEqual(cursor[k], expected[k])
+ else:
+ cursor.set_key(k)
+ r = cursor.search()
+ self.assertEqual(r, wiredtiger.WT_NOTFOUND)
+ s.rollback_transaction()
+ cursor.close()
+
+ def test_rollback_to_stable25(self):
+ # Create a table without logging.
+ uri = "table:rollback_to_stable25"
+ format = 'key_format=r,value_format=S'
+ self.session.create(uri, format + ', log=(enabled=false)')
+
+ # Pin oldest timestamp to 5.
+ self.conn.set_timestamp('oldest_timestamp=' + self.timestamp_str(5))
+
+ # Start stable timestamp at 5.
+ self.conn.set_timestamp('stable_timestamp=' + self.timestamp_str(5))
+
+ value_a = "aaaaa" * 10
+ value_b = "bbbbb" * 10
+ value_c = "ccccc" * 10
+
+ s = self.conn.open_session()
+
+ # Write the endpoints at time 5.
+ cursor = s.open_cursor(uri)
+ s.begin_transaction()
+ cursor[1] = self.value_z
+ cursor[2 + my_rle_size] = self.value_z
+ s.commit_transaction('commit_timestamp=' + self.timestamp_str(5))
+ self.evict(uri, s)
+ cursor.close()
+
+ # Do writes at time 10.
+ expected = {}
+ self.writes(uri, s, expected, self.type_10, self.write_10, value_a, 10)
+ expected10 = expected.copy()
+
+ # Evict at time 10 if requested.
+ if self.evict_time == 10:
+ self.evict(uri, s)
+
+ # Do more writes at time 20.
+ self.writes(uri, s, expected, self.type_20, self.write_20, value_b, 20)
+ expected20 = expected.copy()
+
+ # Evict at time 20 if requested.
+ if self.evict_time == 20:
+ self.evict(uri, s)
+
+ # Do still more writes at time 30.
+ self.writes(uri, s, expected, self.type_30, self.write_30, value_c, 30)
+ expected30 = expected.copy()
+
+ # Evict at time 30 if requested.
+ if self.evict_time == 30:
+ self.evict(uri, s)
+
+ # Now roll back.
+ self.conn.set_timestamp('stable_timestamp=' + self.timestamp_str(self.rollback_time))
+ self.conn.rollback_to_stable()
+
+ if self.rollback_time < 20:
+ expected20 = expected10
+ expected30 = expected10
+ elif self.rollback_time < 30:
+ expected30 = expected20
+
+ # Now make sure we see what we expect.
+ self.check(uri, s, 10, expected10)
+ self.check(uri, s, 20, expected20)
+ self.check(uri, s, 30, expected30)