diff options
author | Luke Chen <luke.chen@mongodb.com> | 2021-08-26 13:54:52 +1000 |
---|---|---|
committer | Luke Chen <luke.chen@mongodb.com> | 2021-08-26 13:54:52 +1000 |
commit | ba6c7287e5ad13a65ef35e2468694219160775ce (patch) | |
tree | 4d8ca6a8837fa82fe21e4eab430eb887becbd38c | |
parent | e15a429d9c1c2c6d5fe5d186b866ae7d8e7c6e60 (diff) | |
download | mongo-ba6c7287e5ad13a65ef35e2468694219160775ce.tar.gz |
Import wiredtiger: c6ea0d18b5bcd7a6e7d91eece81ada238904c80e from branch mongodb-5.0
ref: b385c98487..c6ea0d18b5
for: 5.0.3
WT-6908 Write "cache" subpage for Architecture Guide
WT-6911 Write "block manager" subpage for Architecture Guide
WT-7005 Write "session" subpage for Architecture Guide
WT-7006 Write Connection subpage for Architecture Guide
WT-7905 Fix incorrect builtin behaviour for builds in CMake
WT-7909 Create a new method to check for running user transactions before starting rollback-to-stable operation
WT-7917 Add evergreen validation to s_all
WT-7931 Evicting modifies using the evict cursor in test_multiple_older_readers_with_multiple_mixed_mode() to ensure that eviction happens.
WT-7941 Add an Evergreen task to test abort/recovery using test/format
WT-7964 Fix rollback to stable incorrectly not rolling back updates at snap_max
WT-7965 Update connection base write generation number at the end of recovery checkpoint
WT-7970 Set the stable timestamp before starting the checkpointer and clock threads
WT-7974 More column-store fixes and tests
WT-7984 Fix a bug that could cause a checkpoint to omit a page of data
WT-7995 Fix the global visibility that it cannot go beyond checkpoint visibility
WT-7998 Minor fixes on Cache subpage of Architecture Guide
41 files changed, 1448 insertions, 364 deletions
diff --git a/src/third_party/wiredtiger/dist/docs_data.py b/src/third_party/wiredtiger/dist/docs_data.py index 831cdc61f30..2d0ba1e6068 100644 --- a/src/third_party/wiredtiger/dist/docs_data.py +++ b/src/third_party/wiredtiger/dist/docs_data.py @@ -20,13 +20,18 @@ arch_doc_pages = [ ['src/include/block.h', 'src/include/block_inline.h', 'src/block/']), ArchDocPage('arch-cache', - ['WT_CACHE', 'WT_CACHE_POOL'], - ['src/include/cache.h', 'src/include/cache_inline.h']), + ['WT_CACHE', 'WT_CACHE_POOL', 'WT_COL', 'WT_COL_RLE', 'WT_INSERT', 'WT_PAGE', + 'WT_PAGE_MODIFY', 'WT_REF', 'WT_ROW', 'WT_UPDATE'], + ['src/include/btmem.h', 'src/include/cache.h', 'src/include/cache_inline.h', + 'src/conn/conn_cache.c', 'src/conn/conn_cache_pool.c']), ArchDocPage('arch-checkpoint', ['WT_CONNECTION'], ['src/block/block_ckpt.c', 'src/block/block_ckpt_scan.c', 'src/conn/conn_ckpt.c', 'src/meta/meta_ckpt.c', 'src/txn/txn_ckpt.c']), + ArchDocPage('arch-connection', + ['WT_CONNECTION'], + ['src/include/connection.h']), ArchDocPage('arch-cursor', ['WT_CURSOR', 'WT_CURSOR_BACKUP', 'WT_CURSOR_BTREE', 'WT_CURSOR_BULK', 'WT_CURSOR_DATA_SOURCE', 'WT_CURSOR_DUMP', 'WT_CURSOR_INDEX', @@ -78,6 +83,9 @@ arch_doc_pages = [ ['src/include/intpack_inline.h', 'src/include/packing_inline.h', 'src/include/schema.h', 'src/lsm/', 'src/packing/', 'src/schema/']), + ArchDocPage('arch-session', + ['WT_SESSION'], + ['src/include/session.h']), ArchDocPage('arch-snapshot', ['WT_TXN'], ['src/include/txn.h']), diff --git a/src/third_party/wiredtiger/dist/s_all b/src/third_party/wiredtiger/dist/s_all index 50d1909eea0..4c39bc8e321 100755 --- a/src/third_party/wiredtiger/dist/s_all +++ b/src/third_party/wiredtiger/dist/s_all @@ -47,6 +47,12 @@ errchk() return fi + # Return if evergreen validate runs sucessfully. + if echo "$1" | grep -q evergreen && [ "$(cat "$2")" = "../test/evergreen.yml is valid" ] ; then + rm -f "$2" + return + fi + echo "####################### MESSAGE ############################" echo "s_all run of: \"$1\" resulted in:" sed -e 's/^/ /' $2 @@ -81,6 +87,9 @@ run "./s_clang-format" run "python prototypes.py" run "sh ./s_typedef -b" run "python test_tag.py" +if command -v evergreen > /dev/null; then + run "evergreen validate ../test/evergreen.yml" +fi COMMANDS=" 2>&1 ./s_define > ${t_pfx}s_define diff --git a/src/third_party/wiredtiger/import.data b/src/third_party/wiredtiger/import.data index 52e281c34f8..d19319d772a 100644 --- a/src/third_party/wiredtiger/import.data +++ b/src/third_party/wiredtiger/import.data @@ -2,5 +2,5 @@ "vendor": "wiredtiger", "github": "wiredtiger/wiredtiger.git", "branch": "mongodb-5.0", - "commit": "b385c984870fec2d693dece8e79876fd5e8bf867" + "commit": "c6ea0d18b5bcd7a6e7d91eece81ada238904c80e" } diff --git a/src/third_party/wiredtiger/src/docs/arch-block.dox b/src/third_party/wiredtiger/src/docs/arch-block.dox index 7c8fdf5d72b..1738d2fdb9b 100644 --- a/src/third_party/wiredtiger/src/docs/arch-block.dox +++ b/src/third_party/wiredtiger/src/docs/arch-block.dox @@ -1,9 +1,248 @@ /*! @arch_page arch-block Block Manager -The Block Manager manages the reading and writing of disk blocks -in WiredTiger. It does compression and encryption when these -are configured. +The WiredTiger block manager subsystem manages the reading and writing of data +from the disk. It is designed to facilitate high performance, economic use of +disk space and customizability. + +@section block What is a block? + +A block is a chunk of data that is stored on the disk and operated on as a +single unit. Each WiredTiger data file (any file in the home directory with the +\c .wt suffix) is made up of these blocks. Each block consists of a page header, +a block header and contains a single page of the btree from which it was +generated. WiredTiger is a no-overwrite storage engine, and when blocks are +re-written, they are written to new locations in the file. The size of a block +is a multiple of the allocation size which is set during creation of the +associated WiredTiger data file see: WT_SESSION::create. + +Once a block is written an address cookie is returned. This address cookie is +stored as the \c addr on the associated page ref. The \c WT_REF structure can +be found in \c btmem.h. The address cookie is opaque to other parts of the +system and cannot be interpreted meaningfully. + +The address cookie is made up of 4 components: + - offset: The offset in the file. In order to avoid storing large offsets this + value is divided by the allocation size. + - file_id: Optional and only relevant to the tiered storage type, the file_id + is maintained in the address. + - size: The size of the block, also divided by the allocation size. + - checksum: The checksum of the block for validation purposes. + +The block header contains the following fields: + - size: The size of the block on disk, used when salvaging data from a corrupt + file. + - checksum: The checksum of the block, again used for salvaging. + - flags: Flags set on the block itself. + - padding + +The page header is not described in this document but can be found in +\c btmem.h. + +@section block_implementation Block manager implementation details + +@subsection write_once Writing + +The block manager decides where in the file a block will be written. It has two +forms of writing modes, "first fit" and "best fit". The default behavior is best +fit. While operating in best fit mode the block manager will search a skip list +of extents sorted by size, returning either an exact match or the next largest. +This is done to avoid fragmenting the file when possible. In first fit mode the +block manager will place the newly created block in the first available extent. +First fit mode is used for all root pages. + +Additionally the block manager is a no-overwrite system. As such once a block is +written it cannot be modified. This is for crash recovery reasons, because if +the system were to crash during an overwrite the block state would be unknown. +This doesn't mean that the associated page cannot be modified, once the +associated page is modified a subsequent reconciliation will result in a new +block being created. + +@subsection desc_block Descriptor blocks + +A file is divided up into blocks. The first block in a file is special as it +contains metadata about the file and is referred to as the "descriptor block". +It contains the WiredTiger major and minor version, a checksum of the block +contents as well as a "magic" number to check against. + +The descriptor block serves as a safety check to ensure that the file being +loaded into the block manager is actually a WiredTiger data file, that it +belongs to a compatible version of WiredTiger and that the entire file has not +been corrupted. WiredTiger also uses checksums to defend against file corruption +which is described in the @ref checksum section. + +@subsection block_lists Extent lists + +Internally, the block manager uses a data structure called an extent list or a +\c WT_EXTLIST to track file usage. An extent list consists of a series of +extents (or \c WT_EXT elements). Each extent uses a file offset and size to +track a portion of the file. + +There are three extent lists that are maintained per checkpoint: + +- \c alloc: The file ranges allocated for a given checkpoint. +- \c avail: The file ranges that are unused and available for allocation. +- \c discard: The file ranges freed in the current checkpoint. + +The alloc and discard extent lists are maintained as a skiplist sorted by file +offset. The avail extent list also maintains an extra skiplist sorted by the +extent size to aid with allocating new blocks. + +@section configuration Configuration options + +There are a number of configuration options that affect the block manager's +behavior. This does not aim to be an exhaustive list, however, these are the +configuration options that are more commonly of interest to developers. + +All of the configuration options below are passed into the \c WT_SESSION::create +API at the time of file creation. + +@subsection alloc_size Allocation size + +The allocation_size configuration controls the file unit allocation_size. +Any blocks allocated by the block manager must be a multiple of this value. + +For example, if we specify an allocation_size of \c 4KB, blocks of size +\c 8KB and \c 12KB would be permitted but NOT \c 10KB. The allocation_size +is set to \c 4KB by default which is a good choice unless the OS or storage +device has special requirements. + +@subsection checksum Checksum + +The \c checksum "on" configuration can be provided during creation of the file. +This configuration instructs the block manager to checksum the full length of +the buffer provided to be written into the block. Be default it is enabled. +When disabled the block manager still does perform a checksum operation but only +the first 64 bytes of the buffer are included. + +The checksum is used when reading blocks to validate their contents, it +is compared with the checksum extracted from the address cookie and it is +compared with a checksum generated from the buffer that was held in the block +being read. In both cases the checksum has to match. + +There are other options that can be provided for this configuration option, +they are not discussed here. + +@section block_usage How WiredTiger uses the block manager + +@subsection creation File creation and the block manager + +When a new file is created in WiredTiger via WT_SESSION::create, the file is +created on disk and the associated \c allocation_size is written out to the +metadata file. However the block manager itself only exists on the btree +structure and is allocated when opening a closed btree. + +@subsection read Reading files and pages + +When an existing btree is opened for the first time, the location of the root +block is contained in the metadata file \c WiredTiger.wt. The block manager will +read the block at the location specified and return the page image as a buffer +to the layer above. This will then be instantiated as a page in memory. + +From there subsequent page addresses can be read from the root page and the +process repeated as required. If a cursor traverses to a page which hasn't been +read into memory the same process will take place. + +@subsection Writing + +Two cases exist for writing out data using the block manager: checkpoint and +eviction. When a page image is written out the block manager the \c bm->write +API is called. See \c bt_io.c for more detail. + +@subsubsection Checkpoint + +For details on checkpoint at the WiredTiger level see: @ref arch-checkpoint. + +At the block manager level, a checkpoint corresponds with a set of blocks that +are stored in the file associated with the given URI. Typically each file will +contain a minimum of two checkpoints. Upon opening an existing file the most +recent checkpoint is read. + +During a checkpoint new blocks are only written out for dirty pages. A block can +be included in multiple checkpoints. Assuming a page \c X is dirty and gets +checkpointed in checkpoint \c A, it will be created as a new block on disk. Now +the same page \c X isn't modified and another checkpoint is taken. The page is +clean and as such will not require a new block to be written for it. The address +of the original block is still valid. + +Checkpoints are created in depth first order, leaf blocks are created, then +the parent blocks. This is a requirement as the parent blocks contain the +addresses of the leaf blocks. + +The block manager doesn't guarantee that calling \c bm->write will result in +the data being flushed to disk. In the checkpoint scenario WiredTiger will also +call \c bm->sync once all blocks have been written which will call the file +system dependent flush function. + +* Checkpoint deletion and merging * + +As a checkpoint progresses it takes a snapshot of the three extent lists kept +by the block manager, these extent lists are written out to disk as part of +the checkpoint in blocks. Between checkpoints these extent lists are being +updated via normal operation of WiredTiger. + +Suppose we have a checkpoint \c A, which has an alloc list which contains 3 +blocks \c I, \c J, \c K as such its extent lists are as follows: + +Alloc: \c I, \c J, \c K +Avail: \c L, \c M +Discard: Empty + +A second checkpoint \c B completes and has removed a page which corresponds with +block \c J, it also has allocated an additional block \c L. + +Checkpoint \c B's extent lists are as follows: +Alloc: \c L +Avail: \c M +Discard: \c J + +Finally we complete a 3rd checkpoint \c C which allocates an additional block +\c M. Upon completion of this checkpoint we are able to remove checkpoint \c A, +to do that, the block manager will merge checkpoint \c A's extent lists into +checkpoint \c B's. + +What's important here is that if a block appears in both the alloc list and +the discard list it can be freed which means it goes on the avail list. + +Which gives us the following lists for checkpoint \c C: +Alloc: \c M +Avail: Empty +Discard: Empty + +And the following lists for checkpoint \c B: +Alloc: \c I, \c K, \c L +Avail: \c J +Discard: Empty + +These extent lists are written out with the checkpoint \c C. Anything on the +avail list is considered free space and can be reused as of the completion +of checkpoint \c C. + +We don't want to list each block individually in the extent lists, so instead of +listing each block separately in the list, we use extents, which can describe a +range in the file, that is, any number of contiguous blocks. + +@subsubsection Eviction + +For more detail on how WiredTiger eviction works see: @ref arch-eviction. + +Eviction also utilizes the block manager. When a page is evicted and contains +data that needs to be maintained, logically a block needs to be written. +Eviction calls \c bm->write however it does not instruct the block manager to +sync the data. + +@subsection Compaction + +As new blocks are written, the block manager will place them where they fit best. +Because of this it's common that removal of data will not result in the file +shrinking. The file can only be shrunk when there are available blocks at the +end of the file. + +To manage this, WiredTiger provides a compaction API call WT_SESSION::compact. +The block manager operates in first fit mode during compaction to maximize block +movement towards the beginning of the file. WiredTiger walks the btree and asks +the block manager if relocating that page will reduce the file size. If so, the +page is marked dirty, forcing the block to be rewritten. WiredTiger then +performs two checkpoints, as at least two checkpoints are required to delete the +checkpoint originally containing the block. -The state of the block manager is represented by the \c WT_BM structure. -Individual blocks being tracked are in \c WT_BLOCK structures. */ diff --git a/src/third_party/wiredtiger/src/docs/arch-cache.dox b/src/third_party/wiredtiger/src/docs/arch-cache.dox index 94888260cfe..716c4966d45 100644 --- a/src/third_party/wiredtiger/src/docs/arch-cache.dox +++ b/src/third_party/wiredtiger/src/docs/arch-cache.dox @@ -1,13 +1,118 @@ /*! @arch_page arch-cache Cache -Cache in WiredTiger is represented by the various shared data structures -that make up in-memory Btrees and subordinate data structures. +The WiredTiger cache is memory used to hold copies of recently accessed or modified data. +WiredTiger reads Btree pages into the cache on demand. When the cache runs low on space, Eviction +removes unneeded pages. Updates modify data in the cache and +are flushed to storage asynchronously, either by @ref arch-checkpoint "Checkpoint" or +@ref arch-eviction "Eviction". -Memory used to read in and write out the on-disk representations of Btrees -is not cached, it only exists temporarily during the I/O operation and -while the data is transferred to or from the on-disk format. +The page layout in the WiredTiger cache is optimized for fast, concurrent access by multiple +application threads. In contrast, WiredTiger organizes pages in storage to minimize storage space. +As a result, WiredTiger has to convert between the in-memory and on-storage representations of a +page whenever it reads or writes the page. -Internally, the current cache state is represented by the WT_CACHE structure, -which contains various counters that drive statistics and information -used for eviction. +@section arch_cache_basics Basic operation + +Cached Btree pages point to each other, mirroring the structure of the on-disk Btree. +When WiredTiger opens a file, it loads the root page of the Btree into memory along with the first +level of internal pages. To lookup an entry in a Btree, WiredTiger starts from the root page +and searches the Btree until finds the entry. If WiredTiger encounters a page that is not in +memory, it loads that page from storage and continues the search. + +To load a page into the cache, WiredTiger passes the page's address cookie to the +@ref arch-block "Block Manager" +and gets back a buffer containing the corresponding block from the underlying file. +If necessary, WiredTiger decrypts and decompresses the block. Then it allocates indexing +structures to facilitate quick binary search of the keys in the page. The first time WiredTiger +needs to modify or insert an entry on a page, it allocates additional structures to track these +changes. + +WiredTiger tracks the total amount of data in the cache. It also tracks the space used by +_clean_, (unmodified) pages and by _dirty_ (modified) pages. When the cache becomes too +full or contains too much dirty data, WiredTiger invokes @ref arch-eviction "Eviction" to +remove data from the +cache. To remove a clean page from the cache, WiredTiger simply frees the page's memory. +To remove a dirty page, WiredTiger must first _reconcile_ the page (converting it from +in-memory format to on-disk format) and then write it to storage. + +@section arch_cache_structure Cache structure + +Internally, WiredTiger's cache state is represented by the \c WT_CACHE structure, which contains +counters and parameter settings for tracking cache usage and controlling eviction policy. +The \c WT_CACHE also includes state WiredTiger uses to track the progress of eviction. There +is a single \c WT_CACHE for each connection, accessed via the \c WT_CONNECTION_IMPL structure. + +Each page in the cache is accessed via a \c WT_REF structure. When WiredTiger opens a Btree, +it places a \c WT_REF for the cached root page in the corresponding \c WT_BTREE structure. +A \c WT_REF can represent either +a page in the cache or one that has not been loaded yet. +The page itself is represented by a \c WT_PAGE structure. This includes a pointer to a buffer +that contains the on-disk page image (decrypted and uncompressed). It also holds the supplemental +structures that WiredTiger uses to access and update the page while it is cached. + +When WiredTiger loads a page into the cache, it allocates an internal table with one entry +for each entry on the page. The type and content of these entries depends on the page type. An +internal Btree page will have an array of \c WT_REF structures. A row-store leaf page will have +an array of \c WT_ROW structures representing the KV pairs stored on the page. A variable-length +column-store leaf page will have an array of \c WT_COL structures along with a parallel array +of \c WT_COL_RLE structures indicating run lengths for items that are repeated more then once +on the page. Both of these leaf page formats support binary search to quickly find an entry. +In a fixed-length column-store leaf page, values will be packed into a simple byte array, allowing +WiredTiger to access entries using bit operations based on the value length. + +The first time an entry on a leaf page is inserted or modified, WiredTiger adds a +\c WT_PAGE_MODIFY structure to the corresponding \c WT_PAGE in the cache. For a row-store leaf +page the \c WT_PAGE_MODIFY tracks changes using an array of \c WT_UPDATE pointers with one element +for each +KV pair on the leaf page. When WiredTiger updates an entry, it inserts a \c WT_UPDATE in +this array. If there are multiple updates to the same item, WiredTiger chains them together +in a linked list. When a record is deleted, +WiredTiger adds an update with a special tombstone value. WiredTiger stores newly inserted +elements in a similar array of skip lists represented by \c WT_INSERT structures. There is a +separate skiplist for the gap between each pair of keys on the page, as well as skiplists for +the gaps between the beginning and end of the page and the first and last keys, respectively. + +For a column-store leaf page the \c WT_PAGE_MODIFY structure tracks changes using a pair of +skip lists, one for appended items and one for updated items. + +Almost all operations on these data structures are lock-free, allowing a high level of +concurrency in the cache. + +@section arch_cache_size Cache size and content + +The amount of memory used by the WiredTiger cache is controlled by the \c cache_size configuration +parameter, which defaults to 100 MB. (Note that MongoDB sets the cache size, by default, to be +half the size of RAM.) WiredTiger does not explicitly manage this memory, relying instead on +the C memory allocator to acquire and free memory as needed. Since the cache is +allocated from the heap, evicting data from the cache simply returns the memory to the allocator; +it does not reduce the application's memory footprint. + +The WiredTiger cache is only used for Btree data, including associated in-memory structures such +as indexes, insert lists, and update chains. Other WiredTiger data structures, such as +dhandles, cursors, and sessions, are not considered part of the cache and do not count against +the cache size. Similarly, memory used to read in and write out the on-disk representations of +Btree pages is not cached; it is only allocated temporarily during the I/O operation and +while the data is converted to or from the on-disk format. + +@section arch_cache_shared Shared caches + +WiredTiger supports sharing a single cache among multiple databases within a process. Normally +if a process opens connections to multiple different databases, each connection would use a +separate fixed-size cache. With a shared cache, WiredTiger dynamically partitions a fixed +amount of cache space between participating connections. + +When shared caching is enabled, WiredTiger creates a cache pool server thread to manage the +shared cache. It also allocates a global \c WT_CACHE_POOL structure, which stores settings +and statistics for the shared cache. These settings include a minimum and +maximum cache size for connections participating in the shared cache. + +The cache pool server thread wakes up periodically and adjusts the sizes of the individual +per-connection caches. Adjustments are based on a pressure metric for each cache computed +using a weighted average of the amount of data read into the cache (i.e., cache misses) +and how often applications threads have evicted data from the cache or waited while +performing eviction. If a cache has higher pressure than average and is not yet at the maximum +size, WiredTiger grows that cache. Conversely, if a cache has low pressure, WiredTiger shrinks +it, subject to the minimum cache size. To change the size of a cache, the cache pool server +simply changes the cache size parameters in the corresponding \c WT_CACHE structure. WiredTiger's +eviction code will adjust the amount of data in the cache accordingly. */ diff --git a/src/third_party/wiredtiger/src/docs/arch-connection.dox b/src/third_party/wiredtiger/src/docs/arch-connection.dox new file mode 100644 index 00000000000..cdc3eeecb9a --- /dev/null +++ b/src/third_party/wiredtiger/src/docs/arch-connection.dox @@ -0,0 +1,60 @@ +/*! @arch_page arch-connection Connection + +@section arch_conn_def Definition + +A connection is a handle to a WiredTiger database instance. The connection has exclusive access to +the database through a file lock, hence only one connection can be opened at a time. Internally, a +connection is represented by \c WT_CONNECTION. + +@section arch_conn_lifecycle Life cycle + +@subsection arch_conn_init Initialization + +A connection is initialized when WT_CONNECTION::wiredtiger_open is called by the user application. +WT_CONNECTION::wiredtiger_open accepts a list of configuration items (see @ref database_config) that +can be used to enable different WiredTiger features and tune their behavior. Those features are for +example related to @ref arch-eviction, @ref arch-logging, @ref arch-checkpoint, @ref arch-cache, +statistics, etc. All the different available configuration settings are described in the +documentation for WT_CONNECTION::wiredtiger_open. + +WT_CONNECTION::wiredtiger_open also performs different sanity checks depending on the configuration +item "create". When "create" is specified and a database does not already exist, a new database is +created along with specific WiredTiger files such as the turtle file and other metadata files. If a +database already exists, whether "create" is specified or not, WiredTiger will try to open it and +check for the existence of the different required WiredTiger files. If "create" is not specified, +WiredTiger expects a previously created database where it is executed. If the existing database is +corrupted and cannot be opened, either \c WT_RUN_RECOVERY or \c WT_TRY_SALVAGE error (see @ref +error_handling) is returned to the user application and the connection is not created. In this case, +a recovery operation will be required to bring the database to a consistent state (see @ref +command_line for more details) before a connection can be successfully established with the +database. + +Once the database has been successfully opened, internal worker threads are started to provide +global services used at runtime. Those services consist of different threads to handle statistics, +logging, eviction, checkpoint and cache management. The sweeping server that manages the active and +inactive dhandles is started too, see @ref arch-dhandle for more information. + +Finally, before the connection is completely initialized, the database is set to a consistent state +by running rollback to stable, see @ref arch-rts for more details. + +@subsection arch_conn_runtime Runtime + +At runtime, database-wide operations can be executed using the connection interface. For instance, +it is possible to reconfigure WiredTiger features and behavior using WT_CONNECTION::reconfigure +instead of closing the connection and calling WT_CONNECTION::open again. However, almost all CRUD +operations on the database are executed in the context of a session (see @ref arch-session) which +can be created using WT_CONNECTION::open_session. See the WT_CONNECTION:: documentation to discover +other available APIs related to WiredTiger connections. + +A connection also keeps tracks of global information, see \c WT_CONNECTION_IMPL defined in \c +connection.h. Finally, a \c WT_CONNECTION handle may be shared between threads, see @ref threads for +more information. + +@subsection arch_conn_closure Closure + +When a connection is no longer required, it can be closed using WT_CONNECTION::close. As a result, +any resource held by the connection (i.e sessions) is freed unless configured differently and the +database is restored to a consistent state if necessary. It is worth noting that this final step +might take some time as it may involve running the rollback to stable operation. + +*/ diff --git a/src/third_party/wiredtiger/src/docs/arch-index.dox b/src/third_party/wiredtiger/src/docs/arch-index.dox index 3d916e675aa..906e6959be8 100644 --- a/src/third_party/wiredtiger/src/docs/arch-index.dox +++ b/src/third_party/wiredtiger/src/docs/arch-index.dox @@ -139,6 +139,10 @@ make up in-memory Btrees and subordinate data structures. A checkpoint is created by WiredTiger to serve as a point from which it can recover. +@subpage arch-connection + +A connection is a handle to a WiredTiger database instance. + @subpage arch-cursor Cursors are used to get and modify data. @@ -196,6 +200,10 @@ Rollback to stable to remove the unstable updates from the database. A schema defines the format of the application data in WiredTiger. +@subpage arch-session + +A session defines the context for most operations performed in WiredTiger. + @subpage arch-snapshot Snapshots are implemented by storing transaction ids committed before diff --git a/src/third_party/wiredtiger/src/docs/arch-session.dox b/src/third_party/wiredtiger/src/docs/arch-session.dox new file mode 100644 index 00000000000..e17eca10b1b --- /dev/null +++ b/src/third_party/wiredtiger/src/docs/arch-session.dox @@ -0,0 +1,53 @@ +/*! @arch_page arch-session Session + +@section arch_session_def Definition +After a @ref arch-connection has been established between the application and WiredTiger, the +application can start sending requests to WiredTiger using a session. A session is internally +represented by WT_SESSION and plays an important role since almost all operations are performed +under the context of a session. + +A session can only be created through an existing connection with the API +WT_CONNECTION::open_session and it is possible to create multiple sessions through the same +connection. In fact, one connection can have multiple sessions but one session can only be +associated with one connection. The maximum number of sessions is set through the configuration item +\c session_max as part of the configuration string in ::wiredtiger_open. + +Sessions created by the calling application are called "user sessions". WiredTiger also performs +some internal operations such as @ref arch-eviction through self-created sessions. These sessions +are called "internal sessions". The usage rules and guidelines for both internal sessions and user +sessions are the same and the only difference between them is their origin of creation. + +@section arch_session_ops Operations +The different operations that can be performed on a WiredTiger session are related to cursors, +tables and transactions. You can read the complete description of each possible operation in the +documentation related to WT_SESSION. + +@section arch_session_txn Transactions +It is possible to group several operations within a session, in other words, multiple operations can +be treated as a single atomic operation. This can be done using @ref arch-transaction. Furthermore, +a session can hold only one running transaction at any given time and this transaction only belongs +to that session. + +@section arch_session_cur Cursors +A session can perform multiple data operations on one or several collections using multiple cursors +(see @ref arch-cursor for more details). All the cursors associated with a session share that +session transaction context. It is also possible to cache those cursors if required through the +configuration string given to WT_CONNECTION::open_session or ::wiredtiger_open. The configuration +item for this purpose is \c cache_cursors. + +@section arch_session_dhandles Data Handles +During its lifetime, a session can accumulate a list of data handles (see @ref arch-dhandle). +Indeed, when a session accesses a table for the first time, the data handle of that table is +acquired and cached. Once a session no longer needs to operate on a table, it marks the associated +data handle as idle. This helps the sweep server release data handles that are inactive, see @ref +arch-dhandle-lifecycle for more details. + +@section arch_session_closure Closure +A session can be closed using WT_SESSION::close. Closing the connection will also close all opened +sessions. When a session is closed, it releases all the resources associated with it including +rolling back any active transaction and closing the cursors that are still open. + +@section arch_session_thread Multithreading +A session is always executed as a single thread, see @ref threads for more details. + +*/ diff --git a/src/third_party/wiredtiger/src/docs/spell.ok b/src/third_party/wiredtiger/src/docs/spell.ok index 1d302409f2e..182a9af00f3 100644 --- a/src/third_party/wiredtiger/src/docs/spell.ok +++ b/src/third_party/wiredtiger/src/docs/spell.ok @@ -48,6 +48,7 @@ ECMA EINVAL ENCRYPTOR ENOTSUP +EXTLIST EmpId Encryptors Facebook @@ -180,7 +181,9 @@ bokeh bool boolean booleans +bm br +bt btmem btree btrees @@ -236,6 +239,7 @@ curhs cursortype curtable customerABC +customizability cv cyclomatic dN @@ -257,6 +261,7 @@ decrement decrementing decrypt decrypted +decrypts del desc destructor @@ -574,6 +579,7 @@ sess sid skinparam skiplist +skiplists sortable spinlock spinlocks @@ -596,6 +602,7 @@ subdatabases subdirectory subpage substring +subsubsection sudo superset svg diff --git a/src/third_party/wiredtiger/src/include/cell_inline.h b/src/third_party/wiredtiger/src/include/cell_inline.h index 2d91b9a8ee6..97b856e8a62 100644 --- a/src/third_party/wiredtiger/src/include/cell_inline.h +++ b/src/third_party/wiredtiger/src/include/cell_inline.h @@ -1045,24 +1045,6 @@ __cell_unpack_window_cleanup(WT_SESSION_IMPL *session, const WT_PAGE_HEADER *dsk } /* - * __cell_pack_kv_window_cleanup -- - * Clean up cells loaded from a previous run while writing to disk. - */ -static inline void -__cell_pack_kv_window_cleanup( - WT_SESSION_IMPL *session, const WT_PAGE_HEADER *dsk, WT_CELL_UNPACK_KV *unpack_kv) -{ - /* - * If the page came from a previous run, reset the transaction ids to "none" and timestamps to 0 - * as appropriate when the cell information is used for packing the new cell. - */ - if (F_ISSET(S2C(session), WT_CONN_RECOVERING) && - dsk->write_gen > S2BT(session)->base_write_gen && - dsk->write_gen < S2BT(session)->run_write_gen) - __cell_kv_window_cleanup(session, unpack_kv); -} - -/* * __wt_cell_unpack_addr -- * Unpack an address WT_CELL into a structure. */ diff --git a/src/third_party/wiredtiger/src/include/extern.h b/src/third_party/wiredtiger/src/include/extern.h index 1892edd09a3..8061ce88008 100644 --- a/src/third_party/wiredtiger/src/include/extern.h +++ b/src/third_party/wiredtiger/src/include/extern.h @@ -39,6 +39,8 @@ extern bool __wt_rwlock_islocked(WT_SESSION_IMPL *session, WT_RWLOCK *l) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern bool __wt_txn_active(WT_SESSION_IMPL *session, uint64_t txnid) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); +extern bool __wt_txn_user_active(WT_SESSION_IMPL *session) + WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern char *__wt_time_aggregate_to_string(WT_TIME_AGGREGATE *ta, char *ta_string) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern char *__wt_time_point_to_string(wt_timestamp_t ts, wt_timestamp_t durable_ts, @@ -1088,6 +1090,8 @@ extern int __wt_meta_track_update(WT_SESSION_IMPL *session, const char *key) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_metadata_btree_id_to_uri(WT_SESSION_IMPL *session, uint32_t btree_id, char **uri) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); +extern int __wt_metadata_correct_base_write_gen(WT_SESSION_IMPL *session) + WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_metadata_cursor(WT_SESSION_IMPL *session, WT_CURSOR **cursorp) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_metadata_cursor_close(WT_SESSION_IMPL *session) @@ -1966,6 +1970,8 @@ static inline bool __wt_txn_visible(WT_SESSION_IMPL *session, uint64_t id, wt_ti WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); static inline bool __wt_txn_visible_all(WT_SESSION_IMPL *session, uint64_t id, wt_timestamp_t timestamp) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); +static inline bool __wt_txn_visible_id_snapshot(uint64_t id, uint64_t snap_min, uint64_t snap_max, + uint64_t *snapshot, uint32_t snapshot_count) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); static inline double __wt_eviction_dirty_target(WT_CACHE *cache) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); static inline int __wt_btree_block_free(WT_SESSION_IMPL *session, const uint8_t *addr, diff --git a/src/third_party/wiredtiger/src/include/txn.h b/src/third_party/wiredtiger/src/include/txn.h index 3258cafb29f..6b84061ff82 100644 --- a/src/third_party/wiredtiger/src/include/txn.h +++ b/src/third_party/wiredtiger/src/include/txn.h @@ -244,8 +244,8 @@ struct __wt_txn { /* * Snapshot data: + * ids >= snap_max are invisible, * ids < snap_min are visible, - * ids > snap_max are invisible, * everything else is visible unless it is in the snapshot. */ uint64_t snap_min, snap_max; diff --git a/src/third_party/wiredtiger/src/include/txn_inline.h b/src/third_party/wiredtiger/src/include/txn_inline.h index 6d3b4e8fa55..b57f0d8203f 100644 --- a/src/third_party/wiredtiger/src/include/txn_inline.h +++ b/src/third_party/wiredtiger/src/include/txn_inline.h @@ -449,13 +449,10 @@ err: static inline uint64_t __wt_txn_oldest_id(WT_SESSION_IMPL *session) { - WT_BTREE *btree; WT_TXN_GLOBAL *txn_global; uint64_t checkpoint_pinned, oldest_id; - bool include_checkpoint_txn; txn_global = &S2C(session)->txn_global; - btree = S2BT_SAFE(session); /* * The metadata is tracked specially because of optimizations for checkpoints. @@ -467,10 +464,6 @@ __wt_txn_oldest_id(WT_SESSION_IMPL *session) * Take a local copy of these IDs in case they are updated while we are checking visibility. */ oldest_id = txn_global->oldest_id; - include_checkpoint_txn = - btree == NULL || (btree->checkpoint_gen != __wt_gen(session, WT_GEN_CHECKPOINT)); - if (!include_checkpoint_txn) - return (oldest_id); /* * The read of the transaction ID pinned by a checkpoint needs to be carefully ordered: if a @@ -501,14 +494,11 @@ __wt_txn_oldest_id(WT_SESSION_IMPL *session) static inline void __wt_txn_pinned_timestamp(WT_SESSION_IMPL *session, wt_timestamp_t *pinned_tsp) { - WT_BTREE *btree; WT_TXN_GLOBAL *txn_global; wt_timestamp_t checkpoint_ts, pinned_ts; - bool include_checkpoint_txn; *pinned_tsp = WT_TS_NONE; - btree = S2BT_SAFE(session); txn_global = &S2C(session)->txn_global; /* @@ -520,19 +510,6 @@ __wt_txn_pinned_timestamp(WT_SESSION_IMPL *session, wt_timestamp_t *pinned_tsp) *pinned_tsp = pinned_ts = txn_global->pinned_timestamp; /* - * Checkpoint transactions often fall behind ordinary application threads. Take special effort - * to not keep changes pinned in cache if they are only required for the checkpoint and it has - * already seen them. - * - * If there is no active checkpoint or this handle is up to date with the active checkpoint then - * it's safe to ignore the checkpoint ID in the visibility check. - */ - include_checkpoint_txn = - btree == NULL || (btree->checkpoint_gen != __wt_gen(session, WT_GEN_CHECKPOINT)); - if (!include_checkpoint_txn) - return; - - /* * The read of checkpoint timestamp needs to be carefully ordered: it needs to be after we have * read the pinned timestamp and the checkpoint generation, otherwise, we may read earlier * checkpoint timestamp before the checkpoint generation that is read resulting more data being @@ -680,6 +657,37 @@ __wt_txn_tw_stop_visible_all(WT_SESSION_IMPL *session, WT_TIME_WINDOW *tw) } /* + * __wt_txn_visible_id_snapshot -- + * Is the id visible in terms of the given snapshot? + */ +static inline bool +__wt_txn_visible_id_snapshot( + uint64_t id, uint64_t snap_min, uint64_t snap_max, uint64_t *snapshot, uint32_t snapshot_count) +{ + bool found; + + /* + * WT_ISO_SNAPSHOT, WT_ISO_READ_COMMITTED: the ID is visible if it is not the result of a + * concurrent transaction, that is, if was committed before the snapshot was taken. + * + * The order here is important: anything newer than or equal to the maximum ID we saw when + * taking the snapshot should be invisible, even if the snapshot is empty. + * + * Snapshot data: + * ids >= snap_max not visible, + * ids < snap_min are visible, + * everything else is visible unless it is found in the snapshot. + */ + if (WT_TXNID_LE(snap_max, id)) + return (false); + if (snapshot_count == 0 || WT_TXNID_LT(id, snap_min)) + return (true); + + WT_BINARY_SEARCH(id, snapshot, snapshot_count, found); + return (!found); +} + +/* * __txn_visible_id -- * Can the current transaction see the given ID? */ @@ -687,7 +695,6 @@ static inline bool __txn_visible_id(WT_SESSION_IMPL *session, uint64_t id) { WT_TXN *txn; - bool found; txn = session->txn; @@ -710,20 +717,8 @@ __txn_visible_id(WT_SESSION_IMPL *session, uint64_t id) /* Otherwise, we should be called with a snapshot. */ WT_ASSERT(session, F_ISSET(txn, WT_TXN_HAS_SNAPSHOT) || session->dhandle->checkpoint != NULL); - /* - * WT_ISO_SNAPSHOT, WT_ISO_READ_COMMITTED: the ID is visible if it is not the result of a - * concurrent transaction, that is, if was committed before the snapshot was taken. - * - * The order here is important: anything newer than the maximum ID we saw when taking the - * snapshot should be invisible, even if the snapshot is empty. - */ - if (WT_TXNID_LE(txn->snap_max, id)) - return (false); - if (txn->snapshot_count == 0 || WT_TXNID_LT(id, txn->snap_min)) - return (true); - - WT_BINARY_SEARCH(id, txn->snapshot, txn->snapshot_count, found); - return (!found); + return (__wt_txn_visible_id_snapshot( + id, txn->snap_min, txn->snap_max, txn->snapshot, txn->snapshot_count)); } /* diff --git a/src/third_party/wiredtiger/src/meta/meta_ckpt.c b/src/third_party/wiredtiger/src/meta/meta_ckpt.c index b4c7c933c62..d58d0149658 100644 --- a/src/third_party/wiredtiger/src/meta/meta_ckpt.c +++ b/src/third_party/wiredtiger/src/meta/meta_ckpt.c @@ -974,6 +974,40 @@ err: } /* + * __wt_metadata_correct_base_write_gen -- + * Update the connection's base write generation from all files in metadata at then end of the + * recovery checkpoint. + */ +int +__wt_metadata_correct_base_write_gen(WT_SESSION_IMPL *session) +{ + WT_CURSOR *cursor; + WT_DECL_RET; + char *config, *uri; + + uri = NULL; + WT_RET(__wt_metadata_cursor(session, &cursor)); + while ((ret = cursor->next(cursor)) == 0) { + WT_ERR(cursor->get_key(cursor, &uri)); + + if (!WT_PREFIX_MATCH(uri, "file:") && !WT_PREFIX_MATCH(uri, "tiered:")) + continue; + + WT_ERR(cursor->get_value(cursor, &config)); + + /* Update base write gen to the write gen. */ + WT_ERR(__wt_metadata_update_base_write_gen(session, config)); + } + WT_ERR_NOTFOUND_OK(ret, false); + +err: + if (ret != 0 && uri != NULL) + __wt_err(session, ret, "unable to correct write gen for %s", uri); + WT_TRET(__wt_metadata_cursor_release(session, &cursor)); + return (ret); +} + +/* * __wt_meta_ckptlist_to_meta -- * Convert a checkpoint list into its metadata representation. */ diff --git a/src/third_party/wiredtiger/src/reconcile/rec_col.c b/src/third_party/wiredtiger/src/reconcile/rec_col.c index 4ea57b8acc7..8000026c58b 100644 --- a/src/third_party/wiredtiger/src/reconcile/rec_col.c +++ b/src/third_party/wiredtiger/src/reconcile/rec_col.c @@ -744,7 +744,6 @@ record_loop: twp = &clear_tw; goto compare; } - __cell_pack_kv_window_cleanup(session, page->dsk, vpack); twp = &vpack->tw; /* diff --git a/src/third_party/wiredtiger/src/reconcile/rec_row.c b/src/third_party/wiredtiger/src/reconcile/rec_row.c index a72bc170245..99d887da573 100644 --- a/src/third_party/wiredtiger/src/reconcile/rec_row.c +++ b/src/third_party/wiredtiger/src/reconcile/rec_row.c @@ -808,10 +808,9 @@ __wt_rec_row_leaf( upd = upd_select.upd; /* Take the timestamp from the update or the cell. */ - if (upd == NULL) { - __cell_pack_kv_window_cleanup(session, page->dsk, vpack); + if (upd == NULL) twp = &vpack->tw; - } else + else twp = &upd_select.tw; /* diff --git a/src/third_party/wiredtiger/src/reconcile/rec_write.c b/src/third_party/wiredtiger/src/reconcile/rec_write.c index 9ad305792b9..cf9d1be3175 100644 --- a/src/third_party/wiredtiger/src/reconcile/rec_write.c +++ b/src/third_party/wiredtiger/src/reconcile/rec_write.c @@ -159,6 +159,9 @@ __reconcile(WT_SESSION_IMPL *session, WT_REF *ref, WT_SALVAGE_COOKIE *salvage, u WT_DECL_RET; WT_PAGE *page; WT_RECONCILE *r; +#ifdef HAVE_DIAGNOSTIC + void *addr; +#endif btree = S2BT(session); page = ref->page; @@ -215,11 +218,17 @@ __reconcile(WT_SESSION_IMPL *session, WT_REF *ref, WT_SALVAGE_COOKIE *salvage, u F_ISSET(r, WT_REC_CALL_URGENT) && !r->update_used && r->cache_write_restore) ret = __wt_set_return(session, EBUSY); +#ifdef HAVE_DIAGNOSTIC + addr = ref->addr; +#endif /* Wrap up the page reconciliation. */ if (ret == 0 && (ret = __rec_write_wrapup(session, r, page)) == 0) __rec_write_page_status(session, r); - else + else { + /* Make sure that reconciliation doesn't free the page that has been written to disk. */ + WT_ASSERT(session, addr == NULL || ref->addr != NULL); WT_TRET(__rec_write_wrapup_err(session, r, page)); + } /* Release the reconciliation lock. */ *page_lockedp = false; @@ -1516,7 +1525,7 @@ err: * Initialize the page write generation number. */ static void -__rec_set_page_write_gen(WT_PAGE_HEADER *dsk, WT_BTREE *btree) +__rec_set_page_write_gen(WT_BTREE *btree, WT_PAGE_HEADER *dsk) { /* * We increment the block's write generation so it's easy to identify newer versions of blocks @@ -1553,7 +1562,7 @@ __rec_split_write_header(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_REC_CHUNK dsk->recno = btree->type == BTREE_ROW ? WT_RECNO_OOB : multi->key.recno; - __rec_set_page_write_gen(dsk, btree); + __rec_set_page_write_gen(btree, dsk); dsk->mem_size = multi->size; dsk->u.entries = chunk->entries; dsk->type = page->type; @@ -2088,6 +2097,22 @@ __rec_write_wrapup(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page) WT_TIME_AGGREGATE_INIT(&ta); /* + * If using the history store table eviction path and we found updates that weren't globally + * visible when reconciling this page, copy them into the database's history store. This can + * fail, so try before clearing the page's previous reconciliation state. + */ + if (F_ISSET(r, WT_REC_HS)) + WT_RET(__rec_hs_wrapup(session, r)); + + /* + * Wrap up overflow tracking. If we are about to create a checkpoint, the system must be + * entirely consistent at that point (the underlying block manager is presumably going to do + * some action to resolve the list of allocated/free/whatever blocks that are associated with + * the checkpoint). + */ + WT_RET(__wt_ovfl_track_wrapup(session, page)); + + /* * This page may have previously been reconciled, and that information is now about to be * replaced. Make sure it's discarded at some point, and clear the underlying modification * information, we're creating a new reality. @@ -2137,21 +2162,6 @@ __rec_write_wrapup(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page) /* Reset the reconciliation state. */ mod->rec_result = 0; - /* - * If using the history store table eviction path and we found updates that weren't globally - * visible when reconciling this page, copy them into the database's history store. - */ - if (F_ISSET(r, WT_REC_HS)) - WT_RET(__rec_hs_wrapup(session, r)); - - /* - * Wrap up overflow tracking. If we are about to create a checkpoint, the system must be - * entirely consistent at that point (the underlying block manager is presumably going to do - * some action to resolve the list of allocated/free/whatever blocks that are associated with - * the checkpoint). - */ - WT_RET(__wt_ovfl_track_wrapup(session, page)); - __wt_verbose(session, WT_VERB_RECONCILE, "%p reconciled into %" PRIu32 " pages", (void *)ref, r->multi_next); @@ -2367,7 +2377,7 @@ __wt_rec_cell_build_ovfl(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_REC_KV *k dsk = tmp->mem; memset(dsk, 0, WT_PAGE_HEADER_SIZE); dsk->type = WT_PAGE_OVFL; - __rec_set_page_write_gen(dsk, btree); + __rec_set_page_write_gen(btree, dsk); dsk->u.datalen = (uint32_t)kv->buf.size; memcpy(WT_PAGE_HEADER_BYTE(btree, dsk), kv->buf.data, kv->buf.size); dsk->mem_size = WT_PAGE_HEADER_BYTE_SIZE(btree) + (uint32_t)kv->buf.size; diff --git a/src/third_party/wiredtiger/src/txn/txn.c b/src/third_party/wiredtiger/src/txn/txn.c index 35eb2e08d6f..e3ab3fecc16 100644 --- a/src/third_party/wiredtiger/src/txn/txn.c +++ b/src/third_party/wiredtiger/src/txn/txn.c @@ -139,6 +139,47 @@ __wt_txn_release_snapshot(WT_SESSION_IMPL *session) } /* + * __wt_txn_user_active -- + * Check whether there are any running user transactions. Note that a new transactions may start + * on a session we have already examined and the caller needs to be aware of this limitation. + * Exclude prepared user transactions from this check. + */ +bool +__wt_txn_user_active(WT_SESSION_IMPL *session) +{ + WT_CONNECTION_IMPL *conn; + WT_SESSION_IMPL *session_in_list; + uint32_t i, session_cnt; + bool txn_active; + + conn = S2C(session); + txn_active = false; + + /* + * No lock is required because the session array is fixed size, but it may contain inactive + * entries. We must review any active session, so insert a read barrier after reading the active + * session count. That way, no matter what sessions come or go, we'll check the slots for all of + * the user sessions for active transactions when we started our check. + */ + WT_ORDERED_READ(session_cnt, conn->session_cnt); + for (i = 0, session_in_list = conn->sessions; i < session_cnt; i++, session_in_list++) { + /* Skip inactive sessions. */ + if (!session_in_list->active) + continue; + /* Check if a user session has a running transaction. Ignore prepared transactions. */ + if (F_ISSET(session_in_list->txn, WT_TXN_RUNNING) && + !F_ISSET(session_in_list, WT_SESSION_INTERNAL) && + !F_ISSET(session_in_list->txn, WT_TXN_PREPARE)) { + + txn_active = true; + break; + } + } + + return (txn_active); +} + +/* * __wt_txn_active -- * Check if a transaction is still active. If not, it is either committed, prepared, or rolled * back. It is possible that we race with commit, prepare or rollback and a transaction is still diff --git a/src/third_party/wiredtiger/src/txn/txn_ckpt.c b/src/third_party/wiredtiger/src/txn/txn_ckpt.c index 1a7090cd2c6..c32cd43b0bc 100644 --- a/src/third_party/wiredtiger/src/txn/txn_ckpt.c +++ b/src/third_party/wiredtiger/src/txn/txn_ckpt.c @@ -922,6 +922,14 @@ __txn_checkpoint(WT_SESSION_IMPL *session, const char *cfg[]) } /* + * As part of recovery, rollback to stable may have left out clearing stale transaction ids. + * Update the connection base write generation based on the latest checkpoint write generations + * to reset these transaction ids present on the pages when reading them. + */ + if (F_ISSET(conn, WT_CONN_RECOVERING)) + WT_ERR(__wt_metadata_correct_base_write_gen(session)); + + /* * Clear the dhandle so the visibility check doesn't get confused about the snap min. Don't * bother restoring the handle since it doesn't make sense to carry a handle across a * checkpoint. diff --git a/src/third_party/wiredtiger/src/txn/txn_recover.c b/src/third_party/wiredtiger/src/txn/txn_recover.c index 9f32e9346f0..c76e2af3597 100644 --- a/src/third_party/wiredtiger/src/txn/txn_recover.c +++ b/src/third_party/wiredtiger/src/txn/txn_recover.c @@ -548,39 +548,6 @@ err: } /* - * __recovery_correct_write_gen -- - * Update the connection's base write generation from all files in metadata. - */ -static int -__recovery_correct_write_gen(WT_SESSION_IMPL *session) -{ - WT_CURSOR *cursor; - WT_DECL_RET; - char *config, *uri; - - uri = NULL; - WT_RET(__wt_metadata_cursor(session, &cursor)); - while ((ret = cursor->next(cursor)) == 0) { - WT_ERR(cursor->get_key(cursor, &uri)); - - if (!WT_PREFIX_MATCH(uri, "file:") && !WT_PREFIX_MATCH(uri, "tiered:")) - continue; - - WT_ERR(cursor->get_value(cursor, &config)); - - /* Update base write gen to the write gen. */ - WT_ERR(__wt_metadata_update_base_write_gen(session, config)); - } - WT_ERR_NOTFOUND_OK(ret, false); - -err: - if (ret != 0 && uri != NULL) - __wt_err(session, ret, "unable to correct write gen for %s", uri); - WT_TRET(__wt_metadata_cursor_release(session, &cursor)); - return (ret); -} - -/* * __recovery_setup_file -- * Set up the recovery slot for a file, track the largest file ID, and update the base write gen * based on the file's configuration. @@ -1055,16 +1022,11 @@ done: WT_ERR(session->iface.checkpoint(&session->iface, "force=1")); /* - * Rollback to stable may have left out clearing stale transaction ids. Update the connection - * base write generation based on the latest checkpoint write generations to reset them. - */ - if (rts_executed) - WT_ERR(__recovery_correct_write_gen(session)); - - /* * Update the open dhandles write generations and base write generation with the connection's * base write generation because the recovery checkpoint writes the pages to disk with new write - * generation number which contains transaction ids that are needed to reset later. + * generation number which contains transaction ids that are needed to reset later. The + * connection level base write generation number is updated at the end of the recovery + * checkpoint. */ __wt_dhandle_update_write_gens(session); diff --git a/src/third_party/wiredtiger/src/txn/txn_rollback_to_stable.c b/src/third_party/wiredtiger/src/txn/txn_rollback_to_stable.c index 3ec6bb95934..6004ddd3db2 100644 --- a/src/third_party/wiredtiger/src/txn/txn_rollback_to_stable.c +++ b/src/third_party/wiredtiger/src/txn/txn_rollback_to_stable.c @@ -281,50 +281,30 @@ err: } /* - * __rollback_check_if_txnid_non_committed -- - * Check if the transaction id is non committed. + * __rollback_txn_visible_id -- + * Check if the transaction id is visible or not. */ static bool -__rollback_check_if_txnid_non_committed(WT_SESSION_IMPL *session, uint64_t txnid) +__rollback_txn_visible_id(WT_SESSION_IMPL *session, uint64_t id) { WT_CONNECTION_IMPL *conn; - bool found; conn = S2C(session); - /* If not recovery then assume all the data as committed. */ + /* If not recovery then assume all the data as visible. */ if (!F_ISSET(conn, WT_CONN_RECOVERING)) - return (false); + return (true); /* * Only full checkpoint writes the metadata with snapshot. If the recovered checkpoint snapshot - * details are zero then return false i.e, updates are committed. - */ - if (conn->recovery_ckpt_snap_min == 0 && conn->recovery_ckpt_snap_max == 0) - return (false); - - /* - * Snapshot data: - * ids < recovery_ckpt_snap_min are committed, - * ids > recovery_ckpt_snap_max are non committed, - * everything else is committed unless it is found in the recovery_ckpt_snapshot array. + * details are none then return false i.e, updates are visible. */ - if (txnid < conn->recovery_ckpt_snap_min) - return (false); - else if (txnid > conn->recovery_ckpt_snap_max) + if (conn->recovery_ckpt_snap_min == WT_TXN_NONE && conn->recovery_ckpt_snap_max == WT_TXN_NONE) return (true); - /* - * Return false when the recovery snapshot count is 0, which means there is no uncommitted - * transaction ids. - */ - if (conn->recovery_ckpt_snapshot_count == 0) - return (false); - - WT_BINARY_SEARCH( - txnid, conn->recovery_ckpt_snapshot, conn->recovery_ckpt_snapshot_count, found); - - return (found); + return ( + __wt_txn_visible_id_snapshot(id, conn->recovery_ckpt_snap_min, conn->recovery_ckpt_snap_max, + conn->recovery_ckpt_snapshot, conn->recovery_ckpt_snapshot_count)); } /* @@ -484,7 +464,7 @@ __rollback_ondisk_fixup_key(WT_SESSION_IMPL *session, WT_REF *ref, WT_PAGE *page * Stop processing when we find a stable update according to the given timestamp and * transaction id. */ - if (!__rollback_check_if_txnid_non_committed(session, hs_tw->start_txn) && + if (__rollback_txn_visible_id(session, hs_tw->start_txn) && hs_durable_ts <= rollback_timestamp) { __wt_verbose(session, WT_VERB_RECOVERY_RTS(session), "history store update valid with start timestamp: %s, durable timestamp: %s, stop " @@ -562,7 +542,7 @@ __rollback_ondisk_fixup_key(WT_SESSION_IMPL *session, WT_REF *ref, WT_PAGE *page * We have a tombstone on the original update chain and it is stable according to the * timestamp and txnid, we need to restore that as well. */ - if (!__rollback_check_if_txnid_non_committed(session, hs_tw->stop_txn) && + if (__rollback_txn_visible_id(session, hs_tw->stop_txn) && hs_stop_durable_ts <= rollback_timestamp) { /* * The restoring tombstone timestamp must be zero or less than previous update start @@ -614,6 +594,9 @@ __rollback_ondisk_fixup_key(WT_SESSION_IMPL *session, WT_REF *ref, WT_PAGE *page /* Finally remove that update from history store. */ if (valid_update_found) { + /* Avoid freeing the updates while still in use if hs_cursor->remove fails. */ + upd = tombstone = NULL; + WT_ERR(hs_cursor->remove(hs_cursor)); WT_STAT_CONN_DATA_INCR(session, txn_rts_hs_removed); WT_STAT_CONN_DATA_INCR(session, cache_hs_key_truncate_rts); @@ -692,7 +675,7 @@ __rollback_abort_ondisk_kv(WT_SESSION_IMPL *session, WT_REF *ref, WT_COL *cip, W } else return (0); } else if (vpack->tw.durable_start_ts > rollback_timestamp || - __rollback_check_if_txnid_non_committed(session, vpack->tw.start_txn) || + !__rollback_txn_visible_id(session, vpack->tw.start_txn) || (!WT_TIME_WINDOW_HAS_STOP(&vpack->tw) && prepared)) { __wt_verbose(session, WT_VERB_RECOVERY_RTS(session), "on-disk update aborted with start durable timestamp: %s, commit timestamp: %s, " @@ -713,7 +696,7 @@ __rollback_abort_ondisk_kv(WT_SESSION_IMPL *session, WT_REF *ref, WT_COL *cip, W } } else if (WT_TIME_WINDOW_HAS_STOP(&vpack->tw) && (vpack->tw.durable_stop_ts > rollback_timestamp || - __rollback_check_if_txnid_non_committed(session, vpack->tw.stop_txn) || prepared)) { + !__rollback_txn_visible_id(session, vpack->tw.stop_txn) || prepared)) { /* * For prepared transactions, it is possible that both the on-disk key start and stop time * windows can be the same. To abort these updates, check for any stable update from history @@ -805,7 +788,7 @@ __rollback_abort_col_var(WT_SESSION_IMPL *session, WT_REF *ref, wt_timestamp_t r WT_PAGE *page; uint64_t recno, rle; uint32_t i, j; - bool stable_update_found; + bool is_ondisk_stable, stable_update_found; page = ref->page; /* @@ -824,26 +807,46 @@ __rollback_abort_col_var(WT_SESSION_IMPL *session, WT_REF *ref, wt_timestamp_t r WT_RET(__rollback_abort_insert_list( session, page, ins, rollback_timestamp, &stable_update_found)); - if (!stable_update_found && page->dsk != NULL) { + if (page->dsk != NULL) { + /* Unpack the cell. We need its RLE count whether or not we're going to iterate it. */ kcell = WT_COL_PTR(page, cip); __wt_cell_unpack_kv(session, page->dsk, kcell, &unpack); rle = __wt_cell_rle(&unpack); - if (unpack.type != WT_CELL_DEL) { + + /* + * If we found a stable update on the insert list, this key needs no further attention. + * Any other keys in this cell with stable updates also do not require attention. But + * beyond that, the on-disk value must be older than + * the update we found. That means it too is stable(*), so any keys in the cell that + * _don't_ have stable updates on the update list don't need further attention either. + * (And any unstable updates were just handled above.) Thus we can skip iterating over + * the cell. + * + * Furthermore, if the cell is deleted it must be + * itself stable, because cells only appear as deleted if there is no older value that + * might need to be restored. We can skip iterating over the cell. + * + * (*) Either that, or the update is not timestamped, in which case the on-disk value + * might not be stable but the non-timestamp update will hide it until the next + * reconciliation and then overwrite it. + */ + if (stable_update_found) + WT_STAT_CONN_DATA_INCR(session, txn_rts_stable_rle_skipped); + else if (unpack.type == WT_CELL_DEL) + WT_STAT_CONN_DATA_INCR(session, txn_rts_delete_rle_skipped); + else { for (j = 0; j < rle; j++) { - WT_RET(__rollback_abort_ondisk_kv(session, ref, cip, NULL, rollback_timestamp, - recno + j, &stable_update_found)); - /* Skip processing all RLE if the on-disk version is stable. */ - if (stable_update_found) { + WT_RET(__rollback_abort_ondisk_kv( + session, ref, cip, NULL, rollback_timestamp, recno + j, &is_ondisk_stable)); + /* We can stop right away if the on-disk version is stable. */ + if (is_ondisk_stable) { if (rle > 1) WT_STAT_CONN_DATA_INCR(session, txn_rts_stable_rle_skipped); break; } } - } else - WT_STAT_CONN_DATA_INCR(session, txn_rts_delete_rle_skipped); + } recno += rle; - } else { - recno++; } } @@ -1214,17 +1217,16 @@ __rollback_to_stable_check(WT_SESSION_IMPL *session) bool txn_active; /* - * Help the user comply with the requirement that there are no concurrent operations. Protect - * against spurious conflicts with the sweep server: we exclude it from running concurrent with - * rolling back the history store contents. + * Help the user comply with the requirement that there are no concurrent user operations. It is + * okay to have a transaction in prepared state. */ - ret = __wt_txn_activity_check(session, &txn_active); + txn_active = __wt_txn_user_active(session); #ifdef HAVE_DIAGNOSTIC if (txn_active) WT_TRET(__wt_verbose_dump_txn(session)); #endif - if (ret == 0 && txn_active) + if (txn_active) WT_RET_MSG(session, EINVAL, "rollback_to_stable illegal with active transactions"); return (ret); @@ -1622,85 +1624,16 @@ err: static int __rollback_to_stable(WT_SESSION_IMPL *session, bool no_ckpt) { - WT_CACHE *cache; WT_CONNECTION_IMPL *conn; WT_DECL_RET; WT_TXN_GLOBAL *txn_global; wt_timestamp_t rollback_timestamp; - size_t retries; - uint32_t cache_flags; char ts_string[2][WT_TS_INT_STRING_SIZE]; conn = S2C(session); - cache = conn->cache; txn_global = &conn->txn_global; /* - * We're about to run a check for active transactions in the system to stop users from shooting - * themselves in the foot. Eviction threads may interfere with this check if they involve writes - * to the history store so we need to wait until the system is no longer evicting content. - * - * If we detect active evictions, we should wait a millisecond and check again. If we're waiting - * for evictions to quiesce for more than 2 minutes, we should give up on waiting and proceed - * with the transaction check anyway. - */ -#define WT_RTS_EVICT_MAX_RETRIES (2 * WT_MINUTE * WT_THOUSAND) - /* - * These are the types of evictions that can result in a history store operation. Since we want - * to avoid these happening concurrently with our check, we need to look for these flags. - */ -#define WT_CACHE_EVICT_HS_FLAGS \ - (WT_CACHE_EVICT_DIRTY | WT_CACHE_EVICT_UPDATES | WT_CACHE_EVICT_URGENT) - for (retries = 0; retries < WT_RTS_EVICT_MAX_RETRIES; ++retries) { - /* - * If we're shutting down or running with an in-memory configuration, we aren't at risk of - * racing with history store transactions. - */ - if (F_ISSET(conn, WT_CONN_CLOSING_TIMESTAMP | WT_CONN_IN_MEMORY)) - break; - - /* Check whether eviction has quiesced. */ - WT_ORDERED_READ(cache_flags, cache->flags); - if (!FLD_ISSET(cache_flags, WT_CACHE_EVICT_HS_FLAGS)) { - /* - * If we we find that the eviction flags are unset, interrupt the eviction server and - * acquire the pass lock to stop the server from setting the eviction flags AFTER this - * point and racing with our check. - */ - (void)__wt_atomic_addv32(&cache->pass_intr, 1); - __wt_spin_lock(session, &cache->evict_pass_lock); - (void)__wt_atomic_subv32(&cache->pass_intr, 1); - FLD_SET(session->lock_flags, WT_SESSION_LOCKED_PASS); - - /* - * Check that the flags didn't get set in between when we checked and when we acquired - * the server lock. If it did get set, release the locks and keep trying. If they're - * still unset, break out of this loop and commence our check. - */ - WT_ORDERED_READ(cache_flags, cache->flags); - if (!FLD_ISSET(cache_flags, WT_CACHE_EVICT_HS_FLAGS)) - break; - else { - __wt_spin_unlock(session, &cache->evict_pass_lock); - FLD_CLR(session->lock_flags, WT_SESSION_LOCKED_PASS); - } - } - /* If we're retrying, pause for a millisecond and let eviction make some progress. */ - __wt_sleep(0, WT_THOUSAND); - } - if (retries == WT_RTS_EVICT_MAX_RETRIES) { - WT_ERR(__wt_msg( - session, "timed out waiting for eviction to quiesce, running rollback to stable")); - /* - * FIXME: WT-7877 RTS fails when there are active transactions running in parallel to it. - * Waiting in a loop for eviction to quiesce is not efficient in some scenarios where the - * cache is not cleared in 2 minutes. Enable the following assert and - * test_rollback_to_stable22.py when the cache issue is addressed. - */ - /* WT_ASSERT(session, false && "Timed out waiting for eviction to quiesce prior to rts"); */ - } - - /* * Rollback to stable should ignore tombstones in the history store since it needs to scan the * entire table sequentially. */ @@ -1708,11 +1641,6 @@ __rollback_to_stable(WT_SESSION_IMPL *session, bool no_ckpt) WT_ERR(__rollback_to_stable_check(session)); - if (FLD_ISSET(session->lock_flags, WT_SESSION_LOCKED_PASS)) { - __wt_spin_unlock(session, &cache->evict_pass_lock); - FLD_CLR(session->lock_flags, WT_SESSION_LOCKED_PASS); - } - /* * Copy the stable timestamp, otherwise we'd need to lock it each time it's accessed. Even * though the stable timestamp isn't supposed to be updated while rolling back, accessing it @@ -1746,10 +1674,6 @@ __rollback_to_stable(WT_SESSION_IMPL *session, bool no_ckpt) WT_ERR(session->iface.checkpoint(&session->iface, "force=1")); err: - if (FLD_ISSET(session->lock_flags, WT_SESSION_LOCKED_PASS)) { - __wt_spin_unlock(session, &cache->evict_pass_lock); - FLD_CLR(session->lock_flags, WT_SESSION_LOCKED_PASS); - } F_CLR(session, WT_SESSION_ROLLBACK_TO_STABLE); return (ret); } diff --git a/src/third_party/wiredtiger/test/checkpoint/checkpointer.c b/src/third_party/wiredtiger/test/checkpoint/checkpointer.c index b7e7a9a952b..a2445225e2e 100644 --- a/src/third_party/wiredtiger/test/checkpoint/checkpointer.c +++ b/src/third_party/wiredtiger/test/checkpoint/checkpointer.c @@ -36,12 +36,26 @@ static int real_checkpointer(void); static int verify_consistency(WT_SESSION *, char *); /* + * set_stable -- + * Set the stable timestamp from g.ts_stable. + */ +static void +set_stable(void) +{ + char buf[128]; + + testutil_check(__wt_snprintf(buf, sizeof(buf), "stable_timestamp=%x", g.ts_stable)); + testutil_check(g.conn->set_timestamp(g.conn, buf)); +} + +/* * start_checkpoints -- * Responsible for creating the checkpoint thread. */ void start_checkpoints(void) { + set_stable(); testutil_check(__wt_thread_create(NULL, &g.checkpoint_thread, checkpointer, NULL)); if (g.use_timestamps) { testutil_check(__wt_rwlock_init(NULL, &g.clock_lock)); @@ -74,7 +88,6 @@ clock_thread(void *arg) WT_SESSION *wt_session; WT_SESSION_IMPL *session; uint64_t delay; - char buf[128]; WT_UNUSED(arg); @@ -85,8 +98,7 @@ clock_thread(void *arg) while (g.running) { __wt_writelock(session, &g.clock_lock); ++g.ts_stable; - testutil_check(__wt_snprintf(buf, sizeof(buf), "stable_timestamp=%x", g.ts_stable)); - testutil_check(g.conn->set_timestamp(g.conn, buf)); + set_stable(); if (g.ts_stable % 997 == 0) { /* * Random value between 6 and 10 seconds. diff --git a/src/third_party/wiredtiger/test/checkpoint/test_checkpoint.c b/src/third_party/wiredtiger/test/checkpoint/test_checkpoint.c index 90378cc0de9..1d4c99a2b03 100644 --- a/src/third_party/wiredtiger/test/checkpoint/test_checkpoint.c +++ b/src/third_party/wiredtiger/test/checkpoint/test_checkpoint.c @@ -140,7 +140,8 @@ main(int argc, char *argv[]) testutil_work_dir_from_path(g.home, 512, working_dir); - g.ts_stable = 0; + /* Start time at 1 since 0 is not a valid timestamp. */ + g.ts_stable = 1; printf("%s: process %" PRIu64 "\n", progname, (uint64_t)getpid()); for (cnt = 1; (runs == 0 || cnt <= runs) && g.status == 0; ++cnt) { diff --git a/src/third_party/wiredtiger/test/csuite/Makefile.am b/src/third_party/wiredtiger/test/csuite/Makefile.am index f5c3eaed361..f5da81e75ed 100644 --- a/src/third_party/wiredtiger/test/csuite/Makefile.am +++ b/src/third_party/wiredtiger/test/csuite/Makefile.am @@ -37,7 +37,7 @@ all_TESTS += timestamp_abort/smoke.sh test_truncated_log_SOURCES = truncated_log/main.c noinst_PROGRAMS += test_truncated_log -all_TESTS += test_truncated_log +all_TESTS += truncated_log/smoke.sh test_wt1965_col_efficiency_SOURCES = wt1965_col_efficiency/main.c noinst_PROGRAMS += test_wt1965_col_efficiency @@ -49,8 +49,7 @@ all_TESTS += test_wt2403_lsm_workload test_wt2246_col_append_SOURCES = wt2246_col_append/main.c noinst_PROGRAMS += test_wt2246_col_append -# Temporarily disabled (WT-5790) -# all_TESTS += test_wt2246_col_append +all_TESTS += test_wt2246_col_append test_wt2323_join_visibility_SOURCES = wt2323_join_visibility/main.c noinst_PROGRAMS += test_wt2323_join_visibility @@ -146,11 +145,11 @@ all_TESTS += test_wt4891_meta_ckptlist_get_alloc test_wt6185_modify_ts_SOURCES = wt6185_modify_ts/main.c noinst_PROGRAMS += test_wt6185_modify_ts -all_TESTS += test_wt6185_modify_ts +all_TESTS += wt6185_modify_ts/smoke.sh test_wt6616_checkpoint_oldest_ts_SOURCES = wt6616_checkpoint_oldest_ts/main.c noinst_PROGRAMS += test_wt6616_checkpoint_oldest_ts -all_TESTS += test_wt6616_checkpoint_oldest_ts +all_TESTS += wt6616_checkpoint_oldest_ts/smoke.sh # Run this during a "make check" smoke test. TESTS = $(all_TESTS) diff --git a/src/third_party/wiredtiger/test/csuite/schema_abort/main.c b/src/third_party/wiredtiger/test/csuite/schema_abort/main.c index e16c7d7e79e..acf14348c0a 100644 --- a/src/third_party/wiredtiger/test/csuite/schema_abort/main.c +++ b/src/third_party/wiredtiger/test/csuite/schema_abort/main.c @@ -75,7 +75,7 @@ static const char *const uri_collection = "table:collection"; static const char *const ckpt_file = "checkpoint_done"; -static bool compat, inmem, stable_set, use_ts, use_txn; +static bool compat, inmem, stable_set, use_columns, use_ts, use_txn; static volatile uint64_t global_ts = 1; static volatile uint64_t uid = 1; typedef struct { @@ -96,9 +96,10 @@ static volatile THREAD_TS th_ts[MAX_TH]; /* * A minimum width of 10, along with zero filling, means that all the keys sort according to their - * integer value, making each thread's key space distinct. + * integer value, making each thread's key space distinct. For column-store we just use the integer + * values and that has the same effect. */ -#define KEY_FORMAT ("%010" PRIu64) +#define ROW_KEY_FORMAT ("%010" PRIu64) typedef struct { uint64_t absent_key; /* Last absent key */ @@ -670,14 +671,20 @@ thread_run(void *arg) } if (use_ts) stable_ts = __wt_atomic_addv64(&global_ts, 1); - testutil_check(__wt_snprintf(kname, sizeof(kname), KEY_FORMAT, i)); testutil_check(session->begin_transaction(session, NULL)); if (use_prep) testutil_check(oplog_session->begin_transaction(oplog_session, NULL)); - cur_coll->set_key(cur_coll, kname); - cur_local->set_key(cur_local, kname); - cur_oplog->set_key(cur_oplog, kname); + if (use_columns) { + cur_coll->set_key(cur_coll, i + 1); + cur_local->set_key(cur_local, i + 1); + cur_oplog->set_key(cur_oplog, i + 1); + } else { + testutil_check(__wt_snprintf(kname, sizeof(kname), ROW_KEY_FORMAT, i)); + cur_coll->set_key(cur_coll, kname); + cur_local->set_key(cur_local, kname); + cur_oplog->set_key(cur_oplog, kname); + } /* * Put an informative string into the value so that it can be viewed well in a binary dump. */ @@ -764,7 +771,7 @@ run_workload(uint32_t nth) THREAD_DATA *td; wt_thread_t *thr; uint32_t ckpt_id, i, ts_id; - char envconf[512]; + char envconf[512], tableconf[128]; thr = dcalloc(nth + 2, sizeof(*thr)); td = dcalloc(nth + 2, sizeof(THREAD_DATA)); @@ -783,10 +790,13 @@ run_workload(uint32_t nth) /* * Create all the tables. */ - testutil_check( - session->create(session, uri_collection, "key_format=S,value_format=u,log=(enabled=false)")); - testutil_check(session->create(session, uri_local, "key_format=S,value_format=u")); - testutil_check(session->create(session, uri_oplog, "key_format=S,value_format=u")); + testutil_check(__wt_snprintf(tableconf, sizeof(tableconf), + "key_format=%s,value_format=u,log=(enabled=false)", use_columns ? "r" : "S")); + testutil_check(session->create(session, uri_collection, tableconf)); + testutil_check(__wt_snprintf( + tableconf, sizeof(tableconf), "key_format=%s,value_format=u", use_columns ? "r" : "S")); + testutil_check(session->create(session, uri_local, tableconf)); + testutil_check(session->create(session, uri_oplog, tableconf)); /* * Don't log the stable timestamp table so that we know what timestamp was stored at the * checkpoint. @@ -909,11 +919,15 @@ main(int argc, char *argv[]) verify_only = false; working_dir = "WT_TEST.schema-abort"; - while ((ch = __wt_getopt(progname, argc, argv, "Ch:mT:t:vxz")) != EOF) + while ((ch = __wt_getopt(progname, argc, argv, "Cch:mT:t:vxz")) != EOF) switch (ch) { case 'C': compat = true; break; + case 'c': + /* Variable-length columns only; fixed would require considerable changes */ + use_columns = true; + break; case 'h': working_dir = __wt_optarg; break; @@ -1087,10 +1101,16 @@ main(int argc, char *argv[]) key, last_key); break; } - testutil_check(__wt_snprintf(kname, sizeof(kname), KEY_FORMAT, key)); - cur_coll->set_key(cur_coll, kname); - cur_local->set_key(cur_local, kname); - cur_oplog->set_key(cur_oplog, kname); + if (use_columns) { + cur_coll->set_key(cur_coll, key + 1); + cur_local->set_key(cur_local, key + 1); + cur_oplog->set_key(cur_oplog, key + 1); + } else { + testutil_check(__wt_snprintf(kname, sizeof(kname), ROW_KEY_FORMAT, key)); + cur_coll->set_key(cur_coll, kname); + cur_local->set_key(cur_local, kname); + cur_oplog->set_key(cur_oplog, kname); + } /* * The collection table should always only have the data as of the checkpoint. */ diff --git a/src/third_party/wiredtiger/test/csuite/schema_abort/smoke.sh b/src/third_party/wiredtiger/test/csuite/schema_abort/smoke.sh index 5e82ae180bc..e7d21ec30e6 100755 --- a/src/third_party/wiredtiger/test/csuite/schema_abort/smoke.sh +++ b/src/third_party/wiredtiger/test/csuite/schema_abort/smoke.sh @@ -21,6 +21,14 @@ $TEST_WRAPPER $test_bin -t 10 -T 5 $TEST_WRAPPER $test_bin -m -t 10 -T 5 $TEST_WRAPPER $test_bin -C -t 10 -T 5 $TEST_WRAPPER $test_bin -C -m -t 10 -T 5 + +$TEST_WRAPPER $test_bin -c -t 10 -T 5 +$TEST_WRAPPER $test_bin -c -m -t 10 -T 5 +$TEST_WRAPPER $test_bin -c -C -t 10 -T 5 +$TEST_WRAPPER $test_bin -c -C -m -t 10 -T 5 + # FIXME: In WT-6116 the test is failing if timestamps are turned off. #$TEST_WRAPPER $test_bin -m -t 10 -T 5 -z +#$TEST_WRAPPER $test_bin -c -m -t 10 -T 5 -z $TEST_WRAPPER $test_bin -m -t 10 -T 5 -x +$TEST_WRAPPER $test_bin -c -m -t 10 -T 5 -x diff --git a/src/third_party/wiredtiger/test/csuite/timestamp_abort/main.c b/src/third_party/wiredtiger/test/csuite/timestamp_abort/main.c index 6fa41f0d82c..1d59222104b 100644 --- a/src/third_party/wiredtiger/test/csuite/timestamp_abort/main.c +++ b/src/third_party/wiredtiger/test/csuite/timestamp_abort/main.c @@ -79,7 +79,7 @@ static const char *const uri_shadow = "shadow"; static const char *const ckpt_file = "checkpoint_done"; -static bool compat, inmem, stress, use_ts; +static bool columns, compat, inmem, stress, use_ts; static volatile uint64_t global_ts = 1; /* @@ -107,9 +107,10 @@ static volatile uint64_t global_ts = 1; /* * A minimum width of 10, along with zero filling, means that all the keys sort according to their - * integer value, making each thread's key space distinct. + * integer value, making each thread's key space distinct. For column-store we just use the integer + * values and that has the same effect. */ -#define KEY_FORMAT ("%010" PRIu64) +#define KEY_STRINGFORMAT ("%010" PRIu64) typedef struct { uint64_t absent_key; /* Last absent key */ @@ -334,8 +335,6 @@ thread_run(void *arg) printf("Thread %" PRIu32 " starts at %" PRIu64 "\n", td->info, td->start); active_ts = 0; for (i = td->start;; ++i) { - testutil_check(__wt_snprintf(kname, sizeof(kname), KEY_FORMAT, i)); - testutil_check(session->begin_transaction(session, NULL)); if (use_prep) testutil_check(prepared_session->begin_transaction(prepared_session, NULL)); @@ -354,10 +353,18 @@ thread_run(void *arg) testutil_check(pthread_rwlock_unlock(&ts_lock)); } - cur_coll->set_key(cur_coll, kname); - cur_local->set_key(cur_local, kname); - cur_oplog->set_key(cur_oplog, kname); - cur_shadow->set_key(cur_shadow, kname); + if (columns) { + cur_coll->set_key(cur_coll, i + 1); + cur_local->set_key(cur_local, i + 1); + cur_oplog->set_key(cur_oplog, i + 1); + cur_shadow->set_key(cur_shadow, i + 1); + } else { + testutil_check(__wt_snprintf(kname, sizeof(kname), KEY_STRINGFORMAT, i)); + cur_coll->set_key(cur_coll, kname); + cur_local->set_key(cur_local, kname); + cur_oplog->set_key(cur_oplog, kname); + cur_shadow->set_key(cur_shadow, kname); + } /* * Put an informative string into the value so that it can be viewed well in a binary dump. */ @@ -459,6 +466,7 @@ run_workload(uint32_t nth) wt_thread_t *thr; uint32_t cache_mb, ckpt_id, i, ts_id; char envconf[512], uri[128]; + const char *table_config, *table_config_nolog; thr = dcalloc(nth + 2, sizeof(*thr)); td = dcalloc(nth + 2, sizeof(THREAD_DATA)); @@ -495,19 +503,25 @@ run_workload(uint32_t nth) printf("wiredtiger_open configuration: %s\n", envconf); testutil_check(wiredtiger_open(NULL, NULL, envconf, &conn)); testutil_check(conn->open_session(conn, NULL, NULL, &session)); + /* * Create all the tables. */ + if (columns) { + table_config_nolog = "key_format=r,value_format=u,log=(enabled=false)"; + table_config = "key_format=r,value_format=u"; + } else { + table_config_nolog = "key_format=S,value_format=u,log=(enabled=false)"; + table_config = "key_format=S,value_format=u"; + } testutil_check(__wt_snprintf(uri, sizeof(uri), "%s:%s", table_pfx, uri_collection)); - testutil_check( - session->create(session, uri, "key_format=S,value_format=u,log=(enabled=false)")); + testutil_check(session->create(session, uri, table_config_nolog)); testutil_check(__wt_snprintf(uri, sizeof(uri), "%s:%s", table_pfx, uri_shadow)); - testutil_check( - session->create(session, uri, "key_format=S,value_format=u,log=(enabled=false)")); + testutil_check(session->create(session, uri, table_config_nolog)); testutil_check(__wt_snprintf(uri, sizeof(uri), "%s:%s", table_pfx, uri_local)); - testutil_check(session->create(session, uri, "key_format=S,value_format=u")); + testutil_check(session->create(session, uri, table_config)); testutil_check(__wt_snprintf(uri, sizeof(uri), "%s:%s", table_pfx, uri_oplog)); - testutil_check(session->create(session, uri, "key_format=S,value_format=u")); + testutil_check(session->create(session, uri, table_config)); /* * Don't log the stable timestamp table so that we know what timestamp was stored at the * checkpoint. @@ -616,7 +630,7 @@ main(int argc, char *argv[]) (void)testutil_set_progname(argv); - compat = inmem = stress = false; + columns = compat = inmem = stress = false; use_ts = true; nth = MIN_TH; rand_th = rand_time = true; @@ -624,11 +638,15 @@ main(int argc, char *argv[]) verify_only = false; working_dir = "WT_TEST.timestamp-abort"; - while ((ch = __wt_getopt(progname, argc, argv, "Ch:LmsT:t:vz")) != EOF) + while ((ch = __wt_getopt(progname, argc, argv, "Cch:LmsT:t:vz")) != EOF) switch (ch) { case 'C': compat = true; break; + case 'c': + /* Variable-length columns only (for now) */ + columns = true; + break; case 'h': working_dir = __wt_optarg; break; @@ -699,9 +717,9 @@ main(int argc, char *argv[]) compat ? "true" : "false", inmem ? "true" : "false", stress ? "true" : "false", use_ts ? "true" : "false"); printf("Parent: Create %" PRIu32 " threads; sleep %" PRIu32 " seconds\n", nth, timeout); - printf("CONFIG: %s%s%s%s%s -h %s -T %" PRIu32 " -t %" PRIu32 "\n", progname, - compat ? " -C" : "", inmem ? " -m" : "", stress ? " -s" : "", !use_ts ? " -z" : "", - working_dir, nth, timeout); + printf("CONFIG: %s%s%s%s%s%s -h %s -T %" PRIu32 " -t %" PRIu32 "\n", progname, + compat ? " -C" : "", columns ? " -c" : "", inmem ? " -m" : "", stress ? " -s" : "", + !use_ts ? " -z" : "", working_dir, nth, timeout); /* * Fork a child to insert as many items. We will then randomly kill the child, run recovery * and make sure all items we wrote exist after recovery runs. @@ -823,11 +841,20 @@ main(int argc, char *argv[]) key, last_key); break; } - testutil_check(__wt_snprintf(kname, sizeof(kname), KEY_FORMAT, key)); - cur_coll->set_key(cur_coll, kname); - cur_local->set_key(cur_local, kname); - cur_oplog->set_key(cur_oplog, kname); - cur_shadow->set_key(cur_shadow, kname); + + if (columns) { + cur_coll->set_key(cur_coll, key + 1); + cur_local->set_key(cur_local, key + 1); + cur_oplog->set_key(cur_oplog, key + 1); + cur_shadow->set_key(cur_shadow, key + 1); + } else { + testutil_check(__wt_snprintf(kname, sizeof(kname), KEY_STRINGFORMAT, key)); + cur_coll->set_key(cur_coll, kname); + cur_local->set_key(cur_local, kname); + cur_oplog->set_key(cur_oplog, kname); + cur_shadow->set_key(cur_shadow, kname); + } + /* * The collection table should always only have the data as of the checkpoint. The * shadow table should always have the exact same data (or not) as the collection table, diff --git a/src/third_party/wiredtiger/test/csuite/timestamp_abort/smoke.sh b/src/third_party/wiredtiger/test/csuite/timestamp_abort/smoke.sh index 18d7f9b8dae..b2c70340f4c 100755 --- a/src/third_party/wiredtiger/test/csuite/timestamp_abort/smoke.sh +++ b/src/third_party/wiredtiger/test/csuite/timestamp_abort/smoke.sh @@ -23,8 +23,12 @@ then fi $TEST_WRAPPER $test_bin $default_test_args +$TEST_WRAPPER $test_bin $default_test_args -c #$TEST_WRAPPER $test_bin $default_test_args -L $TEST_WRAPPER $test_bin -m $default_test_args +$TEST_WRAPPER $test_bin -m $default_test_args -c #$TEST_WRAPPER $test_bin -m $default_test_args -L $TEST_WRAPPER $test_bin -C $default_test_args +$TEST_WRAPPER $test_bin -C $default_test_args -c $TEST_WRAPPER $test_bin -C -m $default_test_args +$TEST_WRAPPER $test_bin -C -m $default_test_args -c diff --git a/src/third_party/wiredtiger/test/csuite/truncated_log/main.c b/src/third_party/wiredtiger/test/csuite/truncated_log/main.c index d46b75d48c9..fa45e573781 100644 --- a/src/third_party/wiredtiger/test/csuite/truncated_log/main.c +++ b/src/third_party/wiredtiger/test/csuite/truncated_log/main.c @@ -32,6 +32,7 @@ static char home[1024]; /* Program working dir */ static const char *const uri = "table:main"; +static bool use_columns = false; #define RECORDS_FILE "records" @@ -128,8 +129,14 @@ fill_db(void) WT_SESSION *session; uint32_t i, max_key, min_key, units, unused; char k[K_SIZE], v[V_SIZE]; + const char *table_config; bool first; + if (use_columns) + table_config = "key_format=r,value_format=S"; + else + table_config = "key_format=S,value_format=S"; + /* * Run in the home directory so that the records file is in there too. */ @@ -137,7 +144,7 @@ fill_db(void) testutil_die(errno, "chdir: %s", home); testutil_check(wiredtiger_open(NULL, NULL, ENV_CONFIG, &conn)); testutil_check(conn->open_session(conn, NULL, NULL, &session)); - testutil_check(session->create(session, uri, "key_format=S,value_format=S")); + testutil_check(session->create(session, uri, table_config)); testutil_check(session->open_cursor(session, uri, NULL, NULL, &cursor)); /* @@ -164,10 +171,14 @@ fill_db(void) max_key = min_key * 2; first = true; for (i = 0; i < max_key; ++i) { - testutil_check(__wt_snprintf(k, sizeof(k), "key%03d", (int)i)); + if (use_columns) + cursor->set_key(cursor, i + 1); + else { + testutil_check(__wt_snprintf(k, sizeof(k), "key%03" PRIu32, i)); + cursor->set_key(cursor, k); + } testutil_check( - __wt_snprintf(v, sizeof(v), "value%0*d", (int)(V_SIZE - (strlen("value") + 1)), (int)i)); - cursor->set_key(cursor, k); + __wt_snprintf(v, sizeof(v), "value%0*" PRIu32, (int)(V_SIZE - (strlen("value") + 1)), i)); cursor->set_value(cursor, v); testutil_check(cursor->insert(cursor)); @@ -230,8 +241,12 @@ main(int argc, char *argv[]) (void)testutil_set_progname(argv); working_dir = "WT_TEST.truncated-log"; - while ((ch = __wt_getopt(progname, argc, argv, "h:")) != EOF) + while ((ch = __wt_getopt(progname, argc, argv, "ch:")) != EOF) switch (ch) { + case 'c': + /* Variable-length columns only (for now) */ + use_columns = true; + break; case 'h': working_dir = __wt_optarg; break; diff --git a/src/third_party/wiredtiger/test/csuite/truncated_log/smoke.sh b/src/third_party/wiredtiger/test/csuite/truncated_log/smoke.sh new file mode 100755 index 00000000000..0079adf0340 --- /dev/null +++ b/src/third_party/wiredtiger/test/csuite/truncated_log/smoke.sh @@ -0,0 +1,20 @@ +#! /bin/sh + +set -e + +# Smoke-test truncated_log as part of running "make check". + +if [ -n "$1" ] +then + # If the test binary is passed in manually. + test_bin=$1 +else + # If $top_builddir/$top_srcdir aren't set, default to building in build_posix + # and running in test/csuite. + top_builddir=${top_builddir:-../../build_posix} + top_srcdir=${top_srcdir:-../..} + test_bin=$top_builddir/test/csuite/test_truncated_log +fi + +$TEST_WRAPPER $test_bin +$TEST_WRAPPER $test_bin -c diff --git a/src/third_party/wiredtiger/test/csuite/wt2246_col_append/main.c b/src/third_party/wiredtiger/test/csuite/wt2246_col_append/main.c index 6b54402fe66..307f47578e3 100644 --- a/src/third_party/wiredtiger/test/csuite/wt2246_col_append/main.c +++ b/src/third_party/wiredtiger/test/csuite/wt2246_col_append/main.c @@ -89,8 +89,8 @@ int main(int argc, char *argv[]) { WT_SESSION *session; + wt_thread_t idlist[100]; clock_t ce, cs; - pthread_t idlist[100]; uint64_t i, id; char buf[100]; @@ -125,15 +125,16 @@ main(int argc, char *argv[]) (void)signal(SIGINT, onsig); + memset(idlist, 0, sizeof(idlist)); cs = clock(); id = 0; for (i = 0; i < opts->n_append_threads; ++i, ++id) { printf("append: %" PRIu64 "\n", id); - testutil_check(pthread_create(&idlist[id], NULL, thread_append, opts)); + testutil_check(__wt_thread_create(NULL, &idlist[id], thread_append, opts)); } for (i = 0; i < id; ++i) - testutil_check(pthread_join(idlist[i], NULL)); + testutil_check(__wt_thread_join(NULL, &idlist[i])); ce = clock(); printf("%" PRIu64 "M records: %.2lf processor seconds\n", opts->max_inserted_id / MILLION, diff --git a/src/third_party/wiredtiger/test/csuite/wt6185_modify_ts/main.c b/src/third_party/wiredtiger/test/csuite/wt6185_modify_ts/main.c index 1f2e824047b..02205c88429 100644 --- a/src/third_party/wiredtiger/test/csuite/wt6185_modify_ts/main.c +++ b/src/third_party/wiredtiger/test/csuite/wt6185_modify_ts/main.c @@ -50,7 +50,10 @@ static u_int tnext; static uint64_t ts; /* Current timestamp. */ -static char key[100], modify_repl[256], tmp[4 * 1024]; +static char keystr[100], modify_repl[256], tmp[4 * 1024]; +static uint64_t keyrecno; + +static bool use_columns = false; /* * trace -- @@ -117,6 +120,32 @@ mmrand(u_int min, u_int max) } /* + * change_key -- + * Switch to a different key. + */ +static void +change_key(u_int n) +{ + if (use_columns) + keyrecno = n + 1; + else + testutil_check(__wt_snprintf(keystr, sizeof(keystr), "%010u.key", n)); +} + +/* + * set_key -- + * Set the current key in the cursor. + */ +static void +set_key(WT_CURSOR *c) +{ + if (use_columns) + c->set_key(c, keyrecno); + else + c->set_key(c, keystr); +} + +/* * modify_repl_init -- * Initialize the replacement information. */ @@ -181,13 +210,13 @@ modify(WT_SESSION *session, WT_CURSOR *c) for (cnt = loop = 1; loop < 5; ++cnt, ++loop) if (mmrand(1, 10) <= 8) { modify_build(entries, &nentries, cnt); - c->set_key(c, key); + set_key(c); testutil_check(c->modify(c, entries, nentries)); } /* Commit 90% of the time, else rollback. */ if (mmrand(1, 10) != 1) { - c->set_key(c, key); + set_key(c); testutil_check(c->search(c)); testutil_check(c->get_value(c, &v)); free(list[lnext].v); @@ -223,7 +252,7 @@ repeat(WT_SESSION *session, WT_CURSOR *c) testutil_check(__wt_snprintf(tmp, sizeof(tmp), "read_timestamp=%" PRIx64, list[i].ts)); testutil_check(session->timestamp_transaction(session, tmp)); - c->set_key(c, key); + set_key(c); testutil_check(c->search(c)); testutil_check(c->get_value(c, &v)); @@ -246,7 +275,7 @@ evict(WT_CURSOR *c) { trace("%s", "eviction"); - c->set_key(c, key); + set_key(c); testutil_check(c->search(c)); F_SET(c, WT_CURSTD_DEBUG_RESET_EVICT); testutil_check(c->reset(c)); @@ -286,7 +315,7 @@ main(int argc, char *argv[]) WT_SESSION *session; u_int i, j; int ch; - char path[1024], value[VALUE_SIZE]; + char path[1024], table_config[128], value[VALUE_SIZE]; const char *home, *v; bool no_checkpoint, no_eviction; @@ -298,8 +327,12 @@ main(int argc, char *argv[]) no_checkpoint = no_eviction = false; home = "WT_TEST.wt6185_modify_ts"; - while ((ch = __wt_getopt(progname, argc, argv, "ceh:S:")) != EOF) + while ((ch = __wt_getopt(progname, argc, argv, "Cceh:S:")) != EOF) switch (ch) { + case 'C': + /* Variable-length columns only (for now anyway) */ + use_columns = true; + break; case 'c': no_checkpoint = true; break; @@ -322,14 +355,17 @@ main(int argc, char *argv[]) testutil_work_dir_from_path(path, sizeof(path), home); testutil_make_work_dir(path); + testutil_check(__wt_snprintf( + table_config, sizeof(table_config), "key_format=%s,value_format=S", use_columns ? "r" : "S")); + /* Load 100 records. */ testutil_check(wiredtiger_open(path, NULL, "create", &conn)); testutil_check(conn->open_session(conn, NULL, NULL, &session)); - testutil_check(session->create(session, "file:xxx", "key_format=S,value_format=S")); + testutil_check(session->create(session, "file:xxx", table_config)); testutil_check(session->open_cursor(session, "file:xxx", NULL, NULL, &c)); for (i = 0; i <= 100; ++i) { - testutil_check(__wt_snprintf(key, sizeof(key), "%010u.key", i)); - c->set_key(c, key); + change_key(i); + set_key(c); SET_VALUE(i, value); c->set_value(c, value); testutil_check(c->insert(c)); @@ -341,8 +377,8 @@ main(int argc, char *argv[]) testutil_check(conn->open_session(conn, NULL, NULL, &session)); testutil_check(session->create(session, "file:xxx", NULL)); testutil_check(session->open_cursor(session, "file:xxx", NULL, NULL, &c)); - testutil_check(__wt_snprintf(key, sizeof(key), "%010d.key", KEYNO)); - c->set_key(c, key); + change_key(KEYNO); + set_key(c); testutil_check(c->search(c)); testutil_check(c->get_value(c, &v)); SET_VALUE(KEYNO, value); diff --git a/src/third_party/wiredtiger/test/csuite/wt6185_modify_ts/smoke.sh b/src/third_party/wiredtiger/test/csuite/wt6185_modify_ts/smoke.sh new file mode 100755 index 00000000000..b317eeeb2ed --- /dev/null +++ b/src/third_party/wiredtiger/test/csuite/wt6185_modify_ts/smoke.sh @@ -0,0 +1,21 @@ +#! /bin/sh + +set -e + +# Smoke-test wt6185_modify_ts as part of running "make check". + +if [ -n "$1" ] +then + # If the test binary is passed in manually. + test_bin=$1 +else + # If $top_builddir/$top_srcdir aren't set, default to building in build_posix + # and running in test/csuite. + top_builddir=${top_builddir:-../../build_posix} + top_srcdir=${top_srcdir:-../..} + test_bin=$top_builddir/test/csuite/test_wt6185_modify_ts +fi + +$TEST_WRAPPER $test_bin +$TEST_WRAPPER $test_bin -C + diff --git a/src/third_party/wiredtiger/test/csuite/wt6616_checkpoint_oldest_ts/main.c b/src/third_party/wiredtiger/test/csuite/wt6616_checkpoint_oldest_ts/main.c index 5a75777aa78..2e9648efea0 100644 --- a/src/third_party/wiredtiger/test/csuite/wt6616_checkpoint_oldest_ts/main.c +++ b/src/third_party/wiredtiger/test/csuite/wt6616_checkpoint_oldest_ts/main.c @@ -32,6 +32,7 @@ #include <signal.h> static char home[1024]; /* Program working dir */ +static bool use_columns = false; /* * Spin up a child process to do operations and checkpoint. For each set of operations on a key, @@ -48,7 +49,7 @@ static char home[1024]; /* Program working dir */ * recovery by reading without a timestamp. Whether it is possible to read historical versions based * on timestamps from a logged table after recovery is not defined and implemented yet. */ -#define KEY_FORMAT ("%010" PRIu64) +#define ROW_KEY_FORMAT ("%010" PRIu64) #define MAX_CKPT_INVL 5 /* Maximum interval between checkpoints */ #define MAX_DATA 1000 @@ -147,11 +148,14 @@ thread_run(void *arg) /* Insert and then delete the keys until we're killed. */ printf("Worker thread started.\n"); for (oldest_ts = 0, ts = 1;; ++ts) { - testutil_check(__wt_snprintf(kname, sizeof(kname), KEY_FORMAT, ts)); + testutil_check(__wt_snprintf(kname, sizeof(kname), ROW_KEY_FORMAT, ts)); /* Insert the same value for key and value. */ testutil_check(session->begin_transaction(session, NULL)); - cursor->set_key(cursor, kname); + if (use_columns) + cursor->set_key(cursor, ts); + else + cursor->set_key(cursor, kname); data.data = kname; data.size = sizeof(kname); cursor->set_value(cursor, &data); @@ -193,7 +197,7 @@ run_workload(void) WT_SESSION *session; wt_thread_t *thr; uint32_t i; - char envconf[512]; + char envconf[512], tableconf[512]; thr = dcalloc(2, sizeof(*thr)); @@ -206,8 +210,9 @@ run_workload(void) testutil_check(conn->open_session(conn, NULL, NULL, &session)); /* Create the table. */ - testutil_check( - session->create(session, uri, "key_format=S,value_format=u,log=(enabled=false)")); + testutil_check(__wt_snprintf(tableconf, sizeof(tableconf), + "key_format=%s,value_format=u,log=(enabled=false)", use_columns ? "r" : "S")); + testutil_check(session->create(session, uri, tableconf)); testutil_check(session->close(session, NULL)); /* The checkpoint thread is added at the end. */ @@ -268,8 +273,12 @@ main(int argc, char *argv[]) timeout = MIN_TIME; working_dir = "WT_TEST.wt6616-checkpoint-oldest-ts"; - while ((ch = __wt_getopt(progname, argc, argv, "h:t:")) != EOF) + while ((ch = __wt_getopt(progname, argc, argv, "ch:t:")) != EOF) switch (ch) { + case 'c': + /* Variable-length columns only (for now) */ + use_columns = true; + break; case 'h': working_dir = __wt_optarg; break; @@ -363,8 +372,11 @@ main(int argc, char *argv[]) for (ts = oldest_ts; ts <= stable_ts; ++ts) { testutil_check(__wt_snprintf(tscfg, sizeof(tscfg), "read_timestamp=%" PRIx64, ts)); testutil_check(session->begin_transaction(session, tscfg)); - testutil_check(__wt_snprintf(kname, sizeof(kname), KEY_FORMAT, ts)); - cursor->set_key(cursor, kname); + testutil_check(__wt_snprintf(kname, sizeof(kname), ROW_KEY_FORMAT, ts)); + if (use_columns) + cursor->set_key(cursor, ts); + else + cursor->set_key(cursor, kname); ret = cursor->search(cursor); if (ret == WT_NOTFOUND) { fatal = true; diff --git a/src/third_party/wiredtiger/test/csuite/wt6616_checkpoint_oldest_ts/smoke.sh b/src/third_party/wiredtiger/test/csuite/wt6616_checkpoint_oldest_ts/smoke.sh new file mode 100755 index 00000000000..9b9cc997026 --- /dev/null +++ b/src/third_party/wiredtiger/test/csuite/wt6616_checkpoint_oldest_ts/smoke.sh @@ -0,0 +1,21 @@ +#! /bin/sh + +set -e + +# Smoke-test wt6616_checkpoint_oldest_ts as part of running "make check". + +if [ -n "$1" ] +then + # If the test binary is passed in manually. + test_bin=$1 +else + # If $top_builddir/$top_srcdir aren't set, default to building in build_posix + # and running in test/csuite. + top_builddir=${top_builddir:-../../build_posix} + top_srcdir=${top_srcdir:-../..} + test_bin=$top_builddir/test/csuite/test_wt6616_checkpoint_oldest_ts +fi + +$TEST_WRAPPER $test_bin +$TEST_WRAPPER $test_bin -c + diff --git a/src/third_party/wiredtiger/test/evergreen.yml b/src/third_party/wiredtiger/test/evergreen.yml index 5bc5fa6580b..41362d91097 100755 --- a/src/third_party/wiredtiger/test/evergreen.yml +++ b/src/third_party/wiredtiger/test/evergreen.yml @@ -2580,6 +2580,17 @@ tasks: name: recovery-stress-test-3 tags: ["stress-test-3", "stress-test-zseries-3"] + - name: format-abort-recovery-stress-test + commands: + - command: timeout.update + params: + exec_timeout_secs: 2500 + - func: "get project" + - func: "compile wiredtiger with builtins" + - func: "format test script" + vars: + format_test_script_args: -a -t 30 + - name: many-dhandle-stress-test commands: - func: "get project" @@ -2830,6 +2841,7 @@ buildvariants: - name: ".stress-test-2" - name: ".stress-test-3" - name: ".stress-test-4" + - name: format-abort-recovery-stress-test - name: large-scale-tests display_name: "Large scale tests" @@ -2856,13 +2868,11 @@ buildvariants: run_on: - ubuntu1804-test expansions: - test_env_vars: LD_LIBRARY_PATH=$(pwd)/../../.libs + test_env_vars: LD_LIBRARY_PATH=$(pwd)/../../.libs PATH=/opt/mongodbtoolchain/v3/bin:$PATH make_command: PATH=/opt/mongodbtoolchain/v3/bin:$PATH make posix_configure_flags: --enable-silent-rules --enable-python --enable-zlib --enable-snappy --enable-strict --enable-static - test_env_vars: - PATH=/opt/mongodbtoolchain/v3/bin:$PATH tasks: - name: compile - name: cppsuite-hs-cleanup-stress diff --git a/src/third_party/wiredtiger/test/format/config.c b/src/third_party/wiredtiger/test/format/config.c index 88964902d98..33b8d7a9112 100644 --- a/src/third_party/wiredtiger/test/format/config.c +++ b/src/third_party/wiredtiger/test/format/config.c @@ -715,6 +715,8 @@ config_in_memory(void) return; if (config_is_perm("checkpoint")) return; + if (config_is_perm("format.abort")) + return; if (config_is_perm("import")) return; if (config_is_perm("logging")) diff --git a/src/third_party/wiredtiger/test/suite/test_hs18.py b/src/third_party/wiredtiger/test/suite/test_hs18.py index bcef53e4d17..5ed21e3c90a 100644 --- a/src/third_party/wiredtiger/test/suite/test_hs18.py +++ b/src/third_party/wiredtiger/test/suite/test_hs18.py @@ -438,8 +438,6 @@ class test_hs18(wttest.WiredTigerTestCase): session_ts_reader = self.setUpSessionOpen(self.conn) cursor_ts_reader = session_ts_reader.open_cursor(uri) - self.skipTest('Skip this part of test_hs18 until WT-7931 is resolved') - # The ID of the session corresponds the value it should see. sessions = [] cursors = [] @@ -448,8 +446,6 @@ class test_hs18(wttest.WiredTigerTestCase): sessions.append(self.setUpSessionOpen(self.conn)) cursors.append(sessions[i].open_cursor(uri)) - value_junk = 'aaaaa' * 100 - values.append('f' * 10) values.append('a' + values[0]) values.append('b' + values[1]) @@ -485,11 +481,13 @@ class test_hs18(wttest.WiredTigerTestCase): # Start a long running transaction which could see modify 1. self.start_txn(sessions, cursors, values, 2) - # Insert a bunch of contents to fill the cache - for i in range(2000, 10000): - self.session.begin_transaction() - cursor[self.create_key(i)] = value_junk - self.session.commit_transaction() + # Evict the update using a debug cursor + cursor.reset() + evict_cursor = self.session.open_cursor(uri, None, "debug=(release_evict)") + evict_cursor.set_key(self.create_key(1)) + self.assertEqual(evict_cursor.search(), 0) + evict_cursor.reset() + evict_cursor.close() # Commit a modify without a timestamp on our original key self.session.begin_transaction() @@ -511,11 +509,13 @@ class test_hs18(wttest.WiredTigerTestCase): for i in range(0, 5): self.check_value(cursors[i], values[i]) - # Insert a bunch of other contents to trigger eviction - for i in range(10001, 11000): - self.session.begin_transaction() - cursor[self.create_key(i)] = value_junk - self.session.commit_transaction() + # Evict the update using a debug cursor + cursor.reset() + evict_cursor = self.session.open_cursor(uri, None, "debug=(release_evict)") + evict_cursor.set_key(self.create_key(1)) + self.assertEqual(evict_cursor.search(), 0) + evict_cursor.reset() + evict_cursor.close() # Check our values are still correct. for i in range(0, 5): diff --git a/src/third_party/wiredtiger/test/suite/test_rollback_to_stable22.py b/src/third_party/wiredtiger/test/suite/test_rollback_to_stable22.py index f6ef52cc388..7c85800b070 100644 --- a/src/third_party/wiredtiger/test/suite/test_rollback_to_stable22.py +++ b/src/third_party/wiredtiger/test/suite/test_rollback_to_stable22.py @@ -43,8 +43,6 @@ class test_rollback_to_stable22(test_rollback_to_stable_base): nrows = 1000 nds = 10 - self.skipTest('Skip it until the fix is provided to handle concurrent internal transactions running in parallel.') - # Create a few tables and populate them with some initial data. # # Our way of preventing history store operations from interfering with rollback to stable's diff --git a/src/third_party/wiredtiger/test/suite/test_rollback_to_stable24.py b/src/third_party/wiredtiger/test/suite/test_rollback_to_stable24.py new file mode 100644 index 00000000000..ea690506dc9 --- /dev/null +++ b/src/third_party/wiredtiger/test/suite/test_rollback_to_stable24.py @@ -0,0 +1,135 @@ +#!/usr/bin/env python +# +# Public Domain 2014-present MongoDB, Inc. +# Public Domain 2008-2014 WiredTiger, Inc. +# +# This is free and unencumbered software released into the public domain. +# +# Anyone is free to copy, modify, publish, use, compile, sell, or +# distribute this software, either in source code form or as a compiled +# binary, for any purpose, commercial or non-commercial, and by any +# means. +# +# In jurisdictions that recognize copyright laws, the author or authors +# of this software dedicate any and all copyright interest in the +# software to the public domain. We make this dedication for the benefit +# of the public at large and to the detriment of our heirs and +# successors. We intend this dedication to be an overt act of +# relinquishment in perpetuity of all present and future rights to this +# software under copyright law. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +# IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR +# OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +# ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR +# OTHER DEALINGS IN THE SOFTWARE. + +import wiredtiger, wttest +from wtscenario import make_scenarios + +# test_rollback_to_stable24.py +# Exercise a recno-counting bug in column store. +# +# Prior to August 2021 a cell for which there's a pending stable update was counted (in the +# column-store RTS code) as having RLE count 1 regardless of what the actual count was. +# +# In order to exploit this we have to do janky things with timestamps, but I think they're +# allowable. +# +# Construct a cell with RLE count of 3 by writing 3 copies of aaaaaa at timestamp 10. +# Then at the next key write bbbbbb at timestamp 10 and cccccc at timestamp 50. +# Evict the page to reconcile it and produce the RLE cell. +# +# Then post an update to the first key of the RLE cell at timestamp 30 (to dddddd), and roll +# back to 40. +# +# Reading at 40, we should at that point see dddddd and two aaaaaa's followed by bbbbbb, but +# with the bad counting we get a key error on the second key. +# +# This happens because it goes to process key 4 but thinks it's on key 2; it finds that it +# needs to roll back the value it's looking at (the cccccc from timestamp 50) but because it +# thinks it's on key to it asks the history store for key 2 and finds nothing. (The bbbbbb +# from timestamp 10 is in the history store, but under key 4; there's nothing in the history +# store for key 2.) So it issues a tombstone, and issues it for key 2, so key 2 improperly +# disappears. +# +# Run this test on rows as well as columns to help make sure the test itself is valid (and +# stays so over time...) +class test_rollback_to_stable24(wttest.WiredTigerTestCase): + session_config = 'isolation=snapshot' + conn_config = 'in_memory=false' + + key_format_values = [ + ('column', dict(key_format='r')), + ('integer_row', dict(key_format='i')), + ] + + scenarios = make_scenarios(key_format_values) + + def test_rollback_to_stable24(self): + # Create a table without logging. + uri = "table:rollback_to_stable24" + format = 'key_format={},value_format=S'.format(self.key_format) + self.session.create(uri, format + ', log=(enabled=false)') + + # Pin oldest timestamp to 10. + self.conn.set_timestamp('oldest_timestamp=' + self.timestamp_str(10)) + + # Start stable timestamp at 10. + self.conn.set_timestamp('stable_timestamp=' + self.timestamp_str(10)) + + value_a = "aaaaa" * 100 + value_b = "bbbbb" * 100 + value_c = "ccccc" * 100 + value_d = "ddddd" * 100 + + s = self.conn.open_session() + cursor = s.open_cursor(uri) + + # Write some keys at time 10. + s.begin_transaction() + cursor[1] = value_a + cursor[2] = value_a + cursor[3] = value_a + cursor[4] = value_b + s.commit_transaction('commit_timestamp=' + self.timestamp_str(10)) + + # Update key 4 at time 50. + s.begin_transaction() + cursor[4] = value_c + s.commit_transaction('commit_timestamp=' + self.timestamp_str(50)) + + cursor.close() + + # Evict the page to force reconciliation. + evict_cursor = self.session.open_cursor(uri, None, "debug=(release_evict)") + s.begin_transaction() + # Search the key to evict it. + v = evict_cursor[1] + self.assertEqual(v, value_a) + self.assertEqual(evict_cursor.reset(), 0) + s.rollback_transaction() + evict_cursor.close() + + # Now update key 1 at time 30. + cursor = s.open_cursor(uri) + s.begin_transaction() + cursor[1] = value_d + s.commit_transaction('commit_timestamp=' + self.timestamp_str(30)) + cursor.close() + + # Roll back to 40. + self.conn.set_timestamp('stable_timestamp=' + self.timestamp_str(40)) + self.conn.rollback_to_stable() + + # Now read at 40. + cursor = s.open_cursor(uri) + s.begin_transaction('read_timestamp=' + self.timestamp_str(40)) + self.assertEqual(cursor[1], value_d) + self.assertEqual(cursor[2], value_a) + self.assertEqual(cursor[3], value_a) + self.assertEqual(cursor[4], value_b) + s.rollback_transaction() + cursor.close() diff --git a/src/third_party/wiredtiger/test/suite/test_rollback_to_stable25.py b/src/third_party/wiredtiger/test/suite/test_rollback_to_stable25.py new file mode 100644 index 00000000000..2d800a17d32 --- /dev/null +++ b/src/third_party/wiredtiger/test/suite/test_rollback_to_stable25.py @@ -0,0 +1,293 @@ +#!/usr/bin/env python +# +# Public Domain 2014-present MongoDB, Inc. +# Public Domain 2008-2014 WiredTiger, Inc. +# +# This is free and unencumbered software released into the public domain. +# +# Anyone is free to copy, modify, publish, use, compile, sell, or +# distribute this software, either in source code form or as a compiled +# binary, for any purpose, commercial or non-commercial, and by any +# means. +# +# In jurisdictions that recognize copyright laws, the author or authors +# of this software dedicate any and all copyright interest in the +# software to the public domain. We make this dedication for the benefit +# of the public at large and to the detriment of our heirs and +# successors. We intend this dedication to be an overt act of +# relinquishment in perpetuity of all present and future rights to this +# software under copyright law. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +# IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR +# OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +# ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR +# OTHER DEALINGS IN THE SOFTWARE. + +import wiredtiger, wttest +from wtscenario import make_scenarios, filter_scenarios + +# test_rollback_to_stable25.py +# Check various scenarios relating to RLE cells in column-store. +# +# We write at three different timestamps: +# 10 - aaaaaa or none +# 20 - bbbbbb or delete or none +# 30 - cccccc or delete or none +# +# and we evict to push things to disk after any of these, +# and we roll back to either 15 or 25. +# +# The writes can be either uniform, heterogeneous, first key, middle key, or last key. +# +# We do this with a group of 5 keys 2..6. Keys 1 and 6 are written with zzzzzz at +# timestamp 5 and evicted to ensure that the group of keys we're using is isolated +# from other unused keys. +# +# This generates a lot of cases, but we filter pointless combinations and they run fast. + +# Put these bits outside the class definition so they can be referred to both in class +# instances and in the scenario setup logic, which doesn't have a class instance yet. + +my_rle_size = 5 + +def keys_of_write(write): + if write == 'u' or write == 'h': + return range(2, 2 + my_rle_size) + elif write == 'f': + return [2] + elif write == 'm': + return [2 + my_rle_size // 2] + else: + return [2 + my_rle_size - 1] + +class test_rollback_to_stable25(wttest.WiredTigerTestCase): + session_config = 'isolation=snapshot' + conn_config = 'in_memory=false' + + write_10_values = [ + ('10u', dict(write_10='u')), + ('10h', dict(write_10='h')), + ('10f', dict(write_10='f')), + ('10m', dict(write_10='m')), + ('10l', dict(write_10='l')), + ] + type_10_values = [ + ('nil', dict(type_10=None)), + ('upd', dict(type_10='upd')), + ] + + write_20_values = [ + ('20u', dict(write_20='u')), + ('20h', dict(write_20='h')), + ('20f', dict(write_20='f')), + ('20m', dict(write_20='m')), + ('20l', dict(write_20='l')), + ] + type_20_values = [ + ('nil', dict(type_20=None)), + ('upd', dict(type_20='upd')), + ('del', dict(type_20='del')), + ] + + write_30_values = [ + ('30u', dict(write_30='u')), + ('30h', dict(write_30='h')), + ('30f', dict(write_30='f')), + ('30m', dict(write_30='m')), + ('30l', dict(write_30='l')), + ] + type_30_values = [ + ('nil', dict(type_30=None)), + ('upd', dict(type_30='upd')), + ('del', dict(type_30='del')), + ] + + evict_time_values = [ + ('chk10', dict(evict_time=10)), + ('chk20', dict(evict_time=20)), + ('chk30', dict(evict_time=30)), + ] + + rollback_time_values = [ + ('roll15', dict(rollback_time=15)), + ('roll25', dict(rollback_time=25)), + ] + + def is_meaningful(name, vals): + # The last write at evict time should be uniform, to get an RLE cell. + if vals['evict_time'] == 10 and vals['write_10'] != 'u': + return False + if vals['evict_time'] == 20 and vals['write_20'] != 'u': + return False + if vals['evict_time'] == 30 and vals['write_30'] != 'u': + return False + # If the type is nil, the value must be uniform. + if vals['type_10'] is None and vals['write_10'] != 'u': + return False + if vals['type_20'] is None and vals['write_20'] != 'u': + return False + if vals['type_30'] is None and vals['write_30'] != 'u': + return False + # Similarly, delete and heterogeneous doesn't make sense. + if vals['type_10'] == 'del' and vals['write_10'] == 'h': + return False + if vals['type_20'] == 'del' and vals['write_20'] == 'h': + return False + if vals['type_20'] == 'del' and vals['write_30'] == 'h': + return False + # Both 10 and 20 shouldn't be nil. That's equivalent to 10 and 30 being nil. + if vals['type_10'] is None and vals['type_20'] is None: + return False + + # Avoid cases that delete nonexistent values. + def deletes_nonexistent(): + present = {} + for k in range(2, 2 + my_rle_size): + present[k] = False + def adjust(ty, write): + if ty is None: + return + for k in keys_of_write(write): + if ty == 'upd': + present[k] = True + elif ty == 'del': + if present[k]: + present[k] = False + else: + raise KeyError + + adjust(vals['type_10'], vals['write_10']) + adjust(vals['type_20'], vals['write_20']) + adjust(vals['type_30'], vals['write_30']) + try: + deletes_nonexistent() + except KeyError: + return False + return True + + scenarios = filter_scenarios(make_scenarios(write_10_values, type_10_values, + write_20_values, type_20_values, + write_30_values, type_30_values, + evict_time_values, + rollback_time_values), + is_meaningful) + + value_z = "zzzzz" * 10 + + def writes(self, uri, s, expected, ty, write, value, ts): + if ty is None: + # do nothing at all + return + cursor = s.open_cursor(uri) + s.begin_transaction() + for k in keys_of_write(write): + if ty == 'upd': + myval = value + str(k) if write == 'h' else value + cursor[k] = myval + expected[k] = myval + else: + cursor.set_key(k) + cursor.remove() + del expected[k] + s.commit_transaction('commit_timestamp=' + self.timestamp_str(ts)) + cursor.close() + + def evict(self, uri, s): + # Evict the page to force reconciliation. + evict_cursor = s.open_cursor(uri, None, "debug=(release_evict)") + s.begin_transaction() + # Search the key to evict it. Use both bookends. + v = evict_cursor[1] + self.assertEqual(v, self. value_z) + v = evict_cursor[2 + my_rle_size] + self.assertEqual(v, self. value_z) + self.assertEqual(evict_cursor.reset(), 0) + s.rollback_transaction() + evict_cursor.close() + + def check(self, uri, s, ts, expected): + cursor = s.open_cursor(uri) + s.begin_transaction('read_timestamp=' + self.timestamp_str(ts)) + # endpoints should still be in place + self.assertEqual(cursor[1], self.value_z) + self.assertEqual(cursor[2 + my_rle_size], self.value_z) + + for k in range(2, 2 + my_rle_size): + if k in expected: + self.assertEqual(cursor[k], expected[k]) + else: + cursor.set_key(k) + r = cursor.search() + self.assertEqual(r, wiredtiger.WT_NOTFOUND) + s.rollback_transaction() + cursor.close() + + def test_rollback_to_stable25(self): + # Create a table without logging. + uri = "table:rollback_to_stable25" + format = 'key_format=r,value_format=S' + self.session.create(uri, format + ', log=(enabled=false)') + + # Pin oldest timestamp to 5. + self.conn.set_timestamp('oldest_timestamp=' + self.timestamp_str(5)) + + # Start stable timestamp at 5. + self.conn.set_timestamp('stable_timestamp=' + self.timestamp_str(5)) + + value_a = "aaaaa" * 10 + value_b = "bbbbb" * 10 + value_c = "ccccc" * 10 + + s = self.conn.open_session() + + # Write the endpoints at time 5. + cursor = s.open_cursor(uri) + s.begin_transaction() + cursor[1] = self.value_z + cursor[2 + my_rle_size] = self.value_z + s.commit_transaction('commit_timestamp=' + self.timestamp_str(5)) + self.evict(uri, s) + cursor.close() + + # Do writes at time 10. + expected = {} + self.writes(uri, s, expected, self.type_10, self.write_10, value_a, 10) + expected10 = expected.copy() + + # Evict at time 10 if requested. + if self.evict_time == 10: + self.evict(uri, s) + + # Do more writes at time 20. + self.writes(uri, s, expected, self.type_20, self.write_20, value_b, 20) + expected20 = expected.copy() + + # Evict at time 20 if requested. + if self.evict_time == 20: + self.evict(uri, s) + + # Do still more writes at time 30. + self.writes(uri, s, expected, self.type_30, self.write_30, value_c, 30) + expected30 = expected.copy() + + # Evict at time 30 if requested. + if self.evict_time == 30: + self.evict(uri, s) + + # Now roll back. + self.conn.set_timestamp('stable_timestamp=' + self.timestamp_str(self.rollback_time)) + self.conn.rollback_to_stable() + + if self.rollback_time < 20: + expected20 = expected10 + expected30 = expected10 + elif self.rollback_time < 30: + expected30 = expected20 + + # Now make sure we see what we expect. + self.check(uri, s, 10, expected10) + self.check(uri, s, 20, expected20) + self.check(uri, s, 30, expected30) |