summaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
authorLuke Chen <luke.chen@mongodb.com>2018-11-01 11:13:55 +1100
committerLuke Chen <luke.chen@mongodb.com>2018-11-01 11:13:55 +1100
commit3d60283fad14b4c62274bbd4bc61a9b606929cdb (patch)
treee7268c77b1e35d2ad5c4008f2425253e3c49faec /src
parent8c1de7e08de30a38f3d878118248735e6e2ea72a (diff)
downloadmongo-3d60283fad14b4c62274bbd4bc61a9b606929cdb.tar.gz
Import wiredtiger: 45b751a54fb181d1995684f7c807bbbc142d3c90 from branch mongodb-3.6
ref: 0d4bc746f2..45b751a54f for: 3.6.9 WT-3276 Add recover=salvage to recover from a corrupted log file WT-3735 Add a workgen workload that generates a lot of page splits WT-3736 Add statistics to measure contention on lookaside cursor WT-3839 Document the undefined behavior when a range truncate overlaps with inserts WT-3856 Create a test that runs recovery to different points of time with schema operations WT-3879 Disallow checkpoint from evicting metadata pages WT-3894 Timestamp queue implementation and statistics improvements WT-3917 Enhance WT_CURSOR::reserve documentation around commit visibility WT-3943 Include full error message when a python test asserts WT-3955 Add verbose option to log more messages on error returns WT-3963 Add a schema intensive abort testing WT-3968 Use compression ratio to tune page sizes WT-4010 Simplify test/format timestamp handling. WT-4024 Fix a race between split and next/prev WT-4026 Add implementation for existing file extension configuration API WT-4048 Generalize timing_stress_for_test split functionality WT-4067 Enhance LSM to not pin as much history in cache WT-4090 Low priority reads WT-4101 Don't abort the eviction server during session verify when oldest_timestamp is held back WT-4104 Fix test/format failure during comparing data content with berkeley db WT-4111 Improve checkpoint scrubbing algorithm WT-4119 Avoid restarts updating / removing during a column store scan WT-4125 Ensure that subsequent checkpoints with stable timestamp don't read too much WT-4131 Rename lookaside to cache overflow WT-4133 Coverity 1393445, 1393446 Dereference before null check WT-4134 Rework assertion that we don't discard required history WT-4136 Add a new timing stress flag that yields during tree search WT-4138 Add an option to timeout waiting for space in the cache WT-4139 rename the cursor restart statistic to match implementation WT-4140 Cursor walk limits quick eviction page selection unnecessarily. WT-4141 Enhance checkpoint with timestamps to unblock eviction sooner WT-4143 Use WiredTiger.turtle.set if it exists but WiredTiger.turtle does not WT-4144 Fix rollback_to_stable with lookaside history WT-4145 Only include the checkpoint timestamp during checkpoints WT-4146 Coverity 1393639, unused variable WT-4147 Log recovery should not ignore corruption outside of log records in a log file WT-4152 Save return value for later comparison in transaction code WT-4154 Surface the oldest read timestamp WT-4156 Add new wiredtiger_salvage top level API WT-4160 Restore performance when timestamps are not in use WT-4163 Lint WT-4168 Update upgrading documentation for 3.1.0 release WT-4169 Fix wt verify dump-pages failure WT-4171 Enabling tree walk timing stress causes excessive slowdown WT-4172 Add diagnostic hazard pointer checks in more places before freeing refs WT-4174 Do not access the lookaside file in rollback_to_stable when running with in_memory=true WT-4176 Expose a WT_SESSION.query_timestamp method WT-4177 Backup cursor open should force a log file switch WT-4178 Fixes for wt_btree_immediately_durable needed for in-memory WT-4179 Expose WiredTiger crc32c functions WT-4182 Use conservative approach for log checksum errors WT-4183 Extend verbose option to log more messages on error returns WT-4185 Don't remove all lookaside entries when reading a page WT-4186 Log recovery should detect and report corruption within log records WT-4187 Coverity: unused value complaints WT-4188 Coverity: unchecked return value complaints WT-4189 Potential infinite loop in __async_flush_wait(). WT-4191 Fix Coverity static analysis errors WT-4193 test/format snapshot-isolation search mismatch WT-4194 Improve fairness of eviction with multiple tables WT-4195 When encountering an illegal value, log the value that failed WT-4196 Make log corruption checking work regardless of the machine byte order WT-4198 Some supported MongoDB architectures don't support crc32 hardware WT-4199 Fix an incorrect report of log corruption WT-4201 Fix Coverity static analysis issues WT-4206 Fix error handling in cursor close routines WT-4207 Coverity #1394567: null pointer dereference WT-4208 tree walks can be interrupted by locked internal pages WT-4210 schema abort child process failing prematurely WT-4211 Add automated test for long running prepared transactions WT-4212 Update lookaside schema to handle prepared transactions WT-4213 Rename lock statistics that have redundant or misleading text WT-4215 Allow recovery of backup without salvage WT-4216 Use separate counters for page_swap yield and sleep WT-4218 Change eviction to evict prepared updates WT-4225 Automate a backup test that simulates volume snapshot via dd WT-4226 test/format LSM configurations can misconfigure prepare and timestamps WT-4229 Lint WT-4231 Fix ctags index of functions with attributes WT-4233 Change log corruption errors to warnings and truncate log WT-4234 Remove documentation mention of legacy tool statlog.py WT-4235 Fix workgen tracking of table state across workloads WT-4239 Don't allow checkpoints to perform insert-splits in the tree WT-4241 GNU-stack section should never be conditionally compiled out WT-4242 New log file extension Python test failure WT-4246 Change transaction update list to support indirect references WT-4248 Fix checkpoints in schema_abort for slow machines WT-4249 Attempt to discard dirty page during verify operation WT-4251 Prepared updates cannot be discarded WT-4252 Btree debug functions can leak scratch buffers on error. WT-4253 Btree debug function to do blind reads doesn't handle row-store internal pages WT-4256 Loosen check during rollback_to_stable WT-4257 Don't assume timestamps from lookaside are aligned in memory WT-4259 Restore ref to the previous state rather than MEM when eviction fails WT-4261 Test salvage of out-of-sync metadata/turtle files WT-4262 Lock deleted children in eviction of internal pages WT-4263 Use the right tree when copying a key for a lookaside write WT-4264 Compaction can race with page modifications WT-4267 fixed-length column store operations can corrupt data WT-4268 Random abort should wait until record files exist before starting timer WT-4270 Add an operation field to know where threads hang WT-4272 Increase startup timeout to 30 seconds for slow I/O systems WT-4274 Fix memory leak in wt4156_metadata_salvage test WT-4277 Make truncate in column stores more efficient WT-4281 Shorten runtime of Python test suite WT-4282 Don't transition pages from limbo to mem unless required WT-4283 Restore WT_ERROR and use a corrupt flag WT-4284 Print a verbose message in recovery on error too WT-4285 Fix wt4156_metadata_salvage Coverity/lint complaints WT-4286 Column store should skip end-of-table checks if there's an exact match WT-4288 Don't let return value of closing conn overwrite WT_TRY_SALVAGE WT-4289 Update WT_DATA_CORRUPTION to WT_TRY_SALVAGE in test_txn19.py WT-4291 Fix test_txn19.py error detection by looking for WT_ERROR WT-4292 Add call to testutil_cleanup to avoid memory leak WT-4300 Setting the update timestamp can overwrite the WT_REF.addr field WT-4301 WT_CURSOR.reserve operations can leak memory when committed WT-4305 Add a gating variable for long running prepare support WT-4306 Fix mode if metadata pages need eviction WT-4308 Insert split during sync should not free blocks WT-4321 Disable the random direct I/O test WT-4325 Add a WiredTiger-local version of the qsort(3) call
Diffstat (limited to 'src')
-rw-r--r--src/third_party/wiredtiger/.gitignore4
-rw-r--r--src/third_party/wiredtiger/NEWS8
-rw-r--r--src/third_party/wiredtiger/README6
-rw-r--r--src/third_party/wiredtiger/RELEASE_INFO2
-rw-r--r--src/third_party/wiredtiger/bench/workgen/runner/compress_ratio.py126
-rw-r--r--src/third_party/wiredtiger/bench/workgen/runner/insert_stress.py66
-rw-r--r--src/third_party/wiredtiger/bench/workgen/runner/split_stress.py83
-rw-r--r--src/third_party/wiredtiger/bench/workgen/workgen.cxx106
-rw-r--r--src/third_party/wiredtiger/bench/workgen/workgen.h6
-rw-r--r--src/third_party/wiredtiger/bench/workgen/workgen_int.h4
-rw-r--r--src/third_party/wiredtiger/bench/wtperf/idle_table_cycle.c4
-rw-r--r--src/third_party/wiredtiger/bench/wtperf/wtperf.c60
-rw-r--r--src/third_party/wiredtiger/bench/wtperf/wtperf.h2
-rw-r--r--src/third_party/wiredtiger/bench/wtperf/wtperf_truncate.c30
-rw-r--r--src/third_party/wiredtiger/build_posix/aclocal/options.m413
-rw-r--r--src/third_party/wiredtiger/build_posix/aclocal/version-set.m44
-rw-r--r--src/third_party/wiredtiger/build_win/wiredtiger.def2
-rw-r--r--src/third_party/wiredtiger/build_win/wiredtiger_config.h3
-rw-r--r--src/third_party/wiredtiger/dist/api_data.py63
-rw-r--r--src/third_party/wiredtiger/dist/api_err.py5
-rw-r--r--src/third_party/wiredtiger/dist/log.py2
-rw-r--r--src/third_party/wiredtiger/dist/s_export.list2
-rw-r--r--src/third_party/wiredtiger/dist/s_funcs.list1
-rw-r--r--src/third_party/wiredtiger/dist/s_string.ok25
-rwxr-xr-xsrc/third_party/wiredtiger/dist/s_style6
-rwxr-xr-xsrc/third_party/wiredtiger/dist/s_tags2
-rwxr-xr-xsrc/third_party/wiredtiger/dist/s_void4
-rw-r--r--src/third_party/wiredtiger/dist/stat_data.py67
-rw-r--r--src/third_party/wiredtiger/examples/c/ex_all.c5
-rw-r--r--src/third_party/wiredtiger/examples/c/ex_thread.c2
-rw-r--r--src/third_party/wiredtiger/ext/collators/revint/revint_collator.c21
-rw-r--r--src/third_party/wiredtiger/ext/compressors/lz4/lz4_compress.c9
-rw-r--r--src/third_party/wiredtiger/ext/compressors/nop/nop_compress.c9
-rw-r--r--src/third_party/wiredtiger/ext/compressors/snappy/snappy_compress.c9
-rw-r--r--src/third_party/wiredtiger/ext/compressors/zlib/zlib_compress.c9
-rw-r--r--src/third_party/wiredtiger/ext/compressors/zstd/zstd_compress.c8
-rw-r--r--src/third_party/wiredtiger/ext/encryptors/nop/nop_encrypt.c9
-rw-r--r--src/third_party/wiredtiger/ext/encryptors/rotn/rotn_encrypt.c8
-rw-r--r--src/third_party/wiredtiger/ext/extractors/csv/csv_extractor.c59
-rw-r--r--src/third_party/wiredtiger/ext/test/fail_fs/fail_fs.c12
-rw-r--r--src/third_party/wiredtiger/import.data2
-rw-r--r--src/third_party/wiredtiger/lang/java/java_doc.i1
-rw-r--r--src/third_party/wiredtiger/src/async/async_api.c6
-rw-r--r--src/third_party/wiredtiger/src/async/async_worker.c4
-rw-r--r--src/third_party/wiredtiger/src/block/block_ckpt.c3
-rw-r--r--src/third_party/wiredtiger/src/block/block_ext.c6
-rw-r--r--src/third_party/wiredtiger/src/block/block_read.c54
-rw-r--r--src/third_party/wiredtiger/src/bloom/bloom.c11
-rw-r--r--src/third_party/wiredtiger/src/btree/bt_compact.c44
-rw-r--r--src/third_party/wiredtiger/src/btree/bt_curnext.c13
-rw-r--r--src/third_party/wiredtiger/src/btree/bt_curprev.c6
-rw-r--r--src/third_party/wiredtiger/src/btree/bt_cursor.c76
-rw-r--r--src/third_party/wiredtiger/src/btree/bt_debug.c99
-rw-r--r--src/third_party/wiredtiger/src/btree/bt_delete.c15
-rw-r--r--src/third_party/wiredtiger/src/btree/bt_discard.c22
-rw-r--r--src/third_party/wiredtiger/src/btree/bt_handle.c100
-rw-r--r--src/third_party/wiredtiger/src/btree/bt_io.c16
-rw-r--r--src/third_party/wiredtiger/src/btree/bt_ovfl.c2
-rw-r--r--src/third_party/wiredtiger/src/btree/bt_page.c14
-rw-r--r--src/third_party/wiredtiger/src/btree/bt_random.c26
-rw-r--r--src/third_party/wiredtiger/src/btree/bt_read.c103
-rw-r--r--src/third_party/wiredtiger/src/btree/bt_rebalance.c6
-rw-r--r--src/third_party/wiredtiger/src/btree/bt_slvg.c26
-rw-r--r--src/third_party/wiredtiger/src/btree/bt_split.c127
-rw-r--r--src/third_party/wiredtiger/src/btree/bt_stat.c2
-rw-r--r--src/third_party/wiredtiger/src/btree/bt_sync.c33
-rw-r--r--src/third_party/wiredtiger/src/btree/bt_vrfy_dsk.c4
-rw-r--r--src/third_party/wiredtiger/src/btree/bt_walk.c358
-rw-r--r--src/third_party/wiredtiger/src/btree/col_modify.c85
-rw-r--r--src/third_party/wiredtiger/src/btree/col_srch.c10
-rw-r--r--src/third_party/wiredtiger/src/btree/row_modify.c4
-rw-r--r--src/third_party/wiredtiger/src/btree/row_srch.c12
-rw-r--r--src/third_party/wiredtiger/src/cache/cache_las.c305
-rw-r--r--src/third_party/wiredtiger/src/checksum/arm64/crc32-arm64.c37
-rw-r--r--src/third_party/wiredtiger/src/checksum/power8/crc32.sx17
-rw-r--r--src/third_party/wiredtiger/src/checksum/power8/crc32_wrapper.c35
-rw-r--r--src/third_party/wiredtiger/src/checksum/software/checksum.c6
-rw-r--r--src/third_party/wiredtiger/src/checksum/x86/crc32-x86.c50
-rw-r--r--src/third_party/wiredtiger/src/checksum/zseries/crc32-s390x.c35
-rw-r--r--src/third_party/wiredtiger/src/checksum/zseries/crc32le-vx.sx19
-rw-r--r--src/third_party/wiredtiger/src/config/config_collapse.c2
-rw-r--r--src/third_party/wiredtiger/src/config/config_def.c305
-rw-r--r--src/third_party/wiredtiger/src/conn/api_strerror.c2
-rw-r--r--src/third_party/wiredtiger/src/conn/conn_api.c118
-rw-r--r--src/third_party/wiredtiger/src/conn/conn_cache.c4
-rw-r--r--src/third_party/wiredtiger/src/conn/conn_cache_pool.c2
-rw-r--r--src/third_party/wiredtiger/src/conn/conn_ckpt.c2
-rw-r--r--src/third_party/wiredtiger/src/conn/conn_dhandle.c9
-rw-r--r--src/third_party/wiredtiger/src/conn/conn_log.c16
-rw-r--r--src/third_party/wiredtiger/src/conn/conn_reconfig.c3
-rw-r--r--src/third_party/wiredtiger/src/conn/conn_stat.c10
-rw-r--r--src/third_party/wiredtiger/src/conn/conn_sweep.c2
-rw-r--r--src/third_party/wiredtiger/src/cursor/cur_backup.c34
-rw-r--r--src/third_party/wiredtiger/src/cursor/cur_config.c11
-rw-r--r--src/third_party/wiredtiger/src/cursor/cur_ds.c13
-rw-r--r--src/third_party/wiredtiger/src/cursor/cur_dump.c9
-rw-r--r--src/third_party/wiredtiger/src/cursor/cur_file.c43
-rw-r--r--src/third_party/wiredtiger/src/cursor/cur_index.c10
-rw-r--r--src/third_party/wiredtiger/src/cursor/cur_join.c26
-rw-r--r--src/third_party/wiredtiger/src/cursor/cur_json.c4
-rw-r--r--src/third_party/wiredtiger/src/cursor/cur_log.c23
-rw-r--r--src/third_party/wiredtiger/src/cursor/cur_metadata.c18
-rw-r--r--src/third_party/wiredtiger/src/cursor/cur_stat.c11
-rw-r--r--src/third_party/wiredtiger/src/cursor/cur_std.c26
-rw-r--r--src/third_party/wiredtiger/src/cursor/cur_table.c12
-rw-r--r--src/third_party/wiredtiger/src/docs/config-strings.dox2
-rw-r--r--src/third_party/wiredtiger/src/docs/error-handling.dox3
-rw-r--r--src/third_party/wiredtiger/src/docs/statistics.dox5
-rw-r--r--src/third_party/wiredtiger/src/docs/top/main.dox8
-rw-r--r--src/third_party/wiredtiger/src/docs/upgrading.dox82
-rw-r--r--src/third_party/wiredtiger/src/evict/evict_file.c7
-rw-r--r--src/third_party/wiredtiger/src/evict/evict_lru.c111
-rw-r--r--src/third_party/wiredtiger/src/evict/evict_page.c120
-rw-r--r--src/third_party/wiredtiger/src/include/api.h3
-rw-r--r--src/third_party/wiredtiger/src/include/btmem.h25
-rw-r--r--src/third_party/wiredtiger/src/include/btree.h49
-rw-r--r--src/third_party/wiredtiger/src/include/btree.i138
-rw-r--r--src/third_party/wiredtiger/src/include/cache.h7
-rw-r--r--src/third_party/wiredtiger/src/include/cache.i27
-rw-r--r--src/third_party/wiredtiger/src/include/cell.i6
-rw-r--r--src/third_party/wiredtiger/src/include/config.h48
-rw-r--r--src/third_party/wiredtiger/src/include/connection.h108
-rw-r--r--src/third_party/wiredtiger/src/include/cursor.h9
-rw-r--r--src/third_party/wiredtiger/src/include/error.h29
-rw-r--r--src/third_party/wiredtiger/src/include/extern.h49
-rw-r--r--src/third_party/wiredtiger/src/include/extern_posix.h3
-rw-r--r--src/third_party/wiredtiger/src/include/extern_win.h3
-rw-r--r--src/third_party/wiredtiger/src/include/log.h12
-rw-r--r--src/third_party/wiredtiger/src/include/lsm.h19
-rw-r--r--src/third_party/wiredtiger/src/include/meta.h1
-rw-r--r--src/third_party/wiredtiger/src/include/misc.h86
-rw-r--r--src/third_party/wiredtiger/src/include/misc.i34
-rw-r--r--src/third_party/wiredtiger/src/include/os_fhandle.i4
-rw-r--r--src/third_party/wiredtiger/src/include/packing.i4
-rw-r--r--src/third_party/wiredtiger/src/include/session.h6
-rw-r--r--src/third_party/wiredtiger/src/include/stat.h21
-rw-r--r--src/third_party/wiredtiger/src/include/txn.h35
-rw-r--r--src/third_party/wiredtiger/src/include/txn.i270
-rw-r--r--src/third_party/wiredtiger/src/include/wiredtiger.in912
-rw-r--r--src/third_party/wiredtiger/src/log/log.c486
-rw-r--r--src/third_party/wiredtiger/src/log/log_auto.c2
-rw-r--r--src/third_party/wiredtiger/src/log/log_slot.c2
-rw-r--r--src/third_party/wiredtiger/src/lsm/lsm_cursor.c33
-rw-r--r--src/third_party/wiredtiger/src/lsm/lsm_manager.c15
-rw-r--r--src/third_party/wiredtiger/src/lsm/lsm_tree.c10
-rw-r--r--src/third_party/wiredtiger/src/lsm/lsm_work_unit.c94
-rw-r--r--src/third_party/wiredtiger/src/lsm/lsm_worker.c14
-rw-r--r--src/third_party/wiredtiger/src/meta/meta_ckpt.c2
-rw-r--r--src/third_party/wiredtiger/src/meta/meta_table.c24
-rw-r--r--src/third_party/wiredtiger/src/meta/meta_track.c15
-rw-r--r--src/third_party/wiredtiger/src/meta/meta_turtle.c14
-rw-r--r--src/third_party/wiredtiger/src/optrack/optrack.c12
-rw-r--r--src/third_party/wiredtiger/src/os_common/filename.c2
-rw-r--r--src/third_party/wiredtiger/src/os_common/os_abort.c1
-rw-r--r--src/third_party/wiredtiger/src/os_common/os_errno.c15
-rw-r--r--src/third_party/wiredtiger/src/os_common/os_fs_inmemory.c5
-rw-r--r--src/third_party/wiredtiger/src/os_posix/os_dir.c7
-rw-r--r--src/third_party/wiredtiger/src/os_posix/os_fallocate.c12
-rw-r--r--src/third_party/wiredtiger/src/os_posix/os_fs.c10
-rw-r--r--src/third_party/wiredtiger/src/os_posix/os_map.c2
-rw-r--r--src/third_party/wiredtiger/src/os_posix/os_thread.c11
-rw-r--r--src/third_party/wiredtiger/src/os_posix/os_time.c14
-rw-r--r--src/third_party/wiredtiger/src/os_win/os_fs.c4
-rw-r--r--src/third_party/wiredtiger/src/os_win/os_thread.c10
-rw-r--r--src/third_party/wiredtiger/src/os_win/os_time.c17
-rw-r--r--src/third_party/wiredtiger/src/packing/pack_stream.c16
-rw-r--r--src/third_party/wiredtiger/src/reconcile/rec_write.c470
-rw-r--r--src/third_party/wiredtiger/src/schema/schema_alter.c2
-rw-r--r--src/third_party/wiredtiger/src/schema/schema_list.c2
-rw-r--r--src/third_party/wiredtiger/src/schema/schema_plan.c2
-rw-r--r--src/third_party/wiredtiger/src/schema/schema_util.c2
-rw-r--r--src/third_party/wiredtiger/src/session/session_api.c59
-rw-r--r--src/third_party/wiredtiger/src/session/session_dhandle.c4
-rw-r--r--src/third_party/wiredtiger/src/support/err.c131
-rw-r--r--src/third_party/wiredtiger/src/support/global.c2
-rw-r--r--src/third_party/wiredtiger/src/support/hazard.c29
-rw-r--r--src/third_party/wiredtiger/src/support/huffman.c4
-rw-r--r--src/third_party/wiredtiger/src/support/mtx_rw.c6
-rw-r--r--src/third_party/wiredtiger/src/support/scratch.c72
-rw-r--r--src/third_party/wiredtiger/src/support/stat.c116
-rw-r--r--src/third_party/wiredtiger/src/support/thread_group.c2
-rw-r--r--src/third_party/wiredtiger/src/txn/txn.c206
-rw-r--r--src/third_party/wiredtiger/src/txn/txn_ckpt.c149
-rw-r--r--src/third_party/wiredtiger/src/txn/txn_log.c56
-rw-r--r--src/third_party/wiredtiger/src/txn/txn_recover.c12
-rw-r--r--src/third_party/wiredtiger/src/txn/txn_rollback_to_stable.c142
-rw-r--r--src/third_party/wiredtiger/src/txn/txn_timestamp.c385
-rw-r--r--src/third_party/wiredtiger/src/utilities/util_alter.c3
-rw-r--r--src/third_party/wiredtiger/src/utilities/util_load.c55
-rw-r--r--src/third_party/wiredtiger/src/utilities/util_load_json.c18
-rw-r--r--src/third_party/wiredtiger/src/utilities/util_loadtext.c10
-rw-r--r--src/third_party/wiredtiger/src/utilities/util_main.c18
-rw-r--r--src/third_party/wiredtiger/src/utilities/util_read.c8
-rw-r--r--src/third_party/wiredtiger/src/utilities/util_write.c8
-rw-r--r--src/third_party/wiredtiger/test/bloom/test_bloom.c3
-rw-r--r--src/third_party/wiredtiger/test/checkpoint/checkpointer.c2
-rw-r--r--src/third_party/wiredtiger/test/checkpoint/workers.c2
-rw-r--r--src/third_party/wiredtiger/test/csuite/Makefile.am12
-rw-r--r--src/third_party/wiredtiger/test/csuite/random_abort/main.c30
-rw-r--r--src/third_party/wiredtiger/test/csuite/random_directio/main.c1291
-rwxr-xr-xsrc/third_party/wiredtiger/test/csuite/random_directio/smoke.sh34
-rw-r--r--src/third_party/wiredtiger/test/csuite/schema_abort/main.c1355
-rwxr-xr-xsrc/third_party/wiredtiger/test/csuite/schema_abort/smoke.sh12
-rw-r--r--src/third_party/wiredtiger/test/csuite/timestamp_abort/main.c99
-rwxr-xr-xsrc/third_party/wiredtiger/test/csuite/timestamp_abort/smoke.sh2
-rw-r--r--src/third_party/wiredtiger/test/csuite/truncated_log/main.c1
-rw-r--r--src/third_party/wiredtiger/test/csuite/wt2909_checkpoint_integrity/main.c6
-rw-r--r--src/third_party/wiredtiger/test/csuite/wt4105_large_doc_small_upd/main.c7
-rw-r--r--src/third_party/wiredtiger/test/csuite/wt4117_checksum/main.c36
-rw-r--r--src/third_party/wiredtiger/test/csuite/wt4156_metadata_salvage/main.c755
-rw-r--r--src/third_party/wiredtiger/test/cursor_order/cursor_order.c1
-rw-r--r--src/third_party/wiredtiger/test/cursor_order/cursor_order_ops.c2
-rw-r--r--src/third_party/wiredtiger/test/fops/fops.c2
-rw-r--r--src/third_party/wiredtiger/test/fops/t.c1
-rw-r--r--src/third_party/wiredtiger/test/format/config.c10
-rw-r--r--src/third_party/wiredtiger/test/format/config.h22
-rw-r--r--src/third_party/wiredtiger/test/format/format.h39
-rw-r--r--src/third_party/wiredtiger/test/format/lrt.c17
-rw-r--r--src/third_party/wiredtiger/test/format/ops.c171
-rw-r--r--src/third_party/wiredtiger/test/format/t.c5
-rw-r--r--src/third_party/wiredtiger/test/format/util.c81
-rw-r--r--src/third_party/wiredtiger/test/format/wts.c14
-rw-r--r--src/third_party/wiredtiger/test/huge/huge.c1
-rw-r--r--src/third_party/wiredtiger/test/manydbs/manydbs.c1
-rw-r--r--src/third_party/wiredtiger/test/readonly/readonly.c1
-rw-r--r--src/third_party/wiredtiger/test/salvage/salvage.c3
-rw-r--r--src/third_party/wiredtiger/test/suite/run.py7
-rwxr-xr-x[-rw-r--r--]src/third_party/wiredtiger/test/suite/suite_subprocess.py51
-rw-r--r--src/third_party/wiredtiger/test/suite/test_backup02.py10
-rw-r--r--src/third_party/wiredtiger/test/suite/test_backup03.py15
-rw-r--r--src/third_party/wiredtiger/test/suite/test_backup09.py145
-rw-r--r--src/third_party/wiredtiger/test/suite/test_bug010.py2
-rw-r--r--src/third_party/wiredtiger/test/suite/test_bug019.py8
-rw-r--r--src/third_party/wiredtiger/test/suite/test_bug021.py212
-rw-r--r--src/third_party/wiredtiger/test/suite/test_config07.py105
-rw-r--r--src/third_party/wiredtiger/test/suite/test_cursor12.py10
-rw-r--r--src/third_party/wiredtiger/test/suite/test_cursor15.py70
-rw-r--r--src/third_party/wiredtiger/test/suite/test_intpack.py5
-rw-r--r--src/third_party/wiredtiger/test/suite/test_las01.py2
-rw-r--r--src/third_party/wiredtiger/test/suite/test_las03.py105
-rw-r--r--src/third_party/wiredtiger/test/suite/test_prepare_lookaside01.py137
-rw-r--r--src/third_party/wiredtiger/test/suite/test_schema08.py189
-rw-r--r--src/third_party/wiredtiger/test/suite/test_sweep01.py6
-rw-r--r--src/third_party/wiredtiger/test/suite/test_sweep03.py43
-rw-r--r--src/third_party/wiredtiger/test/suite/test_timestamp04.py10
-rw-r--r--src/third_party/wiredtiger/test/suite/test_timestamp06.py10
-rw-r--r--src/third_party/wiredtiger/test/suite/test_timestamp07.py4
-rw-r--r--src/third_party/wiredtiger/test/suite/test_timestamp09.py4
-rw-r--r--src/third_party/wiredtiger/test/suite/test_timestamp13.py151
-rw-r--r--src/third_party/wiredtiger/test/suite/test_txn02.py15
-rwxr-xr-xsrc/third_party/wiredtiger/test/suite/test_txn19.py352
-rw-r--r--src/third_party/wiredtiger/test/suite/wtscenario.py8
-rw-r--r--src/third_party/wiredtiger/test/suite/wttest.py16
-rw-r--r--src/third_party/wiredtiger/test/thread/rw.c2
-rw-r--r--src/third_party/wiredtiger/test/thread/t.c1
-rw-r--r--src/third_party/wiredtiger/test/utility/misc.c36
-rw-r--r--src/third_party/wiredtiger/test/utility/test_util.h3
-rw-r--r--src/third_party/wiredtiger/test/windows/windows_shim.c24
-rw-r--r--src/third_party/wiredtiger/test/windows/windows_shim.h3
259 files changed, 10699 insertions, 3523 deletions
diff --git a/src/third_party/wiredtiger/.gitignore b/src/third_party/wiredtiger/.gitignore
index 49e737fe301..a27c74ff155 100644
--- a/src/third_party/wiredtiger/.gitignore
+++ b/src/third_party/wiredtiger/.gitignore
@@ -104,6 +104,7 @@ _wiredtiger.pyd
**/test/checkpoint/t
**/test/csuite/test_random_abort
**/test/csuite/test_rwlock
+**/test/csuite/test_schema_abort
**/test/csuite/test_scope
**/test/csuite/test_timestamp_abort
**/test/csuite/test_truncated_log
@@ -126,6 +127,9 @@ _wiredtiger.pyd
**/test/csuite/test_wt3338_partial_update
**/test/csuite/test_wt3363_checkpoint_op_races
**/test/csuite/test_wt3874_pad_byte_collator
+**/test/csuite/test_wt4105_large_doc_small_upd
+**/test/csuite/test_wt4117_checksum
+**/test/csuite/test_wt4156_metadata_salvage
**/test/cursor_order/cursor_order
**/test/fops/t
**/test/format/s_dumpcmp
diff --git a/src/third_party/wiredtiger/NEWS b/src/third_party/wiredtiger/NEWS
index 7bf3b0e7edb..1e821835386 100644
--- a/src/third_party/wiredtiger/NEWS
+++ b/src/third_party/wiredtiger/NEWS
@@ -1,6 +1,14 @@
Ticket reference tags refer to tickets in the MongoDB JIRA tracking system:
https://jira.mongodb.org
+WiredTiger release 3.1.0, 2018-07-12
+------------------------------------
+
+See the upgrading documentation for details of API and behavior changes.
+
+See JIRA changelog for a full listing:
+https://jira.mongodb.org/projects/WT/versions/19708
+
WiredTiger release 3.0.0, 2018-01-08
------------------------------------
diff --git a/src/third_party/wiredtiger/README b/src/third_party/wiredtiger/README
index f21ff213a7c..fd8757621bf 100644
--- a/src/third_party/wiredtiger/README
+++ b/src/third_party/wiredtiger/README
@@ -1,6 +1,6 @@
-WiredTiger 3.1.0: (April 23, 2018)
+WiredTiger 3.1.1: (July 12, 2018)
-This is version 3.1.0 of WiredTiger.
+This is version 3.1.1 of WiredTiger.
WiredTiger release packages and documentation can be found at:
@@ -8,7 +8,7 @@ WiredTiger release packages and documentation can be found at:
The documentation for this specific release can be found at:
- http://source.wiredtiger.com/3.1.0/index.html
+ http://source.wiredtiger.com/3.1.1/index.html
The WiredTiger source code can be found at:
diff --git a/src/third_party/wiredtiger/RELEASE_INFO b/src/third_party/wiredtiger/RELEASE_INFO
index ee25ecd6c56..2014ba3ee74 100644
--- a/src/third_party/wiredtiger/RELEASE_INFO
+++ b/src/third_party/wiredtiger/RELEASE_INFO
@@ -1,6 +1,6 @@
WIREDTIGER_VERSION_MAJOR=3
WIREDTIGER_VERSION_MINOR=1
-WIREDTIGER_VERSION_PATCH=0
+WIREDTIGER_VERSION_PATCH=1
WIREDTIGER_VERSION="$WIREDTIGER_VERSION_MAJOR.$WIREDTIGER_VERSION_MINOR.$WIREDTIGER_VERSION_PATCH"
WIREDTIGER_RELEASE_DATE=`date "+%B %e, %Y"`
diff --git a/src/third_party/wiredtiger/bench/workgen/runner/compress_ratio.py b/src/third_party/wiredtiger/bench/workgen/runner/compress_ratio.py
new file mode 100644
index 00000000000..2c5552bfa5d
--- /dev/null
+++ b/src/third_party/wiredtiger/bench/workgen/runner/compress_ratio.py
@@ -0,0 +1,126 @@
+#!/usr/bin/env python
+#
+# Public Domain 2014-2018 MongoDB, Inc.
+# Public Domain 2008-2014 WiredTiger, Inc.
+#
+# This is free and unencumbered software released into the public domain.
+#
+# Anyone is free to copy, modify, publish, use, compile, sell, or
+# distribute this software, either in source code form or as a compiled
+# binary, for any purpose, commercial or non-commercial, and by any
+# means.
+#
+# In jurisdictions that recognize copyright laws, the author or authors
+# of this software dedicate any and all copyright interest in the
+# software to the public domain. We make this dedication for the benefit
+# of the public at large and to the detriment of our heirs and
+# successors. We intend this dedication to be an overt act of
+# relinquishment in perpetuity of all present and future rights to this
+# software under copyright law.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+# IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+# OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+# ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+# OTHER DEALINGS IN THE SOFTWARE.
+#
+
+# Drive a constant high workload through, even if WiredTiger isn't keeping
+# up by dividing the workload across a lot of threads. This needs to be
+# tuned to the particular machine so the workload is close to capacity in the
+# steady state, but not overwhelming.
+#
+################
+# Note: This looks similar to multi_btree_heavy_stress.py with values altered
+# for run time, #ops, #threads, #throttle to maintain dirty cache around the
+# eviction target of 5% on the AWS perf machines. These values being machine
+# dependant might need to be altered as per the machine this workload gets
+# run on.
+#
+from runner import *
+from wiredtiger import *
+from workgen import *
+
+def op_append(ops, op):
+ if ops == None:
+ ops = op
+ else:
+ ops += op
+ return ops
+
+def make_op(optype, table, key, value = None):
+ if value == None:
+ return Operation(optype, table, key)
+ else:
+ return Operation(optype, table, key, value)
+
+logkey = Key(Key.KEYGEN_APPEND, 8) ## should be 8 bytes format 'Q'
+def operations(optype, tables, key, value = None, ops_per_txn = 0, logtable = None):
+ txn_list = []
+ ops = None
+ nops = 0
+ for table in tables:
+ ops = op_append(ops, make_op(optype, table, key, value))
+ if logtable != None:
+ ops = op_append(ops, make_op(optype, logtable, logkey, value))
+ nops += 1
+ if ops_per_txn > 0 and nops % ops_per_txn == 0:
+ txn_list.append(txn(ops))
+ ops = None
+ if ops_per_txn > 0:
+ if ops != None:
+ txn_list.append(txn(ops))
+ ops = None
+ for t in txn_list:
+ ops = op_append(ops, t)
+ return ops
+
+context = Context()
+conn_config="create,cache_size=2GB,session_max=1000,eviction=(threads_min=4,threads_max=4),log=(enabled=false),transaction_sync=(enabled=false),checkpoint_sync=false,checkpoint=(wait=20),statistics=(fast),statistics_log=(json,wait=1)"
+table_config="allocation_size=4k,memory_page_max=10MB,prefix_compression=false,split_pct=90,leaf_page_max=32k,internal_page_max=16k,type=file"
+compression_opts = {
+ "none" : "block_compressor=none",
+ "zlib_noraw" : "block_compressor=zlib-noraw",
+ "zlib_noraw_onepage" : "block_compressor=zlib-noraw,memory_page_image_max=32k",
+ "zlib_noraw_tenpage" : "block_compressor=zlib-noraw,memory_page_image_max=320k",
+ "zlib_raw" : "block_compressor=zlib",
+ "snappy" : "block_compressor=snappy"
+}
+#conn_config += extensions_config(['compressors/snappy'])
+conn = wiredtiger_open("WT_TEST", conn_config)
+s = conn.open_session()
+
+tables = []
+for name_ext, compress_config in compression_opts.iteritems():
+ tname = "table:test_" + name_ext
+ s.create(tname, 'key_format=S,value_format=S,' + table_config + "," + compress_config)
+ table = Table(tname)
+ table.options.value_compressibility = 70
+ tables.append(table)
+
+icount=500000
+ins_ops = operations(Operation.OP_INSERT, tables, Key(Key.KEYGEN_APPEND, 20), Value(500))
+thread = Thread(ins_ops * icount)
+pop_workload = Workload(context, thread)
+print('populate:')
+pop_workload.run(conn)
+
+ins_ops = operations(Operation.OP_INSERT, tables, Key(Key.KEYGEN_APPEND, 20), Value(500), 0)
+upd_ops = operations(Operation.OP_UPDATE, tables, Key(Key.KEYGEN_UNIFORM, 20), Value(500), 0)
+
+ins_thread = Thread(ins_ops)
+upd_thread = Thread(upd_ops)
+ins_thread.options.throttle = 1000
+ins_thread.options.name = "Insert"
+upd_thread.options.throttle = 1000
+upd_thread.options.name = "Update"
+threads = ins_thread * 2 + upd_thread * 10
+workload = Workload(context, threads)
+workload.options.run_time = 60
+workload.options.report_interval = 1
+workload.options.sample_interval = 1
+workload.options.sample_rate = 1
+print('Update heavy workload:')
+workload.run(conn)
diff --git a/src/third_party/wiredtiger/bench/workgen/runner/insert_stress.py b/src/third_party/wiredtiger/bench/workgen/runner/insert_stress.py
new file mode 100644
index 00000000000..be33396db70
--- /dev/null
+++ b/src/third_party/wiredtiger/bench/workgen/runner/insert_stress.py
@@ -0,0 +1,66 @@
+#!/usr/bin/env python
+#
+# Public Domain 2014-2018 MongoDB, Inc.
+# Public Domain 2008-2014 WiredTiger, Inc.
+#
+# This is free and unencumbered software released into the public domain.
+#
+# Anyone is free to copy, modify, publish, use, compile, sell, or
+# distribute this software, either in source code form or as a compiled
+# binary, for any purpose, commercial or non-commercial, and by any
+# means.
+#
+# In jurisdictions that recognize copyright laws, the author or authors
+# of this software dedicate any and all copyright interest in the
+# software to the public domain. We make this dedication for the benefit
+# of the public at large and to the detriment of our heirs and
+# successors. We intend this dedication to be an overt act of
+# relinquishment in perpetuity of all present and future rights to this
+# software under copyright law.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+# IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+# OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+# ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+# OTHER DEALINGS IN THE SOFTWARE.
+#
+
+from runner import *
+from wiredtiger import *
+from workgen import *
+
+context = Context()
+conn_config="create,cache_size=4GB,session_max=1000,eviction=(threads_min=4,threads_max=8),log=(enabled=false),transaction_sync=(enabled=false),checkpoint_sync=true,checkpoint=(wait=10),statistics=(fast),statistics_log=(json,wait=1)"
+table_config="allocation_size=4k,memory_page_max=10MB,prefix_compression=false,split_pct=90,leaf_page_max=32k,internal_page_max=16k,type=file,block_compressor=snappy"
+conn = wiredtiger_open("WT_TEST", conn_config)
+s = conn.open_session()
+tname = "file:test.wt"
+table_config="key_format=S,value_format=S,allocation_size=4k,memory_page_max=10MB,prefix_compression=false,split_pct=90,leaf_page_max=32k,leaf_value_max=64MB,internal_page_max=16k,type=file,block_compressor=snappy"
+s.create(tname, table_config)
+table = Table(tname)
+table.options.key_size = 20
+table.options.value_size = 130 * 1024
+table.options.range = 100000000 # 100 million
+
+op = Operation(Operation.OP_INSERT, table)
+thread = Thread(op * 500)
+pop_workload = Workload(context, thread)
+print('populate:')
+pop_workload.run(conn)
+
+op = Operation(Operation.OP_INSERT, table, Key(Key.KEYGEN_UNIFORM, 10), Value(130 * 1024))
+op2 = Operation(Operation.OP_INSERT, table, Key(Key.KEYGEN_UNIFORM, 10), Value(100))
+op3 = Operation(Operation.OP_INSERT, table, Key(Key.KEYGEN_APPEND, 10), Value(130 * 1024))
+t = Thread(op + 10 * op2 + op3)
+
+read_op = Operation(Operation.OP_SEARCH, table, Key(Key.KEYGEN_UNIFORM, 10))
+read_txn_ops = op_group_transaction(read_op, 100, "")
+read_thread = Thread(read_txn_ops)
+
+workload = Workload(context, t * 8 + read_thread)
+workload.options.run_time = 240
+workload.options.report_interval = 5
+print('workload:')
+workload.run(conn)
diff --git a/src/third_party/wiredtiger/bench/workgen/runner/split_stress.py b/src/third_party/wiredtiger/bench/workgen/runner/split_stress.py
new file mode 100644
index 00000000000..57aa8040d5d
--- /dev/null
+++ b/src/third_party/wiredtiger/bench/workgen/runner/split_stress.py
@@ -0,0 +1,83 @@
+#!/usr/bin/env python
+#
+# Public Domain 2014-2018 MongoDB, Inc.
+# Public Domain 2008-2014 WiredTiger, Inc.
+#
+# This is free and unencumbered software released into the public domain.
+#
+# Anyone is free to copy, modify, publish, use, compile, sell, or
+# distribute this software, either in source code form or as a compiled
+# binary, for any purpose, commercial or non-commercial, and by any
+# means.
+#
+# In jurisdictions that recognize copyright laws, the author or authors
+# of this software dedicate any and all copyright interest in the
+# software to the public domain. We make this dedication for the benefit
+# of the public at large and to the detriment of our heirs and
+# successors. We intend this dedication to be an overt act of
+# relinquishment in perpetuity of all present and future rights to this
+# software under copyright law.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+# IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+# OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+# ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+# OTHER DEALINGS IN THE SOFTWARE.
+#
+
+# A workload with small cache, small internal and leaf page sizes, faster splits
+# and multiple threads inserting keys in random order. It stresses the page
+# splits in order to catch split races.
+#
+from runner import *
+from wiredtiger import *
+from workgen import *
+
+context = Context()
+# Connection configuration.
+conn_config = "cache_size=100MB,log=(enabled=false),statistics=[fast],statistics_log=(wait=1,json=false)"
+conn = wiredtiger_open("WT_TEST", "create," + conn_config)
+s = conn.open_session("")
+
+# Table configuration.
+table_config = "leaf_page_max=8k,internal_page_max=8k,leaf_item_max=1433,internal_item_max=3100,type=file,memory_page_max=1MB,split_deepen_min_child=100"
+tables = []
+table_count = 3
+for i in range(0, table_count):
+ tname = "file:test" + str(i)
+ table = Table(tname)
+ s.create(tname, 'key_format=S,value_format=S,' + table_config)
+ table.options.key_size = 64
+ table.options.value_size = 200
+ table.options.range = 100000000 # 100 million
+ tables.append(table)
+
+# Populate phase.
+populate_threads = 1
+icount = 50000
+# There are multiple tables to be filled during populate,
+# the icount is split between them all.
+pop_ops = Operation(Operation.OP_INSERT, tables[0])
+pop_ops = op_multi_table(pop_ops, tables)
+nops_per_thread = icount / (populate_threads * table_count)
+pop_thread = Thread(pop_ops * nops_per_thread)
+pop_workload = Workload(context, populate_threads * pop_thread)
+print('populate:')
+pop_workload.run(conn)
+
+# Run phase.
+ops = Operation(Operation.OP_INSERT, tables[0])
+ops = op_multi_table(ops, tables, False)
+thread0 = Thread(ops)
+
+workload = Workload(context, 20 * thread0)
+workload.options.report_interval=5
+workload.options.run_time=300
+print('Split stress workload running...')
+workload.run(conn)
+
+latency_filename = "WT_TEST/latency.out"
+latency.workload_latency(workload, latency_filename)
+conn.close()
diff --git a/src/third_party/wiredtiger/bench/workgen/workgen.cxx b/src/third_party/wiredtiger/bench/workgen/workgen.cxx
index f89356b836a..9ae63682f9c 100644
--- a/src/third_party/wiredtiger/bench/workgen/workgen.cxx
+++ b/src/third_party/wiredtiger/bench/workgen/workgen.cxx
@@ -55,11 +55,11 @@ extern "C" {
#define THROTTLE_PER_SEC 20 // times per sec we will throttle
-#define MIN(a, b) ((a) < (b) ? (a) : (b))
-#define MAX(a, b) ((a) < (b) ? (b) : (a))
-#define TIMESPEC_DOUBLE(ts) ((double)(ts).tv_sec + ts.tv_nsec * 0.000000001)
-#define PCT(n, total) ((total) == 0 ? 0 : ((n) * 100) / (total))
-#define OPS_PER_SEC(ops, ts) (int) ((ts) == 0 ? 0.0 : \
+#define MIN(a, b) ((a) < (b) ? (a) : (b))
+#define MAX(a, b) ((a) < (b) ? (b) : (a))
+#define TIMESPEC_DOUBLE(ts) ((double)(ts).tv_sec + ts.tv_nsec * 0.000000001)
+#define PCT(n, total) ((total) == 0 ? 0 : ((n) * 100) / (total))
+#define OPS_PER_SEC(ops, ts) (int) ((ts) == 0 ? 0.0 : \
(ops) / TIMESPEC_DOUBLE(ts))
// Get the value of a STL container, even if it is not present
@@ -254,12 +254,11 @@ ContextInternal::~ContextInternal() {
}
int ContextInternal::create_all() {
- if (_runtime_alloced != _tint_last) {
+ if (_runtime_alloced < _tint_last) {
// The array references are 1-based, we'll waste one entry.
TableRuntime *new_table_runtime = new TableRuntime[_tint_last + 1];
- memcpy(new_table_runtime, _table_runtime, sizeof(uint64_t) * _runtime_alloced);
- memset(&new_table_runtime[_runtime_alloced], 0,
- sizeof(uint64_t) * (_tint_last - _runtime_alloced + 1));
+ for (int i = 0; i < _runtime_alloced; i++)
+ new_table_runtime[i + 1] = _table_runtime[i + 1];
delete _table_runtime;
_table_runtime = new_table_runtime;
_runtime_alloced = _tint_last;
@@ -345,7 +344,7 @@ int Monitor::run() {
<< std::endl;
if (_json != NULL) {
-#define WORKGEN_TIMESTAMP_JSON "%Y-%m-%dT%H:%M:%S.000Z"
+#define WORKGEN_TIMESTAMP_JSON "%Y-%m-%dT%H:%M:%S.000Z"
(void)strftime(time_buf, sizeof(time_buf),
WORKGEN_TIMESTAMP_JSON, tm);
@@ -598,7 +597,7 @@ void ThreadRunner::op_create_all(Operation *op, size_t &keysize,
}
-#define PARETO_SHAPE 1.5
+#define PARETO_SHAPE 1.5
// Return a value within the interval [ 0, recno_max )
// that is weighted toward lower numbers with pareto_param at 0 (the minimum),
@@ -653,7 +652,7 @@ uint64_t ThreadRunner::op_get_key_recno(Operation *op, uint64_t range,
if (recno_count == 0)
// The file has no entries, returning 0 forces a WT_NOTFOUND return.
return (0);
- rval = workgen_random(_rand_state);
+ rval = random_value();
if (op->_key._keytype == Key::KEYGEN_PARETO)
rval = pareto_calculation(rval, recno_count, op->_key._pareto);
return (rval % recno_count + 1); // recnos are one-based.
@@ -742,13 +741,12 @@ int ThreadRunner::op_run(Operation *op) {
_in_transaction = true;
}
if (op->_optype != Operation::OP_NONE) {
- op->kv_gen(true, 0, recno, _keybuf);
+ op->kv_gen(this, true, 100, recno, _keybuf);
cursor->set_key(cursor, _keybuf);
if (OP_HAS_VALUE(op)) {
- uint32_t r = 0;
- if (op->_table.options.random_value)
- r = workgen_random(_rand_state);
- op->kv_gen(false, r, recno, _valuebuf);
+ uint64_t compressibility = op->_table.options.random_value ?
+ 0 : op->_table.options.value_compressibility;
+ op->kv_gen(this, false, compressibility, recno, _valuebuf);
cursor->set_value(cursor, _valuebuf);
}
switch (op->_optype) {
@@ -806,6 +804,18 @@ std::string ThreadRunner::get_debug() {
}
#endif
+uint32_t ThreadRunner::random_value() {
+ return (workgen_random(_rand_state));
+}
+
+// Generate a random 32-bit value then return a float value equally distributed
+// between -1.0 and 1.0.
+float ThreadRunner::random_signed() {
+ uint32_t r = random_value();
+ int sign = ((r & 0x1) == 0 ? 1 : -1);
+ return (((float)r * sign) / UINT32_MAX);
+}
+
Throttle::Throttle(ThreadRunner &runner, double throttle,
double throttle_burst) : _runner(runner), _throttle(throttle),
_burst(throttle_burst), _next_div(), _ops_delta(0), _ops_prev(0),
@@ -817,13 +827,6 @@ Throttle::Throttle(ThreadRunner &runner, double throttle,
Throttle::~Throttle() {}
-// Given a random 32-bit value, return a float value equally distributed
-// between -1.0 and 1.0.
-static float rand_signed(uint32_t r) {
- int sign = ((r & 0x1) == 0 ? 1 : -1);
- return (((float)r * sign) / UINT32_MAX);
-}
-
// Each time throttle is called, we sleep and return a number of operations to
// perform next. To implement this we keep a time calculation in _next_div set
// initially to the current time + 1/THROTTLE_PER_SEC. Each call to throttle
@@ -851,8 +854,7 @@ int Throttle::throttle(uint64_t op_count, uint64_t *op_limit) {
_ops_delta += (op_count - _ops_prev);
if (now < _next_div) {
sleep_ms = ts_ms(_next_div - now);
- sleep_ms += (_ms_per_div * _burst *
- rand_signed(workgen_random(_runner._rand_state)));
+ sleep_ms += (_ms_per_div * _burst * _runner.random_signed());
if (sleep_ms > 0) {
DEBUG_CAPTURE(_runner, ", sleep=" << sleep_ms);
usleep((useconds_t)ms_to_us(sleep_ms));
@@ -1066,10 +1068,6 @@ void Operation::kv_compute_max(bool iskey, bool has_random) {
if (has_random) {
if (iskey)
THROW("Random keys not allowed");
- size -= RANDOMIZER_SIZE;
- if (size < 1)
- THROW("Value.size with random values too small for table '"
- << _table._uri << "'");
}
if (size > 1)
@@ -1096,8 +1094,8 @@ void Operation::kv_size_buffer(bool iskey, size_t &maxsize) const {
}
}
-void Operation::kv_gen(bool iskey, uint32_t randomizer, uint64_t n,
- char *result) const {
+void Operation::kv_gen(ThreadRunner *runner, bool iskey,
+ uint64_t compressibility, uint64_t n, char *result) const {
uint64_t max;
int size;
@@ -1106,13 +1104,40 @@ void Operation::kv_gen(bool iskey, uint32_t randomizer, uint64_t n,
if (n > max)
THROW((iskey ? "Key" : "Value") << " (" << n
<< ") too large for size (" << size << ")");
- if (randomizer != 0) {
- randomizer %= 1000;
- snprintf(result, 6, ":%3.3d:", randomizer);
- n -= RANDOMIZER_SIZE;
- result += RANDOMIZER_SIZE;
- }
+ /* Setup the buffer, defaulting to zero filled. */
workgen_u64_to_string_zf(n, result, size);
+
+ /*
+ * Compressibility is a percentage, 100 is all zeroes, it applies to the
+ * proportion of the value that can't be used for the identifier.
+ */
+ if (size > 20 && compressibility < 100) {
+ static const char alphanum[] =
+ "0123456789"
+ "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
+ "abcdefghijklmnopqrstuvwxyz";
+ /*
+ * The random length is the proportion of the string that should not
+ * be compressible. As an example a compressibility of 25 in a value
+ * of length 100 should be:
+ * 100 - ((100 * 25) / 100) = 75
+ * That means that 75% of the string will be random numbers, and 25
+ * will be easily compressible zero-fill.
+ */
+ uint64_t random_len = size - ((size * compressibility) / 100);
+
+ /* Never overwrite the record number identifier */
+ if (random_len > size - 20)
+ random_len = size - 20;
+
+ for (int i = 0; i < random_len; ++i)
+ /*
+ * TODO: It'd be nice to use workgen_rand here, but this class
+ * is without the context of a runner thread, so it's not easy
+ * to get access to a state.
+ */
+ result[i] = alphanum[runner->random_value() % (sizeof(alphanum) - 1)];
+ }
}
void Operation::size_check() const {
@@ -1436,18 +1461,21 @@ void Stats::track_latency(bool latency) {
}
TableOptions::TableOptions() : key_size(0), value_size(0),
- random_value(false), range(0), _options() {
+ value_compressibility(100), random_value(false), range(0), _options() {
_options.add_int("key_size", key_size,
"default size of the key, unless overridden by Key.size");
_options.add_int("value_size", value_size,
"default size of the value, unless overridden by Value.size");
_options.add_bool("random_value", random_value,
"generate random content for the value");
+ _options.add_bool("value_compressibility", value_compressibility,
+ "How compressible the generated value should be");
_options.add_int("range", range,
"if zero, keys are inserted at the end and reads/updates are in the current range, if non-zero, inserts/reads/updates are at a random key between 0 and the given range");
}
TableOptions::TableOptions(const TableOptions &other) :
key_size(other.key_size), value_size(other.value_size),
+ value_compressibility(other.value_compressibility),
random_value(other.random_value), range(other.range),
_options(other._options) {}
TableOptions::~TableOptions() {}
diff --git a/src/third_party/wiredtiger/bench/workgen/workgen.h b/src/third_party/wiredtiger/bench/workgen/workgen.h
index dc15ab06bf8..7de03a90f17 100644
--- a/src/third_party/wiredtiger/bench/workgen/workgen.h
+++ b/src/third_party/wiredtiger/bench/workgen/workgen.h
@@ -35,6 +35,7 @@ namespace workgen {
struct ContextInternal;
struct OperationInternal;
struct TableInternal;
+struct ThreadRunner;
struct Thread;
struct Transaction;
@@ -171,6 +172,7 @@ struct Context {
struct TableOptions {
int key_size;
int value_size;
+ uint64_t value_compressibility;
bool random_value;
int range;
@@ -308,8 +310,8 @@ struct Operation {
void create_all();
void get_static_counts(Stats &stats, int multiplier);
void kv_compute_max(bool iskey, bool has_random);
- void kv_gen(bool iskey, uint32_t randomizer, uint64_t n,
- char *result) const;
+ void kv_gen(ThreadRunner *runner, bool iskey, uint64_t compressibility,
+ uint64_t n, char *result) const;
void kv_size_buffer(bool iskey, size_t &size) const;
void size_check() const;
#endif
diff --git a/src/third_party/wiredtiger/bench/workgen/workgen_int.h b/src/third_party/wiredtiger/bench/workgen/workgen_int.h
index 33a8a8f492f..c38f709efa1 100644
--- a/src/third_party/wiredtiger/bench/workgen/workgen_int.h
+++ b/src/third_party/wiredtiger/bench/workgen/workgen_int.h
@@ -36,8 +36,6 @@ extern "C" {
}
#endif
-#define RANDOMIZER_SIZE 5 /* ":000:" prefix */
-
namespace workgen {
// A 'tint' or ('table integer') is a unique small value integer
@@ -131,6 +129,8 @@ struct ThreadRunner {
uint64_t op_get_key_recno(Operation *, uint64_t range, tint_t tint);
void op_get_static_counts(Operation *, Stats &, int);
int op_run(Operation *);
+ float random_signed();
+ uint32_t random_value();
#ifdef _DEBUG
std::stringstream _debug_messages;
diff --git a/src/third_party/wiredtiger/bench/wtperf/idle_table_cycle.c b/src/third_party/wiredtiger/bench/wtperf/idle_table_cycle.c
index f84e9ddaed5..baab2177507 100644
--- a/src/third_party/wiredtiger/bench/wtperf/idle_table_cycle.c
+++ b/src/third_party/wiredtiger/bench/wtperf/idle_table_cycle.c
@@ -129,7 +129,7 @@ cycle_idle_tables(void *arg)
session, uri, "force,checkpoint_wait=false")) == EBUSY)
__wt_sleep(1, 0);
- if (ret != 0 && ret != EBUSY) {
+ if (ret != 0) {
lprintf(wtperf, ret, 0,
"Table drop failed in cycle_idle_tables.");
wtperf->error = true;
@@ -178,5 +178,5 @@ stop_idle_table_cycle(WTPERF *wtperf, wt_thread_t idle_table_cycle_thread)
return;
wtperf->idle_cycle_run = false;
- testutil_check(__wt_thread_join(NULL, idle_table_cycle_thread));
+ testutil_check(__wt_thread_join(NULL, &idle_table_cycle_thread));
}
diff --git a/src/third_party/wiredtiger/bench/wtperf/wtperf.c b/src/third_party/wiredtiger/bench/wtperf/wtperf.c
index 4adb3db3c6c..047ce549746 100644
--- a/src/third_party/wiredtiger/bench/wtperf/wtperf.c
+++ b/src/third_party/wiredtiger/bench/wtperf/wtperf.c
@@ -473,44 +473,31 @@ do_range_reads(WTPERF *wtperf, WT_CURSOR *cursor, int64_t read_range)
/* pre_load_data --
* Pull everything into cache before starting the workload phase.
*/
-static int
+static void
pre_load_data(WTPERF *wtperf)
{
CONFIG_OPTS *opts;
WT_CONNECTION *conn;
WT_CURSOR *cursor;
WT_SESSION *session;
- char *key;
- int ret;
size_t i;
+ int ret;
+ char *key;
opts = wtperf->opts;
conn = wtperf->conn;
- if ((ret = conn->open_session(
- conn, NULL, opts->sess_config, &session)) != 0) {
- lprintf(wtperf, ret, 0, "worker: WT_CONNECTION.open_session");
- goto err;
- }
+ testutil_check(conn->open_session(
+ conn, NULL, opts->sess_config, &session));
for (i = 0; i < opts->table_count; i++) {
- if ((ret = session->open_cursor(session,
- wtperf->uris[i], NULL, NULL, &cursor)) != 0) {
- lprintf(wtperf, ret, 0,
- "worker: WT_SESSION.open_cursor: %s",
- wtperf->uris[i]);
- goto err;
- }
- while (cursor->next(cursor) == 0)
- if ((ret = cursor->get_key(cursor, &key)) != 0)
- goto err;
- if ((ret = cursor->close(cursor)) != 0)
- goto err;
- }
- if ((ret = session->close(session, NULL)) != 0)
- goto err;
- if (ret != 0)
-err: lprintf(wtperf, ret, 0, "Pre-workload traverse error");
- return (ret);
+ testutil_check(session->open_cursor(
+ session, wtperf->uris[i], NULL, NULL, &cursor));
+ while ((ret = cursor->next(cursor)) == 0)
+ testutil_check(cursor->get_key(cursor, &key));
+ testutil_assert(ret == WT_NOTFOUND);
+ testutil_check(cursor->close(cursor));
+ }
+ testutil_check(session->close(session, NULL));
}
static WT_THREAD_RET
@@ -608,8 +595,7 @@ worker(void *arg)
/* Setup for truncate */
if (workload->truncate != 0)
- if ((ret = setup_truncate(wtperf, thread, session)) != 0)
- goto err;
+ setup_truncate(wtperf, thread, session);
key_buf = thread->key_buf;
value_buf = thread->value_buf;
@@ -1268,7 +1254,7 @@ static WT_THREAD_RET
monitor(void *arg)
{
struct timespec t;
- struct tm *tm, _tm;
+ struct tm localt;
CONFIG_OPTS *opts;
FILE *fp;
WTPERF *wtperf;
@@ -1337,8 +1323,9 @@ monitor(void *arg)
continue;
__wt_epoch(NULL, &t);
- tm = localtime_r(&t.tv_sec, &_tm);
- (void)strftime(buf, sizeof(buf), "%b %d %H:%M:%S", tm);
+ testutil_check(__wt_localtime(NULL, &t.tv_sec, &localt));
+ testutil_assert(
+ strftime(buf, sizeof(buf), "%b %d %H:%M:%S", &localt) != 0);
reads = sum_read_ops(wtperf);
inserts = sum_insert_ops(wtperf);
@@ -2197,7 +2184,7 @@ start_all_runs(WTPERF *wtperf)
/* Wait for threads to finish. */
for (i = 0; i < opts->database_count; i++)
- testutil_check(__wt_thread_join(NULL, threads[i]));
+ testutil_check(__wt_thread_join(NULL, &threads[i]));
for (i = 0; i < opts->database_count && wtperfs[i] != NULL; i++) {
wtperf_free(wtperfs[i]);
@@ -2286,8 +2273,9 @@ start_run(WTPERF *wtperf)
start_threads(wtperf, NULL, wtperf->ckptthreads,
opts->checkpoint_threads, checkpoint_worker);
}
- if (opts->pre_load_data && (ret = pre_load_data(wtperf)) != 0)
- goto err;
+ if (opts->pre_load_data)
+ pre_load_data(wtperf);
+
/* Execute the workload. */
if ((ret = execute_workload(wtperf)) != 0)
goto err;
@@ -2341,7 +2329,7 @@ err: if (ret == 0)
stop_threads(1, wtperf->ckptthreads);
if (monitor_created != 0)
- testutil_check(__wt_thread_join(NULL, monitor_thread));
+ testutil_check(__wt_thread_join(NULL, &monitor_thread));
if (wtperf->conn != NULL && opts->close_conn &&
(t_ret = wtperf->conn->close(wtperf->conn, NULL)) != 0) {
@@ -2761,7 +2749,7 @@ stop_threads(u_int num, WTPERF_THREAD *threads)
return;
for (i = 0; i < num; ++i, ++threads) {
- testutil_check(__wt_thread_join(NULL, threads->handle));
+ testutil_check(__wt_thread_join(NULL, &threads->handle));
free(threads->key_buf);
threads->key_buf = NULL;
diff --git a/src/third_party/wiredtiger/bench/wtperf/wtperf.h b/src/third_party/wiredtiger/bench/wtperf/wtperf.h
index 7fb370e0b5c..5efbe5f6e13 100644
--- a/src/third_party/wiredtiger/bench/wtperf/wtperf.h
+++ b/src/third_party/wiredtiger/bench/wtperf/wtperf.h
@@ -268,7 +268,7 @@ int run_truncate(
WTPERF *, WTPERF_THREAD *, WT_CURSOR *, WT_SESSION *, int *);
int setup_log_file(WTPERF *);
void setup_throttle(WTPERF_THREAD *);
-int setup_truncate(WTPERF *, WTPERF_THREAD *, WT_SESSION *);
+void setup_truncate(WTPERF *, WTPERF_THREAD *, WT_SESSION *);
void start_idle_table_cycle(WTPERF *, wt_thread_t *);
void stop_idle_table_cycle(WTPERF *, wt_thread_t);
void worker_throttle(WTPERF_THREAD *);
diff --git a/src/third_party/wiredtiger/bench/wtperf/wtperf_truncate.c b/src/third_party/wiredtiger/bench/wtperf/wtperf_truncate.c
index 1f910b9a3a4..83cfe3ffae0 100644
--- a/src/third_party/wiredtiger/bench/wtperf/wtperf_truncate.c
+++ b/src/third_party/wiredtiger/bench/wtperf/wtperf_truncate.c
@@ -34,7 +34,7 @@ decode_key(char *key_buf)
return (strtoull(key_buf, NULL, 10));
}
-int
+void
setup_truncate(WTPERF *wtperf, WTPERF_THREAD *thread, WT_SESSION *session)
{
CONFIG_OPTS *opts;
@@ -42,9 +42,8 @@ setup_truncate(WTPERF *wtperf, WTPERF_THREAD *thread, WT_SESSION *session)
TRUNCATE_QUEUE_ENTRY *truncate_item;
WORKLOAD *workload;
WT_CURSOR *cursor;
- char *key;
- int ret;
uint64_t end_point, final_stone_gap, i, start_point;
+ char *key;
opts = wtperf->opts;
end_point = final_stone_gap = start_point = 0;
@@ -52,9 +51,8 @@ setup_truncate(WTPERF *wtperf, WTPERF_THREAD *thread, WT_SESSION *session)
workload = thread->workload;
/* We are limited to only one table when running truncate. */
- if ((ret = session->open_cursor(
- session, wtperf->uris[0], NULL, NULL, &cursor)) != 0)
- goto err;
+ testutil_check(session->open_cursor(
+ session, wtperf->uris[0], NULL, NULL, &cursor));
/*
* If we find the workload getting behind we multiply the number of
@@ -79,18 +77,13 @@ setup_truncate(WTPERF *wtperf, WTPERF_THREAD *thread, WT_SESSION *session)
* data available, then we need to setup some initial truncation
* stones.
*/
- if ((ret = cursor->next(cursor)) != 0 ||
- (ret = cursor->get_key(cursor, &key)) != 0) {
- lprintf(wtperf, ret, 0, "truncate setup start: failed");
- goto err;
- }
+ testutil_check(cursor->next(cursor));
+ testutil_check(cursor->get_key(cursor, &key));
start_point = decode_key(key);
- if ((cursor->reset(cursor)) != 0 || (ret = cursor->prev(cursor)) != 0 ||
- (ret = cursor->get_key(cursor, &key)) != 0) {
- lprintf(wtperf, ret, 0, "truncate setup end: failed");
- goto err;
- }
+ testutil_check(cursor->reset(cursor));
+ testutil_check(cursor->prev(cursor));
+ testutil_check(cursor->get_key(cursor, &key));
end_point = decode_key(key);
/* Assign stones if there are enough documents. */
@@ -119,10 +112,7 @@ setup_truncate(WTPERF *wtperf, WTPERF_THREAD *thread, WT_SESSION *session)
}
trunc_cfg->stone_gap = final_stone_gap;
-err: if ((ret = cursor->close(cursor)) != 0) {
- lprintf(wtperf, ret, 0, "truncate setup: cursor close failed");
- }
- return (ret);
+ testutil_check(cursor->close(cursor));
}
int
diff --git a/src/third_party/wiredtiger/build_posix/aclocal/options.m4 b/src/third_party/wiredtiger/build_posix/aclocal/options.m4
index 7d0df5d65ac..9d07958bad9 100644
--- a/src/third_party/wiredtiger/build_posix/aclocal/options.m4
+++ b/src/third_party/wiredtiger/build_posix/aclocal/options.m4
@@ -53,19 +53,6 @@ AM_CONDITIONAL([HAVE_BUILTIN_EXTENSION_ZSTD],
[test "$wt_cv_with_builtin_extension_zstd" = "yes"])
AC_MSG_RESULT($with_builtins)
-AH_TEMPLATE(
- HAVE_CRC32_HARDWARE, [Define to 1 to configure CRC32 hardware support.])
-AC_MSG_CHECKING(if --enable-crc32-hardware option specified)
-AC_ARG_ENABLE(crc32-hardware,
- AS_HELP_STRING([--enable-crc32-hardware],
- [Enable CRC32 hardware support.]), r=$enableval, r=yes)
-case "$r" in
-no) wt_cv_enable_crc32_hardware=no;;
-*) AC_DEFINE(HAVE_CRC32_HARDWARE)
- wt_cv_enable_crc32_hardware=yes;;
-esac
-AC_MSG_RESULT($wt_cv_enable_crc32_hardware)
-
AH_TEMPLATE(HAVE_DIAGNOSTIC, [Define to 1 for diagnostic tests.])
AC_MSG_CHECKING(if --enable-diagnostic option specified)
AC_ARG_ENABLE(diagnostic,
diff --git a/src/third_party/wiredtiger/build_posix/aclocal/version-set.m4 b/src/third_party/wiredtiger/build_posix/aclocal/version-set.m4
index 3f87f7f6507..8b39a5d09d6 100644
--- a/src/third_party/wiredtiger/build_posix/aclocal/version-set.m4
+++ b/src/third_party/wiredtiger/build_posix/aclocal/version-set.m4
@@ -2,8 +2,8 @@ dnl build by dist/s_version
VERSION_MAJOR=3
VERSION_MINOR=1
-VERSION_PATCH=0
-VERSION_STRING='"WiredTiger 3.1.0: (April 23, 2018)"'
+VERSION_PATCH=1
+VERSION_STRING='"WiredTiger 3.1.1: (July 12, 2018)"'
AC_SUBST(VERSION_MAJOR)
AC_SUBST(VERSION_MINOR)
diff --git a/src/third_party/wiredtiger/build_win/wiredtiger.def b/src/third_party/wiredtiger/build_win/wiredtiger.def
index 3ee9f6b6a9d..79fa84a11e0 100644
--- a/src/third_party/wiredtiger/build_win/wiredtiger.def
+++ b/src/third_party/wiredtiger/build_win/wiredtiger.def
@@ -1,8 +1,8 @@
LIBRARY WIREDTIGER
EXPORTS
- wiredtiger_checksum_crc32c
wiredtiger_config_parser_open
wiredtiger_config_validate
+ wiredtiger_crc32c_func
wiredtiger_open
wiredtiger_pack_close
wiredtiger_pack_int
diff --git a/src/third_party/wiredtiger/build_win/wiredtiger_config.h b/src/third_party/wiredtiger/build_win/wiredtiger_config.h
index 55431f59fae..bb4cc7848f8 100644
--- a/src/third_party/wiredtiger/build_win/wiredtiger_config.h
+++ b/src/third_party/wiredtiger/build_win/wiredtiger_config.h
@@ -25,9 +25,6 @@
/* Define to 1 if you have the `clock_gettime' function. */
/* #undef HAVE_CLOCK_GETTIME */
-/* Define to 1 to enable CRC32 hardware support. */
-/* #undef HAVE_CRC32_HARDWARE */
-
/* Define to 1 for diagnostic tests. */
/* #undef HAVE_DIAGNOSTIC */
diff --git a/src/third_party/wiredtiger/dist/api_data.py b/src/third_party/wiredtiger/dist/api_data.py
index 7d8a58c83bb..08a322a66e7 100644
--- a/src/third_party/wiredtiger/dist/api_data.py
+++ b/src/third_party/wiredtiger/dist/api_data.py
@@ -302,6 +302,14 @@ file_config = format_meta + file_runtime_config + [
Config('leaf_item_max', '0', r'''
historic term for leaf_key_max and leaf_value_max''',
min=0, undoc=True),
+ Config('memory_page_image_max', '0', r'''
+ the maximum in-memory page image represented by a single storage block.
+ Depending on compression efficiency, compression can create storage
+ blocks which require significant resources to re-instantiate in the
+ cache, penalizing the performance of future point updates. The value
+ limits the maximum in-memory page image a storage block will need. If
+ set to 0, a default of 4 times \c leaf_page_max is used''',
+ min='0'),
Config('memory_page_max', '5MB', r'''
the maximum size a page can grow to in memory before being
reconciled to disk. The specified size will be adjusted to a lower
@@ -417,6 +425,11 @@ connection_runtime_config = [
maximum heap memory to allocate for the cache. A database should
configure either \c cache_size or \c shared_cache but not both''',
min='1MB', max='10TB'),
+ Config('cache_max_wait_ms', '0', r'''
+ the maximum number of milliseconds an application thread will wait
+ for space to be available in cache before giving up. Default will
+ wait forever''',
+ min=0),
Config('cache_overhead', '8', r'''
assume the heap allocator overhead is the specified percentage, and
adjust the cache usage by that amount (for example, if there is 10GB
@@ -460,7 +473,7 @@ connection_runtime_config = [
vary depending on the current eviction load''',
min=1, max=20),
]),
- Config('eviction_checkpoint_target', '5', r'''
+ Config('eviction_checkpoint_target', '1', r'''
perform eviction at the beginning of checkpoints to bring the dirty
content in cache to this level. It is a percentage of the cache size if
the value is within the range of 0 to 100 or an absolute size when
@@ -585,7 +598,7 @@ connection_runtime_config = [
type='list', undoc=True,
choices=[
'checkpoint_slow', 'lookaside_sweep_race', 'split_1', 'split_2',
- 'split_3', 'split_4', 'split_5', 'split_6', 'split_7']),
+ 'split_3', 'split_4', 'split_5', 'split_6', 'split_7', 'split_8']),
Config('verbose', '', r'''
enable messages for various events. Options are given as a
list, such as <code>"verbose=[evictserver,read]"</code>''',
@@ -595,6 +608,7 @@ connection_runtime_config = [
'checkpoint',
'checkpoint_progress',
'compact',
+ 'error_returns',
'evict',
'evict_stuck',
'evictserver',
@@ -694,7 +708,8 @@ wiredtiger_open_log_configuration = [
information'''),
Config('file_max', '100MB', r'''
the maximum size of log files''',
- min='100KB', max='2GB'),
+ min='100KB', # !!! Must match WT_LOG_FILE_MIN
+ max='2GB'), # !!! Must match WT_LOG_FILE_MAX
Config('path', '"."', r'''
the name of a directory into which log files are written. The
directory must already exist. If the value is not an absolute path,
@@ -703,7 +718,7 @@ wiredtiger_open_log_configuration = [
Config('recover', 'on', r'''
run recovery or error if recovery needs to run after an
unclean shutdown''',
- choices=['error','on'])
+ choices=['error', 'on'])
]),
]
@@ -848,7 +863,11 @@ wiredtiger_open_common =\
file extension configuration. If set, extend files of the set
type in allocations of the set size, instead of a block at a
time as each new block is written. For example,
- <code>file_extend=(data=16MB)</code>''',
+ <code>file_extend=(data=16MB)</code>. If set to 0, disable the file
+ extension for the set type. For log files, the allowed range is
+ between 100KB and 2GB; values larger than the configured maximum log
+ size and the default config would extend log files in allocations of
+ the maximum log file size.''',
type='list', choices=['data', 'log']),
Config('hazard_max', '1000', r'''
maximum number of simultaneous hazard pointers per session
@@ -867,6 +886,14 @@ wiredtiger_open_common =\
methods that may modify a database are disabled. See @ref readonly
for more information''',
type='boolean'),
+ Config('salvage', 'false', r'''
+ open connection and salvage any WiredTiger-owned database and log
+ files that it detects as corrupted. This API should only be used
+ after getting an error return of WT_TRY_SALVAGE.
+ Salvage rebuilds files in place, overwriting existing files.
+ We recommend making a backup copy of all files with the
+ WiredTiger prefix prior to passing this flag.''',
+ type='boolean'),
Config('session_max', '100', r'''
maximum expected number of sessions (including server
threads)''',
@@ -1114,6 +1141,10 @@ methods = {
ignore the encodings for the key and value, manage data as if
the formats were \c "u". See @ref cursor_raw for details''',
type='boolean'),
+ Config('read_once', 'false', r'''
+ results that are brought into cache from disk by this cursor will be
+ given less priority in the cache.''',
+ type='boolean'),
Config('readonly', 'false', r'''
only query operations are supported by this cursor. An error is
returned if a modification is attempted using the cursor. The
@@ -1148,6 +1179,16 @@ methods = {
type='list'),
]),
+'WT_SESSION.query_timestamp' : Method([
+ Config('get', 'read', r'''
+ specify which timestamp to query: \c commit returns the most recently
+ set commit_timestamp. \c first_commit returns the first set
+ commit_timestamp. \c prepare returns the timestamp used in preparing a
+ transaction. \c read returns the timestamp at which the transaction is
+ reading at. See @ref transaction_timestamps''',
+ choices=['commit', 'first_commit', 'prepare', 'read']),
+]),
+
'WT_SESSION.rebalance' : Method([]),
'WT_SESSION.rename' : Method([]),
'WT_SESSION.reset' : Method([]),
@@ -1412,12 +1453,14 @@ methods = {
specify which timestamp to query: \c all_committed returns the largest
timestamp such that all timestamps up to that value have committed,
\c oldest returns the most recent \c oldest_timestamp set with
- WT_CONNECTION::set_timestamp, \c pinned returns the minimum of the
- \c oldest_timestamp and the read timestamps of all active readers, and
- \c stable returns the most recent \c stable_timestamp set with
- WT_CONNECTION::set_timestamp. See @ref transaction_timestamps''',
+ WT_CONNECTION::set_timestamp, \c oldest_reader returns the
+ minimum of the read timestamps of all active readers \c pinned returns
+ the minimum of the\c oldest_timestamp and the read timestamps of all
+ active readers, and \c stable returns the most recent
+ \c stable_timestamp set with WT_CONNECTION::set_timestamp. See
+ @ref transaction_timestamps''',
choices=['all_committed','last_checkpoint',
- 'oldest','pinned','recovery','stable']),
+ 'oldest','oldest_reader','pinned','recovery','stable']),
]),
'WT_CONNECTION.set_timestamp' : Method([
diff --git a/src/third_party/wiredtiger/dist/api_err.py b/src/third_party/wiredtiger/dist/api_err.py
index 2404755a49d..cc6b9a6610a 100644
--- a/src/third_party/wiredtiger/dist/api_err.py
+++ b/src/third_party/wiredtiger/dist/api_err.py
@@ -64,6 +64,11 @@ errors = [
an already updated record which is in prepared state. An updated
record will be in prepared state, when the transaction that performed
the update is in prepared state.'''),
+ Error('WT_TRY_SALVAGE', -31809,
+ 'database corruption detected', '''
+ This error is generated when corruption is detected in an on-disk file.
+ The application may choose to salvage the file or retry wiredtiger_open
+ with the 'salvage=true' configuration setting.'''),
]
# Update the #defines in the wiredtiger.in file.
diff --git a/src/third_party/wiredtiger/dist/log.py b/src/third_party/wiredtiger/dist/log.py
index 2da8e5eae66..2503edea49d 100644
--- a/src/third_party/wiredtiger/dist/log.py
+++ b/src/third_party/wiredtiger/dist/log.py
@@ -331,7 +331,7 @@ for optype in log_data.optypes:
})
tfile.write('''
-\tWT_ILLEGAL_VALUE(session);
+\tWT_ILLEGAL_VALUE(session, optype);
\t}
\treturn (0);
diff --git a/src/third_party/wiredtiger/dist/s_export.list b/src/third_party/wiredtiger/dist/s_export.list
index 72ce553ac9b..e49fa113d96 100644
--- a/src/third_party/wiredtiger/dist/s_export.list
+++ b/src/third_party/wiredtiger/dist/s_export.list
@@ -1,7 +1,7 @@
# List of OK external symbols.
-wiredtiger_checksum_crc32c
wiredtiger_config_parser_open
wiredtiger_config_validate
+wiredtiger_crc32c_func
wiredtiger_open
wiredtiger_pack_close
wiredtiger_pack_int
diff --git a/src/third_party/wiredtiger/dist/s_funcs.list b/src/third_party/wiredtiger/dist/s_funcs.list
index eed29e91fc1..95c568a19ff 100644
--- a/src/third_party/wiredtiger/dist/s_funcs.list
+++ b/src/third_party/wiredtiger/dist/s_funcs.list
@@ -33,7 +33,6 @@ __wt_stat_join_aggregate
__wt_stat_join_clear_all
__wt_stream_set_no_buffer
__wt_try_readlock
-wiredtiger_checksum_crc32c
wiredtiger_config_parser_open
wiredtiger_config_validate
wiredtiger_pack_int
diff --git a/src/third_party/wiredtiger/dist/s_string.ok b/src/third_party/wiredtiger/dist/s_string.ok
index 64b9758877e..ed5f27cdf11 100644
--- a/src/third_party/wiredtiger/dist/s_string.ok
+++ b/src/third_party/wiredtiger/dist/s_string.ok
@@ -24,6 +24,7 @@ Async
AsyncOp
Athanassoulis
Athlon
+Axxx
BBBBB
BBBBBB
BBBBBBBBBB
@@ -46,6 +47,7 @@ Brueckner
Bsearch
Btree
Buf
+Bxxx
Bzip
CAS
CCCCCCCCCCCCCCCCCC
@@ -75,6 +77,7 @@ Checksum
Checksums
CityHash
CloseHandle
+Cmvxz
Cmvz
Collet
Comparator
@@ -207,8 +210,8 @@ LLLLLLL
LOGREC
LOGSCAN
LOOKASIDE
+LRSVv
LRU
-LRVv
LSB
LSM
LSN
@@ -222,6 +225,7 @@ LevelDB
Levyx
Llqr
Llqrt
+LmT
LoadLoad
LockFile
Lookaside
@@ -443,6 +447,7 @@ Zlib's
Zstandard
Zstd
Zstd's
+aaa
abcdef
abcdefghijklmnopqrstuvwxyz
addl
@@ -478,6 +483,7 @@ backoff
bal
basecfg
basho
+bbb
bcr
bdb
beginthreadex
@@ -496,6 +502,7 @@ bool
boolean
br
breakpoint
+bs
bswap
bt
btcur
@@ -525,6 +532,7 @@ calloc
cas
catfmt
cb
+ccc
ccr
cd
centric
@@ -622,6 +630,7 @@ datastore
dbc
dbs
dcalloc
+ddd
decile
deciles
decl
@@ -652,6 +661,7 @@ dhandle
dhandles
difftime
dir
+directio
dirlist
disjunction
disjunctions
@@ -677,6 +687,7 @@ dsync
dumpcmp
dumpfile
dup
+eee
eg
egrep
emp
@@ -718,6 +729,7 @@ fdatasync
fdopen
fextend
ffc
+fff
fflush
ffs
fgetc
@@ -746,9 +758,11 @@ fopen
formatmessage
fp
fprintf
+fread
free'd
fs
fscanf
+fseek
fstat
fstream
fsync
@@ -758,6 +772,7 @@ ftruncate
func
funcid
fvisibility
+fwrite
gcc
gdb
ge
@@ -772,6 +787,7 @@ getraw
gettime
gettimeofday
getv
+ggg
github
gitignore
gobare
@@ -785,6 +801,7 @@ handlep
hashval
havesize
hdr
+hhh
highjack
hilq
hotbackup
@@ -801,6 +818,7 @@ idlems
idx
ifdef
ifdef's
+iflag
iiSii
iiiS
iiii
@@ -859,6 +877,7 @@ iszero
iter
iteratively
iters
+jjj
jnr
jrx
json
@@ -1030,6 +1049,7 @@ other's
ovfl
ownp
pR
+pS
packv
pagesize
parens
@@ -1137,6 +1157,7 @@ rwlock
sH
sHQ
sT
+sanitizers
scalability
sched
scr
@@ -1318,6 +1339,7 @@ vtype
vunpack
vw
vxr
+vxz
vz
waitpid
waker
@@ -1361,3 +1383,4 @@ zstd
zstd's
zu
zyxwvutsrqponmlkjihgfedcba
+zzz
diff --git a/src/third_party/wiredtiger/dist/s_style b/src/third_party/wiredtiger/dist/s_style
index 3f4346173e6..9c1dd6fa506 100755
--- a/src/third_party/wiredtiger/dist/s_style
+++ b/src/third_party/wiredtiger/dist/s_style
@@ -101,6 +101,12 @@ else
cat $t
}
+ if ! expr "$f" : 'src/include/misc.h' > /dev/null &&
+ grep '[[:space:]]qsort(' $f > $t; then
+ echo "$f: qsort call, use WiredTiger __wt_qsort instead"
+ cat $t
+ fi
+
if ! expr "$f" : 'src/.*/os_setvbuf.c' > /dev/null &&
egrep -w 'setvbuf' $f > $t; then
echo "$f: setvbuf call, use WiredTiger library replacements"
diff --git a/src/third_party/wiredtiger/dist/s_tags b/src/third_party/wiredtiger/dist/s_tags
index edb1567992c..48aa98cb411 100755
--- a/src/third_party/wiredtiger/dist/s_tags
+++ b/src/third_party/wiredtiger/dist/s_tags
@@ -20,7 +20,7 @@ type ctags > /dev/null 2>&1 || {
# Test to see what flags this ctags binary supports.
flags=""
-for i in -d -t -w --language-force=C; do
+for i in -d -t -w --language-force=C '-I WT_GCC_FUNC_ATTRIBUTE+'; do
if ctags $i ../src/conn/api_strerror.c 2>/dev/null; then
flags="$i $flags"
fi
diff --git a/src/third_party/wiredtiger/dist/s_void b/src/third_party/wiredtiger/dist/s_void
index 9c5f6711da0..8798893597d 100755
--- a/src/third_party/wiredtiger/dist/s_void
+++ b/src/third_party/wiredtiger/dist/s_void
@@ -55,8 +55,9 @@ func_ok()
-e '/int __wt_block_compact_start$/d' \
-e '/int __wt_block_manager_size$/d' \
-e '/int __wt_block_write_size$/d' \
+ -e '/int __wt_buf_catfmt$/d' \
+ -e '/int __wt_buf_fmt$/d' \
-e '/int __wt_curjoin_joined$/d' \
- -e '/int __wt_cursor_close$/d' \
-e '/int __wt_cursor_noop$/d' \
-e '/int __wt_epoch$/d' \
-e '/int __wt_errno$/d' \
@@ -65,6 +66,7 @@ func_ok()
-e '/int __wt_once$/d' \
-e '/int __wt_posix_directory_list_free$/d' \
-e '/int __wt_session_breakpoint$/d' \
+ -e '/int __wt_set_return_func$/d' \
-e '/int __wt_spin_init$/d' \
-e '/int __wt_spin_trylock$/d' \
-e '/int __wt_stat_connection_desc$/d' \
diff --git a/src/third_party/wiredtiger/dist/stat_data.py b/src/third_party/wiredtiger/dist/stat_data.py
index c1b43ac6b2d..c452efa9cc6 100644
--- a/src/third_party/wiredtiger/dist/stat_data.py
+++ b/src/third_party/wiredtiger/dist/stat_data.py
@@ -195,7 +195,7 @@ connection_stats = [
CacheStat('cache_bytes_internal', 'tracked bytes belonging to internal pages in the cache', 'no_clear,no_scale,size'),
CacheStat('cache_bytes_inuse', 'bytes currently in the cache', 'no_clear,no_scale,size'),
CacheStat('cache_bytes_leaf', 'tracked bytes belonging to leaf pages in the cache', 'no_clear,no_scale,size'),
- CacheStat('cache_bytes_lookaside', 'bytes belonging to the lookaside table in the cache', 'no_clear,no_scale,size'),
+ CacheStat('cache_bytes_lookaside', 'bytes belonging to the cache overflow table in the cache', 'no_clear,no_scale,size'),
CacheStat('cache_bytes_max', 'maximum bytes configured', 'no_clear,no_scale,size'),
CacheStat('cache_bytes_other', 'bytes not belonging to page images in the cache', 'no_clear,no_scale,size'),
CacheStat('cache_bytes_read', 'bytes read into cache', 'size'),
@@ -238,9 +238,9 @@ connection_stats = [
CacheStat('cache_eviction_state', 'eviction state', 'no_clear,no_scale'),
CacheStat('cache_eviction_target_page_ge128', 'eviction walk target pages histogram - 128 and higher'),
CacheStat('cache_eviction_target_page_lt10', 'eviction walk target pages histogram - 0-9'),
+ CacheStat('cache_eviction_target_page_lt128', 'eviction walk target pages histogram - 64-128'),
CacheStat('cache_eviction_target_page_lt32', 'eviction walk target pages histogram - 10-31'),
CacheStat('cache_eviction_target_page_lt64', 'eviction walk target pages histogram - 32-63'),
- CacheStat('cache_eviction_target_page_lt128', 'eviction walk target pages histogram - 64-128'),
CacheStat('cache_eviction_walk', 'pages walked for eviction'),
CacheStat('cache_eviction_walk_from_root', 'eviction walks started from root of tree'),
CacheStat('cache_eviction_walk_passes', 'eviction passes of a file'),
@@ -260,10 +260,12 @@ connection_stats = [
CacheStat('cache_hazard_walks', 'hazard pointer check entries walked'),
CacheStat('cache_inmem_split', 'in-memory page splits'),
CacheStat('cache_inmem_splittable', 'in-memory page passed criteria to be split'),
- CacheStat('cache_lookaside_entries', 'lookaside table entries', 'no_clear,no_scale'),
- CacheStat('cache_lookaside_insert', 'lookaside table insert calls'),
- CacheStat('cache_lookaside_remove', 'lookaside table remove calls'),
- CacheStat('cache_lookaside_score', 'lookaside score', 'no_clear,no_scale'),
+ CacheStat('cache_lookaside_cursor_wait_application', 'cache overflow cursor application thread wait time (usecs)'),
+ CacheStat('cache_lookaside_cursor_wait_internal', 'cache overflow cursor internal thread wait time (usecs)'),
+ CacheStat('cache_lookaside_entries', 'cache overflow table entries', 'no_clear,no_scale'),
+ CacheStat('cache_lookaside_insert', 'cache overflow table insert calls'),
+ CacheStat('cache_lookaside_remove', 'cache overflow table remove calls'),
+ CacheStat('cache_lookaside_score', 'cache overflow score', 'no_clear,no_scale'),
CacheStat('cache_overhead', 'percentage overhead', 'no_clear,no_scale'),
CacheStat('cache_pages_dirty', 'tracked dirty pages in the cache', 'no_clear,no_scale'),
CacheStat('cache_pages_inuse', 'pages currently held in the cache', 'no_clear,no_scale'),
@@ -273,16 +275,17 @@ connection_stats = [
CacheStat('cache_read_app_time', 'application threads page read from disk to cache time (usecs)'),
CacheStat('cache_read_deleted', 'pages read into cache after truncate'),
CacheStat('cache_read_deleted_prepared', 'pages read into cache after truncate in prepare state'),
- CacheStat('cache_read_lookaside', 'pages read into cache requiring lookaside entries'),
- CacheStat('cache_read_lookaside_checkpoint', 'pages read into cache requiring lookaside for checkpoint'),
- CacheStat('cache_read_lookaside_delay', 'pages read into cache with skipped lookaside entries needed later'),
- CacheStat('cache_read_lookaside_delay_checkpoint', 'pages read into cache with skipped lookaside entries needed later by checkpoint'),
- CacheStat('cache_read_lookaside_skipped', 'pages read into cache skipping older lookaside entries'),
+ CacheStat('cache_read_lookaside', 'pages read into cache requiring cache overflow entries'),
+ CacheStat('cache_read_lookaside_checkpoint', 'pages read into cache requiring cache overflow for checkpoint'),
+ CacheStat('cache_read_lookaside_delay', 'pages read into cache with skipped cache overflow entries needed later'),
+ CacheStat('cache_read_lookaside_delay_checkpoint', 'pages read into cache with skipped cache overflow entries needed later by checkpoint'),
+ CacheStat('cache_read_lookaside_skipped', 'pages read into cache skipping older cache overflow entries'),
CacheStat('cache_read_overflow', 'overflow pages read into cache'),
+ CacheStat('cache_timed_out_ops', 'operations timed out waiting for space in cache'),
CacheStat('cache_write', 'pages written from cache'),
CacheStat('cache_write_app_count', 'application threads page write from cache to disk count'),
CacheStat('cache_write_app_time', 'application threads page write from cache to disk time (usecs)'),
- CacheStat('cache_write_lookaside', 'page written requiring lookaside records'),
+ CacheStat('cache_write_lookaside', 'page written requiring cache overflow records'),
CacheStat('cache_write_restore', 'pages written requiring in-memory restoration'),
##########################################
@@ -294,11 +297,11 @@ connection_stats = [
CursorStat('cursor_modify', 'cursor modify calls'),
CursorStat('cursor_next', 'cursor next calls'),
CursorStat('cursor_prev', 'cursor prev calls'),
- CursorStat('cursor_reopen', 'cursors reused from cache'),
CursorStat('cursor_remove', 'cursor remove calls'),
+ CursorStat('cursor_reopen', 'cursors reused from cache'),
CursorStat('cursor_reserve', 'cursor reserve calls'),
CursorStat('cursor_reset', 'cursor reset calls'),
- CursorStat('cursor_restart', 'cursor restarted searches'),
+ CursorStat('cursor_restart', 'cursor operation restarted'),
CursorStat('cursor_search', 'cursor search calls'),
CursorStat('cursor_search_near', 'cursor search near calls'),
CursorStat('cursor_truncate', 'truncate calls'),
@@ -309,8 +312,8 @@ connection_stats = [
##########################################
CursorStat('cursor_sweep', 'cursor sweeps'),
CursorStat('cursor_sweep_buckets', 'cursor sweep buckets'),
- CursorStat('cursor_sweep_examined', 'cursor sweep cursors examined'),
CursorStat('cursor_sweep_closed', 'cursor sweep cursors closed'),
+ CursorStat('cursor_sweep_examined', 'cursor sweep cursors examined'),
##########################################
# Dhandle statistics
@@ -331,19 +334,19 @@ connection_stats = [
LockStat('lock_checkpoint_wait_application', 'checkpoint lock application thread wait time (usecs)'),
LockStat('lock_checkpoint_wait_internal', 'checkpoint lock internal thread wait time (usecs)'),
LockStat('lock_commit_timestamp_read_count', 'commit timestamp queue read lock acquisitions'),
- LockStat('lock_commit_timestamp_wait_application', 'commit timestamp queue lock application thread time waiting for the dhandle lock (usecs)'),
- LockStat('lock_commit_timestamp_wait_internal', 'commit timestamp queue lock internal thread time waiting for the dhandle lock (usecs)'),
+ LockStat('lock_commit_timestamp_wait_application', 'commit timestamp queue lock application thread time waiting (usecs)'),
+ LockStat('lock_commit_timestamp_wait_internal', 'commit timestamp queue lock internal thread time waiting (usecs)'),
LockStat('lock_commit_timestamp_write_count', 'commit timestamp queue write lock acquisitions'),
LockStat('lock_dhandle_read_count', 'dhandle read lock acquisitions'),
- LockStat('lock_dhandle_wait_application', 'dhandle lock application thread time waiting for the dhandle lock (usecs)'),
- LockStat('lock_dhandle_wait_internal', 'dhandle lock internal thread time waiting for the dhandle lock (usecs)'),
+ LockStat('lock_dhandle_wait_application', 'dhandle lock application thread time waiting (usecs)'),
+ LockStat('lock_dhandle_wait_internal', 'dhandle lock internal thread time waiting (usecs)'),
LockStat('lock_dhandle_write_count', 'dhandle write lock acquisitions'),
LockStat('lock_metadata_count', 'metadata lock acquisitions'),
LockStat('lock_metadata_wait_application', 'metadata lock application thread wait time (usecs)'),
LockStat('lock_metadata_wait_internal', 'metadata lock internal thread wait time (usecs)'),
LockStat('lock_read_timestamp_read_count', 'read timestamp queue read lock acquisitions'),
- LockStat('lock_read_timestamp_wait_application', 'read timestamp queue lock application thread time waiting for the dhandle lock (usecs)'),
- LockStat('lock_read_timestamp_wait_internal', 'read timestamp queue lock internal thread time waiting for the dhandle lock (usecs)'),
+ LockStat('lock_read_timestamp_wait_application', 'read timestamp queue lock application thread time waiting (usecs)'),
+ LockStat('lock_read_timestamp_wait_internal', 'read timestamp queue lock internal thread time waiting (usecs)'),
LockStat('lock_read_timestamp_write_count', 'read timestamp queue write lock acquisitions'),
LockStat('lock_schema_count', 'schema lock acquisitions'),
LockStat('lock_schema_wait_application', 'schema lock application thread wait time (usecs)'),
@@ -353,8 +356,8 @@ connection_stats = [
LockStat('lock_table_wait_internal', 'table lock internal thread time waiting for the table lock (usecs)'),
LockStat('lock_table_write_count', 'table write lock acquisitions'),
LockStat('lock_txn_global_read_count', 'txn global read lock acquisitions'),
- LockStat('lock_txn_global_wait_application', 'txn global lock application thread time waiting for the dhandle lock (usecs)'),
- LockStat('lock_txn_global_wait_internal', 'txn global lock internal thread time waiting for the dhandle lock (usecs)'),
+ LockStat('lock_txn_global_wait_application', 'txn global lock application thread time waiting (usecs)'),
+ LockStat('lock_txn_global_wait_internal', 'txn global lock internal thread time waiting (usecs)'),
LockStat('lock_txn_global_write_count', 'txn global write lock acquisitions'),
##########################################
@@ -462,6 +465,7 @@ connection_stats = [
##########################################
SessionStat('session_cursor_open', 'open cursor count', 'no_clear,no_scale'),
SessionStat('session_open', 'open session count', 'no_clear,no_scale'),
+ SessionStat('session_query_ts', 'session query timestamp calls'),
SessionStat('session_table_alter_fail', 'table alter failed calls', 'no_clear,no_scale'),
SessionStat('session_table_alter_skip', 'table alter unchanged and skipped', 'no_clear,no_scale'),
SessionStat('session_table_alter_success', 'table alter successful calls', 'no_clear,no_scale'),
@@ -507,14 +511,16 @@ connection_stats = [
TxnStat('txn_checkpoint_time_total', 'transaction checkpoint total time (msecs)', 'no_clear,no_scale'),
TxnStat('txn_commit', 'transactions committed'),
TxnStat('txn_commit_queue_empty', 'commit timestamp queue insert to empty'),
- TxnStat('txn_commit_queue_tail', 'commit timestamp queue inserts to tail'),
+ TxnStat('txn_commit_queue_head', 'commit timestamp queue inserts to head'),
TxnStat('txn_commit_queue_inserts', 'commit timestamp queue inserts total'),
TxnStat('txn_commit_queue_len', 'commit timestamp queue length'),
+ TxnStat('txn_commit_queue_walked', 'commit timestamp queue entries walked'),
TxnStat('txn_fail_cache', 'transaction failures due to cache overflow'),
TxnStat('txn_pinned_checkpoint_range', 'transaction range of IDs currently pinned by a checkpoint', 'no_clear,no_scale'),
TxnStat('txn_pinned_range', 'transaction range of IDs currently pinned', 'no_clear,no_scale'),
TxnStat('txn_pinned_snapshot_range', 'transaction range of IDs currently pinned by named snapshots', 'no_clear,no_scale'),
TxnStat('txn_pinned_timestamp', 'transaction range of timestamps currently pinned', 'no_clear,no_scale'),
+ TxnStat('txn_pinned_timestamp_checkpoint', 'transaction range of timestamps pinned by a checkpoint', 'no_clear,no_scale'),
TxnStat('txn_pinned_timestamp_oldest', 'transaction range of timestamps pinned by the oldest timestamp', 'no_clear,no_scale'),
TxnStat('txn_prepare', 'prepared transactions'),
TxnStat('txn_prepare_active', 'prepared transactions currently active'),
@@ -525,8 +531,9 @@ connection_stats = [
TxnStat('txn_read_queue_head', 'read timestamp queue inserts to head'),
TxnStat('txn_read_queue_inserts', 'read timestamp queue inserts total'),
TxnStat('txn_read_queue_len', 'read timestamp queue length'),
+ TxnStat('txn_read_queue_walked', 'read timestamp queue entries walked'),
TxnStat('txn_rollback', 'transactions rolled back'),
- TxnStat('txn_rollback_las_removed', 'rollback to stable updates removed from lookaside'),
+ TxnStat('txn_rollback_las_removed', 'rollback to stable updates removed from cache overflow'),
TxnStat('txn_rollback_to_stable', 'rollback to stable calls'),
TxnStat('txn_rollback_upd_aborted', 'rollback to stable updates aborted'),
TxnStat('txn_set_ts', 'set timestamp calls'),
@@ -622,9 +629,9 @@ dsrc_stats = [
CacheStat('cache_eviction_split_leaf', 'leaf pages split during eviction'),
CacheStat('cache_eviction_target_page_ge128', 'eviction walk target pages histogram - 128 and higher'),
CacheStat('cache_eviction_target_page_lt10', 'eviction walk target pages histogram - 0-9'),
+ CacheStat('cache_eviction_target_page_lt128', 'eviction walk target pages histogram - 64-128'),
CacheStat('cache_eviction_target_page_lt32', 'eviction walk target pages histogram - 10-31'),
CacheStat('cache_eviction_target_page_lt64', 'eviction walk target pages histogram - 32-63'),
- CacheStat('cache_eviction_target_page_lt128', 'eviction walk target pages histogram - 64-128'),
CacheStat('cache_eviction_walk_from_root', 'eviction walks started from root of tree'),
CacheStat('cache_eviction_walk_passes', 'eviction walk passes of a file'),
CacheStat('cache_eviction_walk_saved_pos', 'eviction walks started from saved location in tree'),
@@ -639,10 +646,10 @@ dsrc_stats = [
CacheStat('cache_read', 'pages read into cache'),
CacheStat('cache_read_deleted', 'pages read into cache after truncate'),
CacheStat('cache_read_deleted_prepared', 'pages read into cache after truncate in prepare state'),
- CacheStat('cache_read_lookaside', 'pages read into cache requiring lookaside entries'),
+ CacheStat('cache_read_lookaside', 'pages read into cache requiring cache overflow entries'),
CacheStat('cache_read_overflow', 'overflow pages read into cache'),
CacheStat('cache_write', 'pages written from cache'),
- CacheStat('cache_write_lookaside', 'page written requiring lookaside records'),
+ CacheStat('cache_write_lookaside', 'page written requiring cache overflow records'),
CacheStat('cache_write_restore', 'pages written requiring in-memory restoration'),
##########################################
@@ -697,7 +704,7 @@ dsrc_stats = [
CursorStat('cursor_reopen', 'cursors reused from cache'),
CursorStat('cursor_reserve', 'reserve calls'),
CursorStat('cursor_reset', 'reset calls'),
- CursorStat('cursor_restart', 'restarted searches'),
+ CursorStat('cursor_restart', 'cursor operation restarted'),
CursorStat('cursor_search', 'search calls'),
CursorStat('cursor_search_near', 'search near calls'),
CursorStat('cursor_truncate', 'truncate calls'),
@@ -741,8 +748,8 @@ dsrc_stats = [
##########################################
# Session operations
##########################################
- SessionStat('session_cursor_cached', 'cached cursor count', 'no_clear,no_scale'),
SessionStat('session_compact', 'object compaction'),
+ SessionStat('session_cursor_cached', 'cached cursor count', 'no_clear,no_scale'),
SessionStat('session_cursor_open', 'open cursor count', 'no_clear,no_scale'),
##########################################
diff --git a/src/third_party/wiredtiger/examples/c/ex_all.c b/src/third_party/wiredtiger/examples/c/ex_all.c
index 139f39fe673..190c2c421d3 100644
--- a/src/third_party/wiredtiger/examples/c/ex_all.c
+++ b/src/third_party/wiredtiger/examples/c/ex_all.c
@@ -1356,8 +1356,9 @@ main(int argc, char *argv[])
const char *buffer = "some string";
size_t len = strlen(buffer);
/*! [Checksum a buffer] */
- uint32_t crc32c;
- crc32c = wiredtiger_checksum_crc32c(buffer, len);
+ uint32_t crc32c, (*func)(const void *, size_t);
+ func = wiredtiger_crc32c_func();
+ crc32c = func(buffer, len);
/*! [Checksum a buffer] */
(void)crc32c;
}
diff --git a/src/third_party/wiredtiger/examples/c/ex_thread.c b/src/third_party/wiredtiger/examples/c/ex_thread.c
index f709707bffc..79aef70d644 100644
--- a/src/third_party/wiredtiger/examples/c/ex_thread.c
+++ b/src/third_party/wiredtiger/examples/c/ex_thread.c
@@ -95,7 +95,7 @@ main(int argc, char *argv[])
__wt_thread_create(NULL, &threads[i], scan_thread, conn));
for (i = 0; i < NUM_THREADS; i++)
- error_check(__wt_thread_join(NULL, threads[i]));
+ error_check(__wt_thread_join(NULL, &threads[i]));
error_check(conn->close(conn, NULL));
diff --git a/src/third_party/wiredtiger/ext/collators/revint/revint_collator.c b/src/third_party/wiredtiger/ext/collators/revint/revint_collator.c
index 00c8bf93acb..b080a5e4d6a 100644
--- a/src/third_party/wiredtiger/ext/collators/revint/revint_collator.c
+++ b/src/third_party/wiredtiger/ext/collators/revint/revint_collator.c
@@ -80,9 +80,10 @@ revint_compare(WT_COLLATOR *collator,
* when comparing primary keys.
*/
if ((ret = wt_api->unpack_start(
- wt_api, session, "ii", k1->data, k1->size, &pstream)) != 0 ||
- (ret = wt_api->unpack_int(wt_api, pstream, &i1)) != 0)
+ wt_api, session, "ii", k1->data, k1->size, &pstream)) != 0)
return (ret);
+ if ((ret = wt_api->unpack_int(wt_api, pstream, &i1)) != 0)
+ goto err;
if ((ret = wt_api->unpack_int(wt_api, pstream, &p1)) != 0)
/* A missing primary key is OK and sorts first. */
p1 = INT64_MIN;
@@ -91,9 +92,12 @@ revint_compare(WT_COLLATOR *collator,
/* Unpack the second pair of numbers. */
if ((ret = wt_api->unpack_start(
- wt_api, session, "ii", k2->data, k2->size, &pstream)) != 0 ||
- (ret = wt_api->unpack_int(wt_api, pstream, &i2)) != 0)
+ wt_api, session, "ii", k2->data, k2->size, &pstream)) != 0)
return (ret);
+ if ((ret = wt_api->unpack_int(wt_api, pstream, &i2)) != 0) {
+err: (void)wt_api->pack_close(wt_api, pstream, NULL);
+ return (ret);
+ }
if ((ret = wt_api->unpack_int(wt_api, pstream, &p2)) != 0)
/* A missing primary key is OK and sorts first. */
p2 = INT64_MIN;
@@ -138,6 +142,7 @@ int
wiredtiger_extension_init(WT_CONNECTION *connection, WT_CONFIG_ARG *config)
{
REVINT_COLLATOR *revint_collator;
+ int ret;
(void)config; /* Unused parameters */
@@ -148,6 +153,10 @@ wiredtiger_extension_init(WT_CONNECTION *connection, WT_CONFIG_ARG *config)
revint_collator->collator.terminate = revint_terminate;
revint_collator->wt_api = connection->get_extension_api(connection);
- return (connection->add_collator(
- connection, "revint", &revint_collator->collator, NULL));
+ if ((ret = connection->add_collator(
+ connection, "revint", (WT_COLLATOR *)revint_collator, NULL)) == 0)
+ return (0);
+
+ free(revint_collator);
+ return (ret);
}
diff --git a/src/third_party/wiredtiger/ext/compressors/lz4/lz4_compress.c b/src/third_party/wiredtiger/ext/compressors/lz4/lz4_compress.c
index 1f32ff910d6..dc90500dcdb 100644
--- a/src/third_party/wiredtiger/ext/compressors/lz4/lz4_compress.c
+++ b/src/third_party/wiredtiger/ext/compressors/lz4/lz4_compress.c
@@ -375,6 +375,7 @@ static int
lz_add_compressor(WT_CONNECTION *connection, bool raw, const char *name)
{
LZ4_COMPRESSOR *lz4_compressor;
+ int ret;
/*
* There are two almost identical LZ4 compressors: one using raw
@@ -392,8 +393,12 @@ lz_add_compressor(WT_CONNECTION *connection, bool raw, const char *name)
lz4_compressor->wt_api = connection->get_extension_api(connection);
/* Load the compressor */
- return (connection->add_compressor(
- connection, name, (WT_COMPRESSOR *)lz4_compressor, NULL));
+ if ((ret = connection->add_compressor(
+ connection, name, (WT_COMPRESSOR *)lz4_compressor, NULL)) == 0)
+ return (0);
+
+ free(lz4_compressor);
+ return (ret);
}
int lz4_extension_init(WT_CONNECTION *, WT_CONFIG_ARG *);
diff --git a/src/third_party/wiredtiger/ext/compressors/nop/nop_compress.c b/src/third_party/wiredtiger/ext/compressors/nop/nop_compress.c
index f739ffa4777..586f6c8831b 100644
--- a/src/third_party/wiredtiger/ext/compressors/nop/nop_compress.c
+++ b/src/third_party/wiredtiger/ext/compressors/nop/nop_compress.c
@@ -155,6 +155,7 @@ int
wiredtiger_extension_init(WT_CONNECTION *connection, WT_CONFIG_ARG *config)
{
NOP_COMPRESSOR *nop_compressor;
+ int ret;
(void)config; /* Unused parameters */
@@ -177,7 +178,11 @@ wiredtiger_extension_init(WT_CONNECTION *connection, WT_CONFIG_ARG *config)
nop_compressor->wt_api = connection->get_extension_api(connection);
/* Load the compressor */
- return (connection->add_compressor(
- connection, "nop", (WT_COMPRESSOR *)nop_compressor, NULL));
+ if ((ret = connection->add_compressor(
+ connection, "nop", (WT_COMPRESSOR *)nop_compressor, NULL)) == 0)
+ return (0);
+
+ free(nop_compressor);
+ return (ret);
}
/*! [WT_COMPRESSOR initialization function] */
diff --git a/src/third_party/wiredtiger/ext/compressors/snappy/snappy_compress.c b/src/third_party/wiredtiger/ext/compressors/snappy/snappy_compress.c
index 26aa3082bc0..03a17d28a1b 100644
--- a/src/third_party/wiredtiger/ext/compressors/snappy/snappy_compress.c
+++ b/src/third_party/wiredtiger/ext/compressors/snappy/snappy_compress.c
@@ -252,6 +252,7 @@ int
snappy_extension_init(WT_CONNECTION *connection, WT_CONFIG_ARG *config)
{
SNAPPY_COMPRESSOR *snappy_compressor;
+ int ret;
(void)config; /* Unused parameters */
@@ -266,8 +267,12 @@ snappy_extension_init(WT_CONNECTION *connection, WT_CONFIG_ARG *config)
snappy_compressor->wt_api = connection->get_extension_api(connection);
- return (connection->add_compressor(
- connection, "snappy", (WT_COMPRESSOR *)snappy_compressor, NULL));
+ if ((ret = connection->add_compressor(connection,
+ "snappy", (WT_COMPRESSOR *)snappy_compressor, NULL)) == 0)
+ return (0);
+
+ free(snappy_compressor);
+ return (ret);
}
/*
diff --git a/src/third_party/wiredtiger/ext/compressors/zlib/zlib_compress.c b/src/third_party/wiredtiger/ext/compressors/zlib/zlib_compress.c
index 5ae54a25163..d5c0d0fb318 100644
--- a/src/third_party/wiredtiger/ext/compressors/zlib/zlib_compress.c
+++ b/src/third_party/wiredtiger/ext/compressors/zlib/zlib_compress.c
@@ -452,6 +452,7 @@ zlib_add_compressor(
WT_CONNECTION *connection, bool raw, const char *name, int zlib_level)
{
ZLIB_COMPRESSOR *zlib_compressor;
+ int ret;
/*
* There are two almost identical zlib compressors: one using raw
@@ -471,8 +472,12 @@ zlib_add_compressor(
zlib_compressor->zlib_level = zlib_level;
/* Load the compressor. */
- return (connection->add_compressor(
- connection, name, (WT_COMPRESSOR *)zlib_compressor, NULL));
+ if ((ret = connection->add_compressor(
+ connection, name, (WT_COMPRESSOR *)zlib_compressor, NULL)) == 0)
+ return (0);
+
+ free(zlib_compressor);
+ return (ret);
}
/*
diff --git a/src/third_party/wiredtiger/ext/compressors/zstd/zstd_compress.c b/src/third_party/wiredtiger/ext/compressors/zstd/zstd_compress.c
index 4f80694b0d1..40a872f92e2 100644
--- a/src/third_party/wiredtiger/ext/compressors/zstd/zstd_compress.c
+++ b/src/third_party/wiredtiger/ext/compressors/zstd/zstd_compress.c
@@ -315,8 +315,12 @@ zstd_extension_init(WT_CONNECTION *connection, WT_CONFIG_ARG *config)
zstd_compressor->compression_level = compression_level;
/* Load the compressor */
- return (connection->add_compressor(
- connection, "zstd", (WT_COMPRESSOR *)zstd_compressor, NULL));
+ if ((ret = connection->add_compressor(
+ connection, "zstd", (WT_COMPRESSOR *)zstd_compressor, NULL)) == 0)
+ return (0);
+
+ free(zstd_compressor);
+ return (ret);
}
/*
diff --git a/src/third_party/wiredtiger/ext/encryptors/nop/nop_encrypt.c b/src/third_party/wiredtiger/ext/encryptors/nop/nop_encrypt.c
index 0d04e51e395..edefab450a0 100644
--- a/src/third_party/wiredtiger/ext/encryptors/nop/nop_encrypt.c
+++ b/src/third_party/wiredtiger/ext/encryptors/nop/nop_encrypt.c
@@ -168,6 +168,7 @@ int
wiredtiger_extension_init(WT_CONNECTION *connection, WT_CONFIG_ARG *config)
{
NOP_ENCRYPTOR *nop_encryptor;
+ int ret;
(void)config; /* Unused parameters */
@@ -189,7 +190,11 @@ wiredtiger_extension_init(WT_CONNECTION *connection, WT_CONFIG_ARG *config)
nop_encryptor->wt_api = connection->get_extension_api(connection);
/* Load the encryptor */
- return (connection->add_encryptor(
- connection, "nop", (WT_ENCRYPTOR *)nop_encryptor, NULL));
+ if ((ret = connection->add_encryptor(
+ connection, "nop", (WT_ENCRYPTOR *)nop_encryptor, NULL)) == 0)
+ return (0);
+
+ free(nop_encryptor);
+ return (ret);
}
/*! [WT_ENCRYPTOR initialization function] */
diff --git a/src/third_party/wiredtiger/ext/encryptors/rotn/rotn_encrypt.c b/src/third_party/wiredtiger/ext/encryptors/rotn/rotn_encrypt.c
index f1e26eeaa23..6f46a950bfc 100644
--- a/src/third_party/wiredtiger/ext/encryptors/rotn/rotn_encrypt.c
+++ b/src/third_party/wiredtiger/ext/encryptors/rotn/rotn_encrypt.c
@@ -479,7 +479,11 @@ wiredtiger_extension_init(WT_CONNECTION *connection, WT_CONFIG_ARG *config)
return (ret);
}
/* Load the encryptor */
- return (connection->add_encryptor(
- connection, "rotn", (WT_ENCRYPTOR *)rotn_encryptor, NULL));
+ if ((ret = connection->add_encryptor(
+ connection, "rotn", (WT_ENCRYPTOR *)rotn_encryptor, NULL)) == 0)
+ return (0);
+
+ free(rotn_encryptor);
+ return (ret);
}
/*! [WT_ENCRYPTOR initialization function] */
diff --git a/src/third_party/wiredtiger/ext/extractors/csv/csv_extractor.c b/src/third_party/wiredtiger/ext/extractors/csv/csv_extractor.c
index 6ce64e240fe..f3c01bc3f41 100644
--- a/src/third_party/wiredtiger/ext/extractors/csv/csv_extractor.c
+++ b/src/third_party/wiredtiger/ext/extractors/csv/csv_extractor.c
@@ -147,7 +147,6 @@ csv_customize(WT_EXTRACTOR *extractor, WT_SESSION *session,
long field_num;
int ret;
- (void)session; /* Unused parameters */
(void)uri; /* Unused parameters */
orig = (const CSV_EXTRACTOR *)extractor;
@@ -155,34 +154,61 @@ csv_customize(WT_EXTRACTOR *extractor, WT_SESSION *session,
if ((ret = wt_api->config_parser_open(wt_api, session, appcfg->str,
appcfg->len, &parser)) != 0)
return (ret);
- if ((ret = parser->get(parser, "field", &field)) != 0 ||
- (ret = parser->get(parser, "format", &format)) != 0) {
- if (ret == WT_NOTFOUND) {
+ if ((ret = parser->get(parser, "field", &field)) != 0) {
+ if (ret == WT_NOTFOUND)
(void)wt_api->err_printf(
- wt_api, session, "field or format not found");
- return (WT_NOTFOUND);
- }
- return (ret);
+ wt_api, session, "field not found");
+ else
+ (void)wt_api->err_printf(
+ wt_api, session, "WT_CONFIG_PARSER.get: field: %s",
+ wt_api->strerror(wt_api, session, ret));
+ goto err;
+ }
+ if ((ret = parser->get(parser, "format", &format)) != 0) {
+ if (ret == WT_NOTFOUND)
+ (void)wt_api->err_printf(
+ wt_api, session, "format not found");
+ else
+ (void)wt_api->err_printf(
+ wt_api, session, "WT_CONFIG_PARSER.get: format: %s",
+ wt_api->strerror(wt_api, session, ret));
+ goto err;
+ }
+ ret = parser->close(parser);
+ parser = NULL;
+ if (ret != 0) {
+ (void)wt_api->err_printf(
+ wt_api, session, "WT_CONFIG_PARSER.close: %s",
+ wt_api->strerror(wt_api, session, ret));
}
+
field_num = strtol(field.str, NULL, 10);
if (field_num < 0 || field_num > INT_MAX) {
(void)wt_api->err_printf(
wt_api, session, "field: invalid format");
- return (EINVAL);
+ ret = EINVAL;
+ goto err;
}
if (format.len != 1 || (format.str[0] != 'S' && format.str[0] != 'i')) {
(void)wt_api->err_printf(
wt_api, session, "format: invalid format");
- return (EINVAL);
+ ret = EINVAL;
+ goto err;
+ }
+ if ((csv_extractor = calloc(1, sizeof(CSV_EXTRACTOR))) == NULL) {
+ ret = errno;
+ goto err;
}
- if ((csv_extractor = calloc(1, sizeof(CSV_EXTRACTOR))) == NULL)
- return (errno);
*csv_extractor = *orig;
csv_extractor->field = (int)field_num;
csv_extractor->format_isnum = (format.str[0] == 'i');
*customp = (WT_EXTRACTOR *)csv_extractor;
return (0);
+
+err: if (parser != NULL)
+ (void)parser->close(parser);
+ return (ret);
}
/*
@@ -207,6 +233,7 @@ int
wiredtiger_extension_init(WT_CONNECTION *connection, WT_CONFIG_ARG *config)
{
CSV_EXTRACTOR *csv_extractor;
+ int ret;
(void)config; /* Unused parameters */
@@ -218,6 +245,10 @@ wiredtiger_extension_init(WT_CONNECTION *connection, WT_CONFIG_ARG *config)
csv_extractor->extractor.terminate = csv_terminate;
csv_extractor->wt_api = connection->get_extension_api(connection);
- return (connection->add_extractor(
- connection, "csv", (WT_EXTRACTOR *)csv_extractor, NULL));
+ if ((ret = connection->add_extractor(
+ connection, "csv", (WT_EXTRACTOR *)csv_extractor, NULL)) == 0)
+ return (0);
+
+ free(csv_extractor);
+ return (ret);
}
diff --git a/src/third_party/wiredtiger/ext/test/fail_fs/fail_fs.c b/src/third_party/wiredtiger/ext/test/fail_fs/fail_fs.c
index bdb4669a637..b74144eb252 100644
--- a/src/third_party/wiredtiger/ext/test/fail_fs/fail_fs.c
+++ b/src/third_party/wiredtiger/ext/test/fail_fs/fail_fs.c
@@ -767,8 +767,10 @@ wiredtiger_extension_init(WT_CONNECTION *conn, WT_CONFIG_ARG *config)
int64_t argval;
int ret;
- ret = 0;
+ config_parser = NULL;
wtext = conn->get_extension_api(conn);
+ ret = 0;
+
if ((fail_fs = calloc(1, sizeof(FAIL_FILE_SYSTEM))) == NULL) {
(void)wtext->err_printf(wtext, NULL,
"fail_file_system extension_init: %s",
@@ -813,7 +815,9 @@ wiredtiger_extension_init(WT_CONNECTION *conn, WT_CONFIG_ARG *config)
wtext->strerror(wtext, NULL, ret));
goto err;
}
- if ((ret = config_parser->close(config_parser)) != 0) {
+ ret = config_parser->close(config_parser);
+ config_parser = NULL;
+ if (ret != 0) {
(void)wtext->err_printf(wtext, NULL,
"WT_CONFIG_PARSER.close: config: %s",
wtext->strerror(wtext, NULL, ret));
@@ -840,6 +844,8 @@ wiredtiger_extension_init(WT_CONNECTION *conn, WT_CONFIG_ARG *config)
}
return (0);
-err: free(fail_fs);
+err: if (config_parser != NULL)
+ (void)config_parser->close(config_parser);
+ free(fail_fs);
return (ret);
}
diff --git a/src/third_party/wiredtiger/import.data b/src/third_party/wiredtiger/import.data
index 85c6480aa56..4e6e637f3b5 100644
--- a/src/third_party/wiredtiger/import.data
+++ b/src/third_party/wiredtiger/import.data
@@ -1,5 +1,5 @@
{
- "commit": "0d4bc746f2221de7c9d08d7750ec85ba77691d31",
+ "commit": "45b751a54fb181d1995684f7c807bbbc142d3c90",
"github": "wiredtiger/wiredtiger.git",
"vendor": "wiredtiger",
"branch": "mongodb-3.6"
diff --git a/src/third_party/wiredtiger/lang/java/java_doc.i b/src/third_party/wiredtiger/lang/java/java_doc.i
index 0bb5c013c6f..8b363ee4cf7 100644
--- a/src/third_party/wiredtiger/lang/java/java_doc.i
+++ b/src/third_party/wiredtiger/lang/java/java_doc.i
@@ -53,6 +53,7 @@ COPYDOC(__wt_session, WT_SESSION, commit_transaction)
COPYDOC(__wt_session, WT_SESSION, prepare_transaction)
COPYDOC(__wt_session, WT_SESSION, rollback_transaction)
COPYDOC(__wt_session, WT_SESSION, timestamp_transaction)
+COPYDOC(__wt_session, WT_SESSION, query_timestamp)
COPYDOC(__wt_session, WT_SESSION, checkpoint)
COPYDOC(__wt_session, WT_SESSION, snapshot)
COPYDOC(__wt_session, WT_SESSION, transaction_pinned_range)
diff --git a/src/third_party/wiredtiger/src/async/async_api.c b/src/third_party/wiredtiger/src/async/async_api.c
index db755db198a..a4754addfdc 100644
--- a/src/third_party/wiredtiger/src/async/async_api.c
+++ b/src/third_party/wiredtiger/src/async/async_api.c
@@ -145,7 +145,7 @@ retry:
*/
if (op == NULL || op->state != WT_ASYNCOP_FREE) {
WT_STAT_CONN_INCR(session, async_full);
- return (EBUSY);
+ return (__wt_set_return(session, EBUSY));
}
/*
* Set the state of this op handle as READY for the user to use.
@@ -400,7 +400,7 @@ __wt_async_reconfig(WT_SESSION_IMPL *session, const char *cfg[])
F_CLR(async->worker_sessions[i],
WT_SESSION_SERVER_ASYNC);
WT_TRET(__wt_thread_join(
- session, async->worker_tids[i]));
+ session, &async->worker_tids[i]));
wt_session = &async->worker_sessions[i]->iface;
WT_TRET(wt_session->close(wt_session, NULL));
async->worker_sessions[i] = NULL;
@@ -434,7 +434,7 @@ __wt_async_destroy(WT_SESSION_IMPL *session)
F_CLR(conn, WT_CONN_SERVER_ASYNC);
for (i = 0; i < conn->async_workers; i++)
- WT_TRET(__wt_thread_join(session, async->worker_tids[i]));
+ WT_TRET(__wt_thread_join(session, &async->worker_tids[i]));
__wt_cond_destroy(session, &async->flush_cond);
/* Close the server threads' sessions. */
diff --git a/src/third_party/wiredtiger/src/async/async_worker.c b/src/third_party/wiredtiger/src/async/async_worker.c
index 6dfddced8e7..3fe7a1fbe5b 100644
--- a/src/third_party/wiredtiger/src/async/async_worker.c
+++ b/src/third_party/wiredtiger/src/async/async_worker.c
@@ -106,8 +106,10 @@ static void
__async_flush_wait(WT_SESSION_IMPL *session, WT_ASYNC *async, uint64_t my_gen)
{
while (async->flush_state == WT_ASYNC_FLUSHING &&
- async->flush_gen == my_gen)
+ async->flush_gen == my_gen) {
__wt_cond_wait(session, async->flush_cond, 10000, NULL);
+ WT_BARRIER();
+ }
}
/*
diff --git a/src/third_party/wiredtiger/src/block/block_ckpt.c b/src/third_party/wiredtiger/src/block/block_ckpt.c
index 73a3d13e307..ba7c2f13b1e 100644
--- a/src/third_party/wiredtiger/src/block/block_ckpt.c
+++ b/src/third_party/wiredtiger/src/block/block_ckpt.c
@@ -373,8 +373,7 @@ __ckpt_verify(WT_SESSION_IMPL *session, WT_CKPT *ckptbase)
* on some gcc compilers because they don't understand
* FALLTHROUGH as part of a macro.
*/
- return (
- __wt_illegal_value(session, "checkpoint array"));
+ return (__wt_illegal_value(session, ckpt->flags));
}
return (0);
}
diff --git a/src/third_party/wiredtiger/src/block/block_ext.c b/src/third_party/wiredtiger/src/block/block_ext.c
index 3995fdb4c94..a7b55985fa1 100644
--- a/src/third_party/wiredtiger/src/block/block_ext.c
+++ b/src/third_party/wiredtiger/src/block/block_ext.c
@@ -505,6 +505,9 @@ __wt_block_alloc(
WT_EXT *ext, **estack[WT_SKIP_MAXDEPTH];
WT_SIZE *szp, **sstack[WT_SKIP_MAXDEPTH];
+ /* If a sync is running, no other sessions can allocate blocks. */
+ WT_ASSERT(session, WT_SESSION_BTREE_SYNC_SAFE(session, S2BT(session)));
+
/* Assert we're maintaining the by-size skiplist. */
WT_ASSERT(session, block->live.avail.track_size != 0);
@@ -622,6 +625,9 @@ __wt_block_off_free(
{
WT_DECL_RET;
+ /* If a sync is running, no other sessions can free blocks. */
+ WT_ASSERT(session, WT_SESSION_BTREE_SYNC_SAFE(session, S2BT(session)));
+
/*
* Callers of this function are expected to have already acquired any
* locks required to manipulate the extent lists.
diff --git a/src/third_party/wiredtiger/src/block/block_read.c b/src/third_party/wiredtiger/src/block/block_read.c
index ec44885f56a..e43d3b34f66 100644
--- a/src/third_party/wiredtiger/src/block/block_read.c
+++ b/src/third_party/wiredtiger/src/block/block_read.c
@@ -181,39 +181,35 @@ err: __wt_scr_free(session, &tmp);
#ifdef HAVE_DIAGNOSTIC
/*
* __wt_block_read_off_blind --
- * Read the block at an offset, try to figure out what it looks like,
- * debugging only.
+ * Read the block at an offset, return the size and checksum, debugging
+ * only.
*/
int
-__wt_block_read_off_blind(
- WT_SESSION_IMPL *session, WT_BLOCK *block, WT_ITEM *buf, wt_off_t offset)
+__wt_block_read_off_blind(WT_SESSION_IMPL *session,
+ WT_BLOCK *block, wt_off_t offset, uint32_t *sizep, uint32_t *checksump)
{
WT_BLOCK_HEADER *blk;
- uint32_t checksum, size;
+ WT_DECL_ITEM(tmp);
+ WT_DECL_RET;
+
+ *sizep = 0;
+ *checksump = 0;
/*
* Make sure the buffer is large enough for the header and read the
* the first allocation-size block.
*/
- WT_RET(__wt_buf_init(session, buf, block->allocsize));
- WT_RET(__wt_read(
- session, block->fh, offset, (size_t)block->allocsize, buf->mem));
- blk = WT_BLOCK_HEADER_REF(buf->mem);
+ WT_RET(__wt_scr_alloc(session, block->allocsize, &tmp));
+ WT_ERR(__wt_read(
+ session, block->fh, offset, (size_t)block->allocsize, tmp->mem));
+ blk = WT_BLOCK_HEADER_REF(tmp->mem);
__wt_block_header_byteswap(blk);
- /*
- * Copy out the size and checksum (we're about to re-use the buffer),
- * and if the size isn't insane, read the rest of the block.
- */
- size = blk->disk_size;
- checksum = blk->checksum;
- if (__wt_block_offset_invalid(block, offset, size))
- WT_RET_MSG(session, EINVAL,
- "block at offset %" PRIuMAX " cannot be a valid block, no "
- "read attempted",
- (uintmax_t)offset);
- return (
- __wt_block_read_off(session, block, buf, offset, size, checksum));
+ *sizep = blk->disk_size;
+ *checksump = blk->checksum;
+
+err: __wt_scr_free(session, &tmp);
+ return (ret);
}
#endif
@@ -278,18 +274,20 @@ __wt_block_read_off(WT_SESSION_IMPL *session, WT_BLOCK *block,
if (!F_ISSET(session, WT_SESSION_QUIET_CORRUPT_FILE))
__wt_errx(session,
- "read checksum error for %" PRIu32 "B block at "
+ "%s: read checksum error for %" PRIu32 "B block at "
"offset %" PRIuMAX ": calculated block checksum "
"of %" PRIu32 " doesn't match expected checksum "
"of %" PRIu32,
+ block->name,
size, (uintmax_t)offset, page_checksum, checksum);
} else
if (!F_ISSET(session, WT_SESSION_QUIET_CORRUPT_FILE))
__wt_errx(session,
- "read checksum error for %" PRIu32 "B block at "
+ "%s: read checksum error for %" PRIu32 "B block at "
"offset %" PRIuMAX ": block header checksum "
"of %" PRIu32 " doesn't match expected checksum "
"of %" PRIu32,
+ block->name,
size, (uintmax_t)offset, swap.checksum, checksum);
if (!F_ISSET(session, WT_SESSION_QUIET_CORRUPT_FILE))
@@ -297,7 +295,9 @@ __wt_block_read_off(WT_SESSION_IMPL *session, WT_BLOCK *block,
__wt_bm_corrupt_dump(session, buf, offset, size, checksum));
/* Panic if a checksum fails during an ordinary read. */
- return (block->verify ||
- F_ISSET(session, WT_SESSION_QUIET_CORRUPT_FILE) ?
- WT_ERROR : __wt_illegal_value(session, block->name));
+ F_SET(S2C(session), WT_CONN_DATA_CORRUPTION);
+ if (block->verify || F_ISSET(session, WT_SESSION_QUIET_CORRUPT_FILE))
+ return (WT_ERROR);
+ WT_PANIC_RET(
+ session, WT_ERROR, "%s: fatal read error", block->name);
}
diff --git a/src/third_party/wiredtiger/src/bloom/bloom.c b/src/third_party/wiredtiger/src/bloom/bloom.c
index cf4743009ee..d506af89ab7 100644
--- a/src/third_party/wiredtiger/src/bloom/bloom.c
+++ b/src/third_party/wiredtiger/src/bloom/bloom.c
@@ -302,7 +302,16 @@ __wt_bloom_hash_get(WT_BLOOM *bloom, WT_BLOOM_HASH *bhash)
err: if (c != NULL)
WT_TRET(c->reset(c));
- /* Don't return WT_NOTFOUND from a failed cursor open or search. */
+ /*
+ * Error handling from this function is complex. A search in the
+ * backing bit field should never return WT_NOTFOUND - so translate
+ * that into a different error code and report an error. If we got a
+ * WT_ROLLBACK it may be because there is a lot of cache pressure and
+ * the transaction is being killed - don't report an error message in
+ * that case.
+ */
+ if (ret == WT_ROLLBACK || ret == WT_CACHE_FULL)
+ return (ret);
WT_RET_MSG(bloom->session,
ret == WT_NOTFOUND ? WT_ERROR : ret,
"Failed lookup in bloom filter");
diff --git a/src/third_party/wiredtiger/src/btree/bt_compact.c b/src/third_party/wiredtiger/src/btree/bt_compact.c
index 0a11f40d6d4..e20e67c0c13 100644
--- a/src/third_party/wiredtiger/src/btree/bt_compact.c
+++ b/src/third_party/wiredtiger/src/btree/bt_compact.c
@@ -16,7 +16,6 @@ static int
__compact_rewrite(WT_SESSION_IMPL *session, WT_REF *ref, bool *skipp)
{
WT_BM *bm;
- WT_DECL_RET;
WT_MULTI *multi;
WT_PAGE *page;
WT_PAGE_MODIFY *mod;
@@ -28,13 +27,8 @@ __compact_rewrite(WT_SESSION_IMPL *session, WT_REF *ref, bool *skipp)
bm = S2BT(session)->bm;
page = ref->page;
- mod = page->modify;
- /*
- * If the page is clean, test the original addresses.
- * If the page is a replacement, test the replacement addresses.
- * Ignore empty pages, they get merged into the parent.
- */
+ /* If the page is clean, test the original addresses. */
if (__wt_page_evict_clean(page)) {
__wt_ref_info(ref, &addr, &addr_size, NULL);
if (addr == NULL)
@@ -44,34 +38,31 @@ __compact_rewrite(WT_SESSION_IMPL *session, WT_REF *ref, bool *skipp)
}
/*
- * The page's modification information can change underfoot if the page
- * is being reconciled, serialize with reconciliation.
+ * If the page is a replacement, test the replacement addresses.
+ * Ignore empty pages, they get merged into the parent.
+ *
+ * Page-modify variable initialization done here because the page could
+ * be modified while we're looking at it, so the page modified structure
+ * may appear at any time (but cannot disappear). We've confirmed there
+ * is a page modify structure, it's OK to look at it.
*/
- if (mod->rec_result == WT_PM_REC_REPLACE ||
- mod->rec_result == WT_PM_REC_MULTIBLOCK)
- WT_PAGE_LOCK(session, page);
-
+ mod = page->modify;
if (mod->rec_result == WT_PM_REC_REPLACE)
- ret = bm->compact_page_skip(bm, session,
- mod->mod_replace.addr, mod->mod_replace.size, skipp);
+ return (bm->compact_page_skip(bm, session,
+ mod->mod_replace.addr, mod->mod_replace.size, skipp));
if (mod->rec_result == WT_PM_REC_MULTIBLOCK)
for (multi = mod->mod_multi,
i = 0; i < mod->mod_multi_entries; ++multi, ++i) {
if (multi->addr.addr == NULL)
continue;
- if ((ret = bm->compact_page_skip(bm, session,
- multi->addr.addr, multi->addr.size, skipp)) != 0)
- break;
+ WT_RET(bm->compact_page_skip(bm, session,
+ multi->addr.addr, multi->addr.size, skipp));
if (!*skipp)
break;
}
- if (mod->rec_result == WT_PM_REC_REPLACE ||
- mod->rec_result == WT_PM_REC_MULTIBLOCK)
- WT_PAGE_UNLOCK(session, page);
-
- return (ret);
+ return (0);
}
/*
@@ -98,10 +89,9 @@ __compact_rewrite_lock(WT_SESSION_IMPL *session, WT_REF *ref, bool *skipp)
* There are two ways we call reconciliation: checkpoints and eviction.
* Get the tree's flush lock which blocks threads writing pages for
* checkpoints. If checkpoint is holding the lock, quit working this
- * file, we'll visit it again in our next pass.
- *
- * Serializing with eviction is not quite as simple, and it gets done
- * in the underlying function that checks modification information.
+ * file, we'll visit it again in our next pass. We don't have to worry
+ * about eviction, we're holding a hazard pointer on the WT_REF, it's
+ * not going anywhere.
*/
WT_RET(__wt_spin_trylock(session, &btree->flush_lock));
diff --git a/src/third_party/wiredtiger/src/btree/bt_curnext.c b/src/third_party/wiredtiger/src/btree/bt_curnext.c
index 02cceab3123..c9cccc63bf6 100644
--- a/src/third_party/wiredtiger/src/btree/bt_curnext.c
+++ b/src/third_party/wiredtiger/src/btree/bt_curnext.c
@@ -429,7 +429,8 @@ __cursor_key_order_check_row(
WT_ERR(__wt_scr_alloc(session, 512, &b));
WT_PANIC_ERR(session, EINVAL,
- "WT_CURSOR.%s out-of-order returns: returned key %s then key %s",
+ "WT_CURSOR.%s out-of-order returns: returned key %.1024s then "
+ "key %.1024s",
next ? "next" : "prev",
__wt_buf_set_printable_format(session,
cbt->lastkey->data, cbt->lastkey->size, btree->key_format, a),
@@ -456,7 +457,7 @@ __wt_cursor_key_order_check(
return (__cursor_key_order_check_col(session, cbt, next));
case WT_PAGE_ROW_LEAF:
return (__cursor_key_order_check_row(session, cbt, next));
- WT_ILLEGAL_VALUE(session);
+ WT_ILLEGAL_VALUE(session, cbt->ref->page->type);
}
/* NOTREACHED */
}
@@ -481,7 +482,7 @@ __wt_cursor_key_order_init(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt)
case WT_PAGE_ROW_LEAF:
return (__wt_buf_set(session,
cbt->lastkey, cbt->iface.key.data, cbt->iface.key.size));
- WT_ILLEGAL_VALUE(session);
+ WT_ILLEGAL_VALUE(session, cbt->ref->page->type);
}
/* NOTREACHED */
}
@@ -641,7 +642,7 @@ __wt_btcur_next(WT_CURSOR_BTREE *cbt, bool truncating)
case WT_PAGE_COL_VAR:
ret = __cursor_var_append_next(cbt, newpage);
break;
- WT_ILLEGAL_VALUE_ERR(session);
+ WT_ILLEGAL_VALUE_ERR(session, page->type);
}
if (ret == 0)
break;
@@ -659,7 +660,7 @@ __wt_btcur_next(WT_CURSOR_BTREE *cbt, bool truncating)
case WT_PAGE_ROW_LEAF:
ret = __cursor_row_next(cbt, newpage);
break;
- WT_ILLEGAL_VALUE_ERR(session);
+ WT_ILLEGAL_VALUE_ERR(session, page->type);
}
if (ret != WT_NOTFOUND)
break;
@@ -690,6 +691,8 @@ __wt_btcur_next(WT_CURSOR_BTREE *cbt, bool truncating)
__wt_page_evict_soon(session, cbt->ref);
cbt->page_deleted_count = 0;
+ if (F_ISSET(cbt, WT_CBT_READ_ONCE))
+ LF_SET(WT_READ_WONT_NEED);
WT_ERR(__wt_tree_walk(session, &cbt->ref, flags));
WT_ERR_TEST(cbt->ref == NULL, WT_NOTFOUND);
}
diff --git a/src/third_party/wiredtiger/src/btree/bt_curprev.c b/src/third_party/wiredtiger/src/btree/bt_curprev.c
index 9b8ca471749..f72b935c441 100644
--- a/src/third_party/wiredtiger/src/btree/bt_curprev.c
+++ b/src/third_party/wiredtiger/src/btree/bt_curprev.c
@@ -606,7 +606,7 @@ __wt_btcur_prev(WT_CURSOR_BTREE *cbt, bool truncating)
case WT_PAGE_COL_VAR:
ret = __cursor_var_append_prev(cbt, newpage);
break;
- WT_ILLEGAL_VALUE_ERR(session);
+ WT_ILLEGAL_VALUE_ERR(session, page->type);
}
if (ret == 0)
break;
@@ -626,7 +626,7 @@ __wt_btcur_prev(WT_CURSOR_BTREE *cbt, bool truncating)
case WT_PAGE_ROW_LEAF:
ret = __cursor_row_prev(cbt, newpage);
break;
- WT_ILLEGAL_VALUE_ERR(session);
+ WT_ILLEGAL_VALUE_ERR(session, page->type);
}
if (ret != WT_NOTFOUND)
break;
@@ -646,6 +646,8 @@ __wt_btcur_prev(WT_CURSOR_BTREE *cbt, bool truncating)
__wt_page_evict_soon(session, cbt->ref);
cbt->page_deleted_count = 0;
+ if (F_ISSET(cbt, WT_CBT_READ_ONCE))
+ LF_SET(WT_READ_WONT_NEED);
WT_ERR(__wt_tree_walk(session, &cbt->ref, flags));
WT_ERR_TEST(cbt->ref == NULL, WT_NOTFOUND);
}
diff --git a/src/third_party/wiredtiger/src/btree/bt_cursor.c b/src/third_party/wiredtiger/src/btree/bt_cursor.c
index 5ddddc5ff6e..886ea0b68f9 100644
--- a/src/third_party/wiredtiger/src/btree/bt_cursor.c
+++ b/src/third_party/wiredtiger/src/btree/bt_cursor.c
@@ -770,8 +770,12 @@ __wt_btcur_insert(WT_CURSOR_BTREE *cbt)
* key, the update doesn't require another search. Cursors configured
* for append aren't included, regardless of whether or not they meet
* all other criteria.
+ *
+ * Fixed-length column store can never use a positioned cursor to update
+ * because the cursor may not be positioned to the correct record in the
+ * case of implicit records in the append list.
*/
- if (__cursor_page_pinned(cbt) &&
+ if (btree->type != BTREE_COL_FIX && __cursor_page_pinned(cbt) &&
F_ISSET(cursor, WT_CURSTD_OVERWRITE) && !append_key) {
WT_ERR(__wt_txn_autocommit_check(session));
/*
@@ -823,15 +827,18 @@ retry: WT_ERR(__cursor_func_init(cbt, true));
}
ret = __cursor_row_modify(session, cbt, WT_UPDATE_STANDARD);
- } else {
+ } else if (append_key) {
/*
* Optionally insert a new record (ignoring the application's
* record number). The real record number is allocated by the
* serialized append operation.
*/
- if (append_key)
- cbt->iface.recno = WT_RECNO_OOB;
-
+ cbt->iface.recno = WT_RECNO_OOB;
+ cbt->compare = 1;
+ WT_ERR(__cursor_col_search(session, cbt, NULL));
+ WT_ERR(__cursor_col_modify(session, cbt, WT_UPDATE_STANDARD));
+ cursor->recno = cbt->recno;
+ } else {
WT_ERR(__cursor_col_search(session, cbt, NULL));
/*
@@ -850,9 +857,6 @@ retry: WT_ERR(__cursor_func_init(cbt, true));
}
WT_ERR(__cursor_col_modify(session, cbt, WT_UPDATE_STANDARD));
-
- if (append_key)
- cursor->recno = cbt->recno;
}
err: if (ret == WT_RESTART) {
@@ -915,17 +919,17 @@ __curfile_update_check(WT_CURSOR_BTREE *cbt)
int
__wt_btcur_insert_check(WT_CURSOR_BTREE *cbt)
{
- WT_BTREE *btree;
WT_CURSOR *cursor;
WT_DECL_RET;
WT_SESSION_IMPL *session;
uint64_t yield_count, sleep_usecs;
cursor = &cbt->iface;
- btree = cbt->btree;
session = (WT_SESSION_IMPL *)cursor->session;
yield_count = sleep_usecs = 0;
+ WT_ASSERT(session, cbt->btree->type == BTREE_ROW);
+
/*
* The pinned page goes away if we do a search, get a local copy of any
* pinned key and discard any pinned value. Unlike most of the btree
@@ -936,14 +940,10 @@ __wt_btcur_insert_check(WT_CURSOR_BTREE *cbt)
__cursor_novalue(cursor);
retry: WT_ERR(__cursor_func_init(cbt, true));
+ WT_ERR(__cursor_row_search(session, cbt, NULL, true));
- if (btree->type == BTREE_ROW) {
- WT_ERR(__cursor_row_search(session, cbt, NULL, true));
-
- /* Just check for conflicts. */
- ret = __curfile_update_check(cbt);
- } else
- WT_ERR(__wt_illegal_value(session, NULL));
+ /* Just check for conflicts. */
+ ret = __curfile_update_check(cbt);
err: if (ret == WT_RESTART) {
__cursor_restart(session, &yield_count, &sleep_usecs);
@@ -1030,8 +1030,12 @@ __wt_btcur_remove(WT_CURSOR_BTREE *cbt)
* arguably safe to simply leave the key initialized in the cursor (as
* that's all a positioned cursor implies), but it's probably safer to
* avoid page eviction entirely in the positioned case.
+ *
+ * Fixed-length column store can never use a positioned cursor to update
+ * because the cursor may not be positioned to the correct record in the
+ * case of implicit records in the append list.
*/
- if (__cursor_page_pinned(cbt)) {
+ if (btree->type != BTREE_COL_FIX && __cursor_page_pinned(cbt)) {
WT_ERR(__wt_txn_autocommit_check(session));
/*
@@ -1208,8 +1212,12 @@ __btcur_update(WT_CURSOR_BTREE *cbt, WT_ITEM *value, u_int modify_type)
* another search. We don't care about the "overwrite" configuration
* because regardless of the overwrite setting, any existing record is
* updated, and the record must exist with a positioned cursor.
+ *
+ * Fixed-length column store can never use a positioned cursor to update
+ * because the cursor may not be positioned to the correct record in the
+ * case of implicit records in the append list.
*/
- if (__cursor_page_pinned(cbt)) {
+ if (btree->type != BTREE_COL_FIX && __cursor_page_pinned(cbt)) {
WT_ERR(__wt_txn_autocommit_check(session));
/*
@@ -1305,8 +1313,8 @@ done: switch (modify_type) {
/*
* WT_CURSOR.update returns a key and a value.
*/
- WT_TRET(__cursor_kv_return(
- session, cbt, cbt->modify_update));
+ ret = __cursor_kv_return(
+ session, cbt, cbt->modify_update);
break;
case WT_UPDATE_RESERVE:
/*
@@ -1319,13 +1327,11 @@ done: switch (modify_type) {
* WT_CURSOR.modify has already created the return value
* and our job is to leave it untouched.
*/
- WT_TRET(__wt_key_return(session, cbt));
+ ret = __wt_key_return(session, cbt);
break;
case WT_UPDATE_BIRTHMARK:
case WT_UPDATE_TOMBSTONE:
- default:
- WT_TRET(__wt_illegal_value(session, NULL));
- break;
+ WT_ILLEGAL_VALUE(session, modify_type);
}
}
@@ -1420,10 +1426,6 @@ __wt_btcur_modify(WT_CURSOR_BTREE *cbt, WT_MODIFY *entries, int nentries)
/* Save the cursor state. */
__cursor_state_save(cursor, &state);
- if (session->txn.isolation == WT_ISO_READ_UNCOMMITTED)
- WT_ERR_MSG(session, ENOTSUP,
- "not supported in read-uncommitted transactions");
-
/*
* Get the current value and apply the modification to it, for a few
* reasons: first, we set the updated value so the application can
@@ -1434,7 +1436,23 @@ __wt_btcur_modify(WT_CURSOR_BTREE *cbt, WT_MODIFY *entries, int nentries)
* trouble if we attempt to modify a value that doesn't exist. For the
* fifth reason, verify we're not in a read-uncommitted transaction,
* that implies a value that might disappear out from under us.
+ *
+ * Also, an application might read a value outside of a transaction and
+ * then call modify. For that to work, the read must be part of the
+ * transaction that performs the update for correctness, otherwise we
+ * could race with another thread and end up modifying the wrong value.
+ * A clever application could get this right (imagine threads that only
+ * updated non-overlapping, fixed-length byte strings), but it's unsafe
+ * because it will work most of the time and the failure is unlikely to
+ * be detected. Require explicit transactions for modify operations.
*/
+ if (session->txn.isolation == WT_ISO_READ_UNCOMMITTED)
+ WT_ERR_MSG(session, ENOTSUP,
+ "not supported in read-uncommitted transactions");
+ if (F_ISSET(&session->txn, WT_TXN_AUTOCOMMIT))
+ WT_ERR_MSG(session, ENOTSUP,
+ "not supported in implicit transactions");
+
if (!F_ISSET(cursor, WT_CURSTD_KEY_INT) ||
!F_ISSET(cursor, WT_CURSTD_VALUE_INT))
WT_ERR(__wt_btcur_search(cbt));
diff --git a/src/third_party/wiredtiger/src/btree/bt_debug.c b/src/third_party/wiredtiger/src/btree/bt_debug.c
index 566157abd61..47b84ad7a25 100644
--- a/src/third_party/wiredtiger/src/btree/bt_debug.c
+++ b/src/third_party/wiredtiger/src/btree/bt_debug.c
@@ -53,7 +53,7 @@ static int __debug_ref(WT_DBG *, WT_REF *);
static int __debug_row_skip(WT_DBG *, WT_INSERT_HEAD *);
static int __debug_tree(WT_SESSION_IMPL *, WT_REF *, const char *, uint32_t);
static int __debug_update(WT_DBG *, WT_UPDATE *, bool);
-static int __dmsg_wrapup(WT_DBG *);
+static int __debug_wrapup(WT_DBG *);
/*
* __wt_debug_set_verbose --
@@ -253,24 +253,25 @@ static int
__debug_config(WT_SESSION_IMPL *session, WT_DBG *ds, const char *ofile)
{
WT_BTREE *btree;
+ WT_DECL_RET;
memset(ds, 0, sizeof(WT_DBG));
ds->session = session;
- WT_RET(__wt_scr_alloc(session, 512, &ds->t1));
- WT_RET(__wt_scr_alloc(session, 512, &ds->t2));
+ WT_ERR(__wt_scr_alloc(session, 512, &ds->t1));
+ WT_ERR(__wt_scr_alloc(session, 512, &ds->t2));
/*
* If we weren't given a file, we use the default event handler, and
* we'll have to buffer messages.
*/
if (ofile == NULL) {
- WT_RET(__wt_scr_alloc(session, 512, &ds->msg));
+ WT_ERR(__wt_scr_alloc(session, 512, &ds->msg));
ds->f = __dmsg_event;
} else {
if ((ds->fp = fopen(ofile, "w")) == NULL)
- return (EIO);
+ WT_ERR(__wt_set_return(session, EIO));
__wt_stream_set_line_buffer(ds->fp);
ds->f = __dmsg_file;
}
@@ -279,15 +280,19 @@ __debug_config(WT_SESSION_IMPL *session, WT_DBG *ds, const char *ofile)
ds->key_format = btree->key_format;
ds->value_format = btree->value_format;
return (0);
+
+err: WT_TRET(__debug_wrapup(ds));
+ return (ret);
}
/*
- * __dmsg_wrapup --
+ * __debug_wrapup --
* Flush any remaining output, release resources.
*/
static int
-__dmsg_wrapup(WT_DBG *ds)
+__debug_wrapup(WT_DBG *ds)
{
+ WT_DECL_RET;
WT_ITEM *msg;
WT_SESSION_IMPL *session;
@@ -303,7 +308,7 @@ __dmsg_wrapup(WT_DBG *ds)
*/
if (msg != NULL) {
if (msg->size != 0)
- WT_RET(__wt_msg(session, "%s", (char *)msg->mem));
+ ret = __wt_msg(session, "%s", (char *)msg->mem);
__wt_scr_free(session, &ds->msg);
}
@@ -311,7 +316,7 @@ __dmsg_wrapup(WT_DBG *ds)
if (ds->fp != NULL)
(void)fclose(ds->fp);
- return (0);
+ return (ret);
}
/*
@@ -365,25 +370,18 @@ int
__wt_debug_offset_blind(
WT_SESSION_IMPL *session, wt_off_t offset, const char *ofile)
{
- WT_DECL_ITEM(buf);
- WT_DECL_RET;
+ uint32_t checksum, size;
WT_ASSERT(session, S2BT_SAFE(session) != NULL);
/*
* This routine depends on the default block manager's view of files,
* where an address consists of a file offset, length, and checksum.
- * This is for debugging only. Other block managers might not see a
- * file or address the same way, that's why there's no block manager
- * method.
+ * This is for debugging only.
*/
- WT_RET(__wt_scr_alloc(session, 1024, &buf));
- WT_ERR(__wt_block_read_off_blind(
- session, S2BT(session)->bm->block, buf, offset));
- ret = __wt_debug_disk(session, buf->mem, ofile);
-
-err: __wt_scr_free(session, &buf);
- return (ret);
+ WT_RET(__wt_block_read_off_blind(
+ session, S2BT(session)->bm->block, offset, &size, &checksum));
+ return (__wt_debug_offset(session, offset, size, checksum, ofile));
}
/*
@@ -435,59 +433,61 @@ __wt_debug_disk(
WT_SESSION_IMPL *session, const WT_PAGE_HEADER *dsk, const char *ofile)
{
WT_DBG *ds, _ds;
+ WT_DECL_RET;
ds = &_ds;
WT_RET(__debug_config(session, ds, ofile));
- WT_RET(ds->f(ds, "%s page", __wt_page_type_string(dsk->type)));
+ WT_ERR(ds->f(ds, "%s page", __wt_page_type_string(dsk->type)));
switch (dsk->type) {
case WT_PAGE_BLOCK_MANAGER:
break;
case WT_PAGE_COL_FIX:
case WT_PAGE_COL_INT:
case WT_PAGE_COL_VAR:
- WT_RET(ds->f(ds, ", recno %" PRIu64, dsk->recno));
+ WT_ERR(ds->f(ds, ", recno %" PRIu64, dsk->recno));
/* FALLTHROUGH */
case WT_PAGE_ROW_INT:
case WT_PAGE_ROW_LEAF:
- WT_RET(ds->f(ds, ", entries %" PRIu32, dsk->u.entries));
+ WT_ERR(ds->f(ds, ", entries %" PRIu32, dsk->u.entries));
break;
case WT_PAGE_OVFL:
- WT_RET(ds->f(ds, ", datalen %" PRIu32, dsk->u.datalen));
+ WT_ERR(ds->f(ds, ", datalen %" PRIu32, dsk->u.datalen));
break;
- WT_ILLEGAL_VALUE(session);
+ WT_ILLEGAL_VALUE_ERR(session, dsk->type);
}
if (F_ISSET(dsk, WT_PAGE_COMPRESSED))
- WT_RET(ds->f(ds, ", compressed"));
+ WT_ERR(ds->f(ds, ", compressed"));
if (F_ISSET(dsk, WT_PAGE_ENCRYPTED))
- WT_RET(ds->f(ds, ", encrypted"));
+ WT_ERR(ds->f(ds, ", encrypted"));
if (F_ISSET(dsk, WT_PAGE_EMPTY_V_ALL))
- WT_RET(ds->f(ds, ", empty-all"));
+ WT_ERR(ds->f(ds, ", empty-all"));
if (F_ISSET(dsk, WT_PAGE_EMPTY_V_NONE))
- WT_RET(ds->f(ds, ", empty-none"));
+ WT_ERR(ds->f(ds, ", empty-none"));
if (F_ISSET(dsk, WT_PAGE_LAS_UPDATE))
- WT_RET(ds->f(ds, ", LAS-update"));
+ WT_ERR(ds->f(ds, ", LAS-update"));
- WT_RET(ds->f(ds, ", generation %" PRIu64 "\n", dsk->write_gen));
+ WT_ERR(ds->f(ds, ", generation %" PRIu64 "\n", dsk->write_gen));
switch (dsk->type) {
case WT_PAGE_BLOCK_MANAGER:
break;
case WT_PAGE_COL_FIX:
- WT_RET(__debug_dsk_col_fix(ds, dsk));
+ WT_ERR(__debug_dsk_col_fix(ds, dsk));
break;
case WT_PAGE_COL_INT:
case WT_PAGE_COL_VAR:
case WT_PAGE_ROW_INT:
case WT_PAGE_ROW_LEAF:
- WT_RET(__debug_dsk_cell(ds, dsk));
+ WT_ERR(__debug_dsk_cell(ds, dsk));
break;
default:
break;
}
- return (__dmsg_wrapup(ds));
+err: WT_TRET(__debug_wrapup(ds));
+ return (ret);
}
/*
@@ -620,9 +620,9 @@ __wt_debug_tree_shape(
WT_WITH_PAGE_INDEX(session,
ret = __debug_tree_shape_worker(ds, page, 1));
- WT_RET(ret);
- return (__dmsg_wrapup(ds));
+ WT_TRET(__debug_wrapup(ds));
+ return (ret);
}
/* AUTOMATIC FLAG VALUE GENERATION START */
@@ -705,7 +705,7 @@ __wt_debug_page(
WT_WITH_BTREE(session, btree,
ret = __debug_page(ds, ref, WT_DEBUG_TREE_LEAF));
- WT_TRET(__dmsg_wrapup(ds));
+ WT_TRET(__debug_wrapup(ds));
return (ret);
}
@@ -744,7 +744,7 @@ __debug_tree(
ret = __debug_page(ds, ref, flags);
- WT_TRET(__dmsg_wrapup(ds));
+ WT_TRET(__debug_wrapup(ds));
return (ret);
}
@@ -788,7 +788,7 @@ __debug_page(WT_DBG *ds, WT_REF *ref, uint32_t flags)
if (LF_ISSET(WT_DEBUG_TREE_LEAF))
WT_RET(__debug_page_row_leaf(ds, ref->page));
break;
- WT_ILLEGAL_VALUE(session);
+ WT_ILLEGAL_VALUE(session, ref->page->type);
}
return (0);
@@ -805,11 +805,13 @@ __debug_page_metadata(WT_DBG *ds, WT_REF *ref)
WT_PAGE_INDEX *pindex;
WT_PAGE_MODIFY *mod;
WT_SESSION_IMPL *session;
+ uint64_t split_gen;
uint32_t entries;
session = ds->session;
page = ref->page;
mod = page->modify;
+ split_gen = 0;
WT_RET(ds->f(ds, "%p", (void *)ref));
@@ -818,6 +820,7 @@ __debug_page_metadata(WT_DBG *ds, WT_REF *ref)
WT_RET(ds->f(ds, " recno %" PRIu64, ref->ref_recno));
WT_INTL_INDEX_GET(session, page, pindex);
entries = pindex->entries;
+ split_gen = page->pg_intl_split_gen;
break;
case WT_PAGE_COL_FIX:
WT_RET(ds->f(ds, " recno %" PRIu64, ref->ref_recno));
@@ -830,11 +833,12 @@ __debug_page_metadata(WT_DBG *ds, WT_REF *ref)
case WT_PAGE_ROW_INT:
WT_INTL_INDEX_GET(session, page, pindex);
entries = pindex->entries;
+ split_gen = page->pg_intl_split_gen;
break;
case WT_PAGE_ROW_LEAF:
entries = page->entries;
break;
- WT_ILLEGAL_VALUE(session);
+ WT_ILLEGAL_VALUE(session, page->type);
}
WT_RET(ds->f(ds, ": %s\n", __wt_page_type_string(page->type)));
@@ -845,8 +849,6 @@ __debug_page_metadata(WT_DBG *ds, WT_REF *ref)
WT_RET(ds->f(ds, ", entries %" PRIu32, entries));
WT_RET(ds->f(ds,
", %s", __wt_page_is_modified(page) ? "dirty" : "clean"));
- WT_RET(ds->f(ds,
- ", memory_size %" WT_SIZET_FMT, page->memory_footprint));
if (F_ISSET_ATOMIC(page, WT_PAGE_BUILD_KEYS))
WT_RET(ds->f(ds, ", keys-built"));
@@ -876,11 +878,14 @@ __debug_page_metadata(WT_DBG *ds, WT_REF *ref)
break;
case 0:
break;
- WT_ILLEGAL_VALUE(session);
+ WT_ILLEGAL_VALUE(session, mod->rec_result);
}
+ if (split_gen != 0)
+ WT_RET(ds->f(ds, ", split-gen=%" PRIu64, split_gen));
if (mod != NULL)
- WT_RET(
- ds->f(ds, ", write generation=%" PRIu32, mod->write_gen));
+ WT_RET(ds->f(ds, ", write-gen=%" PRIu32, mod->write_gen));
+ WT_RET(ds->f(ds,
+ ", memory-size %" WT_SIZET_FMT, page->memory_footprint));
WT_RET(ds->f(ds, "\n"));
return (0);
@@ -1393,7 +1398,7 @@ __debug_cell_data(WT_DBG *ds,
case WT_CELL_VALUE_SHORT:
WT_ERR(__debug_item_value(ds, tag, buf->data, buf->size));
break;
- WT_ILLEGAL_VALUE_ERR(session);
+ WT_ILLEGAL_VALUE_ERR(session, unpack->raw);
}
err: __wt_scr_free(session, &buf);
diff --git a/src/third_party/wiredtiger/src/btree/bt_delete.c b/src/third_party/wiredtiger/src/btree/bt_delete.c
index a2f9afaf409..6f2b28d296a 100644
--- a/src/third_party/wiredtiger/src/btree/bt_delete.c
+++ b/src/third_party/wiredtiger/src/btree/bt_delete.c
@@ -81,7 +81,7 @@ __wt_delete_page(WT_SESSION_IMPL *session, WT_REF *ref, bool *skipp)
}
(void)__wt_atomic_addv32(&S2BT(session)->evict_busy, 1);
- ret = __wt_evict(session, ref, false);
+ ret = __wt_evict(session, ref, false, previous_state);
(void)__wt_atomic_subv32(&S2BT(session)->evict_busy, 1);
WT_RET_BUSY_OK(ret);
ret = 0;
@@ -209,9 +209,7 @@ __wt_delete_page_rollback(WT_SESSION_IMPL *session, WT_REF *ref)
case WT_REF_LIMBO:
case WT_REF_LOOKASIDE:
case WT_REF_READING:
- default:
- return (__wt_illegal_value(session,
- "illegal WT_REF.state rolling back deleted page"));
+ WT_ILLEGAL_VALUE(session, current_state);
}
if (locked)
@@ -359,13 +357,12 @@ __wt_delete_page_instantiate(WT_SESSION_IMPL *session, WT_REF *ref)
/*
* Give the page a modify structure.
*
- * If the tree is already dirty and so will be written, mark the page
- * dirty. (We'd like to free the deleted pages, but if the handle is
- * read-only or if the application never modifies the tree, we're not
- * able to do so.)
+ * Mark tree dirty, unless the handle is read-only.
+ * (We'd like to free the deleted pages, but if the handle is read-only,
+ * we're not able to do so.)
*/
WT_RET(__wt_page_modify_init(session, page));
- if (btree->modified)
+ if (!F_ISSET(btree, WT_BTREE_READONLY))
__wt_page_modify_set(session, page);
if (ref->page_del != NULL &&
diff --git a/src/third_party/wiredtiger/src/btree/bt_discard.c b/src/third_party/wiredtiger/src/btree/bt_discard.c
index d31f76f629c..0d49adc19ca 100644
--- a/src/third_party/wiredtiger/src/btree/bt_discard.c
+++ b/src/third_party/wiredtiger/src/btree/bt_discard.c
@@ -32,29 +32,14 @@ __wt_ref_out(WT_SESSION_IMPL *session, WT_REF *ref)
*/
WT_ASSERT(session, S2BT(session)->evict_ref != ref);
-#ifdef HAVE_DIAGNOSTIC
- {
- WT_HAZARD *hp;
- int i;
/*
* Make sure no other thread has a hazard pointer on the page we are
* about to discard. This is complicated by the fact that readers
* publish their hazard pointer before re-checking the page state, so
* our check can race with readers without indicating a real problem.
- * Wait for up to a second for hazard pointers to be cleared.
+ * If we find a hazard pointer, wait for it to be cleared.
*/
- for (hp = NULL, i = 0; i < 100; i++) {
- if ((hp = __wt_hazard_check(session, ref)) == NULL)
- break;
- __wt_sleep(0, 10000);
- }
- if (hp != NULL)
- __wt_errx(session,
- "discarded page has hazard pointer: (%p: %s, line %d)",
- (void *)hp->ref, hp->file, hp->line);
- WT_ASSERT(session, hp == NULL);
- }
-#endif
+ WT_ASSERT(session, __wt_hazard_check_assert(session, ref, true));
__wt_page_out(session, &ref->page);
}
@@ -263,6 +248,9 @@ __wt_free_ref(
if (ref == NULL)
return;
+ /* Assert there are no hazard pointers. */
+ WT_ASSERT(session, __wt_hazard_check_assert(session, ref, false));
+
/*
* Optionally free the referenced pages. (The path to free referenced
* page is used for error cleanup, no instantiated and then discarded
diff --git a/src/third_party/wiredtiger/src/btree/bt_handle.c b/src/third_party/wiredtiger/src/btree/bt_handle.c
index e6f8bad8e31..5d9609c3a52 100644
--- a/src/third_party/wiredtiger/src/btree/bt_handle.c
+++ b/src/third_party/wiredtiger/src/btree/bt_handle.c
@@ -68,7 +68,7 @@ __wt_btree_open(WT_SESSION_IMPL *session, const char *op_cfg[])
size_t root_addr_size;
uint8_t root_addr[WT_BTREE_MAX_ADDR_COOKIE];
const char *filename;
- bool creation, forced_salvage, readonly;
+ bool creation, forced_salvage;
btree = S2BT(session);
dhandle = session->dhandle;
@@ -86,9 +86,10 @@ __wt_btree_open(WT_SESSION_IMPL *session, const char *op_cfg[])
/* Set the data handle first, our called functions reasonably use it. */
btree->dhandle = dhandle;
- /* Checkpoint files are readonly. */
- readonly = dhandle->checkpoint != NULL ||
- F_ISSET(S2C(session), WT_CONN_READONLY);
+ /* Checkpoint and verify files are readonly. */
+ if (dhandle->checkpoint != NULL || F_ISSET(btree, WT_BTREE_VERIFY) ||
+ F_ISSET(S2C(session), WT_CONN_READONLY))
+ F_SET(btree, WT_BTREE_READONLY);
/* Get the checkpoint information for this name/checkpoint pair. */
WT_CLEAR(ckpt);
@@ -120,7 +121,8 @@ __wt_btree_open(WT_SESSION_IMPL *session, const char *op_cfg[])
WT_ERR_MSG(session, EINVAL, "expected a 'file:' URI");
WT_ERR(__wt_block_manager_open(session, filename, dhandle->cfg,
- forced_salvage, readonly, btree->allocsize, &btree->bm));
+ forced_salvage, F_ISSET(btree, WT_BTREE_READONLY),
+ btree->allocsize, &btree->bm));
bm = btree->bm;
/*
@@ -150,7 +152,8 @@ __wt_btree_open(WT_SESSION_IMPL *session, const char *op_cfg[])
*/
WT_ERR(bm->checkpoint_load(bm, session,
ckpt.raw.data, ckpt.raw.size,
- root_addr, &root_addr_size, readonly));
+ root_addr, &root_addr_size,
+ F_ISSET(btree, WT_BTREE_READONLY)));
if (creation || root_addr_size == 0)
WT_ERR(__btree_tree_open_empty(session, creation));
else {
@@ -447,6 +450,48 @@ __btree_conf(WT_SESSION_IMPL *session, WT_CKPT *ckpt)
WT_RET(__wt_compressor_config(session, &cval, &btree->compressor));
/*
+ * Configure compression adjustment.
+ * When doing compression, assume compression rates that will result in
+ * pages larger than the maximum in-memory images allowed. If we're
+ * wrong, we adjust downward (but we're almost certainly correct, the
+ * maximum in-memory images allowed are only 4x the maximum page size,
+ * and compression always gives us more than 4x).
+ * Don't do compression adjustment for fixed-size column store, the
+ * leaf page sizes don't change. (We could adjust internal pages but not
+ * internal pages, but that seems an unlikely use case.)
+ * XXX
+ * Don't do compression adjustment of snappy-compressed blocks.
+ */
+ btree->intlpage_compadjust = false;
+ btree->maxintlpage_precomp = btree->maxintlpage;
+ btree->leafpage_compadjust = false;
+ btree->maxleafpage_precomp = btree->maxleafpage;
+ if (btree->compressor != NULL && btree->compressor->compress != NULL &&
+ !WT_STRING_MATCH("snappy", cval.str, cval.len) &&
+ btree->type != BTREE_COL_FIX) {
+ /*
+ * Don't do compression adjustment when on-disk page sizes are
+ * less than 16KB. There's not enough compression going on to
+ * fine-tune the size, all we end up doing is hammering shared
+ * memory.
+ *
+ * Don't do compression adjustment when on-disk page sizes are
+ * equal to the maximum in-memory page image, the bytes taken
+ * for compression can't grow past the base value.
+ */
+ if (btree->maxintlpage >= 16 * 1024 &&
+ btree->maxmempage_image > btree->maxintlpage) {
+ btree->intlpage_compadjust = true;
+ btree->maxintlpage_precomp = btree->maxmempage_image;
+ }
+ if (btree->maxleafpage >= 16 * 1024 &&
+ btree->maxmempage_image > btree->maxleafpage) {
+ btree->leafpage_compadjust = true;
+ btree->maxleafpage_precomp = btree->maxmempage_image;
+ }
+ }
+
+ /*
* We do not use __wt_config_gets_none here because "none" and the empty
* string have different meanings. The empty string means inherit the
* system encryption setting and "none" means this table is in the clear
@@ -476,7 +521,7 @@ __btree_conf(WT_SESSION_IMPL *session, WT_CKPT *ckpt)
btree->modified = false; /* Clean */
- btree->checkpointing = WT_CKPT_OFF; /* Not checkpointing */
+ btree->syncing = WT_BTREE_SYNC_OFF; /* Not syncing */
btree->write_gen = ckpt->write_gen; /* Write generation */
btree->checkpoint_gen = __wt_gen(session, WT_GEN_CHECKPOINT);
@@ -753,7 +798,7 @@ __btree_page_sizes(WT_SESSION_IMPL *session)
WT_CONFIG_ITEM cval;
WT_CONNECTION_IMPL *conn;
uint64_t cache_size;
- uint32_t intl_split_size, leaf_split_size;
+ uint32_t intl_split_size, leaf_split_size, max;
const char **cfg;
btree = S2BT(session);
@@ -787,6 +832,22 @@ __btree_page_sizes(WT_SESSION_IMPL *session)
"size (%" PRIu32 "B)", btree->allocsize);
/*
+ * Default in-memory page image size for compression is 4x the maximum
+ * internal or leaf page size, and enforce the on-disk page sizes as a
+ * lower-limit for the in-memory image size.
+ */
+ WT_RET(__wt_config_gets(session, cfg, "memory_page_image_max", &cval));
+ btree->maxmempage_image = (uint32_t)cval.val;
+ max = WT_MAX(btree->maxintlpage, btree->maxleafpage);
+ if (btree->maxmempage_image == 0)
+ btree->maxmempage_image = 4 * max;
+ else if (btree->maxmempage_image < max)
+ WT_RET_MSG(session, EINVAL,
+ "in-memory page image size must be larger than the maximum "
+ "page size (%" PRIu32 "B < %" PRIu32 "B)",
+ btree->maxmempage_image, max);
+
+ /*
* Don't let pages grow large compared to the cache size or we can end
* up in a situation where nothing can be evicted. Make sure at least
* 10 pages fit in cache when it is at the dirty trigger where threads
@@ -811,7 +872,7 @@ __btree_page_sizes(WT_SESSION_IMPL *session)
* size. This gives multi-threaded append workloads a better chance of
* not stalling.
*/
- btree->splitmempage = 8 * btree->maxmempage / 10;
+ btree->splitmempage = (8 * btree->maxmempage) / 10;
/*
* Get the split percentage (reconciliation splits pages into smaller
@@ -827,8 +888,10 @@ __btree_page_sizes(WT_SESSION_IMPL *session)
"%d%%.", session->dhandle->name, WT_BTREE_MIN_SPLIT_PCT));
} else
btree->split_pct = (int)cval.val;
- intl_split_size = __wt_split_page_size(btree, btree->maxintlpage);
- leaf_split_size = __wt_split_page_size(btree, btree->maxleafpage);
+ intl_split_size = __wt_split_page_size(
+ btree->split_pct, btree->maxintlpage, btree->allocsize);
+ leaf_split_size = __wt_split_page_size(
+ btree->split_pct, btree->maxleafpage, btree->allocsize);
/*
* In-memory split configuration.
@@ -893,11 +956,11 @@ __btree_page_sizes(WT_SESSION_IMPL *session)
* reset it to the default.
*/
if (btree->maxintlkey == 0 || btree->maxintlkey > intl_split_size / 10)
- btree->maxintlkey = intl_split_size / 10;
+ btree->maxintlkey = intl_split_size / 10;
if (btree->maxleafkey == 0)
- btree->maxleafkey = leaf_split_size / 10;
+ btree->maxleafkey = leaf_split_size / 10;
if (btree->maxleafvalue == 0)
- btree->maxleafvalue = leaf_split_size / 2;
+ btree->maxleafvalue = leaf_split_size / 2;
return (0);
}
@@ -915,10 +978,11 @@ __wt_btree_immediately_durable(WT_SESSION_IMPL *session)
/*
* This is used to determine whether timestamp updates should
- * be rolled back for this btree. It's likely that the particular
- * test required here will change when rollback to stable is
- * supported with in-memory configurations.
+ * be rolled back for this btree. With in-memory, the logging
+ * setting on tables is still important and when enabled they
+ * should be considered "durable".
*/
- return (FLD_ISSET(S2C(session)->log_flags, WT_CONN_LOG_ENABLED) &&
+ return ((FLD_ISSET(S2C(session)->log_flags, WT_CONN_LOG_ENABLED) ||
+ (F_ISSET(S2C(session), WT_CONN_IN_MEMORY))) &&
!F_ISSET(btree, WT_BTREE_NO_LOGGING));
}
diff --git a/src/third_party/wiredtiger/src/btree/bt_io.c b/src/third_party/wiredtiger/src/btree/bt_io.c
index 1379553c211..7e7909eed9f 100644
--- a/src/third_party/wiredtiger/src/btree/bt_io.c
+++ b/src/third_party/wiredtiger/src/btree/bt_io.c
@@ -151,12 +151,13 @@ __wt_bt_read(WT_SESSION_IMPL *session,
if (0) {
corrupt: if (ret == 0)
ret = WT_ERROR;
+ F_SET(S2C(session), WT_CONN_DATA_CORRUPTION);
if (!F_ISSET(btree, WT_BTREE_VERIFY) &&
!F_ISSET(session, WT_SESSION_QUIET_CORRUPT_FILE)) {
- __wt_err(session, ret, "%s", fail_msg);
WT_TRET(bm->corrupt(bm, session, addr, addr_size));
- WT_TRET(
- __wt_illegal_value(session, btree->dhandle->name));
+ WT_PANIC_ERR(session, ret,
+ "%s: fatal read error: %s",
+ btree->dhandle->name, fail_msg);
}
}
@@ -172,7 +173,7 @@ err: __wt_scr_free(session, &tmp);
*/
int
__wt_bt_write(WT_SESSION_IMPL *session, WT_ITEM *buf,
- uint8_t *addr, size_t *addr_sizep,
+ uint8_t *addr, size_t *addr_sizep, size_t *compressed_sizep,
bool checkpoint, bool checkpoint_io, bool compressed)
{
WT_BM *bm;
@@ -189,6 +190,9 @@ __wt_bt_write(WT_SESSION_IMPL *session, WT_ITEM *buf,
int compression_failed; /* Extension API, so not a bool. */
bool data_checksum, encrypted, timer;
+ if (compressed_sizep != NULL)
+ *compressed_sizep = 0;
+
btree = S2BT(session);
bm = btree->bm;
encrypted = false;
@@ -306,6 +310,10 @@ __wt_bt_write(WT_SESSION_IMPL *session, WT_ITEM *buf,
memcpy(ctmp->mem, buf->mem, WT_BLOCK_COMPRESS_SKIP);
ctmp->size = result_len;
ip = ctmp;
+
+ /* Optionally return the compressed size. */
+ if (compressed_sizep != NULL)
+ *compressed_sizep = result_len;
}
}
/*
diff --git a/src/third_party/wiredtiger/src/btree/bt_ovfl.c b/src/third_party/wiredtiger/src/btree/bt_ovfl.c
index ce0ee706923..6032364fff7 100644
--- a/src/third_party/wiredtiger/src/btree/bt_ovfl.c
+++ b/src/third_party/wiredtiger/src/btree/bt_ovfl.c
@@ -241,7 +241,7 @@ __wt_ovfl_discard(WT_SESSION_IMPL *session, WT_CELL *cell)
__wt_cell_type_reset(session,
unpack->cell, WT_CELL_VALUE_OVFL, WT_CELL_VALUE_OVFL_RM);
break;
- WT_ILLEGAL_VALUE(session);
+ WT_ILLEGAL_VALUE(session, unpack->raw);
}
__wt_writeunlock(session, &btree->ovfl_lock);
diff --git a/src/third_party/wiredtiger/src/btree/bt_page.c b/src/third_party/wiredtiger/src/btree/bt_page.c
index 612540956b7..e3f5d64deb9 100644
--- a/src/third_party/wiredtiger/src/btree/bt_page.c
+++ b/src/third_party/wiredtiger/src/btree/bt_page.c
@@ -57,7 +57,7 @@ __wt_page_alloc(WT_SESSION_IMPL *session,
*/
size += alloc_entries * sizeof(WT_ROW);
break;
- WT_ILLEGAL_VALUE(session);
+ WT_ILLEGAL_VALUE(session, type);
}
WT_RET(__wt_calloc(session, 1, size, &page));
@@ -112,7 +112,7 @@ err: if ((pindex = WT_INTL_INDEX_GET_SAFE(page)) != NULL) {
NULL : (WT_ROW *)((uint8_t *)page + sizeof(WT_PAGE));
page->entries = alloc_entries;
break;
- WT_ILLEGAL_VALUE(session);
+ WT_ILLEGAL_VALUE(session, type);
}
/* Increment the cache statistics. */
@@ -186,7 +186,7 @@ __wt_page_inmem(WT_SESSION_IMPL *session,
WT_RET(__inmem_row_leaf_entries(
session, dsk, &alloc_entries));
break;
- WT_ILLEGAL_VALUE(session);
+ WT_ILLEGAL_VALUE(session, dsk->type);
}
/* Allocate and initialize a new WT_PAGE. */
@@ -222,7 +222,7 @@ __wt_page_inmem(WT_SESSION_IMPL *session,
case WT_PAGE_ROW_LEAF:
WT_ERR(__inmem_row_leaf(session, page));
break;
- WT_ILLEGAL_VALUE_ERR(session);
+ WT_ILLEGAL_VALUE_ERR(session, page->type);
}
/* Update the page's cache statistics. */
@@ -503,7 +503,7 @@ __inmem_row_int(WT_SESSION_IMPL *session, WT_PAGE *page, size_t *sizep)
ref->addr = cell;
++refp;
break;
- WT_ILLEGAL_VALUE_ERR(session);
+ WT_ILLEGAL_VALUE_ERR(session, unpack->type);
}
}
@@ -556,7 +556,7 @@ __inmem_row_leaf_entries(
case WT_CELL_VALUE:
case WT_CELL_VALUE_OVFL:
break;
- WT_ILLEGAL_VALUE(session);
+ WT_ILLEGAL_VALUE(session, unpack->type);
}
}
@@ -614,7 +614,7 @@ __inmem_row_leaf(WT_SESSION_IMPL *session, WT_PAGE *page)
break;
case WT_CELL_VALUE_OVFL:
break;
- WT_ILLEGAL_VALUE(session);
+ WT_ILLEGAL_VALUE(session, unpack->type);
}
}
diff --git a/src/third_party/wiredtiger/src/btree/bt_random.c b/src/third_party/wiredtiger/src/btree/bt_random.c
index 17497561248..ed68513b245 100644
--- a/src/third_party/wiredtiger/src/btree/bt_random.c
+++ b/src/third_party/wiredtiger/src/btree/bt_random.c
@@ -169,27 +169,27 @@ __wt_row_random_leaf(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt)
* Find a random page in a tree for either sampling or eviction.
*/
int
-__wt_random_descent(WT_SESSION_IMPL *session, WT_REF **refp, bool eviction)
+__wt_random_descent(WT_SESSION_IMPL *session, WT_REF **refp, uint32_t flags)
{
WT_BTREE *btree;
WT_DECL_RET;
WT_PAGE *page;
WT_PAGE_INDEX *pindex;
WT_REF *current, *descent;
- uint32_t flags, i, entries, retry;
+ uint32_t i, entries, retry;
+ bool eviction;
*refp = NULL;
btree = S2BT(session);
current = NULL;
retry = 100;
-
- /* Eviction should not be tapped to do eviction. */
- if (eviction)
- flags = WT_READ_CACHE | WT_READ_NO_EVICT | WT_READ_NO_GEN |
- WT_READ_NO_WAIT | WT_READ_NOTFOUND_OK | WT_READ_RESTART_OK;
- else
- flags = WT_READ_RESTART_OK;
+ /*
+ * This function is called by eviction to find a random page in the
+ * cache. That case is indicated by the WT_READ_CACHE flag. Ordinary
+ * lookups in a tree will read pages into cache as needed.
+ */
+ eviction = LF_ISSET(WT_READ_CACHE);
if (0) {
restart: /*
@@ -262,7 +262,7 @@ restart: /*
* holding nothing on failure.
*/
descend: if ((ret = __wt_page_swap(
- session, current, descent, false, flags)) == 0) {
+ session, current, descent, flags)) == 0) {
current = descent;
continue;
}
@@ -302,11 +302,15 @@ __wt_btcur_next_random(WT_CURSOR_BTREE *cbt)
WT_UPDATE *upd;
wt_off_t size;
uint64_t n, skip;
+ uint32_t read_flags;
bool valid;
btree = cbt->btree;
cursor = &cbt->iface;
session = (WT_SESSION_IMPL *)cbt->iface.session;
+ read_flags = WT_READ_RESTART_OK;
+ if (F_ISSET(cbt, WT_CBT_READ_ONCE))
+ FLD_SET(read_flags, WT_READ_WONT_NEED);
/*
* Only supports row-store: applications can trivially select a random
@@ -337,7 +341,7 @@ __wt_btcur_next_random(WT_CURSOR_BTREE *cbt)
if (cbt->ref == NULL || cbt->next_random_sample_size == 0) {
WT_ERR(__cursor_func_init(cbt, true));
WT_WITH_PAGE_INDEX(session,
- ret = __wt_random_descent(session, &cbt->ref, false));
+ ret = __wt_random_descent(session, &cbt->ref, read_flags));
if (ret == 0)
goto random_page_entry;
diff --git a/src/third_party/wiredtiger/src/btree/bt_read.c b/src/third_party/wiredtiger/src/btree/bt_read.c
index 9e530be4f0e..0d0cf17762c 100644
--- a/src/third_party/wiredtiger/src/btree/bt_read.c
+++ b/src/third_party/wiredtiger/src/btree/bt_read.c
@@ -127,7 +127,7 @@ __las_page_instantiate(WT_SESSION_IMPL *session, WT_REF *ref)
uint64_t current_recno, las_counter, las_pageid, las_txnid, recno;
uint32_t las_id, session_flags;
const uint8_t *p;
- uint8_t upd_type;
+ uint8_t prepare_state, upd_type;
bool locked;
cursor = NULL;
@@ -180,12 +180,14 @@ __las_page_instantiate(WT_SESSION_IMPL *session, WT_REF *ref)
break;
/* Allocate the WT_UPDATE structure. */
- WT_ERR(cursor->get_value(cursor,
- &las_txnid, &las_timestamp, &upd_type, &las_value));
+ WT_ERR(cursor->get_value(
+ cursor, &las_txnid, &las_timestamp,
+ &prepare_state, &upd_type, &las_value));
WT_ERR(__wt_update_alloc(
session, &las_value, &upd, &incr, upd_type));
total_incr += incr;
upd->txnid = las_txnid;
+ upd->prepare_state = prepare_state;
#ifdef HAVE_TIMESTAMPS
WT_ASSERT(session, las_timestamp.size == WT_TIMESTAMP_SIZE);
memcpy(&upd->timestamp, las_timestamp.data, las_timestamp.size);
@@ -221,7 +223,7 @@ __las_page_instantiate(WT_SESSION_IMPL *session, WT_REF *ref)
WT_ERR(__wt_buf_set(session,
current_key, las_key.data, las_key.size));
break;
- WT_ILLEGAL_VALUE_ERR(session);
+ WT_ILLEGAL_VALUE_ERR(session, page->type);
}
/* Append the latest update to the list. */
@@ -251,7 +253,7 @@ __las_page_instantiate(WT_SESSION_IMPL *session, WT_REF *ref)
current_key, ref, &cbt, first_upd));
first_upd = NULL;
break;
- WT_ILLEGAL_VALUE_ERR(session);
+ WT_ILLEGAL_VALUE_ERR(session, page->type);
}
/* Discard the cursor. */
@@ -276,13 +278,15 @@ __las_page_instantiate(WT_SESSION_IMPL *session, WT_REF *ref)
*/
page->modify->first_dirty_txn = WT_TXN_FIRST;
- if (ref->page_las->las_skew_newest &&
+ FLD_SET(page->modify->restore_state, WT_PAGE_RS_LOOKASIDE);
+
+ if (ref->page_las->skew_newest &&
!S2C(session)->txn_global.has_stable_timestamp &&
- __wt_txn_visible_all(session, ref->page_las->las_max_txn,
- WT_TIMESTAMP_NULL(&ref->page_las->onpage_timestamp))) {
- page->modify->rec_max_txn = ref->page_las->las_max_txn;
+ __wt_txn_visible_all(session, ref->page_las->unstable_txn,
+ WT_TIMESTAMP_NULL(&ref->page_las->unstable_timestamp))) {
+ page->modify->rec_max_txn = ref->page_las->max_txn;
__wt_timestamp_set(&page->modify->rec_max_timestamp,
- &ref->page_las->onpage_timestamp);
+ &ref->page_las->max_timestamp);
__wt_page_modify_clear(session, page);
}
}
@@ -365,6 +369,43 @@ __evict_force_check(WT_SESSION_IMPL *session, WT_REF *ref)
}
/*
+ * __page_read_lookaside --
+ * Figure out whether to instantiate content from lookaside on
+ * page access.
+ */
+static inline int
+__page_read_lookaside(WT_SESSION_IMPL *session,
+ WT_REF *ref, uint32_t previous_state, uint32_t *final_statep)
+{
+ /*
+ * Reading a lookaside ref for the first time, and not requiring the
+ * history triggers a transition to WT_REF_LIMBO, if we are already
+ * in limbo and still don't need the history - we are done.
+ */
+ if (__wt_las_page_skip_locked(session, ref)) {
+ if (previous_state == WT_REF_LOOKASIDE) {
+ WT_STAT_CONN_INCR(
+ session, cache_read_lookaside_skipped);
+ ref->page_las->eviction_to_lookaside = true;
+ *final_statep = WT_REF_LIMBO;
+ }
+ return (0);
+ }
+
+ /* Instantiate updates from the database's lookaside table. */
+ if (previous_state == WT_REF_LIMBO) {
+ WT_STAT_CONN_INCR(session, cache_read_lookaside_delay);
+ if (WT_SESSION_IS_CHECKPOINT(session))
+ WT_STAT_CONN_INCR(session,
+ cache_read_lookaside_delay_checkpoint);
+ }
+
+ WT_RET(__las_page_instantiate(session, ref));
+ ref->page_las->eviction_to_lookaside = false;
+ return (0);
+}
+
+/*
* __page_read --
* Read a page from the file.
*/
@@ -492,37 +533,27 @@ skip_read:
/* Move all records to a deleted state. */
WT_ERR(__wt_delete_page_instantiate(session, ref));
break;
- case WT_REF_LOOKASIDE:
- if (__wt_las_page_skip_locked(session, ref)) {
- WT_STAT_CONN_INCR(
- session, cache_read_lookaside_skipped);
- ref->page_las->eviction_to_lookaside = true;
- final_state = WT_REF_LIMBO;
- break;
- }
- /* FALLTHROUGH */
case WT_REF_LIMBO:
- /* Instantiate updates from the database's lookaside table. */
- if (previous_state == WT_REF_LIMBO) {
- WT_STAT_CONN_INCR(session, cache_read_lookaside_delay);
- if (WT_SESSION_IS_CHECKPOINT(session))
- WT_STAT_CONN_INCR(session,
- cache_read_lookaside_delay_checkpoint);
- }
-
- WT_ERR(__las_page_instantiate(session, ref));
- ref->page_las->eviction_to_lookaside = false;
+ case WT_REF_LOOKASIDE:
+ WT_ERR(__page_read_lookaside(
+ session, ref, previous_state, &final_state));
break;
}
/*
- * We no longer need lookaside entries once the page is instantiated.
- * There's no reason for the lookaside remove to fail, but ignore it
- * if for some reason it fails, we've got a valid page.
+ * Once the page is instantiated, we no longer need the history in
+ * lookaside. We leave the lookaside sweep thread to do most cleanup,
+ * but it can only remove keys that skew newest (if there are entries
+ * in the lookaside newer than the page, they need to be read back into
+ * cache or they will be lost).
+ *
+ * There is no reason for the lookaside remove should fail, but ignore
+ * it if for some reason it fails, we've got a valid page.
*
* Don't free WT_REF.page_las, there may be concurrent readers.
*/
- if (final_state == WT_REF_MEM && ref->page_las != NULL)
+ if (final_state == WT_REF_MEM &&
+ ref->page_las != NULL && !ref->page_las->skew_newest)
WT_IGNORE_RET(__wt_las_remove_block(
session, ref->page_las->las_pageid, false));
@@ -551,7 +582,7 @@ err: /*
int
__wt_page_in_func(WT_SESSION_IMPL *session, WT_REF *ref, uint32_t flags
#ifdef HAVE_DIAGNOSTIC
- , const char *file, int line
+ , const char *func, int line
#endif
)
{
@@ -681,7 +712,7 @@ read: /*
*/
#ifdef HAVE_DIAGNOSTIC
WT_RET(
- __wt_hazard_set(session, ref, &busy, file, line));
+ __wt_hazard_set(session, ref, &busy, func, line));
#else
WT_RET(__wt_hazard_set(session, ref, &busy));
#endif
@@ -786,7 +817,7 @@ skip_evict: /*
return (LF_ISSET(WT_READ_IGNORE_CACHE_SIZE) &&
!F_ISSET(session, WT_SESSION_IGNORE_CACHE_SIZE) ?
0 : __wt_txn_autocommit_check(session));
- WT_ILLEGAL_VALUE(session);
+ WT_ILLEGAL_VALUE(session, current_state);
}
/*
diff --git a/src/third_party/wiredtiger/src/btree/bt_rebalance.c b/src/third_party/wiredtiger/src/btree/bt_rebalance.c
index 17adcdd6da6..a509bbb88bc 100644
--- a/src/third_party/wiredtiger/src/btree/bt_rebalance.c
+++ b/src/third_party/wiredtiger/src/btree/bt_rebalance.c
@@ -235,7 +235,7 @@ __rebalance_col_walk(
unpack.type == WT_CELL_ADDR_LEAF ?
WT_ADDR_LEAF : WT_ADDR_LEAF_NO, rs));
break;
- WT_ILLEGAL_VALUE_ERR(session);
+ WT_ILLEGAL_VALUE_ERR(session, unpack.type);
}
}
@@ -386,7 +386,7 @@ __rebalance_row_walk(
first_cell = false;
break;
- WT_ILLEGAL_VALUE_ERR(session);
+ WT_ILLEGAL_VALUE_ERR(session, unpack.type);
}
}
@@ -440,7 +440,7 @@ __wt_bt_rebalance(WT_SESSION_IMPL *session, const char *cfg[])
WT_ERR(
__rebalance_col_walk(session, btree->root.page->dsk, rs));
break;
- WT_ILLEGAL_VALUE_ERR(session);
+ WT_ILLEGAL_VALUE_ERR(session, rs->type);
}
/* Build a new root page. */
diff --git a/src/third_party/wiredtiger/src/btree/bt_slvg.c b/src/third_party/wiredtiger/src/btree/bt_slvg.c
index 54f4eaa8f52..475b026ddbf 100644
--- a/src/third_party/wiredtiger/src/btree/bt_slvg.c
+++ b/src/third_party/wiredtiger/src/btree/bt_slvg.c
@@ -259,7 +259,7 @@ __wt_bt_salvage(WT_SESSION_IMPL *session, WT_CKPT *ckptbase, const char *cfg[])
* fixed-length format ranges to overlap during salvage, and I don't
* want to have to retrofit the code later.
*/
- qsort(ss->pages,
+ __wt_qsort(ss->pages,
(size_t)ss->pages_next, sizeof(WT_TRACK *), __slvg_trk_compare_key);
if (ss->page_type == WT_PAGE_ROW_LEAF)
WT_ERR(__slvg_row_range(session, ss));
@@ -328,7 +328,7 @@ __wt_bt_salvage(WT_SESSION_IMPL *session, WT_CKPT *ckptbase, const char *cfg[])
*/
if (ss->root_ref.page != NULL) {
btree->ckpt = ckptbase;
- ret = __wt_evict(session, &ss->root_ref, true);
+ ret = __wt_evict(session, &ss->root_ref, true, WT_REF_MEM);
ss->root_ref.page = NULL;
btree->ckpt = NULL;
}
@@ -1093,7 +1093,7 @@ __slvg_col_trk_update_start(uint32_t slot, WT_STUFF *ss)
}
i -= slot;
if (i > 1)
- qsort(ss->pages + slot, (size_t)i,
+ __wt_qsort(ss->pages + slot, (size_t)i,
sizeof(WT_TRACK *), __slvg_trk_compare_key);
}
@@ -1300,7 +1300,7 @@ __slvg_col_build_leaf(WT_SESSION_IMPL *session, WT_TRACK *trk, WT_REF *ref)
ret = __wt_page_release(session, ref, 0);
if (ret == 0)
- ret = __wt_evict(session, ref, true);
+ ret = __wt_evict(session, ref, true, WT_REF_MEM);
if (0) {
err: WT_TRET(__wt_page_release(session, ref, 0));
@@ -1770,7 +1770,8 @@ __slvg_row_trk_update_start(
* would have discarded it, we wouldn't be here. Therefore, this test
* is safe. (But, it never hurts to check.)
*/
- WT_ERR_TEST(!found, WT_ERROR);
+ if (!found)
+ WT_ERR_MSG(session, WT_ERROR, "expected on-page key not found");
WT_ERR(__slvg_key_copy(session, &trk->row_start, key));
/*
@@ -1789,7 +1790,7 @@ __slvg_row_trk_update_start(
}
i -= slot;
if (i > 1)
- qsort(ss->pages + slot, (size_t)i,
+ __wt_qsort(ss->pages + slot, (size_t)i,
sizeof(WT_TRACK *), __slvg_trk_compare_key);
err: if (page != NULL)
@@ -2018,7 +2019,7 @@ __slvg_row_build_leaf(
*/
ret = __wt_page_release(session, ref, 0);
if (ret == 0)
- ret = __wt_evict(session, ref, true);
+ ret = __wt_evict(session, ref, true, WT_REF_MEM);
if (0) {
err: WT_TRET(__wt_page_release(session, ref, 0));
@@ -2159,13 +2160,12 @@ __slvg_ovfl_reconcile(WT_SESSION_IMPL *session, WT_STUFF *ss)
* If an overflow page is referenced more than once, discard leaf pages
* with the lowest LSNs until overflow pages are only referenced once.
*
- * This requires sorting the page list by LSN, and the overflow array
-
- * by address cookie.
+ * This requires sorting the page list by LSN, and the overflow array by
+ * address cookie.
*/
- qsort(ss->pages,
+ __wt_qsort(ss->pages,
(size_t)ss->pages_next, sizeof(WT_TRACK *), __slvg_trk_compare_gen);
- qsort(ss->ovfl,
+ __wt_qsort(ss->ovfl,
(size_t)ss->ovfl_next, sizeof(WT_TRACK *), __slvg_trk_compare_addr);
/*
@@ -2350,7 +2350,7 @@ __slvg_ovfl_ref(WT_SESSION_IMPL *session, WT_TRACK *trk, bool multi_panic)
{
if (F_ISSET(trk, WT_TRACK_OVFL_REFD)) {
if (!multi_panic)
- return (EBUSY);
+ return (__wt_set_return(session, EBUSY));
WT_PANIC_RET(session, EINVAL,
"overflow record unexpectedly referenced multiple times "
"during leaf page merge");
diff --git a/src/third_party/wiredtiger/src/btree/bt_split.c b/src/third_party/wiredtiger/src/btree/bt_split.c
index 7061b92af78..42d232fc7b4 100644
--- a/src/third_party/wiredtiger/src/btree/bt_split.c
+++ b/src/third_party/wiredtiger/src/btree/bt_split.c
@@ -31,24 +31,6 @@ typedef enum {
} WT_SPLIT_ERROR_PHASE;
/*
- * __page_split_timing_stress --
- * Optionally add delay to simulate the race conditions in page split for
- * debug purposes. The purpose is to uncover the race conditions in page split.
- */
-static void
-__page_split_timing_stress(
- WT_SESSION_IMPL *session, uint64_t flag, uint64_t micro_seconds)
-{
- WT_CONNECTION_IMPL *conn;
-
- conn = S2C(session);
-
- /* We only want to sleep when page split race flag is set. */
- if (FLD_ISSET(conn->timing_stress_flags, flag))
- __wt_sleep(0, micro_seconds);
-}
-
-/*
* __split_safe_free --
* Free a buffer if we can be sure no thread is accessing it, or schedule
* it to be freed otherwise.
@@ -85,8 +67,8 @@ __split_verify_intl_key_order(WT_SESSION_IMPL *session, WT_PAGE *page)
WT_ITEM *next, _next, *last, _last, *tmp;
WT_REF *ref;
uint64_t recno;
+ uint32_t slot;
int cmp;
- bool first;
btree = S2BT(session);
@@ -106,20 +88,19 @@ __split_verify_intl_key_order(WT_SESSION_IMPL *session, WT_PAGE *page)
last = &_last;
WT_CLEAR(_last);
- first = true;
+ slot = 0;
WT_INTL_FOREACH_BEGIN(session, page, ref) {
WT_ASSERT(session, ref->home == page);
+ /*
+ * Don't compare the first slot with any other slot,
+ * it's ignored on row-store internal pages.
+ */
__wt_ref_key(page, ref, &next->data, &next->size);
- if (last->size == 0) {
- if (first)
- first = false;
- else {
- WT_ASSERT(session, __wt_compare(
- session, btree->collator, last,
- next, &cmp) == 0);
- WT_ASSERT(session, cmp < 0);
- }
+ if (++slot > 2) {
+ WT_ASSERT(session, __wt_compare(session,
+ btree->collator, last, next, &cmp) == 0);
+ WT_ASSERT(session, cmp < 0);
}
tmp = last;
last = next;
@@ -205,19 +186,8 @@ __split_ovfl_key_cleanup(WT_SESSION_IMPL *session, WT_PAGE *page, WT_REF *ref)
cell = WT_PAGE_REF_OFFSET(page, cell_offset);
__wt_cell_unpack(cell, &kpack);
- if (kpack.ovfl && kpack.raw != WT_CELL_KEY_OVFL_RM) {
- /*
- * Eviction cannot free overflow items once a checkpoint is
- * running in a tree: that can corrupt the checkpoint's block
- * management. Assert that checkpoints aren't running to make
- * sure we're catching all paths and to avoid regressions.
- */
- WT_ASSERT(session,
- S2BT(session)->checkpointing != WT_CKPT_RUNNING ||
- WT_SESSION_IS_CHECKPOINT(session));
-
+ if (kpack.ovfl && kpack.raw != WT_CELL_KEY_OVFL_RM)
WT_RET(__wt_ovfl_discard(session, cell));
- }
return (0);
}
@@ -305,7 +275,7 @@ __split_ref_move(WT_SESSION_IMPL *session, WT_PAGE *from_home,
case WT_CELL_ADDR_LEAF_NO:
addr->type = WT_ADDR_LEAF_NO;
break;
- WT_ILLEGAL_VALUE_ERR(session);
+ WT_ILLEGAL_VALUE_ERR(session, unpack.raw);
}
if (__wt_atomic_cas_ptr(&ref->addr, ref_addr, addr))
addr = NULL;
@@ -467,7 +437,7 @@ __split_root(WT_SESSION_IMPL *session, WT_PAGE *root)
children = pindex->entries / btree->split_deepen_per_child;
if (children < 10) {
if (pindex->entries < 100)
- return (EBUSY);
+ return (__wt_set_return(session, EBUSY));
children = 10;
}
chunk = pindex->entries / children;
@@ -566,8 +536,7 @@ __split_root(WT_SESSION_IMPL *session, WT_PAGE *root)
WT_ERR(__split_ref_prepare(session, alloc_index, &locked, false));
/* Encourage a race */
- __page_split_timing_stress(
- session, WT_TIMING_STRESS_SPLIT_1, TIMING_STRESS_TEST_SLEEP);
+ __wt_timing_stress(session, WT_TIMING_STRESS_SPLIT_1);
/*
* Confirm the root page's index hasn't moved, then update it, which
@@ -578,8 +547,7 @@ __split_root(WT_SESSION_IMPL *session, WT_PAGE *root)
alloc_index = NULL;
/* Encourage a race */
- __page_split_timing_stress(
- session, WT_TIMING_STRESS_SPLIT_2, TIMING_STRESS_TEST_SLEEP);
+ __wt_timing_stress(session, WT_TIMING_STRESS_SPLIT_2);
/*
* Get a generation for this split, mark the root page. This must be
@@ -657,6 +625,7 @@ static int
__split_parent(WT_SESSION_IMPL *session, WT_REF *ref, WT_REF **ref_new,
uint32_t new_entries, size_t parent_incr, bool exclusive, bool discard)
{
+ WT_BTREE *btree;
WT_DECL_ITEM(scr);
WT_DECL_RET;
WT_IKEY *ikey;
@@ -671,6 +640,7 @@ __split_parent(WT_SESSION_IMPL *session, WT_REF *ref, WT_REF **ref_new,
uint32_t hint, i, j;
bool empty_parent;
+ btree = S2BT(session);
parent = ref->home;
alloc_index = pindex = NULL;
@@ -690,17 +660,23 @@ __split_parent(WT_SESSION_IMPL *session, WT_REF *ref, WT_REF **ref_new,
parent_entries = pindex->entries;
/*
- * Remove any refs to deleted pages while we are splitting, we have
- * the internal page locked down, and are copying the refs into a new
- * array anyway. Switch them to the special split state, so that any
- * reading thread will restart.
+ * Remove any refs to deleted pages while we are splitting, we have the
+ * internal page locked down, and are copying the refs into a new array
+ * anyway. Switch them to the special split state, so that any reading
+ * thread will restart.
+ *
+ * We can't do this if there is a sync running in the tree in another
+ * session: removing the refs frees the blocks for the deleted pages,
+ * which can corrupt the free list calculated by the sync.
*/
WT_ERR(__wt_scr_alloc(session, 10 * sizeof(uint32_t), &scr));
for (deleted_entries = 0, i = 0; i < parent_entries; ++i) {
next_ref = pindex->index[i];
WT_ASSERT(session, next_ref->state != WT_REF_SPLIT);
if ((discard && next_ref == ref) ||
- (next_ref->state == WT_REF_DELETED &&
+ ((!WT_BTREE_SYNCING(btree) ||
+ WT_SESSION_BTREE_SYNC(session)) &&
+ next_ref->state == WT_REF_DELETED &&
__wt_delete_page_skip(session, next_ref, true) &&
__wt_atomic_casv32(
&next_ref->state, WT_REF_DELETED, WT_REF_SPLIT))) {
@@ -772,8 +748,7 @@ __split_parent(WT_SESSION_IMPL *session, WT_REF *ref, WT_REF **ref_new,
WT_NOT_READ(complete, WT_ERR_PANIC);
/* Encourage a race */
- __page_split_timing_stress(
- session, WT_TIMING_STRESS_SPLIT_3, TIMING_STRESS_TEST_SLEEP);
+ __wt_timing_stress(session, WT_TIMING_STRESS_SPLIT_3);
/*
* Confirm the parent page's index hasn't moved then update it, which
@@ -784,8 +759,7 @@ __split_parent(WT_SESSION_IMPL *session, WT_REF *ref, WT_REF **ref_new,
alloc_index = NULL;
/* Encourage a race */
- __page_split_timing_stress(
- session, WT_TIMING_STRESS_SPLIT_4, TIMING_STRESS_TEST_SLEEP);
+ __wt_timing_stress(session, WT_TIMING_STRESS_SPLIT_4);
/*
* Get a generation for this split, mark the page. This must be after
@@ -894,6 +868,8 @@ __split_parent(WT_SESSION_IMPL *session, WT_REF *ref, WT_REF **ref_new,
/* Free the backing block and address. */
WT_TRET(__wt_ref_block_free(session, next_ref));
+ WT_ASSERT(session,
+ __wt_hazard_check_assert(session, next_ref, false));
WT_TRET(__split_safe_free(
session, split_gen, exclusive, next_ref, sizeof(WT_REF)));
parent_decr += sizeof(WT_REF);
@@ -937,7 +913,7 @@ err: __wt_scr_free(session, &scr);
* being deleted, but don't be noisy, there's nothing wrong.
*/
if (empty_parent)
- ret = EBUSY;
+ ret = __wt_set_return(session, EBUSY);
break;
case WT_ERR_PANIC:
__wt_err(session, ret, "fatal error during parent page split");
@@ -1004,7 +980,7 @@ __split_internal(WT_SESSION_IMPL *session, WT_PAGE *parent, WT_PAGE *page)
children = pindex->entries / btree->split_deepen_per_child;
if (children < 10) {
if (pindex->entries < 100)
- return (EBUSY);
+ return (__wt_set_return(session, EBUSY));
children = 10;
}
chunk = pindex->entries / children;
@@ -1125,8 +1101,7 @@ __split_internal(WT_SESSION_IMPL *session, WT_PAGE *parent, WT_PAGE *page)
WT_ERR(__split_ref_prepare(session, alloc_index, &locked, true));
/* Encourage a race */
- __page_split_timing_stress(
- session, WT_TIMING_STRESS_SPLIT_5, TIMING_STRESS_TEST_SLEEP);
+ __wt_timing_stress(session, WT_TIMING_STRESS_SPLIT_5);
/* Split into the parent. */
WT_ERR(__split_parent(session, page_ref, alloc_index->index,
@@ -1140,8 +1115,7 @@ __split_internal(WT_SESSION_IMPL *session, WT_PAGE *parent, WT_PAGE *page)
WT_INTL_INDEX_SET(page, replace_index);
/* Encourage a race */
- __page_split_timing_stress(
- session, WT_TIMING_STRESS_SPLIT_6, TIMING_STRESS_TEST_SLEEP);
+ __wt_timing_stress(session, WT_TIMING_STRESS_SPLIT_6);
/*
* Get a generation for this split, mark the parent page. This must be
@@ -1238,7 +1212,7 @@ __split_internal_lock(
* the parent, give up to avoid that deadlock.
*/
if (!trylock && !__wt_btree_can_evict_dirty(session))
- return (EBUSY);
+ return (__wt_set_return(session, EBUSY));
/*
* Get a page-level lock on the parent to single-thread splits into the
@@ -1259,8 +1233,7 @@ __split_internal_lock(
parent = ref->home;
/* Encourage races. */
- __page_split_timing_stress(
- session, WT_TIMING_STRESS_SPLIT_7, WT_THOUSAND);
+ __wt_timing_stress(session, WT_TIMING_STRESS_SPLIT_7);
/* Page locks live in the modify structure. */
WT_RET(__wt_page_modify_init(session, parent));
@@ -1439,6 +1412,7 @@ __split_multi_inmem(
WT_DECL_ITEM(key);
WT_DECL_RET;
WT_PAGE *page;
+ WT_PAGE_MODIFY *mod;
WT_SAVE_UPD *supd;
WT_UPDATE *upd;
uint64_t recno;
@@ -1535,7 +1509,7 @@ __split_multi_inmem(
WT_ERR(__wt_row_modify(session,
&cbt, key, NULL, upd, WT_UPDATE_INVALID, true));
break;
- WT_ILLEGAL_VALUE_ERR(session);
+ WT_ILLEGAL_VALUE_ERR(session, orig->type);
}
}
@@ -1545,17 +1519,26 @@ __split_multi_inmem(
* might be older than that. Set the first dirty transaction to an
* impossibly old value so this page is never skipped in a checkpoint.
*/
- page->modify->first_dirty_txn = WT_TXN_FIRST;
+ mod = page->modify;
+ mod->first_dirty_txn = WT_TXN_FIRST;
/*
* If the new page is modified, save the eviction generation to avoid
* repeatedly attempting eviction on the same page.
*/
- page->modify->last_evict_pass_gen = orig->modify->last_evict_pass_gen;
- page->modify->last_eviction_id = orig->modify->last_eviction_id;
- __wt_timestamp_set(&page->modify->last_eviction_timestamp,
+ mod->last_evict_pass_gen = orig->modify->last_evict_pass_gen;
+ mod->last_eviction_id = orig->modify->last_eviction_id;
+ __wt_timestamp_set(&mod->last_eviction_timestamp,
&orig->modify->last_eviction_timestamp);
- page->modify->update_restored = 1;
+
+ /* Add the update/restore flag to any previous state. */
+ __wt_timestamp_set(&mod->last_stable_timestamp,
+ &orig->modify->last_stable_timestamp);
+ mod->rec_max_txn = orig->modify->rec_max_txn;
+ __wt_timestamp_set(&mod->rec_max_timestamp,
+ &orig->modify->rec_max_timestamp);
+ mod->restore_state = orig->modify->restore_state;
+ FLD_SET(mod->restore_state, WT_PAGE_RS_RESTORED);
err: /* Free any resources that may have been cached in the cursor. */
WT_TRET(__wt_btcur_close(&cbt, true));
@@ -1709,7 +1692,7 @@ __wt_multi_to_ref(WT_SESSION_IMPL *session,
WT_RET(__wt_calloc_one(session, &ref->page_las));
*ref->page_las = multi->page_las;
- WT_ASSERT(session, ref->page_las->las_max_txn != WT_TXN_NONE);
+ WT_ASSERT(session, ref->page_las->max_txn != WT_TXN_NONE);
ref->state = WT_REF_LOOKASIDE;
}
diff --git a/src/third_party/wiredtiger/src/btree/bt_stat.c b/src/third_party/wiredtiger/src/btree/bt_stat.c
index 0fbd5ce869f..2fd23596cd7 100644
--- a/src/third_party/wiredtiger/src/btree/bt_stat.c
+++ b/src/third_party/wiredtiger/src/btree/bt_stat.c
@@ -122,7 +122,7 @@ __stat_page(WT_SESSION_IMPL *session, WT_PAGE *page, WT_DSRC_STATS **stats)
case WT_PAGE_ROW_LEAF:
__stat_page_row_leaf(session, page, stats);
break;
- WT_ILLEGAL_VALUE(session);
+ WT_ILLEGAL_VALUE(session, page->type);
}
return (0);
}
diff --git a/src/third_party/wiredtiger/src/btree/bt_sync.c b/src/third_party/wiredtiger/src/btree/bt_sync.c
index ad7d7d9fcab..c5c08faa701 100644
--- a/src/third_party/wiredtiger/src/btree/bt_sync.c
+++ b/src/third_party/wiredtiger/src/btree/bt_sync.c
@@ -118,6 +118,7 @@ __sync_file(WT_SESSION_IMPL *session, WT_CACHE_OP syncop)
WT_CONNECTION_IMPL *conn;
WT_DECL_RET;
WT_PAGE *page;
+ WT_PAGE_MODIFY *mod;
WT_REF *prev, *walk;
WT_TXN *txn;
uint64_t internal_bytes, internal_pages, leaf_bytes, leaf_pages;
@@ -239,9 +240,13 @@ __sync_file(WT_SESSION_IMPL *session, WT_CACHE_OP syncop)
* Set the checkpointing flag to block such actions and wait for
* any problematic eviction or page splits to complete.
*/
- btree->checkpointing = WT_CKPT_PREPARE;
+ WT_ASSERT(session, btree->syncing == WT_BTREE_SYNC_OFF &&
+ btree->sync_session == NULL);
+
+ btree->sync_session = session;
+ btree->syncing = WT_BTREE_SYNC_WAIT;
(void)__wt_gen_next_drain(session, WT_GEN_EVICT);
- btree->checkpointing = WT_CKPT_RUNNING;
+ btree->syncing = WT_BTREE_SYNC_RUNNING;
/* Write all dirty in-cache pages. */
LF_SET(WT_READ_NO_EVICT);
@@ -256,9 +261,24 @@ __sync_file(WT_SESSION_IMPL *session, WT_CACHE_OP syncop)
if (walk == NULL)
break;
- /* Skip clean pages. */
- if (!__wt_page_is_modified(walk->page))
+ /*
+ * Skip clean pages, but need to make sure maximum
+ * transaction ID is always updated.
+ */
+ if (!__wt_page_is_modified(walk->page)) {
+ if (((mod = walk->page->modify) != NULL) &&
+ mod->rec_max_txn > btree->rec_max_txn)
+ btree->rec_max_txn = mod->rec_max_txn;
+#ifdef HAVE_TIMESTAMPS
+ if (mod != NULL && __wt_timestamp_cmp(
+ &btree->rec_max_timestamp,
+ &mod->rec_max_timestamp) < 0)
+ __wt_timestamp_set(
+ &btree->rec_max_timestamp,
+ &mod->rec_max_timestamp);
+#endif
continue;
+ }
/*
* Take a local reference to the page modify structure
@@ -338,7 +358,7 @@ __sync_file(WT_SESSION_IMPL *session, WT_CACHE_OP syncop)
break;
case WT_SYNC_CLOSE:
case WT_SYNC_DISCARD:
- WT_ERR(__wt_illegal_value(session, NULL));
+ WT_ERR(__wt_illegal_value(session, syncop));
break;
}
@@ -367,7 +387,8 @@ err: /* On error, clear any left-over tree walk. */
__wt_txn_release_snapshot(session);
/* Clear the checkpoint flag. */
- btree->checkpointing = WT_CKPT_OFF;
+ btree->syncing = WT_BTREE_SYNC_OFF;
+ btree->sync_session = NULL;
__wt_spin_unlock(session, &btree->flush_lock);
diff --git a/src/third_party/wiredtiger/src/btree/bt_vrfy_dsk.c b/src/third_party/wiredtiger/src/btree/bt_vrfy_dsk.c
index aae50ed636c..39f4a041ea9 100644
--- a/src/third_party/wiredtiger/src/btree/bt_vrfy_dsk.c
+++ b/src/third_party/wiredtiger/src/btree/bt_vrfy_dsk.c
@@ -34,6 +34,7 @@ static int __verify_dsk_row(
#define WT_RET_VRFY(session, ...) do { \
if (!(F_ISSET(session, WT_SESSION_QUIET_CORRUPT_FILE))) \
__wt_errx(session, __VA_ARGS__); \
+ F_SET(S2C(session), WT_CONN_DATA_CORRUPTION); \
return (WT_ERROR); \
} while (0)
@@ -166,7 +167,7 @@ __wt_verify_dsk_image(WT_SESSION_IMPL *session,
case WT_PAGE_BLOCK_MANAGER:
case WT_PAGE_OVFL:
return (__verify_dsk_chunk(session, tag, dsk, dsk->u.datalen));
- WT_ILLEGAL_VALUE(session);
+ WT_ILLEGAL_VALUE(session, dsk->type);
}
/* NOTREACHED */
}
@@ -699,6 +700,7 @@ static int
__err_cell_corrupt(
WT_SESSION_IMPL *session, uint32_t entry_num, const char *tag)
{
+ F_SET(S2C(session), WT_CONN_DATA_CORRUPTION);
WT_RET_VRFY(session,
"item %" PRIu32 " on page at %s is a corrupted cell",
entry_num, tag);
diff --git a/src/third_party/wiredtiger/src/btree/bt_walk.c b/src/third_party/wiredtiger/src/btree/bt_walk.c
index d445184b7dd..dc32f76c7a7 100644
--- a/src/third_party/wiredtiger/src/btree/bt_walk.c
+++ b/src/third_party/wiredtiger/src/btree/bt_walk.c
@@ -176,44 +176,96 @@ __ref_ascend(WT_SESSION_IMPL *session,
}
/*
- * __ref_initial_descent_prev --
- * Descend the tree one level, when setting up the initial cursor position
- * for a previous-cursor walk.
+ * __split_prev_race --
+ * Check for races when descending the tree during a previous-cursor walk.
*/
static inline bool
-__ref_initial_descent_prev(
+__split_prev_race(
WT_SESSION_IMPL *session, WT_REF *ref, WT_PAGE_INDEX **pindexp)
{
WT_PAGE_INDEX *pindex;
/*
- * When splitting an internal page into its parent, we move the WT_REF
- * structures and update the parent's page index before updating the
- * split page's page index, and it's not an atomic update. A thread can
- * read the parent page's replacement page index, then read the split
- * page's original index, or the parent page's original and the split
- * page's replacement.
+ * Handle a cursor moving backwards through the tree or setting up at
+ * the end of the tree. We're passed the child page into which we're
+ * descending, and the parent page's page-index we used to find that
+ * child page.
*
- * This isn't a problem for a cursor setting up at the start of the tree
- * because we do right-hand splits on internal pages and the initial
- * part of the split page's namespace won't change as part of a split.
- * A thread reading the parent page's and split page's indexes will move
- * to the same slot no matter what order of indexes are read.
- *
- * Handle a cursor setting up at the end of the tree.
+ * When splitting an internal page into its parent, we move the split
+ * pages WT_REF structures, then update the parent's page index, then
+ * update the split page's page index, and nothing is atomic. A thread
+ * can read the parent page's replacement page index and then the split
+ * page's original index, or vice-versa, and either change can cause a
+ * cursor moving backwards through the tree to skip pages.
*
- * We're passed a child page into which we're descending, and on which
- * we have a hazard pointer.
+ * This isn't a problem for a cursor setting up at the start of the tree
+ * or moving forward through the tree because we do right-hand splits on
+ * internal pages and the initial part of the split page's namespace
+ * won't change as part of a split (in other words, a thread reading the
+ * parent page's and split page's indexes will move to the same slot no
+ * matter what order of indexes are read.
*
- * Acquire a page index for the child page and then confirm we haven't
- * raced with a parent split.
+ * Acquire the child's page index, then confirm the parent's page index
+ * hasn't changed, to check for reading an old version of the parent's
+ * page index and then reading a new version of the child's page index.
*/
WT_INTL_INDEX_GET(session, ref->page, pindex);
if (__wt_split_descent_race(session, ref, *pindexp))
- return (false);
+ return (true);
+
+ /*
+ * That doesn't check if we read a new version of parent's page index
+ * and then an old version of the child's page index. For example, if
+ * a thread were in a newly created split page subtree, the split
+ * completes into the parent before the thread reads it and descends
+ * into the child (where the split hasn't yet completed).
+ *
+ * Imagine an internal page with 3 child pages, with the namespaces a-f,
+ * g-h and i-j; the first child page splits. The parent starts out with
+ * the following page-index:
+ *
+ * | ... | a | g | i | ... |
+ *
+ * The split page starts out with the following page-index:
+ *
+ * | a | b | c | d | e | f |
+ *
+ * The first step is to move the c-f ranges into a new subtree, so, for
+ * example we might have two new internal pages 'c' and 'e', where the
+ * new 'c' page references the c-d namespace and the new 'e' page
+ * references the e-f namespace. The top of the subtree references the
+ * parent page, but until the parent's page index is updated, threads in
+ * the subtree won't be able to ascend out of the subtree. However, once
+ * the parent page's page index is updated to this:
+ *
+ * | ... | a | c | e | g | i | ... |
+ *
+ * threads in the subtree can ascend into the parent. Imagine a cursor
+ * in the c-d part of the namespace that ascends to the parent's 'c'
+ * slot. It would then decrement to the slot before the 'c' slot, the
+ * 'a' slot.
+ *
+ * The previous-cursor movement selects the last slot in the 'a' page;
+ * if the split page's page-index hasn't been updated yet, it selects
+ * the 'f' slot, which is incorrect. Once the split page's page index is
+ * updated to this:
+ *
+ * | a | b |
+ *
+ * the previous-cursor movement will select the 'b' slot, which is
+ * correct.
+ *
+ * If the last slot on the page no longer points to the current page as
+ * its "home", the page is being split and part of its namespace moved,
+ * restart. (We probably don't have to restart, I think we could spin
+ * until the page-index is updated, but I'm not willing to debug that
+ * one if I'm wrong.)
+ */
+ if (pindex->index[pindex->entries - 1]->home != ref->page)
+ return (true);
*pindexp = pindex;
- return (true);
+ return (false);
}
/*
@@ -229,22 +281,21 @@ __tree_walk_internal(WT_SESSION_IMPL *session,
WT_BTREE *btree;
WT_DECL_RET;
WT_PAGE_INDEX *pindex;
- WT_REF *couple, *couple_orig, *ref;
- uint64_t sleep_usecs, yield_count;
+ WT_REF *couple, *ref, *ref_orig;
+ uint64_t restart_sleep, restart_yield, swap_sleep, swap_yield;
uint32_t current_state, slot;
- bool empty_internal, initial_descent, prev, skip;
+ bool empty_internal, prev, skip;
btree = S2BT(session);
pindex = NULL;
- sleep_usecs = yield_count = 0;
- empty_internal = initial_descent = false;
+ restart_sleep = restart_yield = swap_sleep = swap_yield = 0;
+ empty_internal = false;
/*
- * Tree walks are special: they look inside page structures that splits
- * may want to free. Publish that the tree is active during this
- * window.
+ * We're not supposed to walk trees without root pages. As this has not
+ * always been the case, assert to debug that change.
*/
- WT_ENTER_PAGE_INDEX(session);
+ WT_ASSERT(session, btree->root.page != NULL);
/* Check whether deleted pages can be skipped. */
if (!LF_ISSET(WT_READ_DELETED_SKIP))
@@ -284,36 +335,41 @@ __tree_walk_internal(WT_SESSION_IMPL *session,
* new leaf, couple to the next page to which we're descending, it
* saves a hazard-pointer swap for each cursor page movement.
*
- * !!!
- * NOTE: we depend on the fact it's OK to release a page we don't hold,
- * that is, it's OK to release couple when couple is set to NULL.
- *
- * Take a copy of any held page and clear the return value. Remember
- * the hazard pointer we're currently holding.
- *
- * Clear the returned value, it makes future error handling easier.
+ * The hazard pointer on the original location is held until the end of
+ * the movement, in case we have to restart the movement. Take a copy
+ * of any held page and clear the return value (it makes future error
+ * handling easier).
*/
- couple = couple_orig = ref = *refp;
+ couple = NULL;
+ ref_orig = *refp;
*refp = NULL;
+ /*
+ * Tree walks are special: they look inside page structures that splits
+ * may want to free. Publish the tree is active during this window.
+ */
+ WT_ENTER_PAGE_INDEX(session);
+
/* If no page is active, begin a walk from the start/end of the tree. */
- if (ref == NULL) {
-restart: /*
- * We can be here with a NULL or root WT_REF; the page release
- * function handles them internally, don't complicate this code
- * by calling them out.
- */
- WT_ERR(__wt_page_release(session, couple, flags));
+ if ((ref = ref_orig) == NULL) {
+ if (0) {
+restart: /*
+ * Yield before retrying, and if we've yielded enough
+ * times, start sleeping so we don't burn CPU to no
+ * purpose.
+ */
+ __wt_spin_backoff(&restart_yield, &restart_sleep);
- /*
- * We're not supposed to walk trees without root pages. As this
- * has not always been the case, assert to debug that change.
- */
- WT_ASSERT(session, btree->root.page != NULL);
+ WT_ERR(__wt_page_release(session, couple, flags));
+ couple = NULL;
+ }
- couple = couple_orig = ref = &btree->root;
- initial_descent = true;
- goto descend;
+ if ((ref = ref_orig) == NULL) {
+ ref = &btree->root;
+ WT_INTL_INDEX_GET(session, ref->page, pindex);
+ slot = prev ? pindex->entries - 1 : 0;
+ goto descend;
+ }
}
/*
@@ -340,12 +396,9 @@ restart: /*
/*
* If at the root and returning internal pages, return
- * the root page, otherwise we're done. Regardless, no
- * hazard pointer is required, release the one we hold.
+ * the root page, otherwise we're done.
*/
if (__wt_ref_is_root(ref)) {
- WT_ERR(__wt_page_release(
- session, couple, flags));
if (!LF_ISSET(WT_READ_SKIP_INTL))
*refp = ref;
goto done;
@@ -356,28 +409,46 @@ restart: /*
* all of the child pages were deleted, mark it for
* eviction.
*/
- if (empty_internal && pindex->entries > 1) {
+ if (empty_internal) {
__wt_page_evict_soon(session, ref);
empty_internal = false;
}
- /*
- * Optionally return internal pages. Swap our previous
- * hazard pointer for the page we'll return. We don't
- * handle restart or not-found returns, it would require
- * additional complexity and is not a possible return:
- * we're moving to the parent of the current child page,
- * the parent can't have been evicted. (This is why we
- * don't pass "prev" to the page-swap function, we can't
- * handle the restart error returned if the parent page
- * is currently splitting.)
- */
- if (!LF_ISSET(WT_READ_SKIP_INTL)) {
- WT_ERR(__wt_page_swap(
- session, couple, ref, false, flags));
- *refp = ref;
- goto done;
+ /* Encourage races. */
+ __wt_timing_stress(session, WT_TIMING_STRESS_SPLIT_8);
+
+ /* Optionally return internal pages. */
+ if (LF_ISSET(WT_READ_SKIP_INTL))
+ continue;
+
+ for (;;) {
+ /*
+ * Swap our previous hazard pointer for the page
+ * we'll return.
+ *
+ * Not-found is an expected return, as eviction
+ * might have been attempted. The page can't be
+ * evicted, we're holding a hazard pointer on a
+ * child, spin until we're successful.
+ *
+ * Restart is not expected, our parent WT_REF
+ * should not have split.
+ */
+ ret = __wt_page_swap(session,
+ couple, ref, WT_READ_NOTFOUND_OK | flags);
+ if (ret == 0) {
+ /* Success, "couple" released. */
+ couple = NULL;
+ *refp = ref;
+ goto done;
+ }
+
+ WT_ASSERT(session, ret == WT_NOTFOUND);
+ WT_ERR_NOTFOUND_OK(ret);
+
+ __wt_spin_backoff(&swap_yield, &swap_sleep);
}
+ /* NOTREACHED */
}
if (prev)
@@ -389,9 +460,9 @@ restart: /*
++*walkcntp;
for (;;) {
- /*
- * Move to the next slot, and set the reference hint if
- * it's wrong (used when we continue the walk). We don't
+descend: /*
+ * Get a reference, setting the reference hint if it's
+ * wrong (used when we continue the walk). We don't
* always update the hints when splitting, it's expected
* for them to be incorrect in some workloads.
*/
@@ -452,12 +523,41 @@ restart: /*
break;
}
- ret = __wt_page_swap(session, couple, ref, prev,
+ ret = __wt_page_swap(session, couple, ref,
WT_READ_NOTFOUND_OK | WT_READ_RESTART_OK | flags);
+ if (ret == 0) {
+ /* Success, so "couple" has been released. */
+ couple = NULL;
+
+ /* Return leaf pages to our caller. */
+ if (!WT_PAGE_IS_INTERNAL(ref->page)) {
+ *refp = ref;
+ goto done;
+ }
+
+ /* Set the new "couple" value. */
+ couple = ref;
+
+ /* Configure traversal of any internal page. */
+ empty_internal = true;
+ if (prev) {
+ if (__split_prev_race(
+ session, ref, &pindex))
+ goto restart;
+ slot = pindex->entries - 1;
+ } else {
+ WT_INTL_INDEX_GET(
+ session, ref->page, pindex);
+ slot = 0;
+ }
+ continue;
+ }
/*
- * Not-found is an expected return when only walking
+ * Not-found is an expected return when walking only
* in-cache pages, or if we see a deleted page.
+ *
+ * An expected error, so "couple" is unchanged.
*/
if (ret == WT_NOTFOUND) {
WT_NOT_READ(ret, 0);
@@ -466,94 +566,24 @@ restart: /*
/*
* The page we're moving to might have split, in which
- * case move to the last position we held.
- */
- if (ret == WT_RESTART) {
- ret = 0;
-
- /*
- * Yield before retrying, and if we've yielded
- * enough times, start sleeping so we don't burn
- * CPU to no purpose.
- */
- __wt_spin_backoff(
- &yield_count, &sleep_usecs);
-
- /*
- * If a cursor is setting up at the end of the
- * tree, we can't use our parent page's index,
- * because it may have already split; restart
- * the walk.
- */
- if (prev && initial_descent)
- goto restart;
-
- /*
- * If a new walk that never coupled from the
- * root to a new saved position in the tree,
- * restart the walk.
- */
- if (couple == &btree->root)
- goto restart;
-
- /*
- * If restarting from some original position,
- * repeat the increment or decrement we made at
- * that time. Otherwise, couple is an internal
- * page we've acquired after moving from that
- * starting position and we can treat it as a
- * new page. This works because we never acquire
- * a hazard pointer on a leaf page we're not
- * going to return to our caller, this will quit
- * working if that ever changes.
- */
- WT_ASSERT(session,
- couple == couple_orig ||
- WT_PAGE_IS_INTERNAL(couple->page));
- ref = couple;
- __ref_index_slot(session, ref, &pindex, &slot);
- if (couple == couple_orig)
- break;
- }
- WT_ERR(ret);
- couple = ref;
-
- /*
- * A new page: configure for traversal of any internal
- * page's children, else return the leaf page.
+ * case restart the movement.
+ *
+ * An expected error, so "couple" is unchanged.
*/
- if (WT_PAGE_IS_INTERNAL(ref->page)) {
-descend: empty_internal = true;
+ if (ret == WT_RESTART)
+ goto restart;
- /*
- * There's a split race when a cursor is setting
- * up at the end of the tree.
- */
- if (prev && initial_descent) {
- if (!__ref_initial_descent_prev(
- session, ref, &pindex))
- goto restart;
- } else
- WT_INTL_INDEX_GET(
- session, ref->page, pindex);
- slot = prev ? pindex->entries - 1 : 0;
- continue;
- }
-
- /*
- * The tree-walk restart code knows we return any leaf
- * page we acquire (never hazard-pointer coupling on
- * after acquiring a leaf page), and asserts no restart
- * happens while holding a leaf page. This page must be
- * returned to our caller.
- */
- *refp = ref;
- goto done;
+ /* Unexpected error, so "couple" was released. */
+ couple = NULL;
+ goto err;
}
}
done:
-err: WT_LEAVE_PAGE_INDEX(session);
+err:
+ WT_TRET(__wt_page_release(session, couple, flags));
+ WT_TRET(__wt_page_release(session, ref_orig, flags));
+ WT_LEAVE_PAGE_INDEX(session);
return (ret);
}
diff --git a/src/third_party/wiredtiger/src/btree/col_modify.c b/src/third_party/wiredtiger/src/btree/col_modify.c
index 7270c49a9f5..233a88c9404 100644
--- a/src/third_party/wiredtiger/src/btree/col_modify.c
+++ b/src/third_party/wiredtiger/src/btree/col_modify.c
@@ -50,20 +50,25 @@ __wt_col_modify(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt,
modify_type = WT_UPDATE_STANDARD;
value = &col_fix_remove;
}
- } else {
- /*
- * There's a chance the application specified a record
- * past the last record on the page. If that's the
- * case, and we're inserting a new WT_INSERT/WT_UPDATE
- * pair, it goes on the append list, not the update
- * list. Also, an out-of-band recno implies an append
- * operation, we're allocating a new row.
- */
- if (recno == WT_RECNO_OOB ||
- recno > (btree->type == BTREE_COL_VAR ?
- __col_var_last_recno(cbt->ref) :
- __col_fix_last_recno(cbt->ref)))
- append = true;
+ }
+
+ /*
+ * There's a chance the application specified a record past the
+ * last record on the page. If that's the case and we're
+ * inserting a new WT_INSERT/WT_UPDATE pair, it goes on the
+ * append list, not the update list. Also, an out-of-band recno
+ * implies an append operation, we're allocating a new row.
+ * Ignore any information obtained from the search.
+ */
+ WT_ASSERT(session, recno != WT_RECNO_OOB || cbt->compare != 0);
+ if (cbt->compare != 0 &&
+ (recno == WT_RECNO_OOB ||
+ recno > (btree->type == BTREE_COL_VAR ?
+ __col_var_last_recno(cbt->ref) :
+ __col_fix_last_recno(cbt->ref)))) {
+ append = true;
+ cbt->ins = NULL;
+ cbt->ins_head = NULL;
}
}
@@ -75,15 +80,51 @@ __wt_col_modify(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt,
mod = page->modify;
/*
+ * If modifying a record not previously modified, but which is in the
+ * same update slot as a previously modified record, cursor.ins will
+ * not be set because there's no list of update records for this recno,
+ * but cursor.ins_head will be set to point to the correct update slot.
+ * Acquire the necessary insert information, then create a new update
+ * entry and link it into the existing list. We get here if a page has
+ * a single cell representing multiple records (the records have the
+ * same value), and then a record in the cell is updated or removed,
+ * creating the update list for the cell, and then a cursor iterates
+ * into that same cell to update/remove a different record. We find the
+ * correct slot in the update array, but we don't find an update list
+ * (because it doesn't exist), and don't have the information we need
+ * to do the insert. Normally, we wouldn't care (we could fail and do
+ * a search for the record which would configure everything for the
+ * insert), but range truncation does this pattern for every record in
+ * the cell, and the performance is terrible. For that reason, catch it
+ * here.
+ */
+ if (cbt->ins == NULL && cbt->ins_head != NULL) {
+ cbt->ins = __col_insert_search(
+ cbt->ins_head, cbt->ins_stack, cbt->next_stack, recno);
+ if (cbt->ins != NULL) {
+ if (WT_INSERT_RECNO(cbt->ins) == recno)
+ cbt->compare = 0;
+ else {
+ /*
+ * The test below is for cursor.compare set to 0
+ * and cursor.ins set: cursor.compare wasn't set
+ * by the search we just did, and has an unknown
+ * value. Clear cursor.ins to avoid the test.
+ */
+ cbt->ins = NULL;
+ }
+ }
+ }
+
+ /*
* Delete, insert or update a column-store entry.
*
- * If modifying a previously modified record, create a new WT_UPDATE
- * entry and have a serialized function link it into an existing
- * WT_INSERT entry's WT_UPDATE list.
+ * If modifying a previously modified record, cursor.ins will be set to
+ * point to the correct update list. Create a new update entry and link
+ * it into the existing list.
*
- * Else, allocate an insert array as necessary, build a WT_INSERT and
- * WT_UPDATE structure pair, and call a serialized function to insert
- * the WT_INSERT structure.
+ * Else, allocate an insert array as necessary, build an insert/update
+ * structure pair, and link it into place.
*/
if (cbt->compare == 0 && cbt->ins != NULL) {
/*
@@ -98,7 +139,7 @@ __wt_col_modify(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt,
/* Allocate a WT_UPDATE structure and transaction ID. */
WT_ERR(__wt_update_alloc(session,
value, &upd, &upd_size, modify_type));
- WT_ERR(__wt_txn_modify(session, upd));
+ WT_ERR(__wt_txn_modify(session, cbt, upd));
logged = true;
/* Avoid a data copy in WT_CURSOR.update. */
@@ -159,7 +200,7 @@ __wt_col_modify(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt,
if (upd_arg == NULL) {
WT_ERR(__wt_update_alloc(session,
value, &upd, &upd_size, modify_type));
- WT_ERR(__wt_txn_modify(session, upd));
+ WT_ERR(__wt_txn_modify(session, cbt, upd));
logged = true;
/* Avoid a data copy in WT_CURSOR.update. */
diff --git a/src/third_party/wiredtiger/src/btree/col_srch.c b/src/third_party/wiredtiger/src/btree/col_srch.c
index 8cc6630599b..e72ee7455da 100644
--- a/src/third_party/wiredtiger/src/btree/col_srch.c
+++ b/src/third_party/wiredtiger/src/btree/col_srch.c
@@ -73,7 +73,7 @@ __wt_col_search(WT_SESSION_IMPL *session,
WT_PAGE_INDEX *pindex, *parent_pindex;
WT_REF *current, *descent;
uint64_t recno;
- uint32_t base, indx, limit;
+ uint32_t base, indx, limit, read_flags;
int depth;
btree = S2BT(session);
@@ -179,6 +179,9 @@ descend: /*
descent = pindex->index[base - 1];
}
+ /* Encourage races. */
+ WT_DIAGNOSTIC_YIELD;
+
/*
* Swap the current page for the child page. If the page splits
* while we're retrieving it, restart the search at the root.
@@ -191,8 +194,11 @@ descend: /*
* On other error, simply return, the swap call ensures we're
* holding nothing on failure.
*/
+ read_flags = WT_READ_RESTART_OK;
+ if (F_ISSET(cbt, WT_CBT_READ_ONCE))
+ FLD_SET(read_flags, WT_READ_WONT_NEED);
if ((ret = __wt_page_swap(session,
- current, descent, false, WT_READ_RESTART_OK)) == 0) {
+ current, descent, read_flags)) == 0) {
current = descent;
continue;
}
diff --git a/src/third_party/wiredtiger/src/btree/row_modify.c b/src/third_party/wiredtiger/src/btree/row_modify.c
index 8b1e4d78f54..0f89d09f948 100644
--- a/src/third_party/wiredtiger/src/btree/row_modify.c
+++ b/src/third_party/wiredtiger/src/btree/row_modify.c
@@ -96,7 +96,7 @@ __wt_row_modify(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt,
/* Allocate a WT_UPDATE structure and transaction ID. */
WT_ERR(__wt_update_alloc(session,
value, &upd, &upd_size, modify_type));
- WT_ERR(__wt_txn_modify(session, upd));
+ WT_ERR(__wt_txn_modify(session, cbt, upd));
logged = true;
/* Avoid WT_CURSOR.update data copy. */
@@ -167,7 +167,7 @@ __wt_row_modify(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt,
if (upd_arg == NULL) {
WT_ERR(__wt_update_alloc(session,
value, &upd, &upd_size, modify_type));
- WT_ERR(__wt_txn_modify(session, upd));
+ WT_ERR(__wt_txn_modify(session, cbt, upd));
logged = true;
/* Avoid WT_CURSOR.update data copy. */
diff --git a/src/third_party/wiredtiger/src/btree/row_srch.c b/src/third_party/wiredtiger/src/btree/row_srch.c
index 20acda8a1ab..5dff4b6fa60 100644
--- a/src/third_party/wiredtiger/src/btree/row_srch.c
+++ b/src/third_party/wiredtiger/src/btree/row_srch.c
@@ -219,7 +219,7 @@ __wt_row_search(WT_SESSION_IMPL *session,
WT_REF *current, *descent;
WT_ROW *rip;
size_t match, skiphigh, skiplow;
- uint32_t base, indx, limit;
+ uint32_t base, indx, limit, read_flags;
int cmp, depth;
bool append_check, descend_right, done;
@@ -431,7 +431,10 @@ append: if (__wt_split_descent_race(
goto restart;
}
-descend: /*
+descend: /* Encourage races. */
+ WT_DIAGNOSTIC_YIELD;
+
+ /*
* Swap the current page for the child page. If the page splits
* while we're retrieving it, restart the search at the root.
* We cannot restart in the "current" page; for example, if a
@@ -443,8 +446,11 @@ descend: /*
* On other error, simply return, the swap call ensures we're
* holding nothing on failure.
*/
+ read_flags = WT_READ_RESTART_OK;
+ if (F_ISSET(cbt, WT_CBT_READ_ONCE))
+ FLD_SET(read_flags, WT_READ_WONT_NEED);
if ((ret = __wt_page_swap(session,
- current, descent, false, WT_READ_RESTART_OK)) == 0) {
+ current, descent, read_flags)) == 0) {
current = descent;
continue;
}
diff --git a/src/third_party/wiredtiger/src/cache/cache_las.c b/src/third_party/wiredtiger/src/cache/cache_las.c
index 64fe93806e9..cf28027d8b7 100644
--- a/src/third_party/wiredtiger/src/cache/cache_las.c
+++ b/src/third_party/wiredtiger/src/cache/cache_las.c
@@ -19,24 +19,6 @@
WT_SESSION_NO_RECONCILE)
/*
- * __las_timing_stress --
- * Optionally add delay to simulate the race conditions in lookaside
- * sweep for debug purposes.
- */
-static void
-__las_timing_stress(WT_SESSION_IMPL *session)
-{
- WT_CONNECTION_IMPL *conn;
-
- conn = S2C(session);
-
- /* Only sleep when lookaside sweep race flag is set. */
- if (FLD_ISSET(conn->timing_stress_flags,
- WT_TIMING_STRESS_LOOKASIDE_SWEEP))
- __wt_sleep(0, TIMING_STRESS_TEST_SLEEP);
-}
-
-/*
* __las_set_isolation --
* Switch to read-uncommitted.
*/
@@ -328,7 +310,16 @@ __wt_las_cursor(
*
* XXX better as a condition variable.
*/
- __wt_sleep(0, 1000);
+ __wt_sleep(0, WT_THOUSAND);
+ if (F_ISSET(session, WT_SESSION_INTERNAL))
+ WT_STAT_CONN_INCRV(session,
+ cache_lookaside_cursor_wait_internal,
+ WT_THOUSAND);
+ else
+ WT_STAT_CONN_INCRV(session,
+ cache_lookaside_cursor_wait_application,
+ WT_THOUSAND);
+
}
}
@@ -412,9 +403,6 @@ __wt_las_page_skip_locked(WT_SESSION_IMPL *session, WT_REF *ref)
* We also need to instantiate a lookaside page if this is an update
* operation in progress.
*/
- if (ref->page_las->invalid)
- return (false);
-
if (F_ISSET(txn, WT_TXN_UPDATE))
return (false);
@@ -422,43 +410,34 @@ __wt_las_page_skip_locked(WT_SESSION_IMPL *session, WT_REF *ref)
return (false);
/*
- * If page image has the newest version of data and includes data newer
- * than the reader's snapshot then we should read the history.
- */
- if (ref->page_las->las_skew_newest &&
- WT_TXNID_LE(txn->snap_min, ref->page_las->las_max_txn))
- return (false);
-
- /*
- * If page image has the oldest version of data and some of the history
- * overlaps with the reader's snapshot then we should read the history.
+ * If some of the page's history overlaps with the reader's snapshot
+ * then we have to read it. This is only relevant if we chose versions
+ * that were unstable when the page was written.
*/
- if (!ref->page_las->las_skew_newest &&
- WT_TXNID_LE(ref->page_las->las_min_txn, txn->snap_max))
+ if (ref->page_las->skew_newest &&
+ WT_TXNID_LE(txn->snap_min, ref->page_las->unstable_txn))
return (false);
- if (!F_ISSET(txn, WT_TXN_HAS_TS_READ) && ref->page_las->las_skew_newest)
- return (true);
+ if (!F_ISSET(txn, WT_TXN_HAS_TS_READ))
+ return (ref->page_las->skew_newest);
#ifdef HAVE_TIMESTAMPS
/*
* Skip lookaside pages if reading as of a timestamp, we evicted new
* versions of data and all the updates are in the past.
*/
- if (F_ISSET(&session->txn, WT_TXN_HAS_TS_READ) &&
- ref->page_las->las_skew_newest &&
+ if (ref->page_las->skew_newest &&
__wt_timestamp_cmp(
- &ref->page_las->onpage_timestamp, &session->txn.read_timestamp) < 0)
+ &txn->read_timestamp, &ref->page_las->unstable_timestamp) > 0)
return (true);
/*
* Skip lookaside pages if reading as of a timestamp, we evicted old
- * versions of data and all the updates are in the future.
+ * versions of data and all the unstable updates are in the future.
*/
- if (F_ISSET(&session->txn, WT_TXN_HAS_TS_READ) &&
- !ref->page_las->las_skew_newest &&
+ if (!ref->page_las->skew_newest &&
__wt_timestamp_cmp(
- &ref->page_las->min_timestamp, &session->txn.read_timestamp) > 0)
+ &txn->read_timestamp, &ref->page_las->unstable_timestamp) < 0)
return (true);
#endif
@@ -497,18 +476,23 @@ __wt_las_page_skip(WT_SESSION_IMPL *session, WT_REF *ref)
* Remove all records for a given page from the lookaside store.
*/
static int
-__las_remove_block(WT_SESSION_IMPL *session,
+__las_remove_block(
WT_CURSOR *cursor, uint64_t pageid, bool lock_wait, uint64_t *remove_cntp)
{
WT_CONNECTION_IMPL *conn;
WT_DECL_RET;
WT_ITEM las_key;
+ WT_SESSION_IMPL *session;
+ WT_TXN_ISOLATION saved_isolation;
uint64_t las_counter, las_pageid;
uint32_t las_id;
+ bool local_txn;
*remove_cntp = 0;
+ session = (WT_SESSION_IMPL *)cursor->session;
conn = S2C(session);
+ local_txn = false;
/* Prevent the sweep thread from removing the block. */
if (lock_wait)
@@ -517,6 +501,10 @@ __las_remove_block(WT_SESSION_IMPL *session,
WT_RET(__wt_try_writelock(
session, &conn->cache->las_sweepwalk_lock));
+ __las_set_isolation(session, &saved_isolation);
+ WT_ERR(__wt_txn_begin(session, NULL));
+ local_txn = true;
+
/*
* Search for the block's unique btree ID and page ID prefix and step
* through all matching records, removing them.
@@ -535,7 +523,15 @@ __las_remove_block(WT_SESSION_IMPL *session,
}
WT_ERR_NOTFOUND_OK(ret);
-err: __wt_writeunlock(session, &conn->cache->las_sweepwalk_lock);
+err: if (local_txn) {
+ if (ret == 0)
+ ret = __wt_txn_commit(session, NULL);
+ else
+ WT_TRET(__wt_txn_rollback(session, NULL));
+ }
+
+ __las_restore_isolation(session, saved_isolation);
+ __wt_writeunlock(session, &conn->cache->las_sweepwalk_lock);
return (ret);
}
@@ -545,7 +541,8 @@ err: __wt_writeunlock(session, &conn->cache->las_sweepwalk_lock);
* cache state when performing a lookaside table write.
*/
static int
-__las_insert_block_verbose(WT_SESSION_IMPL *session, WT_MULTI *multi)
+__las_insert_block_verbose(
+ WT_SESSION_IMPL *session, WT_BTREE *btree, WT_MULTI *multi)
{
WT_CACHE *cache;
WT_CONNECTION_IMPL *conn;
@@ -557,7 +554,7 @@ __las_insert_block_verbose(WT_SESSION_IMPL *session, WT_MULTI *multi)
#endif
const char *ts;
- btree_id = S2BT(session)->id;
+ btree_id = btree->id;
if (!WT_VERBOSE_ISSET(session,
WT_VERB_LOOKASIDE | WT_VERB_LOOKASIDE_ACTIVITY))
@@ -581,8 +578,8 @@ __las_insert_block_verbose(WT_SESSION_IMPL *session, WT_MULTI *multi)
(void)__wt_eviction_dirty_needed(session, &pct_dirty);
#ifdef HAVE_TIMESTAMPS
- WT_RET(__wt_timestamp_to_hex_string(
- session, hex_timestamp, &multi->page_las.min_timestamp));
+ WT_RET(__wt_timestamp_to_hex_string(session, hex_timestamp,
+ &multi->page_las.unstable_timestamp));
ts = hex_timestamp;
#else
ts = "disabled";
@@ -591,14 +588,14 @@ __las_insert_block_verbose(WT_SESSION_IMPL *session, WT_MULTI *multi)
WT_VERB_LOOKASIDE | WT_VERB_LOOKASIDE_ACTIVITY,
"Page reconciliation triggered lookaside write "
"file ID %" PRIu32 ", page ID %" PRIu64 ". "
- "Max txn ID %" PRIu64 ", min timestamp %s, skewed %s. "
+ "Max txn ID %" PRIu64 ", unstable timestamp %s, %s. "
"Entries now in lookaside file: %" PRId64 ", "
"cache dirty: %2.3f%% , "
"cache use: %2.3f%%",
btree_id, multi->page_las.las_pageid,
- multi->page_las.las_max_txn,
+ multi->page_las.max_txn,
ts,
- multi->page_las.las_skew_newest ? "newest" : "oldest",
+ multi->page_las.skew_newest ? "newest" : "not newest",
WT_STAT_READ(conn->stats, cache_lookaside_entries),
pct_dirty, pct_full);
}
@@ -614,15 +611,15 @@ __las_insert_block_verbose(WT_SESSION_IMPL *session, WT_MULTI *multi)
* Copy one set of saved updates into the database's lookaside table.
*/
int
-__wt_las_insert_block(WT_SESSION_IMPL *session, WT_CURSOR *cursor,
- WT_PAGE *page, WT_MULTI *multi, WT_ITEM *key)
+__wt_las_insert_block(WT_CURSOR *cursor,
+ WT_BTREE *btree, WT_PAGE *page, WT_MULTI *multi, WT_ITEM *key)
{
- WT_BTREE *btree;
WT_CONNECTION_IMPL *conn;
WT_DECL_RET;
+ WT_DECL_TIMESTAMP(prev_timestamp)
WT_ITEM las_timestamp, las_value;
WT_SAVE_UPD *list;
- WT_SESSION_IMPL *las_session;
+ WT_SESSION_IMPL *session;
WT_TXN_ISOLATION saved_isolation;
WT_UPDATE *upd;
uint64_t insert_cnt;
@@ -631,7 +628,7 @@ __wt_las_insert_block(WT_SESSION_IMPL *session, WT_CURSOR *cursor,
uint8_t *p;
bool local_txn;
- btree = S2BT(session);
+ session = (WT_SESSION_IMPL *)cursor->session;
conn = S2C(session);
WT_CLEAR(las_timestamp);
WT_CLEAR(las_value);
@@ -639,17 +636,13 @@ __wt_las_insert_block(WT_SESSION_IMPL *session, WT_CURSOR *cursor,
btree_id = btree->id;
local_txn = false;
+ __wt_timestamp_set_zero(&prev_timestamp);
+
las_pageid = __wt_atomic_add64(&conn->cache->las_pageid, 1);
if (!btree->lookaside_entries)
btree->lookaside_entries = true;
- /* Wrap all the updates in a transaction. */
- las_session = (WT_SESSION_IMPL *)cursor->session;
- __las_set_isolation(las_session, &saved_isolation);
- WT_ERR(__wt_txn_begin(las_session, NULL));
- local_txn = true;
-
#ifdef HAVE_DIAGNOSTIC
{
uint64_t remove_cnt;
@@ -657,12 +650,17 @@ __wt_las_insert_block(WT_SESSION_IMPL *session, WT_CURSOR *cursor,
* There should never be any entries with the page ID we are about to
* use.
*/
- WT_ERR_BUSY_OK(__las_remove_block(
- session, cursor, las_pageid, false, &remove_cnt));
+ WT_RET_BUSY_OK(
+ __las_remove_block(cursor, las_pageid, false, &remove_cnt));
WT_ASSERT(session, remove_cnt == 0);
}
#endif
+ /* Wrap all the updates in a transaction. */
+ __las_set_isolation(session, &saved_isolation);
+ WT_ERR(__wt_txn_begin(session, NULL));
+ local_txn = true;
+
/* Enter each update in the boundary's list into the lookaside store. */
for (las_counter = 0, i = 0,
list = multi->supd; i < multi->supd_entries; ++i, ++list) {
@@ -676,15 +674,17 @@ __wt_las_insert_block(WT_SESSION_IMPL *session, WT_CURSOR *cursor,
key->size = WT_PTRDIFF(p, key->data);
break;
case WT_PAGE_ROW_LEAF:
- if (list->ins == NULL)
- WT_ERR(__wt_row_leaf_key(
+ if (list->ins == NULL) {
+ WT_WITH_BTREE(session, btree,
+ ret = __wt_row_leaf_key(
session, page, list->ripcip, key, false));
- else {
+ WT_ERR(ret);
+ } else {
key->data = WT_INSERT_KEY(list->ins);
key->size = WT_INSERT_KEY_SIZE(list->ins);
}
break;
- WT_ILLEGAL_VALUE_ERR(session);
+ WT_ILLEGAL_VALUE_ERR(session, page->type);
}
/*
@@ -726,7 +726,7 @@ __wt_las_insert_block(WT_SESSION_IMPL *session, WT_CURSOR *cursor,
case WT_UPDATE_TOMBSTONE:
las_value.size = 0;
break;
- WT_ILLEGAL_VALUE_ERR(session);
+ WT_ILLEGAL_VALUE_ERR(session, upd->type);
}
cursor->set_key(cursor,
@@ -742,18 +742,17 @@ __wt_las_insert_block(WT_SESSION_IMPL *session, WT_CURSOR *cursor,
* table. (We check the length because row-store doesn't
* write zero-length data items.)
*/
- if (multi->page_las.las_skew_newest &&
- upd == list->onpage_upd &&
+ if (upd == list->onpage_upd &&
upd->size > 0 &&
(upd->type == WT_UPDATE_STANDARD ||
upd->type == WT_UPDATE_MODIFY)) {
las_value.size = 0;
- cursor->set_value(cursor,
- upd->txnid, &las_timestamp,
+ cursor->set_value(cursor, upd->txnid,
+ &las_timestamp, upd->prepare_state,
WT_UPDATE_BIRTHMARK, &las_value);
} else
- cursor->set_value(cursor,
- upd->txnid, &las_timestamp,
+ cursor->set_value(cursor, upd->txnid,
+ &las_timestamp, upd->prepare_state,
upd->type, &las_value);
/*
@@ -770,9 +769,9 @@ __wt_las_insert_block(WT_SESSION_IMPL *session, WT_CURSOR *cursor,
err: /* Resolve the transaction. */
if (local_txn) {
if (ret == 0)
- WT_TRET(__wt_txn_commit(las_session, NULL));
+ ret = __wt_txn_commit(session, NULL);
else
- WT_TRET(__wt_txn_rollback(las_session, NULL));
+ WT_TRET(__wt_txn_rollback(session, NULL));
/* Adjust the entry count. */
if (ret == 0)
@@ -780,11 +779,11 @@ err: /* Resolve the transaction. */
&conn->cache->las_insert_count, insert_cnt);
}
- __las_restore_isolation(las_session, saved_isolation);
+ __las_restore_isolation(session, saved_isolation);
if (ret == 0 && insert_cnt > 0) {
multi->page_las.las_pageid = las_pageid;
- ret = __las_insert_block_verbose(session, multi);
+ ret = __las_insert_block_verbose(session, btree, multi);
}
return (ret);
@@ -860,8 +859,6 @@ __wt_las_remove_block(
WT_CONNECTION_IMPL *conn;
WT_CURSOR *cursor;
WT_DECL_RET;
- WT_SESSION_IMPL *las_session;
- WT_TXN_ISOLATION saved_isolation;
uint64_t remove_cnt;
uint32_t session_flags;
@@ -875,24 +872,12 @@ __wt_las_remove_block(
*/
__wt_las_cursor(session, &cursor, &session_flags);
- las_session = (WT_SESSION_IMPL *)cursor->session;
- __las_set_isolation(las_session, &saved_isolation);
-
- WT_ERR(__wt_txn_begin(las_session, NULL));
-
- ret = __las_remove_block(
- las_session, cursor, pageid, lock_wait, &remove_cnt);
- if (ret == 0)
- ret = __wt_txn_commit(las_session, NULL);
- else
- WT_TRET(__wt_txn_rollback(las_session, NULL));
- if (ret == 0)
+ if ((ret = __las_remove_block(
+ cursor, pageid, lock_wait, &remove_cnt)) == 0)
(void)__wt_atomic_add64(
&conn->cache->las_remove_count, remove_cnt);
-err: __las_restore_isolation(las_session, saved_isolation);
WT_TRET(__wt_las_cursor_close(session, &cursor, session_flags));
-
return (ret);
}
@@ -1021,21 +1006,27 @@ __wt_las_sweep(WT_SESSION_IMPL *session)
uint64_t cnt, remove_cnt, las_pageid, saved_pageid, visit_cnt;
uint64_t las_counter, las_txnid;
uint32_t las_id, session_flags;
- uint8_t upd_type;
+ uint8_t prepare_state, upd_type;
int notused;
- bool local_txn, locked;
+ bool local_txn, locked, removing_key_block;
cache = S2C(session)->cache;
cursor = NULL;
sweep_key = &cache->las_sweep_key;
remove_cnt = 0;
session_flags = 0; /* [-Werror=maybe-uninitialized] */
- local_txn = locked = false;
+ local_txn = locked = removing_key_block = false;
WT_RET(__wt_scr_alloc(session, 0, &saved_key));
saved_pageid = 0;
/*
+ * Prevent other threads removing entries from underneath the sweep.
+ */
+ __wt_writelock(session, &cache->las_sweepwalk_lock);
+ locked = true;
+
+ /*
* Allocate a cursor and wrap all the updates in a transaction.
* We should have our own lookaside cursor.
*/
@@ -1045,14 +1036,8 @@ __wt_las_sweep(WT_SESSION_IMPL *session)
WT_ERR(__wt_txn_begin(session, NULL));
local_txn = true;
- /*
- * Prevent other threads removing entries from underneath the sweep.
- */
- __wt_writelock(session, &cache->las_sweepwalk_lock);
- locked = true;
-
/* Encourage a race */
- __las_timing_stress(session);
+ __wt_timing_stress(session, WT_TIMING_STRESS_LOOKASIDE_SWEEP);
/*
* When continuing a sweep, position the cursor using the key from the
@@ -1087,21 +1072,17 @@ __wt_las_sweep(WT_SESSION_IMPL *session)
WT_ERR(cursor->get_key(cursor,
&las_pageid, &las_id, &las_counter, &las_key));
- /*
- * If we have switched to a different page, clear the saved key.
- * Otherwise, sweep could incorrectly remove records after
- * seeing a birthmark for a key in one block if the same key is
- * at the beginning of the next block. See WT-3982 for details.
- */
- if (las_pageid != saved_pageid) {
- saved_key->size = 0;
- saved_pageid = las_pageid;
- }
+ __wt_verbose(session,
+ WT_VERB_LOOKASIDE_ACTIVITY,
+ "Sweep reviewing lookaside entry with lookaside "
+ "page ID %" PRIu64 " btree ID %" PRIu32
+ " saved key size: %" WT_SIZET_FMT,
+ las_pageid, las_id, saved_key->size);
/*
- * Stop if the cache is stuck: we are ignoring the cache size
- * while scanning the lookaside table, so we're making things
- * worse.
+ * Signal to stop if the cache is stuck: we are ignoring the
+ * cache size while scanning the lookaside table, so we're
+ * making things worse.
*/
if (__wt_cache_stuck(session))
cnt = 0;
@@ -1112,9 +1093,8 @@ __wt_las_sweep(WT_SESSION_IMPL *session)
* and there is a reader waiting and we're on a key boundary.
*/
++visit_cnt;
- if ((cnt == 0 ||
- (visit_cnt > WT_LAS_SWEEP_ENTRIES && cache->las_reader)) &&
- saved_key->size == 0)
+ if (!removing_key_block && (cnt == 0 ||
+ (visit_cnt > WT_LAS_SWEEP_ENTRIES && cache->las_reader)))
break;
if (cnt > 0)
--cnt;
@@ -1134,15 +1114,20 @@ __wt_las_sweep(WT_SESSION_IMPL *session)
WT_ERR(cursor->remove(cursor));
++remove_cnt;
saved_key->size = 0;
+ /*
+ * Allow sweep to break while removing entries from a
+ * dead file.
+ */
+ removing_key_block = false;
continue;
}
/*
- * Remove entries from the lookaside that have aged out and are
- * now no longer needed.
+ * Remove all entries for a key once they have aged out and are
+ * no longer needed.
*/
- WT_ERR(cursor->get_value(cursor,
- &las_txnid, &las_timestamp, &upd_type, &las_value));
+ WT_ERR(cursor->get_value(cursor, &las_txnid,
+ &las_timestamp, &prepare_state, &upd_type, &las_value));
#ifdef HAVE_TIMESTAMPS
WT_ASSERT(session, las_timestamp.size == WT_TIMESTAMP_SIZE);
memcpy(&timestamp, las_timestamp.data, las_timestamp.size);
@@ -1152,42 +1137,51 @@ __wt_las_sweep(WT_SESSION_IMPL *session)
#endif
/*
- * If this entry isn't globally visible we cannot remove it.
- * If it is visible then perform additional checks to see
- * whether it has aged out of a live file.
- */
- if (!__wt_txn_visible_all(session, las_txnid, val_ts)) {
- saved_key->size = 0;
- continue;
- }
-
- /*
- * Save our key for comparing with older entries if we
- * don't have one or it is different.
+ * Check to see if the page or key has changed this iteration,
+ * and if they have, setup context for safely removing obsolete
+ * updates.
+ *
+ * It's important to check for page boundaries explicitly
+ * because it is possible for the same key to be at the start
+ * of the next block. See WT-3982 for details.
*/
- if (saved_key->size != las_key.size ||
+ if (las_pageid != saved_pageid ||
+ saved_key->size != las_key.size ||
memcmp(saved_key->data, las_key.data, las_key.size) != 0) {
- /* If we have processed enough entries, give up. */
+ /* If we've examined enough entries, give up. */
if (cnt == 0)
break;
- /* We can only start removing from a full value. */
- if (upd_type == WT_UPDATE_MODIFY) {
- saved_key->size = 0;
- continue;
- }
-
- WT_ERR(__wt_buf_set(session, saved_key,
- las_key.data, las_key.size));
+ saved_pageid = las_pageid;
+ WT_ERR(__wt_buf_set(
+ session, saved_key, las_key.data, las_key.size));
/*
- * If the first stable record contains data, we have to
- * keep it.
+ * There are several conditions that need to be met
+ * before we choose to remove a key block:
+ * * The entries were written with skew newest.
+ * Indicated by the first entry being a birthmark.
+ * * The first entry is globally visible.
+ * * The entry wasn't from a prepared transaction.
*/
- if (upd_type != WT_UPDATE_BIRTHMARK)
- continue;
+ if (upd_type == WT_UPDATE_BIRTHMARK &&
+ __wt_txn_visible_all(session, las_txnid, val_ts) &&
+ prepare_state != WT_PREPARE_INPROGRESS)
+ removing_key_block = true;
+ else
+ removing_key_block = false;
}
+ if (!removing_key_block)
+ continue;
+
+ __wt_verbose(session,
+ WT_VERB_LOOKASIDE_ACTIVITY,
+ "Sweep removing lookaside entry with "
+ "page ID: %" PRIu64 " btree ID: %" PRIu32
+ " saved key size: %" WT_SIZET_FMT ", record type: %" PRIu8
+ " transaction ID: %" PRIu64,
+ las_pageid, las_id, saved_key->size, upd_type, las_txnid);
WT_ERR(cursor->remove(cursor));
++remove_cnt;
}
@@ -1221,11 +1215,12 @@ err: __wt_buf_free(session, sweep_key);
(void)__wt_atomic_add64(
&cache->las_remove_count, remove_cnt);
}
- if (locked)
- __wt_writeunlock(session, &cache->las_sweepwalk_lock);
- WT_TRET(__wt_las_cursor_close(session, &cursor, session_flags));
__las_restore_isolation(session, saved_isolation);
+ WT_TRET(__wt_las_cursor_close(session, &cursor, session_flags));
+
+ if (locked)
+ __wt_writeunlock(session, &cache->las_sweepwalk_lock);
__wt_scr_free(session, &saved_key);
diff --git a/src/third_party/wiredtiger/src/checksum/arm64/crc32-arm64.c b/src/third_party/wiredtiger/src/checksum/arm64/crc32-arm64.c
index 01740dcd953..240c2a421bf 100644
--- a/src/third_party/wiredtiger/src/checksum/arm64/crc32-arm64.c
+++ b/src/third_party/wiredtiger/src/checksum/arm64/crc32-arm64.c
@@ -26,9 +26,16 @@
* OTHER DEALINGS IN THE SOFTWARE.
*/
-#include "wt_internal.h"
+#include <inttypes.h>
+#include <stddef.h>
-#if defined(__linux__) && defined(HAVE_CRC32_HARDWARE)
+/*
+ * The checksum code doesn't include WiredTiger configuration or include files.
+ * This means the HAVE_NO_CRC32_HARDWARE #define isn't configurable as part of
+ * standalone WiredTiger configuration, there's no way to turn off the checksum
+ * hardware.
+ */
+#if defined(__linux__) && !defined(HAVE_NO_CRC32_HARDWARE)
#include <asm/hwcap.h>
#include <sys/auxv.h>
@@ -84,23 +91,27 @@ __wt_checksum_hw(const void *chunk, size_t len)
}
#endif
+extern uint32_t __wt_checksum_sw(const void *chunk, size_t len);
+#if defined(__GNUC__)
+extern uint32_t (*wiredtiger_crc32c_func(void))(const void *, size_t)
+ __attribute__((visibility("default")));
+#else
+extern uint32_t (*wiredtiger_crc32c_func(void))(const void *, size_t);
+#endif
+
/*
- * __wt_checksum_init --
- * WiredTiger: detect CRC hardware and set the checksum function.
+ * wiredtiger_crc32c_func --
+ * WiredTiger: detect CRC hardware and return the checksum function.
*/
-void
-__wt_checksum_init(void)
- WT_GCC_FUNC_ATTRIBUTE((cold))
+uint32_t (*wiredtiger_crc32c_func(void))(const void *, size_t)
{
-#if defined(__linux__) && defined(HAVE_CRC32_HARDWARE)
+#if defined(__linux__) && !defined(HAVE_NO_CRC32_HARDWARE)
unsigned long caps = getauxval(AT_HWCAP);
if (caps & HWCAP_CRC32)
- __wt_process.checksum = __wt_checksum_hw;
- else
- __wt_process.checksum = __wt_checksum_sw;
-
+ return (__wt_checksum_hw);
+ return (__wt_checksum_sw);
#else
- __wt_process.checksum = __wt_checksum_sw;
+ return (__wt_checksum_sw);
#endif
}
diff --git a/src/third_party/wiredtiger/src/checksum/power8/crc32.sx b/src/third_party/wiredtiger/src/checksum/power8/crc32.sx
index 0b7870668b5..f8a89cd7c06 100644
--- a/src/third_party/wiredtiger/src/checksum/power8/crc32.sx
+++ b/src/third_party/wiredtiger/src/checksum/power8/crc32.sx
@@ -1,4 +1,11 @@
-#if defined(__powerpc64__)
+/*
+ * The checksum code doesn't include WiredTiger configuration or include files.
+ * This means the HAVE_NO_CRC32_HARDWARE #define isn't configurable as part of
+ * standalone WiredTiger configuration, there's no way to turn off the checksum
+ * hardware.
+ */
+#if defined(__powerpc64__) && !defined(HAVE_NO_CRC32_HARDWARE)
+
/*
* Calculate the checksum of data that is 16 byte aligned and a multiple of
* 16 bytes.
@@ -770,9 +777,11 @@ FUNC_START(__crc32_vpmsum)
FUNC_END(__crc32_vpmsum)
#endif
-/*
- * Make sure the stack isn't executable with GCC (regardless of platform).
- */
+/* Make sure the stack isn't executable with GCC (regardless of platform). */
#ifdef __ELF__
.section .note.GNU-stack,"",@progbits
#endif
+/*
+ * DO NOT add an #endif after this line, this section must always be output
+ * and can never be #ifdef'd out as part of conditional compilation.
+ */
diff --git a/src/third_party/wiredtiger/src/checksum/power8/crc32_wrapper.c b/src/third_party/wiredtiger/src/checksum/power8/crc32_wrapper.c
index 8626fa42136..074891eb0e7 100644
--- a/src/third_party/wiredtiger/src/checksum/power8/crc32_wrapper.c
+++ b/src/third_party/wiredtiger/src/checksum/power8/crc32_wrapper.c
@@ -1,6 +1,13 @@
-#if defined(__powerpc64__)
-#include "wt_internal.h"
+#include <inttypes.h>
+#include <stddef.h>
+/*
+ * The checksum code doesn't include WiredTiger configuration or include files.
+ * This means the HAVE_NO_CRC32_HARDWARE #define isn't configurable as part of
+ * standalone WiredTiger configuration, there's no way to turn off the checksum
+ * hardware.
+ */
+#if defined(__powerpc64__) && !defined(HAVE_NO_CRC32_HARDWARE)
#define CRC_TABLE
#include "crc32_constants.h"
@@ -68,7 +75,6 @@ out:
return crc;
}
-#endif
/*
* __wt_checksum_hw --
@@ -79,18 +85,25 @@ __wt_checksum_hw(const void *chunk, size_t len)
{
return (crc32_vpmsum(0, chunk, len));
}
+#endif
+
+extern uint32_t __wt_checksum_sw(const void *chunk, size_t len);
+#if defined(__GNUC__)
+extern uint32_t (*wiredtiger_crc32c_func(void))(const void *, size_t)
+ __attribute__((visibility("default")));
+#else
+extern uint32_t (*wiredtiger_crc32c_func(void))(const void *, size_t);
+#endif
/*
- * __wt_checksum_init --
- * WiredTiger: detect CRC hardware and set the checksum function.
+ * wiredtiger_crc32c_func --
+ * WiredTiger: detect CRC hardware and return the checksum function.
*/
-void
-__wt_checksum_init(void)
- WT_GCC_FUNC_ATTRIBUTE((cold))
+uint32_t (*wiredtiger_crc32c_func(void))(const void *, size_t)
{
-#if defined(HAVE_CRC32_HARDWARE)
- __wt_process.checksum = __wt_checksum_hw;
+#if defined(__powerpc64__) && !defined(HAVE_NO_CRC32_HARDWARE)
+ return (__wt_checksum_hw);
#else
- __wt_process.checksum = __wt_checksum_sw;
+ return (__wt_checksum_sw);
#endif
}
diff --git a/src/third_party/wiredtiger/src/checksum/software/checksum.c b/src/third_party/wiredtiger/src/checksum/software/checksum.c
index 1228c9a0ce1..4d93f8bf1ea 100644
--- a/src/third_party/wiredtiger/src/checksum/software/checksum.c
+++ b/src/third_party/wiredtiger/src/checksum/software/checksum.c
@@ -38,7 +38,8 @@
* little endian.
*/
-#include "wt_internal.h"
+#include <inttypes.h>
+#include <stddef.h>
/*
* The CRC slicing tables.
@@ -1095,13 +1096,14 @@ static const uint32_t g_crc_slicing[8][256] = {
#endif
};
+extern uint32_t __wt_checksum_sw(const void *chunk, size_t len);
+
/*
* __wt_checksum_sw --
* Return a checksum for a chunk of memory, computed in software.
*/
uint32_t
__wt_checksum_sw(const void *chunk, size_t len)
- WT_GCC_FUNC_ATTRIBUTE((visibility("default")))
{
uint32_t crc, next;
size_t nqwords;
diff --git a/src/third_party/wiredtiger/src/checksum/x86/crc32-x86.c b/src/third_party/wiredtiger/src/checksum/x86/crc32-x86.c
index 73199018a7d..70860019e02 100644
--- a/src/third_party/wiredtiger/src/checksum/x86/crc32-x86.c
+++ b/src/third_party/wiredtiger/src/checksum/x86/crc32-x86.c
@@ -26,9 +26,16 @@
* OTHER DEALINGS IN THE SOFTWARE.
*/
-#include "wt_internal.h"
+#include <inttypes.h>
+#include <stddef.h>
-#if defined(HAVE_CRC32_HARDWARE)
+/*
+ * The checksum code doesn't include WiredTiger configuration or include files.
+ * This means the HAVE_NO_CRC32_HARDWARE #define isn't configurable as part of
+ * standalone WiredTiger configuration, there's no way to turn off the checksum
+ * hardware.
+ */
+#if !defined(HAVE_NO_CRC32_HARDWARE)
#if (defined(__amd64) || defined(__x86_64))
/*
* __wt_checksum_hw --
@@ -116,17 +123,23 @@ __wt_checksum_hw(const void *chunk, size_t len)
return (~crc);
}
#endif
-#endif /* HAVE_CRC32_HARDWARE */
+#endif
+
+extern uint32_t __wt_checksum_sw(const void *chunk, size_t len);
+#if defined(__GNUC__)
+extern uint32_t (*wiredtiger_crc32c_func(void))(const void *, size_t)
+ __attribute__((visibility("default")));
+#else
+extern uint32_t (*wiredtiger_crc32c_func(void))(const void *, size_t);
+#endif
/*
- * __wt_checksum_init --
- * WiredTiger: detect CRC hardware and set the checksum function.
+ * wiredtiger_crc32c_func --
+ * WiredTiger: detect CRC hardware and return the checksum function.
*/
-void
-__wt_checksum_init(void)
- WT_GCC_FUNC_ATTRIBUTE((cold))
+uint32_t (*wiredtiger_crc32c_func(void))(const void *, size_t)
{
-#if defined(HAVE_CRC32_HARDWARE)
+#if !defined(HAVE_NO_CRC32_HARDWARE)
#if (defined(__amd64) || defined(__x86_64))
unsigned int eax, ebx, ecx, edx;
@@ -137,9 +150,8 @@ __wt_checksum_init(void)
#define CPUID_ECX_HAS_SSE42 (1 << 20)
if (ecx & CPUID_ECX_HAS_SSE42)
- __wt_process.checksum = __wt_checksum_hw;
- else
- __wt_process.checksum = __wt_checksum_sw;
+ return (__wt_checksum_hw);
+ return (__wt_checksum_sw);
#elif defined(_M_AMD64)
int cpuInfo[4];
@@ -148,14 +160,12 @@ __wt_checksum_init(void)
#define CPUID_ECX_HAS_SSE42 (1 << 20)
if (cpuInfo[2] & CPUID_ECX_HAS_SSE42)
- __wt_process.checksum = __wt_checksum_hw;
- else
- __wt_process.checksum = __wt_checksum_sw;
+ return (__wt_checksum_hw);
+ return (__wt_checksum_sw);
#else
- __wt_process.checksum = __wt_checksum_sw;
+ return (__wt_checksum_sw);
+#endif
+#else
+ return (__wt_checksum_sw);
#endif
-
-#else /* !HAVE_CRC32_HARDWARE */
- __wt_process.checksum = __wt_checksum_sw;
-#endif /* HAVE_CRC32_HARDWARE */
}
diff --git a/src/third_party/wiredtiger/src/checksum/zseries/crc32-s390x.c b/src/third_party/wiredtiger/src/checksum/zseries/crc32-s390x.c
index ec7adb02cba..9c7ae30bf71 100644
--- a/src/third_party/wiredtiger/src/checksum/zseries/crc32-s390x.c
+++ b/src/third_party/wiredtiger/src/checksum/zseries/crc32-s390x.c
@@ -6,12 +6,19 @@
* Author(s): Hendrik Brueckner <brueckner@linux.vnet.ibm.com>
*
*/
-#include "wt_internal.h"
#include <sys/types.h>
#include <endian.h>
+#include <inttypes.h>
+#include <stddef.h>
-#if defined(__linux__) && defined(HAVE_CRC32_HARDWARE)
+/*
+ * The checksum code doesn't include WiredTiger configuration or include files.
+ * This means the HAVE_NO_CRC32_HARDWARE #define isn't configurable as part of
+ * standalone WiredTiger configuration, there's no way to turn off the checksum
+ * hardware.
+ */
+#if defined(__linux__) && !defined(HAVE_NO_CRC32_HARDWARE)
#include <sys/auxv.h>
/* RHEL 7 has kernel support, but does not define this constant in the lib c headers. */
@@ -89,26 +96,30 @@ __wt_checksum_hw(const void *chunk, size_t len)
{
return (~__wt_crc32c_le_vx(0xffffffff, chunk, len));
}
+#endif
+extern uint32_t __wt_checksum_sw(const void *chunk, size_t len);
+#if defined(__GNUC__)
+extern uint32_t (*wiredtiger_crc32c_func(void))(const void *, size_t)
+ __attribute__((visibility("default")));
+#else
+extern uint32_t (*wiredtiger_crc32c_func(void))(const void *, size_t);
#endif
/*
- * __wt_checksum_init --
- * WiredTiger: detect CRC hardware and set the checksum function.
+ * wiredtiger_crc32c_func --
+ * WiredTiger: detect CRC hardware and return the checksum function.
*/
-void
-__wt_checksum_init(void)
- WT_GCC_FUNC_ATTRIBUTE((cold))
+uint32_t (*wiredtiger_crc32c_func(void))(const void *, size_t)
{
-#if defined(__linux__) && defined(HAVE_CRC32_HARDWARE)
+#if defined(__linux__) && !defined(HAVE_NO_CRC32_HARDWARE)
unsigned long caps = getauxval(AT_HWCAP);
if (caps & HWCAP_S390_VX)
- __wt_process.checksum = __wt_checksum_hw;
+ return (__wt_checksum_hw);
else
- __wt_process.checksum = __wt_checksum_sw;
-
+ return (__wt_checksum_sw);
#else
- __wt_process.checksum = __wt_checksum_sw;
+ return (__wt_checksum_sw);
#endif
}
diff --git a/src/third_party/wiredtiger/src/checksum/zseries/crc32le-vx.sx b/src/third_party/wiredtiger/src/checksum/zseries/crc32le-vx.sx
index 0f1392b0952..41ee20d3e94 100644
--- a/src/third_party/wiredtiger/src/checksum/zseries/crc32le-vx.sx
+++ b/src/third_party/wiredtiger/src/checksum/zseries/crc32le-vx.sx
@@ -1,4 +1,12 @@
/*
+ * The checksum code doesn't include WiredTiger configuration or include files.
+ * This means the HAVE_NO_CRC32_HARDWARE #define isn't configurable as part of
+ * standalone WiredTiger configuration, there's no way to turn off the checksum
+ * hardware.
+ */
+#if !defined(HAVE_NO_CRC32_HARDWARE)
+
+/*
* Hardware-accelerated CRC-32 variants for Linux on z Systems
*
* Use the z/Architecture Vector Extension Facility to accelerate the
@@ -271,10 +279,13 @@ crc32_le_vgfm_generic:
VLM %v8,%v15,0,%r15
lmg %r14,%r15,240(%r15)
br %r14
+#endif
-/*
- * Make sure the stack isn't executable with GCC (regardless of platform).
- */
-#ifndef __clang__
+/* Make sure the stack isn't executable with GCC (regardless of platform). */
+#ifdef __ELF__
.section .note.GNU-stack,"",@progbits
#endif
+/*
+ * DO NOT add an #endif after this line, this section must always be output
+ * and can never be #ifdef'd out as part of conditional compilation.
+ */
diff --git a/src/third_party/wiredtiger/src/config/config_collapse.c b/src/third_party/wiredtiger/src/config/config_collapse.c
index 26636873902..60a319af15e 100644
--- a/src/third_party/wiredtiger/src/config/config_collapse.c
+++ b/src/third_party/wiredtiger/src/config/config_collapse.c
@@ -399,7 +399,7 @@ __wt_config_merge(WT_SESSION_IMPL *session,
* Sort the array by key and, in the case of identical keys, by
* generation.
*/
- qsort(merge.entries, merge.entries_next,
+ __wt_qsort(merge.entries, merge.entries_next,
sizeof(WT_CONFIG_MERGE_ENTRY), __config_merge_cmp);
/* Convert the array of entries into a string. */
diff --git a/src/third_party/wiredtiger/src/config/config_def.c b/src/third_party/wiredtiger/src/config/config_def.c
index 7dee7a5e756..1bda9a62e25 100644
--- a/src/third_party/wiredtiger/src/config/config_def.c
+++ b/src/third_party/wiredtiger/src/config/config_def.c
@@ -47,7 +47,7 @@ static const WT_CONFIG_CHECK confchk_WT_CONNECTION_open_session[] = {
static const WT_CONFIG_CHECK confchk_WT_CONNECTION_query_timestamp[] = {
{ "get", "string",
NULL, "choices=[\"all_committed\",\"last_checkpoint\",\"oldest\""
- ",\"pinned\",\"recovery\",\"stable\"]",
+ ",\"oldest_reader\",\"pinned\",\"recovery\",\"stable\"]",
NULL, 0 },
{ NULL, NULL, NULL, NULL, NULL, 0 }
};
@@ -138,6 +138,7 @@ static const WT_CONFIG_CHECK confchk_WT_CONNECTION_reconfigure[] = {
{ "async", "category",
NULL, NULL,
confchk_wiredtiger_open_async_subconfigs, 3 },
+ { "cache_max_wait_ms", "int", NULL, "min=0", NULL, 0 },
{ "cache_overhead", "int", NULL, "min=0,max=30", NULL, 0 },
{ "cache_size", "int", NULL, "min=1MB,max=10TB", NULL, 0 },
{ "checkpoint", "category",
@@ -189,17 +190,18 @@ static const WT_CONFIG_CHECK confchk_WT_CONNECTION_reconfigure[] = {
{ "timing_stress_for_test", "list",
NULL, "choices=[\"checkpoint_slow\",\"lookaside_sweep_race\","
"\"split_1\",\"split_2\",\"split_3\",\"split_4\",\"split_5\","
- "\"split_6\",\"split_7\"]",
+ "\"split_6\",\"split_7\",\"split_8\"]",
NULL, 0 },
{ "verbose", "list",
NULL, "choices=[\"api\",\"block\",\"checkpoint\","
- "\"checkpoint_progress\",\"compact\",\"evict\",\"evict_stuck\","
- "\"evictserver\",\"fileops\",\"handleops\",\"log\",\"lookaside\","
- "\"lookaside_activity\",\"lsm\",\"lsm_manager\",\"metadata\","
- "\"mutex\",\"overflow\",\"read\",\"rebalance\",\"reconcile\","
- "\"recovery\",\"recovery_progress\",\"salvage\",\"shared_cache\","
- "\"split\",\"temporary\",\"thread_group\",\"timestamp\","
- "\"transaction\",\"verify\",\"version\",\"write\"]",
+ "\"checkpoint_progress\",\"compact\",\"error_returns\",\"evict\","
+ "\"evict_stuck\",\"evictserver\",\"fileops\",\"handleops\","
+ "\"log\",\"lookaside\",\"lookaside_activity\",\"lsm\","
+ "\"lsm_manager\",\"metadata\",\"mutex\",\"overflow\",\"read\","
+ "\"rebalance\",\"reconcile\",\"recovery\",\"recovery_progress\","
+ "\"salvage\",\"shared_cache\",\"split\",\"temporary\","
+ "\"thread_group\",\"timestamp\",\"transaction\",\"verify\","
+ "\"version\",\"write\"]",
NULL, 0 },
{ NULL, NULL, NULL, NULL, NULL, 0 }
};
@@ -377,6 +379,7 @@ static const WT_CONFIG_CHECK confchk_WT_SESSION_create[] = {
{ "lsm", "category",
NULL, NULL,
confchk_WT_SESSION_create_lsm_subconfigs, 12 },
+ { "memory_page_image_max", "int", NULL, "min=0", NULL, 0 },
{ "memory_page_max", "int",
NULL, "min=512B,max=10TB",
NULL, 0 },
@@ -439,6 +442,7 @@ static const WT_CONFIG_CHECK confchk_WT_SESSION_open_cursor[] = {
{ "next_random_sample_size", "string", NULL, NULL, NULL, 0 },
{ "overwrite", "boolean", NULL, NULL, NULL, 0 },
{ "raw", "boolean", NULL, NULL, NULL, 0 },
+ { "read_once", "boolean", NULL, NULL, NULL, 0 },
{ "readonly", "boolean", NULL, NULL, NULL, 0 },
{ "skip_sort_check", "boolean", NULL, NULL, NULL, 0 },
{ "statistics", "list",
@@ -454,6 +458,14 @@ static const WT_CONFIG_CHECK confchk_WT_SESSION_prepare_transaction[] = {
{ NULL, NULL, NULL, NULL, NULL, 0 }
};
+static const WT_CONFIG_CHECK confchk_WT_SESSION_query_timestamp[] = {
+ { "get", "string",
+ NULL, "choices=[\"commit\",\"first_commit\",\"prepare\","
+ "\"read\"]",
+ NULL, 0 },
+ { NULL, NULL, NULL, NULL, NULL, 0 }
+};
+
static const WT_CONFIG_CHECK confchk_WT_SESSION_reconfigure[] = {
{ "cache_cursors", "boolean", NULL, NULL, NULL, 0 },
{ "ignore_cache_size", "boolean", NULL, NULL, NULL, 0 },
@@ -566,6 +578,7 @@ static const WT_CONFIG_CHECK confchk_file_config[] = {
{ "log", "category",
NULL, NULL,
confchk_WT_SESSION_create_log_subconfigs, 1 },
+ { "memory_page_image_max", "int", NULL, "min=0", NULL, 0 },
{ "memory_page_max", "int",
NULL, "min=512B,max=10TB",
NULL, 0 },
@@ -633,6 +646,7 @@ static const WT_CONFIG_CHECK confchk_file_meta[] = {
{ "log", "category",
NULL, NULL,
confchk_WT_SESSION_create_log_subconfigs, 1 },
+ { "memory_page_image_max", "int", NULL, "min=0", NULL, 0 },
{ "memory_page_max", "int",
NULL, "min=512B,max=10TB",
NULL, 0 },
@@ -719,6 +733,7 @@ static const WT_CONFIG_CHECK confchk_lsm_meta[] = {
{ "lsm", "category",
NULL, NULL,
confchk_WT_SESSION_create_lsm_subconfigs, 12 },
+ { "memory_page_image_max", "int", NULL, "min=0", NULL, 0 },
{ "memory_page_max", "int",
NULL, "min=512B,max=10TB",
NULL, 0 },
@@ -806,6 +821,7 @@ static const WT_CONFIG_CHECK confchk_wiredtiger_open[] = {
{ "buffer_alignment", "int", NULL, "min=-1,max=1MB", NULL, 0 },
{ "builtin_extension_config", "string", NULL, NULL, NULL, 0 },
{ "cache_cursors", "boolean", NULL, NULL, NULL, 0 },
+ { "cache_max_wait_ms", "int", NULL, "min=0", NULL, 0 },
{ "cache_overhead", "int", NULL, "min=0,max=30", NULL, 0 },
{ "cache_size", "int", NULL, "min=1MB,max=10TB", NULL, 0 },
{ "checkpoint", "category",
@@ -863,6 +879,7 @@ static const WT_CONFIG_CHECK confchk_wiredtiger_open[] = {
NULL, NULL,
confchk_wiredtiger_open_operation_tracking_subconfigs, 2 },
{ "readonly", "boolean", NULL, NULL, NULL, 0 },
+ { "salvage", "boolean", NULL, NULL, NULL, 0 },
{ "session_max", "int", NULL, "min=1", NULL, 0 },
{ "session_scratch_max", "int", NULL, NULL, NULL, 0 },
{ "session_table_cache", "boolean", NULL, NULL, NULL, 0 },
@@ -879,7 +896,7 @@ static const WT_CONFIG_CHECK confchk_wiredtiger_open[] = {
{ "timing_stress_for_test", "list",
NULL, "choices=[\"checkpoint_slow\",\"lookaside_sweep_race\","
"\"split_1\",\"split_2\",\"split_3\",\"split_4\",\"split_5\","
- "\"split_6\",\"split_7\"]",
+ "\"split_6\",\"split_7\",\"split_8\"]",
NULL, 0 },
{ "transaction_sync", "category",
NULL, NULL,
@@ -888,13 +905,14 @@ static const WT_CONFIG_CHECK confchk_wiredtiger_open[] = {
{ "use_environment_priv", "boolean", NULL, NULL, NULL, 0 },
{ "verbose", "list",
NULL, "choices=[\"api\",\"block\",\"checkpoint\","
- "\"checkpoint_progress\",\"compact\",\"evict\",\"evict_stuck\","
- "\"evictserver\",\"fileops\",\"handleops\",\"log\",\"lookaside\","
- "\"lookaside_activity\",\"lsm\",\"lsm_manager\",\"metadata\","
- "\"mutex\",\"overflow\",\"read\",\"rebalance\",\"reconcile\","
- "\"recovery\",\"recovery_progress\",\"salvage\",\"shared_cache\","
- "\"split\",\"temporary\",\"thread_group\",\"timestamp\","
- "\"transaction\",\"verify\",\"version\",\"write\"]",
+ "\"checkpoint_progress\",\"compact\",\"error_returns\",\"evict\","
+ "\"evict_stuck\",\"evictserver\",\"fileops\",\"handleops\","
+ "\"log\",\"lookaside\",\"lookaside_activity\",\"lsm\","
+ "\"lsm_manager\",\"metadata\",\"mutex\",\"overflow\",\"read\","
+ "\"rebalance\",\"reconcile\",\"recovery\",\"recovery_progress\","
+ "\"salvage\",\"shared_cache\",\"split\",\"temporary\","
+ "\"thread_group\",\"timestamp\",\"transaction\",\"verify\","
+ "\"version\",\"write\"]",
NULL, 0 },
{ "write_through", "list",
NULL, "choices=[\"data\",\"log\"]",
@@ -909,6 +927,7 @@ static const WT_CONFIG_CHECK confchk_wiredtiger_open_all[] = {
{ "buffer_alignment", "int", NULL, "min=-1,max=1MB", NULL, 0 },
{ "builtin_extension_config", "string", NULL, NULL, NULL, 0 },
{ "cache_cursors", "boolean", NULL, NULL, NULL, 0 },
+ { "cache_max_wait_ms", "int", NULL, "min=0", NULL, 0 },
{ "cache_overhead", "int", NULL, "min=0,max=30", NULL, 0 },
{ "cache_size", "int", NULL, "min=1MB,max=10TB", NULL, 0 },
{ "checkpoint", "category",
@@ -966,6 +985,7 @@ static const WT_CONFIG_CHECK confchk_wiredtiger_open_all[] = {
NULL, NULL,
confchk_wiredtiger_open_operation_tracking_subconfigs, 2 },
{ "readonly", "boolean", NULL, NULL, NULL, 0 },
+ { "salvage", "boolean", NULL, NULL, NULL, 0 },
{ "session_max", "int", NULL, "min=1", NULL, 0 },
{ "session_scratch_max", "int", NULL, NULL, NULL, 0 },
{ "session_table_cache", "boolean", NULL, NULL, NULL, 0 },
@@ -982,7 +1002,7 @@ static const WT_CONFIG_CHECK confchk_wiredtiger_open_all[] = {
{ "timing_stress_for_test", "list",
NULL, "choices=[\"checkpoint_slow\",\"lookaside_sweep_race\","
"\"split_1\",\"split_2\",\"split_3\",\"split_4\",\"split_5\","
- "\"split_6\",\"split_7\"]",
+ "\"split_6\",\"split_7\",\"split_8\"]",
NULL, 0 },
{ "transaction_sync", "category",
NULL, NULL,
@@ -991,13 +1011,14 @@ static const WT_CONFIG_CHECK confchk_wiredtiger_open_all[] = {
{ "use_environment_priv", "boolean", NULL, NULL, NULL, 0 },
{ "verbose", "list",
NULL, "choices=[\"api\",\"block\",\"checkpoint\","
- "\"checkpoint_progress\",\"compact\",\"evict\",\"evict_stuck\","
- "\"evictserver\",\"fileops\",\"handleops\",\"log\",\"lookaside\","
- "\"lookaside_activity\",\"lsm\",\"lsm_manager\",\"metadata\","
- "\"mutex\",\"overflow\",\"read\",\"rebalance\",\"reconcile\","
- "\"recovery\",\"recovery_progress\",\"salvage\",\"shared_cache\","
- "\"split\",\"temporary\",\"thread_group\",\"timestamp\","
- "\"transaction\",\"verify\",\"version\",\"write\"]",
+ "\"checkpoint_progress\",\"compact\",\"error_returns\",\"evict\","
+ "\"evict_stuck\",\"evictserver\",\"fileops\",\"handleops\","
+ "\"log\",\"lookaside\",\"lookaside_activity\",\"lsm\","
+ "\"lsm_manager\",\"metadata\",\"mutex\",\"overflow\",\"read\","
+ "\"rebalance\",\"reconcile\",\"recovery\",\"recovery_progress\","
+ "\"salvage\",\"shared_cache\",\"split\",\"temporary\","
+ "\"thread_group\",\"timestamp\",\"transaction\",\"verify\","
+ "\"version\",\"write\"]",
NULL, 0 },
{ "version", "string", NULL, NULL, NULL, 0 },
{ "write_through", "list",
@@ -1013,6 +1034,7 @@ static const WT_CONFIG_CHECK confchk_wiredtiger_open_basecfg[] = {
{ "buffer_alignment", "int", NULL, "min=-1,max=1MB", NULL, 0 },
{ "builtin_extension_config", "string", NULL, NULL, NULL, 0 },
{ "cache_cursors", "boolean", NULL, NULL, NULL, 0 },
+ { "cache_max_wait_ms", "int", NULL, "min=0", NULL, 0 },
{ "cache_overhead", "int", NULL, "min=0,max=30", NULL, 0 },
{ "cache_size", "int", NULL, "min=1MB,max=10TB", NULL, 0 },
{ "checkpoint", "category",
@@ -1066,6 +1088,7 @@ static const WT_CONFIG_CHECK confchk_wiredtiger_open_basecfg[] = {
NULL, NULL,
confchk_wiredtiger_open_operation_tracking_subconfigs, 2 },
{ "readonly", "boolean", NULL, NULL, NULL, 0 },
+ { "salvage", "boolean", NULL, NULL, NULL, 0 },
{ "session_max", "int", NULL, "min=1", NULL, 0 },
{ "session_scratch_max", "int", NULL, NULL, NULL, 0 },
{ "session_table_cache", "boolean", NULL, NULL, NULL, 0 },
@@ -1082,20 +1105,21 @@ static const WT_CONFIG_CHECK confchk_wiredtiger_open_basecfg[] = {
{ "timing_stress_for_test", "list",
NULL, "choices=[\"checkpoint_slow\",\"lookaside_sweep_race\","
"\"split_1\",\"split_2\",\"split_3\",\"split_4\",\"split_5\","
- "\"split_6\",\"split_7\"]",
+ "\"split_6\",\"split_7\",\"split_8\"]",
NULL, 0 },
{ "transaction_sync", "category",
NULL, NULL,
confchk_wiredtiger_open_transaction_sync_subconfigs, 2 },
{ "verbose", "list",
NULL, "choices=[\"api\",\"block\",\"checkpoint\","
- "\"checkpoint_progress\",\"compact\",\"evict\",\"evict_stuck\","
- "\"evictserver\",\"fileops\",\"handleops\",\"log\",\"lookaside\","
- "\"lookaside_activity\",\"lsm\",\"lsm_manager\",\"metadata\","
- "\"mutex\",\"overflow\",\"read\",\"rebalance\",\"reconcile\","
- "\"recovery\",\"recovery_progress\",\"salvage\",\"shared_cache\","
- "\"split\",\"temporary\",\"thread_group\",\"timestamp\","
- "\"transaction\",\"verify\",\"version\",\"write\"]",
+ "\"checkpoint_progress\",\"compact\",\"error_returns\",\"evict\","
+ "\"evict_stuck\",\"evictserver\",\"fileops\",\"handleops\","
+ "\"log\",\"lookaside\",\"lookaside_activity\",\"lsm\","
+ "\"lsm_manager\",\"metadata\",\"mutex\",\"overflow\",\"read\","
+ "\"rebalance\",\"reconcile\",\"recovery\",\"recovery_progress\","
+ "\"salvage\",\"shared_cache\",\"split\",\"temporary\","
+ "\"thread_group\",\"timestamp\",\"transaction\",\"verify\","
+ "\"version\",\"write\"]",
NULL, 0 },
{ "version", "string", NULL, NULL, NULL, 0 },
{ "write_through", "list",
@@ -1111,6 +1135,7 @@ static const WT_CONFIG_CHECK confchk_wiredtiger_open_usercfg[] = {
{ "buffer_alignment", "int", NULL, "min=-1,max=1MB", NULL, 0 },
{ "builtin_extension_config", "string", NULL, NULL, NULL, 0 },
{ "cache_cursors", "boolean", NULL, NULL, NULL, 0 },
+ { "cache_max_wait_ms", "int", NULL, "min=0", NULL, 0 },
{ "cache_overhead", "int", NULL, "min=0,max=30", NULL, 0 },
{ "cache_size", "int", NULL, "min=1MB,max=10TB", NULL, 0 },
{ "checkpoint", "category",
@@ -1164,6 +1189,7 @@ static const WT_CONFIG_CHECK confchk_wiredtiger_open_usercfg[] = {
NULL, NULL,
confchk_wiredtiger_open_operation_tracking_subconfigs, 2 },
{ "readonly", "boolean", NULL, NULL, NULL, 0 },
+ { "salvage", "boolean", NULL, NULL, NULL, 0 },
{ "session_max", "int", NULL, "min=1", NULL, 0 },
{ "session_scratch_max", "int", NULL, NULL, NULL, 0 },
{ "session_table_cache", "boolean", NULL, NULL, NULL, 0 },
@@ -1180,20 +1206,21 @@ static const WT_CONFIG_CHECK confchk_wiredtiger_open_usercfg[] = {
{ "timing_stress_for_test", "list",
NULL, "choices=[\"checkpoint_slow\",\"lookaside_sweep_race\","
"\"split_1\",\"split_2\",\"split_3\",\"split_4\",\"split_5\","
- "\"split_6\",\"split_7\"]",
+ "\"split_6\",\"split_7\",\"split_8\"]",
NULL, 0 },
{ "transaction_sync", "category",
NULL, NULL,
confchk_wiredtiger_open_transaction_sync_subconfigs, 2 },
{ "verbose", "list",
NULL, "choices=[\"api\",\"block\",\"checkpoint\","
- "\"checkpoint_progress\",\"compact\",\"evict\",\"evict_stuck\","
- "\"evictserver\",\"fileops\",\"handleops\",\"log\",\"lookaside\","
- "\"lookaside_activity\",\"lsm\",\"lsm_manager\",\"metadata\","
- "\"mutex\",\"overflow\",\"read\",\"rebalance\",\"reconcile\","
- "\"recovery\",\"recovery_progress\",\"salvage\",\"shared_cache\","
- "\"split\",\"temporary\",\"thread_group\",\"timestamp\","
- "\"transaction\",\"verify\",\"version\",\"write\"]",
+ "\"checkpoint_progress\",\"compact\",\"error_returns\",\"evict\","
+ "\"evict_stuck\",\"evictserver\",\"fileops\",\"handleops\","
+ "\"log\",\"lookaside\",\"lookaside_activity\",\"lsm\","
+ "\"lsm_manager\",\"metadata\",\"mutex\",\"overflow\",\"read\","
+ "\"rebalance\",\"reconcile\",\"recovery\",\"recovery_progress\","
+ "\"salvage\",\"shared_cache\",\"split\",\"temporary\","
+ "\"thread_group\",\"timestamp\",\"transaction\",\"verify\","
+ "\"version\",\"write\"]",
NULL, 0 },
{ "write_through", "list",
NULL, "choices=[\"data\",\"log\"]",
@@ -1250,13 +1277,13 @@ static const WT_CONFIG_ENTRY config_entries[] = {
confchk_WT_CONNECTION_query_timestamp, 1
},
{ "WT_CONNECTION.reconfigure",
- "async=(enabled=false,ops_max=1024,threads=2),cache_overhead=8,"
- "cache_size=100MB,checkpoint=(log_size=0,wait=0),"
- "compatibility=(release=),error_prefix=,eviction=(threads_max=8,"
- "threads_min=1),eviction_checkpoint_target=5,"
- "eviction_dirty_target=5,eviction_dirty_trigger=20,"
- "eviction_target=80,eviction_trigger=95,"
- "file_manager=(close_handle_minimum=250,close_idle_time=30,"
+ "async=(enabled=false,ops_max=1024,threads=2),cache_max_wait_ms=0"
+ ",cache_overhead=8,cache_size=100MB,checkpoint=(log_size=0,"
+ "wait=0),compatibility=(release=),error_prefix=,"
+ "eviction=(threads_max=8,threads_min=1),"
+ "eviction_checkpoint_target=1,eviction_dirty_target=5,"
+ "eviction_dirty_trigger=20,eviction_target=80,eviction_trigger=95"
+ ",file_manager=(close_handle_minimum=250,close_idle_time=30,"
"close_scan_interval=10),log=(archive=true,prealloc=true,"
"zero_fill=false),lsm_manager=(merge=true,worker_thread_max=4),"
"lsm_merge=true,operation_tracking=(enabled=false,path=\".\"),"
@@ -1264,7 +1291,7 @@ static const WT_CONFIG_ENTRY config_entries[] = {
"statistics=none,statistics_log=(json=false,on_close=false,"
"sources=,timestamp=\"%b %d %H:%M:%S\",wait=0),"
"timing_stress_for_test=,verbose=",
- confchk_WT_CONNECTION_reconfigure, 22
+ confchk_WT_CONNECTION_reconfigure, 23
},
{ "WT_CONNECTION.rollback_to_stable",
"",
@@ -1329,11 +1356,12 @@ static const WT_CONFIG_ENTRY config_entries[] = {
"bloom=true,bloom_bit_count=16,bloom_config=,bloom_hash_count=8,"
"bloom_oldest=false,chunk_count_limit=0,chunk_max=5GB,"
"chunk_size=10MB,merge_custom=(prefix=,start_generation=0,"
- "suffix=),merge_max=15,merge_min=0),memory_page_max=5MB,"
- "os_cache_dirty_max=0,os_cache_max=0,prefix_compression=false,"
- "prefix_compression_min=4,source=,split_deepen_min_child=0,"
- "split_deepen_per_child=0,split_pct=90,type=file,value_format=u",
- confchk_WT_SESSION_create, 43
+ "suffix=),merge_max=15,merge_min=0),memory_page_image_max=0,"
+ "memory_page_max=5MB,os_cache_dirty_max=0,os_cache_max=0,"
+ "prefix_compression=false,prefix_compression_min=4,source=,"
+ "split_deepen_min_child=0,split_deepen_per_child=0,split_pct=90,"
+ "type=file,value_format=u",
+ confchk_WT_SESSION_create, 44
},
{ "WT_SESSION.drop",
"checkpoint_wait=true,force=false,lock_wait=true,"
@@ -1357,14 +1385,18 @@ static const WT_CONFIG_ENTRY config_entries[] = {
{ "WT_SESSION.open_cursor",
"append=false,bulk=false,checkpoint=,checkpoint_wait=true,dump=,"
"next_random=false,next_random_sample_size=0,overwrite=true,"
- "raw=false,readonly=false,skip_sort_check=false,statistics=,"
- "target=",
- confchk_WT_SESSION_open_cursor, 13
+ "raw=false,read_once=false,readonly=false,skip_sort_check=false,"
+ "statistics=,target=",
+ confchk_WT_SESSION_open_cursor, 14
},
{ "WT_SESSION.prepare_transaction",
"prepare_timestamp=",
confchk_WT_SESSION_prepare_transaction, 1
},
+ { "WT_SESSION.query_timestamp",
+ "get=read",
+ confchk_WT_SESSION_query_timestamp, 1
+ },
{ "WT_SESSION.rebalance",
"",
NULL, 0
@@ -1433,11 +1465,12 @@ static const WT_CONFIG_ENTRY config_entries[] = {
"internal_item_max=0,internal_key_max=0,"
"internal_key_truncate=true,internal_page_max=4KB,key_format=u,"
"key_gap=10,leaf_item_max=0,leaf_key_max=0,leaf_page_max=32KB,"
- "leaf_value_max=0,log=(enabled=true),memory_page_max=5MB,"
- "os_cache_dirty_max=0,os_cache_max=0,prefix_compression=false,"
- "prefix_compression_min=4,split_deepen_min_child=0,"
- "split_deepen_per_child=0,split_pct=90,value_format=u",
- confchk_file_config, 36
+ "leaf_value_max=0,log=(enabled=true),memory_page_image_max=0,"
+ "memory_page_max=5MB,os_cache_dirty_max=0,os_cache_max=0,"
+ "prefix_compression=false,prefix_compression_min=4,"
+ "split_deepen_min_child=0,split_deepen_per_child=0,split_pct=90,"
+ "value_format=u",
+ confchk_file_config, 37
},
{ "file.meta",
"access_pattern_hint=none,allocation_size=4KB,app_metadata=,"
@@ -1450,11 +1483,12 @@ static const WT_CONFIG_ENTRY config_entries[] = {
"internal_key_max=0,internal_key_truncate=true,"
"internal_page_max=4KB,key_format=u,key_gap=10,leaf_item_max=0,"
"leaf_key_max=0,leaf_page_max=32KB,leaf_value_max=0,"
- "log=(enabled=true),memory_page_max=5MB,os_cache_dirty_max=0,"
- "os_cache_max=0,prefix_compression=false,prefix_compression_min=4"
- ",split_deepen_min_child=0,split_deepen_per_child=0,split_pct=90,"
- "value_format=u,version=(major=0,minor=0)",
- confchk_file_meta, 40
+ "log=(enabled=true),memory_page_image_max=0,memory_page_max=5MB,"
+ "os_cache_dirty_max=0,os_cache_max=0,prefix_compression=false,"
+ "prefix_compression_min=4,split_deepen_min_child=0,"
+ "split_deepen_per_child=0,split_pct=90,value_format=u,"
+ "version=(major=0,minor=0)",
+ confchk_file_meta, 41
},
{ "index.meta",
"app_metadata=,collator=,columns=,extractor=,immutable=false,"
@@ -1476,11 +1510,12 @@ static const WT_CONFIG_ENTRY config_entries[] = {
"bloom_config=,bloom_hash_count=8,bloom_oldest=false,"
"chunk_count_limit=0,chunk_max=5GB,chunk_size=10MB,"
"merge_custom=(prefix=,start_generation=0,suffix=),merge_max=15,"
- "merge_min=0),memory_page_max=5MB,old_chunks=,"
- "os_cache_dirty_max=0,os_cache_max=0,prefix_compression=false,"
- "prefix_compression_min=4,split_deepen_min_child=0,"
- "split_deepen_per_child=0,split_pct=90,value_format=u",
- confchk_lsm_meta, 40
+ "merge_min=0),memory_page_image_max=0,memory_page_max=5MB,"
+ "old_chunks=,os_cache_dirty_max=0,os_cache_max=0,"
+ "prefix_compression=false,prefix_compression_min=4,"
+ "split_deepen_min_child=0,split_deepen_per_child=0,split_pct=90,"
+ "value_format=u",
+ confchk_lsm_meta, 41
},
{ "table.meta",
"app_metadata=,colgroups=,collator=,columns=,key_format=u,"
@@ -1489,66 +1524,67 @@ static const WT_CONFIG_ENTRY config_entries[] = {
},
{ "wiredtiger_open",
"async=(enabled=false,ops_max=1024,threads=2),buffer_alignment=-1"
- ",builtin_extension_config=,cache_cursors=true,cache_overhead=8,"
- "cache_size=100MB,checkpoint=(log_size=0,wait=0),"
- "checkpoint_sync=true,compatibility=(release=,require_max=,"
- "require_min=),config_base=true,create=false,direct_io=,"
- "encryption=(keyid=,name=,secretkey=),error_prefix=,"
- "eviction=(threads_max=8,threads_min=1),"
- "eviction_checkpoint_target=5,eviction_dirty_target=5,"
- "eviction_dirty_trigger=20,eviction_target=80,eviction_trigger=95"
- ",exclusive=false,extensions=,file_extend=,"
- "file_manager=(close_handle_minimum=250,close_idle_time=30,"
- "close_scan_interval=10),hazard_max=1000,in_memory=false,"
- "log=(archive=true,compressor=,enabled=false,file_max=100MB,"
- "path=\".\",prealloc=true,recover=on,zero_fill=false),"
- "lsm_manager=(merge=true,worker_thread_max=4),lsm_merge=true,"
- "mmap=true,multiprocess=false,operation_tracking=(enabled=false,"
- "path=\".\"),readonly=false,session_max=100,"
- "session_scratch_max=2MB,session_table_cache=true,"
- "shared_cache=(chunk=10MB,name=,quota=0,reserve=0,size=500MB),"
- "statistics=none,statistics_log=(json=false,on_close=false,"
- "path=\".\",sources=,timestamp=\"%b %d %H:%M:%S\",wait=0),"
- "timing_stress_for_test=,transaction_sync=(enabled=false,"
- "method=fsync),use_environment=true,use_environment_priv=false,"
+ ",builtin_extension_config=,cache_cursors=true,"
+ "cache_max_wait_ms=0,cache_overhead=8,cache_size=100MB,"
+ "checkpoint=(log_size=0,wait=0),checkpoint_sync=true,"
+ "compatibility=(release=,require_max=,require_min=),"
+ "config_base=true,create=false,direct_io=,encryption=(keyid=,"
+ "name=,secretkey=),error_prefix=,eviction=(threads_max=8,"
+ "threads_min=1),eviction_checkpoint_target=1,"
+ "eviction_dirty_target=5,eviction_dirty_trigger=20,"
+ "eviction_target=80,eviction_trigger=95,exclusive=false,"
+ "extensions=,file_extend=,file_manager=(close_handle_minimum=250,"
+ "close_idle_time=30,close_scan_interval=10),hazard_max=1000,"
+ "in_memory=false,log=(archive=true,compressor=,enabled=false,"
+ "file_max=100MB,path=\".\",prealloc=true,recover=on,"
+ "zero_fill=false),lsm_manager=(merge=true,worker_thread_max=4),"
+ "lsm_merge=true,mmap=true,multiprocess=false,"
+ "operation_tracking=(enabled=false,path=\".\"),readonly=false,"
+ "salvage=false,session_max=100,session_scratch_max=2MB,"
+ "session_table_cache=true,shared_cache=(chunk=10MB,name=,quota=0,"
+ "reserve=0,size=500MB),statistics=none,statistics_log=(json=false"
+ ",on_close=false,path=\".\",sources=,timestamp=\"%b %d %H:%M:%S\""
+ ",wait=0),timing_stress_for_test=,transaction_sync=(enabled=false"
+ ",method=fsync),use_environment=true,use_environment_priv=false,"
"verbose=,write_through=",
- confchk_wiredtiger_open, 45
+ confchk_wiredtiger_open, 47
},
{ "wiredtiger_open_all",
"async=(enabled=false,ops_max=1024,threads=2),buffer_alignment=-1"
- ",builtin_extension_config=,cache_cursors=true,cache_overhead=8,"
- "cache_size=100MB,checkpoint=(log_size=0,wait=0),"
- "checkpoint_sync=true,compatibility=(release=,require_max=,"
- "require_min=),config_base=true,create=false,direct_io=,"
- "encryption=(keyid=,name=,secretkey=),error_prefix=,"
- "eviction=(threads_max=8,threads_min=1),"
- "eviction_checkpoint_target=5,eviction_dirty_target=5,"
- "eviction_dirty_trigger=20,eviction_target=80,eviction_trigger=95"
- ",exclusive=false,extensions=,file_extend=,"
- "file_manager=(close_handle_minimum=250,close_idle_time=30,"
- "close_scan_interval=10),hazard_max=1000,in_memory=false,"
- "log=(archive=true,compressor=,enabled=false,file_max=100MB,"
- "path=\".\",prealloc=true,recover=on,zero_fill=false),"
- "lsm_manager=(merge=true,worker_thread_max=4),lsm_merge=true,"
- "mmap=true,multiprocess=false,operation_tracking=(enabled=false,"
- "path=\".\"),readonly=false,session_max=100,"
- "session_scratch_max=2MB,session_table_cache=true,"
- "shared_cache=(chunk=10MB,name=,quota=0,reserve=0,size=500MB),"
- "statistics=none,statistics_log=(json=false,on_close=false,"
- "path=\".\",sources=,timestamp=\"%b %d %H:%M:%S\",wait=0),"
- "timing_stress_for_test=,transaction_sync=(enabled=false,"
- "method=fsync),use_environment=true,use_environment_priv=false,"
+ ",builtin_extension_config=,cache_cursors=true,"
+ "cache_max_wait_ms=0,cache_overhead=8,cache_size=100MB,"
+ "checkpoint=(log_size=0,wait=0),checkpoint_sync=true,"
+ "compatibility=(release=,require_max=,require_min=),"
+ "config_base=true,create=false,direct_io=,encryption=(keyid=,"
+ "name=,secretkey=),error_prefix=,eviction=(threads_max=8,"
+ "threads_min=1),eviction_checkpoint_target=1,"
+ "eviction_dirty_target=5,eviction_dirty_trigger=20,"
+ "eviction_target=80,eviction_trigger=95,exclusive=false,"
+ "extensions=,file_extend=,file_manager=(close_handle_minimum=250,"
+ "close_idle_time=30,close_scan_interval=10),hazard_max=1000,"
+ "in_memory=false,log=(archive=true,compressor=,enabled=false,"
+ "file_max=100MB,path=\".\",prealloc=true,recover=on,"
+ "zero_fill=false),lsm_manager=(merge=true,worker_thread_max=4),"
+ "lsm_merge=true,mmap=true,multiprocess=false,"
+ "operation_tracking=(enabled=false,path=\".\"),readonly=false,"
+ "salvage=false,session_max=100,session_scratch_max=2MB,"
+ "session_table_cache=true,shared_cache=(chunk=10MB,name=,quota=0,"
+ "reserve=0,size=500MB),statistics=none,statistics_log=(json=false"
+ ",on_close=false,path=\".\",sources=,timestamp=\"%b %d %H:%M:%S\""
+ ",wait=0),timing_stress_for_test=,transaction_sync=(enabled=false"
+ ",method=fsync),use_environment=true,use_environment_priv=false,"
"verbose=,version=(major=0,minor=0),write_through=",
- confchk_wiredtiger_open_all, 46
+ confchk_wiredtiger_open_all, 48
},
{ "wiredtiger_open_basecfg",
"async=(enabled=false,ops_max=1024,threads=2),buffer_alignment=-1"
- ",builtin_extension_config=,cache_cursors=true,cache_overhead=8,"
- "cache_size=100MB,checkpoint=(log_size=0,wait=0),"
- "checkpoint_sync=true,compatibility=(release=,require_max=,"
- "require_min=),direct_io=,encryption=(keyid=,name=,secretkey=),"
- "error_prefix=,eviction=(threads_max=8,threads_min=1),"
- "eviction_checkpoint_target=5,eviction_dirty_target=5,"
+ ",builtin_extension_config=,cache_cursors=true,"
+ "cache_max_wait_ms=0,cache_overhead=8,cache_size=100MB,"
+ "checkpoint=(log_size=0,wait=0),checkpoint_sync=true,"
+ "compatibility=(release=,require_max=,require_min=),direct_io=,"
+ "encryption=(keyid=,name=,secretkey=),error_prefix=,"
+ "eviction=(threads_max=8,threads_min=1),"
+ "eviction_checkpoint_target=1,eviction_dirty_target=5,"
"eviction_dirty_trigger=20,eviction_target=80,eviction_trigger=95"
",extensions=,file_extend=,file_manager=(close_handle_minimum=250"
",close_idle_time=30,close_scan_interval=10),hazard_max=1000,"
@@ -1556,23 +1592,24 @@ static const WT_CONFIG_ENTRY config_entries[] = {
"path=\".\",prealloc=true,recover=on,zero_fill=false),"
"lsm_manager=(merge=true,worker_thread_max=4),lsm_merge=true,"
"mmap=true,multiprocess=false,operation_tracking=(enabled=false,"
- "path=\".\"),readonly=false,session_max=100,"
+ "path=\".\"),readonly=false,salvage=false,session_max=100,"
"session_scratch_max=2MB,session_table_cache=true,"
"shared_cache=(chunk=10MB,name=,quota=0,reserve=0,size=500MB),"
"statistics=none,statistics_log=(json=false,on_close=false,"
"path=\".\",sources=,timestamp=\"%b %d %H:%M:%S\",wait=0),"
"timing_stress_for_test=,transaction_sync=(enabled=false,"
"method=fsync),verbose=,version=(major=0,minor=0),write_through=",
- confchk_wiredtiger_open_basecfg, 40
+ confchk_wiredtiger_open_basecfg, 42
},
{ "wiredtiger_open_usercfg",
"async=(enabled=false,ops_max=1024,threads=2),buffer_alignment=-1"
- ",builtin_extension_config=,cache_cursors=true,cache_overhead=8,"
- "cache_size=100MB,checkpoint=(log_size=0,wait=0),"
- "checkpoint_sync=true,compatibility=(release=,require_max=,"
- "require_min=),direct_io=,encryption=(keyid=,name=,secretkey=),"
- "error_prefix=,eviction=(threads_max=8,threads_min=1),"
- "eviction_checkpoint_target=5,eviction_dirty_target=5,"
+ ",builtin_extension_config=,cache_cursors=true,"
+ "cache_max_wait_ms=0,cache_overhead=8,cache_size=100MB,"
+ "checkpoint=(log_size=0,wait=0),checkpoint_sync=true,"
+ "compatibility=(release=,require_max=,require_min=),direct_io=,"
+ "encryption=(keyid=,name=,secretkey=),error_prefix=,"
+ "eviction=(threads_max=8,threads_min=1),"
+ "eviction_checkpoint_target=1,eviction_dirty_target=5,"
"eviction_dirty_trigger=20,eviction_target=80,eviction_trigger=95"
",extensions=,file_extend=,file_manager=(close_handle_minimum=250"
",close_idle_time=30,close_scan_interval=10),hazard_max=1000,"
@@ -1580,14 +1617,14 @@ static const WT_CONFIG_ENTRY config_entries[] = {
"path=\".\",prealloc=true,recover=on,zero_fill=false),"
"lsm_manager=(merge=true,worker_thread_max=4),lsm_merge=true,"
"mmap=true,multiprocess=false,operation_tracking=(enabled=false,"
- "path=\".\"),readonly=false,session_max=100,"
+ "path=\".\"),readonly=false,salvage=false,session_max=100,"
"session_scratch_max=2MB,session_table_cache=true,"
"shared_cache=(chunk=10MB,name=,quota=0,reserve=0,size=500MB),"
"statistics=none,statistics_log=(json=false,on_close=false,"
"path=\".\",sources=,timestamp=\"%b %d %H:%M:%S\",wait=0),"
"timing_stress_for_test=,transaction_sync=(enabled=false,"
"method=fsync),verbose=,write_through=",
- confchk_wiredtiger_open_usercfg, 39
+ confchk_wiredtiger_open_usercfg, 41
},
{ NULL, NULL, NULL, 0 }
};
diff --git a/src/third_party/wiredtiger/src/conn/api_strerror.c b/src/third_party/wiredtiger/src/conn/api_strerror.c
index fd488c02f80..89c8571eaac 100644
--- a/src/third_party/wiredtiger/src/conn/api_strerror.c
+++ b/src/third_party/wiredtiger/src/conn/api_strerror.c
@@ -40,6 +40,8 @@ __wt_wiredtiger_error(int error)
return ("WT_CACHE_FULL: operation would overflow cache");
case WT_PREPARE_CONFLICT:
return ("WT_PREPARE_CONFLICT: conflict with a prepared update");
+ case WT_TRY_SALVAGE:
+ return ("WT_TRY_SALVAGE: database corruption detected");
}
/* Windows strerror doesn't support ENOTSUP. */
diff --git a/src/third_party/wiredtiger/src/conn/conn_api.c b/src/third_party/wiredtiger/src/conn/conn_api.c
index 97e0950a586..e98be368aa5 100644
--- a/src/third_party/wiredtiger/src/conn/conn_api.c
+++ b/src/third_party/wiredtiger/src/conn/conn_api.c
@@ -137,7 +137,7 @@ __conn_add_collator(WT_CONNECTION *wt_conn,
CONNECTION_API_CALL(conn, session, add_collator, config, cfg);
WT_UNUSED(cfg);
- if (WT_STREQ(name, "none"))
+ if (strcmp(name, "none") == 0)
WT_ERR_MSG(session, EINVAL,
"invalid name for a collator: %s", name);
@@ -243,7 +243,7 @@ __conn_add_compressor(WT_CONNECTION *wt_conn,
CONNECTION_API_CALL(conn, session, add_compressor, config, cfg);
WT_UNUSED(cfg);
- if (WT_STREQ(name, "none"))
+ if (strcmp(name, "none") == 0)
WT_ERR_MSG(session, EINVAL,
"invalid name for a compressor: %s", name);
@@ -482,7 +482,7 @@ __conn_add_encryptor(WT_CONNECTION *wt_conn,
CONNECTION_API_CALL(conn, session, add_encryptor, config, cfg);
WT_UNUSED(cfg);
- if (WT_STREQ(name, "none"))
+ if (strcmp(name, "none") == 0)
WT_ERR_MSG(session, EINVAL,
"invalid name for an encryptor: %s", name);
@@ -578,7 +578,7 @@ __conn_add_extractor(WT_CONNECTION *wt_conn,
CONNECTION_API_CALL(conn, session, add_extractor, config, cfg);
WT_UNUSED(cfg);
- if (WT_STREQ(name, "none"))
+ if (strcmp(name, "none") == 0)
WT_ERR_MSG(session, EINVAL,
"invalid name for an extractor: %s", name);
@@ -1256,7 +1256,7 @@ __conn_query_timestamp(WT_CONNECTION *wt_conn,
conn = (WT_CONNECTION_IMPL *)wt_conn;
CONNECTION_API_CALL(conn, session, query_timestamp, config, cfg);
- WT_TRET(__wt_txn_global_query_timestamp(session, hex_timestamp, cfg));
+ WT_TRET(__wt_txn_query_timestamp(session, hex_timestamp, cfg, true));
err: API_END_RET(session, ret);
}
@@ -1692,11 +1692,11 @@ __conn_single(WT_SESSION_IMPL *session, const char *cfg[])
is_create || exist ? WT_FS_OPEN_CREATE : 0, &conn->lock_fh);
/*
- * If this is a read-only connection and we cannot grab the lock
- * file, check if it is because there is not write permission or
- * if the file does not exist. If so, then ignore the error.
- * XXX Ignoring the error does allow multiple read-only
- * connections to exist at the same time on a read-only directory.
+ * If this is a read-only connection and we cannot grab the lock file,
+ * check if it is because there's no write permission or if the file
+ * does not exist. If so, then ignore the error.
+ * XXX Ignoring the error does allow multiple read-only connections to
+ * exist at the same time on a read-only directory.
*
* If we got an expected permission or non-existence error then skip
* the byte lock.
@@ -1832,6 +1832,7 @@ __wt_verbose_config(WT_SESSION_IMPL *session, const char *cfg[])
{ "checkpoint", WT_VERB_CHECKPOINT },
{ "checkpoint_progress",WT_VERB_CHECKPOINT_PROGRESS },
{ "compact", WT_VERB_COMPACT },
+ { "error_returns", WT_VERB_ERROR_RETURNS },
{ "evict", WT_VERB_EVICT },
{ "evict_stuck", WT_VERB_EVICT_STUCK },
{ "evictserver", WT_VERB_EVICTSERVER },
@@ -2024,6 +2025,7 @@ __wt_timing_stress_config(WT_SESSION_IMPL *session, const char *cfg[])
{ "split_5", WT_TIMING_STRESS_SPLIT_5 },
{ "split_6", WT_TIMING_STRESS_SPLIT_6 },
{ "split_7", WT_TIMING_STRESS_SPLIT_7 },
+ { "split_8", WT_TIMING_STRESS_SPLIT_8 },
{ NULL, 0 }
};
WT_CONFIG_ITEM cval, sval;
@@ -2359,7 +2361,7 @@ wiredtiger_open(const char *home, WT_EVENT_HANDLER *event_handler,
WT_DECL_RET;
const WT_NAME_FLAG *ft;
WT_SESSION_IMPL *session;
- bool config_base_set;
+ bool config_base_set, try_salvage;
const char *enc_cfg[] = { NULL, NULL }, *merge_cfg;
char version[64];
@@ -2372,6 +2374,7 @@ wiredtiger_open(const char *home, WT_EVENT_HANDLER *event_handler,
conn = NULL;
session = NULL;
merge_cfg = NULL;
+ try_salvage = false;
WT_RET(__wt_library_init());
@@ -2583,10 +2586,12 @@ wiredtiger_open(const char *home, WT_EVENT_HANDLER *event_handler,
WT_ERR(__wt_config_gets(session, cfg, "session_scratch_max", &cval));
conn->session_scratch_max = (size_t)cval.val;
- WT_ERR(__wt_config_gets(session, cfg, "checkpoint_sync", &cval));
- if (cval.val)
- F_SET(conn, WT_CONN_CKPT_SYNC);
-
+ /*
+ * If buffer alignment is not configured, use zero unless direct I/O is
+ * also configured, in which case use the build-time default. The code
+ * to parse write through is also here because it is nearly identical
+ * to direct I/O.
+ */
WT_ERR(__wt_config_gets(session, cfg, "direct_io", &cval));
for (ft = file_types; ft->name != NULL; ft++) {
ret = __wt_config_subgets(session, &cval, ft->name, &sval);
@@ -2607,10 +2612,6 @@ wiredtiger_open(const char *home, WT_EVENT_HANDLER *event_handler,
WT_ERR_NOTFOUND_OK(ret);
}
- /*
- * If buffer alignment is not configured, use zero unless direct I/O is
- * also configured, in which case use the build-time default.
- */
WT_ERR(__wt_config_gets(session, cfg, "buffer_alignment", &cval));
if (cval.val == -1)
conn->buffer_alignment =
@@ -2623,7 +2624,21 @@ wiredtiger_open(const char *home, WT_EVENT_HANDLER *event_handler,
"buffer_alignment requires posix_memalign");
#endif
+ WT_ERR(__wt_config_gets(session, cfg, "cache_cursors", &cval));
+ if (cval.val)
+ F_SET(conn, WT_CONN_CACHE_CURSORS);
+
+ WT_ERR(__wt_config_gets(session, cfg, "checkpoint_sync", &cval));
+ if (cval.val)
+ F_SET(conn, WT_CONN_CKPT_SYNC);
+
WT_ERR(__wt_config_gets(session, cfg, "file_extend", &cval));
+ /*
+ * If the log extend length is not set use the default of the configured
+ * maximum log file size. That size is not known until it is initialized
+ * as part of the log server initialization.
+ */
+ conn->log_extend_len = WT_CONFIG_UNSET;
for (ft = file_types; ft->name != NULL; ft++) {
ret = __wt_config_subgets(session, &cval, ft->name, &sval);
if (ret == 0) {
@@ -2632,7 +2647,21 @@ wiredtiger_open(const char *home, WT_EVENT_HANDLER *event_handler,
conn->data_extend_len = sval.val;
break;
case WT_DIRECT_IO_LOG:
- conn->log_extend_len = sval.val;
+ /*
+ * When using "file_extend=(log=)", the val
+ * returned is 1. Unset the log extend length
+ * in that case to use the default.
+ */
+ if (sval.val == 1)
+ conn->log_extend_len = WT_CONFIG_UNSET;
+ else if (sval.val == 0 ||
+ (sval.val >= WT_LOG_FILE_MIN &&
+ sval.val <= WT_LOG_FILE_MAX))
+ conn->log_extend_len = sval.val;
+ else
+ WT_ERR_MSG(session, EINVAL,
+ "invalid log extend length: %"
+ PRId64, sval.val);
break;
}
} else
@@ -2642,9 +2671,14 @@ wiredtiger_open(const char *home, WT_EVENT_HANDLER *event_handler,
WT_ERR(__wt_config_gets(session, cfg, "mmap", &cval));
conn->mmap = cval.val != 0;
- WT_ERR(__wt_config_gets(session, cfg, "cache_cursors", &cval));
- if (cval.val)
- F_SET(conn, WT_CONN_CACHE_CURSORS);
+ WT_ERR(__wt_config_gets(session, cfg, "salvage", &cval));
+ if (cval.val) {
+ if (F_ISSET(conn, WT_CONN_READONLY))
+ WT_ERR_MSG(session, EINVAL,
+ "Readonly configuration incompatible with "
+ "salvage.");
+ F_SET(conn, WT_CONN_SALVAGE);
+ }
WT_ERR(__wt_conn_statistics_config(session, cfg));
WT_ERR(__wt_lsm_manager_config(session, cfg));
@@ -2712,6 +2746,15 @@ wiredtiger_open(const char *home, WT_EVENT_HANDLER *event_handler,
*/
WT_ERR(__wt_turtle_init(session));
+ /*
+ * If the user wants to salvage, do so before opening the
+ * metadata cursor. We do this after the call to wt_turtle_init
+ * because that moves metadata files around from backups and
+ * would overwrite any salvage we did if done before that call.
+ */
+ if (F_ISSET(conn, WT_CONN_SALVAGE))
+ WT_ERR(__wt_metadata_salvage(session));
+
WT_ERR(__wt_metadata_cursor(session, NULL));
/* Start the worker threads and run recovery. */
@@ -2743,20 +2786,25 @@ err: /* Discard the scratch buffers. */
*/
if (ret == WT_RUN_RECOVERY)
F_SET(conn, WT_CONN_PANIC);
+ /*
+ * If we detected a data corruption issue, we really want to
+ * indicate the corruption instead of whatever error was set.
+ * We cannot use standard return macros because we don't want
+ * to generalize this. Record it here while we have the
+ * connection and set it after we destroy the connection.
+ */
+ if (F_ISSET(conn, WT_CONN_DATA_CORRUPTION) &&
+ (ret == WT_PANIC || ret == WT_ERROR))
+ try_salvage = true;
WT_TRET(__wt_connection_close(conn));
+ /*
+ * Depending on the error, shutting down the connection may
+ * again return WT_PANIC. So if we detected the corruption
+ * above, set it here after closing.
+ */
+ if (try_salvage)
+ ret = WT_TRY_SALVAGE;
}
return (ret);
}
-
-/*
- * wiredtiger_checksum_crc32c --
- * CRC32C checksum function entry point.
- */
-uint32_t
-wiredtiger_checksum_crc32c(const void *buffer, size_t len)
-{
- if (__wt_process.checksum == NULL)
- __wt_checksum_init();
- return (__wt_process.checksum(buffer, len));
-}
diff --git a/src/third_party/wiredtiger/src/conn/conn_cache.c b/src/third_party/wiredtiger/src/conn/conn_cache.c
index 00de16e6c21..dbb602921a8 100644
--- a/src/third_party/wiredtiger/src/conn/conn_cache.c
+++ b/src/third_party/wiredtiger/src/conn/conn_cache.c
@@ -143,6 +143,10 @@ __cache_config_local(WT_SESSION_IMPL *session, bool shared, const char *cfg[])
conn->evict_threads_max = evict_threads_max;
conn->evict_threads_min = evict_threads_min;
+ /* Retrieve the wait time and convert from milliseconds */
+ WT_RET(__wt_config_gets(session, cfg, "cache_max_wait_ms", &cval));
+ cache->cache_max_wait_us = (uint64_t)(cval.val * WT_THOUSAND);
+
return (0);
}
diff --git a/src/third_party/wiredtiger/src/conn/conn_cache_pool.c b/src/third_party/wiredtiger/src/conn/conn_cache_pool.c
index f1043ee7546..63fd9486823 100644
--- a/src/third_party/wiredtiger/src/conn/conn_cache_pool.c
+++ b/src/third_party/wiredtiger/src/conn/conn_cache_pool.c
@@ -342,7 +342,7 @@ __wt_conn_cache_pool_destroy(WT_SESSION_IMPL *session)
FLD_CLR(cache->pool_flags, WT_CACHE_POOL_RUN);
__wt_cond_signal(session, cp->cache_pool_cond);
- WT_TRET(__wt_thread_join(session, cache->cp_tid));
+ WT_TRET(__wt_thread_join(session, &cache->cp_tid));
wt_session = &cache->cp_session->iface;
WT_TRET(wt_session->close(wt_session, NULL));
diff --git a/src/third_party/wiredtiger/src/conn/conn_ckpt.c b/src/third_party/wiredtiger/src/conn/conn_ckpt.c
index 8691a72cc47..99bfdca0331 100644
--- a/src/third_party/wiredtiger/src/conn/conn_ckpt.c
+++ b/src/third_party/wiredtiger/src/conn/conn_ckpt.c
@@ -219,7 +219,7 @@ __wt_checkpoint_server_destroy(WT_SESSION_IMPL *session)
F_CLR(conn, WT_CONN_SERVER_CHECKPOINT);
if (conn->ckpt_tid_set) {
__wt_cond_signal(session, conn->ckpt_cond);
- WT_TRET(__wt_thread_join(session, conn->ckpt_tid));
+ WT_TRET(__wt_thread_join(session, &conn->ckpt_tid));
conn->ckpt_tid_set = false;
}
__wt_cond_destroy(session, &conn->ckpt_cond);
diff --git a/src/third_party/wiredtiger/src/conn/conn_dhandle.c b/src/third_party/wiredtiger/src/conn/conn_dhandle.c
index 7c24f3c126f..eeaa71683f1 100644
--- a/src/third_party/wiredtiger/src/conn/conn_dhandle.c
+++ b/src/third_party/wiredtiger/src/conn/conn_dhandle.c
@@ -47,7 +47,7 @@ __conn_dhandle_config_set(WT_SESSION_IMPL *session)
if ((ret =
__wt_metadata_search(session, dhandle->name, &metaconf)) != 0) {
if (ret == WT_NOTFOUND)
- ret = ENOENT;
+ ret = __wt_set_return(session, ENOENT);
WT_RET(ret);
}
@@ -140,10 +140,11 @@ __wt_conn_dhandle_alloc(
dhandle->type = WT_DHANDLE_TYPE_BTREE;
} else if (WT_PREFIX_MATCH(uri, "table:")) {
WT_RET(__wt_calloc_one(session, &table));
- dhandle = &table->iface;
+ dhandle = (WT_DATA_HANDLE *)table;
dhandle->type = WT_DHANDLE_TYPE_TABLE;
} else
- return (__wt_illegal_value(session, NULL));
+ WT_PANIC_RET(session, EINVAL,
+ "illegal handle allocation URI %s", uri);
/* Btree handles keep their data separate from the interface. */
if (dhandle->type == WT_DHANDLE_TYPE_BTREE) {
@@ -703,7 +704,7 @@ __conn_dhandle_remove(WT_SESSION_IMPL *session, bool final)
/* Check if the handle was reacquired by a session while we waited. */
if (!final &&
(dhandle->session_inuse != 0 || dhandle->session_ref != 0))
- return (EBUSY);
+ return (__wt_set_return(session, EBUSY));
WT_CONN_DHANDLE_REMOVE(conn, dhandle, bucket);
return (0);
diff --git a/src/third_party/wiredtiger/src/conn/conn_log.c b/src/third_party/wiredtiger/src/conn/conn_log.c
index d8eb095d6d2..2f3d8d64dad 100644
--- a/src/third_party/wiredtiger/src/conn/conn_log.c
+++ b/src/third_party/wiredtiger/src/conn/conn_log.c
@@ -277,6 +277,15 @@ __logmgr_config(
if (!reconfig) {
WT_RET(__wt_config_gets(session, cfg, "log.file_max", &cval));
conn->log_file_max = (wt_off_t)cval.val;
+ /*
+ * With the default log file extend configuration or if the log
+ * file extension size is larger than the configured maximum log
+ * file size, set the log file extension size to the configured
+ * maximum log file size.
+ */
+ if (conn->log_extend_len == WT_CONFIG_UNSET ||
+ conn->log_extend_len > conn->log_file_max)
+ conn->log_extend_len = conn->log_file_max;
WT_STAT_CONN_SET(session, log_max_filesize, conn->log_file_max);
}
@@ -883,7 +892,6 @@ __log_wrlsn_server(void *arg)
__wt_log_wrlsn(session, NULL);
if (0) {
err: WT_PANIC_MSG(session, ret, "log wrlsn server error");
-
}
return (WT_THREAD_RET_VALUE);
}
@@ -1171,12 +1179,12 @@ __wt_logmgr_destroy(WT_SESSION_IMPL *session)
}
if (conn->log_tid_set) {
__wt_cond_signal(session, conn->log_cond);
- WT_TRET(__wt_thread_join(session, conn->log_tid));
+ WT_TRET(__wt_thread_join(session, &conn->log_tid));
conn->log_tid_set = false;
}
if (conn->log_file_tid_set) {
__wt_cond_signal(session, conn->log_file_cond);
- WT_TRET(__wt_thread_join(session, conn->log_file_tid));
+ WT_TRET(__wt_thread_join(session, &conn->log_file_tid));
conn->log_file_tid_set = false;
}
if (conn->log_file_session != NULL) {
@@ -1186,7 +1194,7 @@ __wt_logmgr_destroy(WT_SESSION_IMPL *session)
}
if (conn->log_wrlsn_tid_set) {
__wt_cond_signal(session, conn->log_wrlsn_cond);
- WT_TRET(__wt_thread_join(session, conn->log_wrlsn_tid));
+ WT_TRET(__wt_thread_join(session, &conn->log_wrlsn_tid));
conn->log_wrlsn_tid_set = false;
}
if (conn->log_wrlsn_session != NULL) {
diff --git a/src/third_party/wiredtiger/src/conn/conn_reconfig.c b/src/third_party/wiredtiger/src/conn/conn_reconfig.c
index 8672e824579..8bf2a53e49b 100644
--- a/src/third_party/wiredtiger/src/conn/conn_reconfig.c
+++ b/src/third_party/wiredtiger/src/conn/conn_reconfig.c
@@ -232,8 +232,7 @@ done: conn->req_max_major = max_major;
conn->req_min_major = min_major;
conn->req_min_minor = min_minor;
-err: if (value != NULL)
- __wt_free(session, value);
+err: __wt_free(session, value);
return (ret);
}
diff --git a/src/third_party/wiredtiger/src/conn/conn_stat.c b/src/third_party/wiredtiger/src/conn/conn_stat.c
index 14a1570c138..ffbc1caf2bb 100644
--- a/src/third_party/wiredtiger/src/conn/conn_stat.c
+++ b/src/third_party/wiredtiger/src/conn/conn_stat.c
@@ -501,17 +501,17 @@ static int
__statlog_log_one(WT_SESSION_IMPL *session, WT_ITEM *path, WT_ITEM *tmp)
{
struct timespec ts;
- struct tm *tm, _tm;
+ struct tm localt;
WT_CONNECTION_IMPL *conn;
conn = S2C(session);
/* Get the current local time of day. */
__wt_epoch(session, &ts);
- tm = localtime_r(&ts.tv_sec, &_tm);
+ WT_RET(__wt_localtime(session, &ts.tv_sec, &localt));
/* Create the logging path name for this time of day. */
- if (strftime(tmp->mem, tmp->memsize, conn->stat_path, tm) == 0)
+ if (strftime(tmp->mem, tmp->memsize, conn->stat_path, &localt) == 0)
WT_RET_MSG(session, ENOMEM, "strftime path conversion");
/* If the path has changed, cycle the log file. */
@@ -527,7 +527,7 @@ __statlog_log_one(WT_SESSION_IMPL *session, WT_ITEM *path, WT_ITEM *tmp)
}
/* Create the entry prefix for this time of day. */
- if (strftime(tmp->mem, tmp->memsize, conn->stat_format, tm) == 0)
+ if (strftime(tmp->mem, tmp->memsize, conn->stat_format, &localt) == 0)
WT_RET_MSG(session, ENOMEM, "strftime timestamp conversion");
conn->stat_stamp = tmp->mem;
WT_RET(__statlog_print_header(session));
@@ -742,7 +742,7 @@ __wt_statlog_destroy(WT_SESSION_IMPL *session, bool is_close)
F_CLR(conn, WT_CONN_SERVER_STATISTICS);
if (conn->stat_tid_set) {
__wt_cond_signal(session, conn->stat_cond);
- WT_TRET(__wt_thread_join(session, conn->stat_tid));
+ WT_TRET(__wt_thread_join(session, &conn->stat_tid));
conn->stat_tid_set = false;
}
__wt_cond_destroy(session, &conn->stat_cond);
diff --git a/src/third_party/wiredtiger/src/conn/conn_sweep.c b/src/third_party/wiredtiger/src/conn/conn_sweep.c
index 33342fb4873..734c455d854 100644
--- a/src/third_party/wiredtiger/src/conn/conn_sweep.c
+++ b/src/third_party/wiredtiger/src/conn/conn_sweep.c
@@ -451,7 +451,7 @@ __wt_sweep_destroy(WT_SESSION_IMPL *session)
F_CLR(conn, WT_CONN_SERVER_SWEEP);
if (conn->sweep_tid_set) {
__wt_cond_signal(session, conn->sweep_cond);
- WT_TRET(__wt_thread_join(session, conn->sweep_tid));
+ WT_TRET(__wt_thread_join(session, &conn->sweep_tid));
conn->sweep_tid_set = 0;
}
__wt_cond_destroy(session, &conn->sweep_cond);
diff --git a/src/third_party/wiredtiger/src/cursor/cur_backup.c b/src/third_party/wiredtiger/src/cursor/cur_backup.c
index a9e08cfa4d8..4b1dfbcb1c8 100644
--- a/src/third_party/wiredtiger/src/cursor/cur_backup.c
+++ b/src/third_party/wiredtiger/src/cursor/cur_backup.c
@@ -77,8 +77,8 @@ __curbackup_close(WT_CURSOR *cursor)
WT_SESSION_IMPL *session;
cb = (WT_CURSOR_BACKUP *)cursor;
-
CURSOR_API_CALL_PREPARE_ALLOWED(cursor, session, close, NULL);
+err:
/*
* When starting a hot backup, we serialize hot backup cursors and set
@@ -92,10 +92,10 @@ __curbackup_close(WT_CURSOR *cursor)
if (F_ISSET(cb, WT_CURBACKUP_LOCKER))
WT_TRET(__backup_stop(session, cb));
- WT_TRET(__wt_cursor_close(cursor));
+ __wt_cursor_close(cursor);
session->bkp_cursor = NULL;
-err: API_END_RET(session, ret);
+ API_END_RET(session, ret);
}
/*
@@ -133,17 +133,15 @@ __wt_curbackup_open(WT_SESSION_IMPL *session,
WT_STATIC_ASSERT(offsetof(WT_CURSOR_BACKUP, iface) == 0);
- cb = NULL;
-
WT_RET(__wt_calloc_one(session, &cb));
- cursor = &cb->iface;
+ cursor = (WT_CURSOR *)cb;
*cursor = iface;
- cursor->session = &session->iface;
- session->bkp_cursor = cb;
-
+ cursor->session = (WT_SESSION *)session;
cursor->key_format = "S"; /* Return the file names as the key. */
cursor->value_format = ""; /* No value. */
+ session->bkp_cursor = cb;
+
/*
* Start the backup and fill in the cursor's list. Acquire the schema
* lock, we need a consistent view when creating a copy.
@@ -181,7 +179,7 @@ __backup_log_append(WT_SESSION_IMPL *session, WT_CURSOR_BACKUP *cb, bool active)
ret = 0;
if (conn->log) {
- WT_ERR(__wt_log_get_all_files(
+ WT_ERR(__wt_log_get_backup_files(
session, &logfiles, &logcount, &cb->maxid, active));
for (i = 0; i < logcount; i++)
WT_ERR(__backup_list_append(session, cb, logfiles[i]));
@@ -261,6 +259,22 @@ __backup_start(
target_list = false;
WT_ERR(__backup_uri(session, cfg, &target_list, &log_only));
if (!target_list) {
+ /*
+ * It's important to first gather the log files to be copied
+ * (which internally starts a new log file), followed by
+ * choosing a checkpoint to reference in the WiredTiger.backup
+ * file.
+ *
+ * Applications may have logic that takes a checkpoint, followed
+ * by performing a write that should only appear in the new
+ * checkpoint. This ordering prevents choosing the prior
+ * checkpoint, but including the write in the log files
+ * returned.
+ *
+ * It is also possible, and considered legal, to choose the new
+ * checkpoint, but not include the log file that contains the
+ * log entry for taking the new checkpoint.
+ */
WT_ERR(__backup_log_append(session, cb, true));
WT_ERR(__backup_all(session));
}
diff --git a/src/third_party/wiredtiger/src/cursor/cur_config.c b/src/third_party/wiredtiger/src/cursor/cur_config.c
index 98c59392161..e4d1f3e94e0 100644
--- a/src/third_party/wiredtiger/src/cursor/cur_config.c
+++ b/src/third_party/wiredtiger/src/cursor/cur_config.c
@@ -19,9 +19,11 @@ __curconfig_close(WT_CURSOR *cursor)
WT_SESSION_IMPL *session;
CURSOR_API_CALL_PREPARE_ALLOWED(cursor, session, close, NULL);
- WT_TRET(__wt_cursor_close(cursor));
+err:
-err: API_END_RET(session, ret);
+ __wt_cursor_close(cursor);
+
+ API_END_RET(session, ret);
}
/*
@@ -60,10 +62,9 @@ __wt_curconfig_open(WT_SESSION_IMPL *session,
WT_STATIC_ASSERT(offsetof(WT_CURSOR_CONFIG, iface) == 0);
WT_RET(__wt_calloc_one(session, &cconfig));
-
- cursor = &cconfig->iface;
+ cursor = (WT_CURSOR *)cconfig;
*cursor = iface;
- cursor->session = &session->iface;
+ cursor->session = (WT_SESSION *)session;
cursor->key_format = cursor->value_format = "S";
WT_ERR(__wt_cursor_init(cursor, uri, NULL, cfg, cursorp));
diff --git a/src/third_party/wiredtiger/src/cursor/cur_ds.c b/src/third_party/wiredtiger/src/cursor/cur_ds.c
index 1eb778ed0c9..d82fa934ddf 100644
--- a/src/third_party/wiredtiger/src/cursor/cur_ds.c
+++ b/src/third_party/wiredtiger/src/cursor/cur_ds.c
@@ -443,11 +443,11 @@ __curds_close(WT_CURSOR *cursor)
WT_SESSION_IMPL *session;
cds = (WT_CURSOR_DATA_SOURCE *)cursor;
-
CURSOR_API_CALL_PREPARE_ALLOWED(cursor, session, close, NULL);
+err:
if (cds->source != NULL)
- ret = cds->source->close(cds->source);
+ WT_TRET(cds->source->close(cds->source));
if (cds->collator_owned) {
if (cds->collator->terminate != NULL)
@@ -464,9 +464,9 @@ __curds_close(WT_CURSOR *cursor)
__wt_free(session, cursor->key_format);
__wt_free(session, cursor->value_format);
- WT_TRET(__wt_cursor_close(cursor));
+ __wt_cursor_close(cursor);
-err: API_END_RET(session, ret);
+ API_END_RET(session, ret);
}
/*
@@ -507,13 +507,12 @@ __wt_curds_open(
WT_STATIC_ASSERT(offsetof(WT_CURSOR_DATA_SOURCE, iface) == 0);
- data_source = NULL;
metaconf = NULL;
WT_RET(__wt_calloc_one(session, &data_source));
- cursor = &data_source->iface;
+ cursor = (WT_CURSOR *)data_source;
*cursor = iface;
- cursor->session = &session->iface;
+ cursor->session = (WT_SESSION *)session;
/*
* XXX
diff --git a/src/third_party/wiredtiger/src/cursor/cur_dump.c b/src/third_party/wiredtiger/src/cursor/cur_dump.c
index 8853e6f30d6..2ac8823ddb9 100644
--- a/src/third_party/wiredtiger/src/cursor/cur_dump.c
+++ b/src/third_party/wiredtiger/src/cursor/cur_dump.c
@@ -337,16 +337,17 @@ __curdump_close(WT_CURSOR *cursor)
cdump = (WT_CURSOR_DUMP *)cursor;
child = cdump->child;
-
CURSOR_API_CALL_PREPARE_ALLOWED(cursor, session, close, NULL);
+err:
+
if (child != NULL)
WT_TRET(child->close(child));
/* We shared the child's URI. */
cursor->internal_uri = NULL;
__wt_json_close(session, cursor);
- WT_TRET(__wt_cursor_close(cursor));
+ __wt_cursor_close(cursor);
-err: API_END_RET(session, ret);
+ API_END_RET(session, ret);
}
/*
@@ -389,7 +390,7 @@ __wt_curdump_create(WT_CURSOR *child, WT_CURSOR *owner, WT_CURSOR **cursorp)
session = (WT_SESSION_IMPL *)child->session;
WT_RET(__wt_calloc_one(session, &cdump));
- cursor = &cdump->iface;
+ cursor = (WT_CURSOR *)cdump;
*cursor = iface;
cursor->session = child->session;
cursor->internal_uri = child->internal_uri;
diff --git a/src/third_party/wiredtiger/src/cursor/cur_file.c b/src/third_party/wiredtiger/src/cursor/cur_file.c
index 93f1392aef9..0b26b931f6c 100644
--- a/src/third_party/wiredtiger/src/cursor/cur_file.c
+++ b/src/third_party/wiredtiger/src/cursor/cur_file.c
@@ -469,15 +469,18 @@ __curfile_close(WT_CURSOR *cursor)
cbt = (WT_CURSOR_BTREE *)cursor;
CURSOR_API_CALL_PREPARE_ALLOWED(cursor, session, close, cbt->btree);
- released = false;
+err:
- /*
- * If releasing the cursor fails in any way, it will be left
- * in a state that allows it to be normally closed.
- */
- WT_TRET(__wt_cursor_cache_release(session, cursor, &released));
- if (released)
- goto done;
+ /* Only try to cache the cursor if there's no error. */
+ if (ret == 0) {
+ /*
+ * If releasing the cursor fails in any way, it will be left in
+ * a state that allows it to be normally closed.
+ */
+ ret = __wt_cursor_cache_release(session, cursor, &released);
+ if (released)
+ goto done;
+ }
dead = F_ISSET(cursor, WT_CURSTD_DEAD);
if (F_ISSET(cursor, WT_CURSTD_BULK)) {
@@ -494,7 +497,7 @@ __curfile_close(WT_CURSOR *cursor)
WT_ASSERT(session, session->dhandle == NULL ||
session->dhandle->session_inuse > 0);
- WT_TRET(__wt_cursor_close(cursor));
+ __wt_cursor_close(cursor);
/*
* Note: release the data handle last so that cursor statistics are
@@ -513,8 +516,7 @@ __curfile_close(WT_CURSOR *cursor)
WT_TRET(__wt_session_release_dhandle(session));
}
-done:
-err: API_END_RET(session, ret);
+done: API_END_RET(session, ret);
}
/*
@@ -632,18 +634,16 @@ __curfile_create(WT_SESSION_IMPL *session,
WT_STATIC_ASSERT(offsetof(WT_CURSOR_BTREE, iface) == 0);
- cbt = NULL;
- cacheable = F_ISSET(session, WT_SESSION_CACHE_CURSORS) && !bulk;
-
btree = S2BT(session);
WT_ASSERT(session, btree != NULL);
csize = bulk ? sizeof(WT_CURSOR_BULK) : sizeof(WT_CURSOR_BTREE);
- WT_RET(__wt_calloc(session, 1, csize, &cbt));
+ cacheable = F_ISSET(session, WT_SESSION_CACHE_CURSORS) && !bulk;
- cursor = &cbt->iface;
+ WT_RET(__wt_calloc(session, 1, csize, &cbt));
+ cursor = (WT_CURSOR *)cbt;
*cursor = iface;
- cursor->session = &session->iface;
+ cursor->session = (WT_SESSION *)session;
cursor->internal_uri = btree->dhandle->name;
cursor->key_format = btree->key_format;
cursor->value_format = btree->value_format;
@@ -693,6 +693,10 @@ __curfile_create(WT_SESSION_IMPL *session,
cacheable = false;
}
+ WT_ERR(__wt_config_gets_def(session, cfg, "read_once", 0, &cval));
+ if (cval.val != 0)
+ F_SET(cbt, WT_CBT_READ_ONCE);
+
/* Underlying btree initialization. */
__wt_btcur_open(cbt);
@@ -710,7 +714,7 @@ __curfile_create(WT_SESSION_IMPL *session,
* WiredTiger.wt should not be cached, doing so interferes
* with named checkpoints.
*/
- if (cacheable && !WT_STREQ(WT_METAFILE_URI, cursor->internal_uri))
+ if (cacheable && strcmp(WT_METAFILE_URI, cursor->internal_uri) != 0)
F_SET(cursor, WT_CURSTD_CACHEABLE);
WT_ERR(__wt_cursor_init(
@@ -724,8 +728,7 @@ err: /*
* Our caller expects to release the data handle if we fail.
* Disconnect it from the cursor before closing.
*/
- if (session->dhandle != NULL)
- __wt_cursor_dhandle_decr_use(session);
+ __wt_cursor_dhandle_decr_use(session);
cbt->btree = NULL;
WT_TRET(__curfile_close(cursor));
*cursorp = NULL;
diff --git a/src/third_party/wiredtiger/src/cursor/cur_index.c b/src/third_party/wiredtiger/src/cursor/cur_index.c
index 9e75442a243..627bfbe2f44 100644
--- a/src/third_party/wiredtiger/src/cursor/cur_index.c
+++ b/src/third_party/wiredtiger/src/cursor/cur_index.c
@@ -359,8 +359,8 @@ __curindex_close(WT_CURSOR *cursor)
cindex = (WT_CURSOR_INDEX *)cursor;
idx = cindex->index;
-
JOINABLE_CURSOR_API_CALL_PREPARE_ALLOWED(cursor, session, close, NULL);
+err:
if ((cp = cindex->cg_cursors) != NULL)
for (i = 0, cp = cindex->cg_cursors;
@@ -385,9 +385,9 @@ __curindex_close(WT_CURSOR *cursor)
WT_TRET(__wt_schema_release_table(session, cindex->table));
/* The URI is owned by the index. */
cursor->internal_uri = NULL;
- WT_TRET(__wt_cursor_close(cursor));
+ __wt_cursor_close(cursor);
-err: API_END_RET(session, ret);
+ API_END_RET(session, ret);
}
/*
@@ -494,9 +494,9 @@ __wt_curindex_open(WT_SESSION_IMPL *session,
}
WT_RET(__wt_calloc_one(session, &cindex));
- cursor = &cindex->iface;
+ cursor = (WT_CURSOR *)cindex;
*cursor = iface;
- cursor->session = &session->iface;
+ cursor->session = (WT_SESSION *)session;
cindex->table = table;
cindex->index = idx;
diff --git a/src/third_party/wiredtiger/src/cursor/cur_join.c b/src/third_party/wiredtiger/src/cursor/cur_join.c
index af11ced4ff1..1a23f4a51fc 100644
--- a/src/third_party/wiredtiger/src/cursor/cur_join.c
+++ b/src/third_party/wiredtiger/src/cursor/cur_join.c
@@ -188,7 +188,7 @@ __curjoin_iter_set_entry(WT_CURSOR_JOIN_ITER *iter, u_int entry_pos)
size = strlen(to_dup->internal_uri) + 3;
WT_ERR(__wt_calloc(session, size, 1, &uri));
WT_ERR(__wt_snprintf(uri, size, "%s()", to_dup->internal_uri));
- if ((c = iter->cursor) == NULL || !WT_STREQ(c->uri, uri)) {
+ if ((c = iter->cursor) == NULL || strcmp(c->uri, uri) != 0) {
iter->cursor = NULL;
if (c != NULL)
WT_ERR(c->close(c));
@@ -324,8 +324,8 @@ __curjoin_close(WT_CURSOR *cursor)
u_int i;
cjoin = (WT_CURSOR_JOIN *)cursor;
-
JOINABLE_CURSOR_API_CALL_PREPARE_ALLOWED(cursor, session, close, NULL);
+err:
WT_TRET(__wt_schema_release_table(session, cjoin->table));
@@ -362,9 +362,9 @@ __curjoin_close(WT_CURSOR *cursor)
WT_TRET(cjoin->main->close(cjoin->main));
__wt_free(session, cjoin->entries);
- WT_TRET(__wt_cursor_close(cursor));
+ __wt_cursor_close(cursor);
-err: API_END_RET(session, ret);
+ API_END_RET(session, ret);
}
/*
@@ -500,7 +500,7 @@ __curjoin_entry_in_range(WT_SESSION_IMPL *session, WT_CURSOR_JOIN_ENTRY *entry,
passed = (cmp < 0);
break;
- WT_ILLEGAL_VALUE(session);
+ WT_ILLEGAL_VALUE(session, WT_CURJOIN_END_RANGE(end));
}
if (!passed) {
@@ -651,15 +651,17 @@ __curjoin_entry_member(WT_SESSION_IMPL *session, WT_CURSOR_JOIN_ENTRY *entry,
if (iter != NULL && entry == iter->entry)
WT_ITEM_SET(v, iter->idxkey);
else {
- memset(&v, 0, sizeof(v)); /* Keep lint quiet. */
+ memset(&v, 0, sizeof(v)); /* Keep lint quiet. */
c = entry->main;
c->set_key(c, key);
entry->stats.main_access++;
if ((ret = c->search(c)) == 0)
ret = c->get_value(c, &v);
- else if (ret == WT_NOTFOUND)
- WT_ERR_MSG(session, WT_ERROR,
+ else if (ret == WT_NOTFOUND) {
+ __wt_err(session, ret,
"main table for join is missing entry");
+ ret = WT_ERROR;
+ }
WT_TRET(c->reset(c));
WT_ERR(ret);
}
@@ -801,7 +803,8 @@ __curjoin_init_bloom(WT_SESSION_IMPL *session, WT_CURSOR_JOIN *cjoin,
goto done;
WT_ERR(ret);
} else
- WT_ERR(__wt_illegal_value(session, NULL));
+ WT_PANIC_ERR(session, EINVAL,
+ "fatal error in join cursor position state");
}
collator = (entry->index == NULL) ? NULL : entry->index->collator;
while (ret == 0) {
@@ -1336,11 +1339,12 @@ __wt_curjoin_open(WT_SESSION_IMPL *session,
session, tablename, size, false, 0, &table));
WT_RET(__wt_calloc_one(session, &cjoin));
- cursor = &cjoin->iface;
+ cursor = (WT_CURSOR *)cjoin;
*cursor = iface;
- cursor->session = &session->iface;
+ cursor->session = (WT_SESSION *)session;
cursor->key_format = table->key_format;
cursor->value_format = table->value_format;
+
cjoin->table = table;
/* Handle projections. */
diff --git a/src/third_party/wiredtiger/src/cursor/cur_json.c b/src/third_party/wiredtiger/src/cursor/cur_json.c
index 87f8899d9c8..d4847d5a2ee 100644
--- a/src/third_party/wiredtiger/src/cursor/cur_json.c
+++ b/src/third_party/wiredtiger/src/cursor/cur_json.c
@@ -53,7 +53,7 @@ static int __json_pack_size(WT_SESSION_IMPL *, const char *, WT_CONFIG_ITEM *,
(pv).type = 'K'; \
break; \
/* User format strings have already been validated. */ \
- WT_ILLEGAL_VALUE(session); \
+ WT_ILLEGAL_VALUE(session, (pv).type); \
} \
} while (0)
@@ -922,7 +922,7 @@ __wt_json_strncpy(WT_SESSION *wt_session,
case '\\':
*dst++ = ch;
break;
- WT_ILLEGAL_VALUE(session);
+ WT_ILLEGAL_VALUE(session, ch);
}
else
*dst++ = ch;
diff --git a/src/third_party/wiredtiger/src/cursor/cur_log.c b/src/third_party/wiredtiger/src/cursor/cur_log.c
index 5c2fbd325f6..ca2163b2818 100644
--- a/src/third_party/wiredtiger/src/cursor/cur_log.c
+++ b/src/third_party/wiredtiger/src/cursor/cur_log.c
@@ -317,10 +317,11 @@ __curlog_close(WT_CURSOR *cursor)
WT_DECL_RET;
WT_SESSION_IMPL *session;
- CURSOR_API_CALL_PREPARE_ALLOWED(cursor, session, close, NULL);
cl = (WT_CURSOR_LOG *)cursor;
- conn = S2C(session);
+ CURSOR_API_CALL_PREPARE_ALLOWED(cursor, session, close, NULL);
+err:
+ conn = S2C(session);
if (F_ISSET(cl, WT_CURLOG_ARCHIVE_LOCK)) {
(void)__wt_atomic_sub32(&conn->log_cursors, 1);
__wt_readunlock(session, &conn->log->log_archive_lock);
@@ -334,9 +335,9 @@ __curlog_close(WT_CURSOR *cursor)
__wt_free(session, cl->packed_key);
__wt_free(session, cl->packed_value);
- WT_TRET(__wt_cursor_close(cursor));
+ __wt_cursor_close(cursor);
-err: API_END_RET(session, ret);
+ API_END_RET(session, ret);
}
/*
@@ -375,22 +376,22 @@ __wt_curlog_open(WT_SESSION_IMPL *session,
WT_LOG *log;
WT_STATIC_ASSERT(offsetof(WT_CURSOR_LOG, iface) == 0);
- conn = S2C(session);
+ conn = S2C(session);
log = conn->log;
- cl = NULL;
+
WT_RET(__wt_calloc_one(session, &cl));
- cursor = &cl->iface;
+ cursor = (WT_CURSOR *)cl;
*cursor = iface;
- cursor->session = &session->iface;
+ cursor->session = (WT_SESSION *)session;
+ cursor->key_format = WT_LOGC_KEY_FORMAT;
+ cursor->value_format = WT_LOGC_VALUE_FORMAT;
+
WT_ERR(__wt_calloc_one(session, &cl->cur_lsn));
WT_ERR(__wt_calloc_one(session, &cl->next_lsn));
WT_ERR(__wt_scr_alloc(session, 0, &cl->logrec));
WT_ERR(__wt_scr_alloc(session, 0, &cl->opkey));
WT_ERR(__wt_scr_alloc(session, 0, &cl->opvalue));
- cursor->key_format = WT_LOGC_KEY_FORMAT;
- cursor->value_format = WT_LOGC_VALUE_FORMAT;
-
WT_INIT_LSN(cl->cur_lsn);
WT_INIT_LSN(cl->next_lsn);
diff --git a/src/third_party/wiredtiger/src/cursor/cur_metadata.c b/src/third_party/wiredtiger/src/cursor/cur_metadata.c
index c584c9c5dc3..031001bbf80 100644
--- a/src/third_party/wiredtiger/src/cursor/cur_metadata.c
+++ b/src/third_party/wiredtiger/src/cursor/cur_metadata.c
@@ -553,16 +553,17 @@ __curmetadata_close(WT_CURSOR *cursor)
mdc = (WT_CURSOR_METADATA *)cursor;
c = mdc->file_cursor;
- CURSOR_API_CALL_PREPARE_ALLOWED(cursor, session, close, c == NULL ?
- NULL : ((WT_CURSOR_BTREE *)c)->btree);
+ CURSOR_API_CALL_PREPARE_ALLOWED(cursor, session, close,
+ c == NULL ? NULL : ((WT_CURSOR_BTREE *)c)->btree);
+err:
if (c != NULL)
- ret = c->close(c);
+ WT_TRET(c->close(c));
if ((c = mdc->create_cursor) != NULL)
WT_TRET(c->close(c));
- WT_TRET(__wt_cursor_close(cursor));
+ __wt_cursor_close(cursor);
-err: API_END_RET(session, ret);
+ API_END_RET(session, ret);
}
/*
@@ -606,10 +607,9 @@ __wt_curmetadata_open(WT_SESSION_IMPL *session,
WT_CONFIG_ITEM cval;
WT_RET(__wt_calloc_one(session, &mdc));
-
- cursor = &mdc->iface;
+ cursor = (WT_CURSOR *)mdc;
*cursor = iface;
- cursor->session = &session->iface;
+ cursor->session = (WT_SESSION *)session;
cursor->key_format = "S";
cursor->value_format = "S";
@@ -625,7 +625,7 @@ __wt_curmetadata_open(WT_SESSION_IMPL *session,
* We'll need some extra cursors to pull out column group information
* and chase "source" entries.
*/
- if (WT_STREQ(uri, "metadata:create")) {
+ if (strcmp(uri, "metadata:create") == 0) {
F_SET(mdc, WT_MDC_CREATEONLY);
WT_ERR(__wt_metadata_cursor_open(session, cfg[1],
&mdc->create_cursor));
diff --git a/src/third_party/wiredtiger/src/cursor/cur_stat.c b/src/third_party/wiredtiger/src/cursor/cur_stat.c
index 9cd0ee2c484..25d4b588d3b 100644
--- a/src/third_party/wiredtiger/src/cursor/cur_stat.c
+++ b/src/third_party/wiredtiger/src/cursor/cur_stat.c
@@ -321,6 +321,7 @@ __curstat_close(WT_CURSOR *cursor)
cst = (WT_CURSOR_STAT *)cursor;
CURSOR_API_CALL_PREPARE_ALLOWED(cursor, session, close, NULL);
+err:
if (cst->cfg != NULL) {
for (i = 0; cst->cfg[i] != NULL; ++i)
@@ -331,9 +332,9 @@ __curstat_close(WT_CURSOR *cursor)
__wt_buf_free(session, &cst->pv);
__wt_free(session, cst->desc_buf);
- WT_ERR(__wt_cursor_close(cursor));
+ __wt_cursor_close(cursor);
-err: API_END_RET(session, ret);
+ API_END_RET(session, ret);
}
/*
@@ -529,7 +530,7 @@ __wt_curstat_init(WT_SESSION_IMPL *session,
dsrc_uri = uri + strlen("statistics:");
- if (WT_STREQ(dsrc_uri, "join"))
+ if (strcmp(dsrc_uri, "join") == 0)
WT_RET(__curstat_join_init(session, curjoin, cfg, cst));
else if (WT_PREFIX_MATCH(dsrc_uri, "colgroup:"))
@@ -595,9 +596,9 @@ __wt_curstat_open(WT_SESSION_IMPL *session,
conn = S2C(session);
WT_RET(__wt_calloc_one(session, &cst));
- cursor = &cst->iface;
+ cursor = (WT_CURSOR *)cst;
*cursor = iface;
- cursor->session = &session->iface;
+ cursor->session = (WT_SESSION *)session;
/*
* Statistics cursor configuration: must match (and defaults to), the
diff --git a/src/third_party/wiredtiger/src/cursor/cur_std.c b/src/third_party/wiredtiger/src/cursor/cur_std.c
index 766712c244c..ba00a474f02 100644
--- a/src/third_party/wiredtiger/src/cursor/cur_std.c
+++ b/src/third_party/wiredtiger/src/cursor/cur_std.c
@@ -708,6 +708,7 @@ __wt_cursor_cache_get(WT_SESSION_IMPL *session, const char *uri,
{
WT_CONFIG_ITEM cval;
WT_CURSOR *cursor;
+ WT_CURSOR_BTREE *cbt;
WT_DECL_RET;
uint64_t bucket, hash_value;
uint32_t overwrite_flag;
@@ -722,7 +723,7 @@ __wt_cursor_cache_get(WT_SESSION_IMPL *session, const char *uri,
/* Fast path overwrite configuration */
if (have_config && cfg[2] == NULL &&
- WT_STREQ(cfg[1], "overwrite=false")) {
+ strcmp(cfg[1], "overwrite=false") == 0) {
have_config = false;
overwrite_flag = 0;
} else
@@ -780,7 +781,7 @@ __wt_cursor_cache_get(WT_SESSION_IMPL *session, const char *uri,
bucket = hash_value % WT_HASH_ARRAY_SIZE;
TAILQ_FOREACH(cursor, &session->cursor_cache[bucket], q) {
if (cursor->uri_hash == hash_value &&
- WT_STREQ(cursor->uri, uri)) {
+ strcmp(cursor->uri, uri) == 0) {
if ((ret = cursor->reopen(cursor, false)) != 0) {
F_CLR(cursor, WT_CURSTD_CACHEABLE);
session->dhandle = NULL;
@@ -797,6 +798,15 @@ __wt_cursor_cache_get(WT_SESSION_IMPL *session, const char *uri,
F_CLR(cursor, WT_CURSTD_APPEND | WT_CURSTD_RAW |
WT_CURSTD_OVERWRITE);
F_SET(cursor, overwrite_flag);
+ /*
+ * If this is a btree cursor, clear its read_once flag.
+ */
+ if (WT_PREFIX_MATCH(cursor->internal_uri, "file:")) {
+ cbt = (WT_CURSOR_BTREE *)cursor;
+ F_CLR(cbt, WT_CBT_READ_ONCE);
+ } else {
+ cbt = NULL;
+ }
if (have_config) {
/*
@@ -819,6 +829,15 @@ __wt_cursor_cache_get(WT_SESSION_IMPL *session, const char *uri,
session, cfg, "raw", 0, &cval));
if (cval.val != 0)
F_SET(cursor, WT_CURSTD_RAW);
+
+ if (cbt) {
+ WT_RET(__wt_config_gets_def(
+ session,
+ cfg, "read_once", 0, &cval));
+ if (cval.val != 0)
+ F_SET(cbt, WT_CBT_READ_ONCE);
+ }
+
}
WT_STAT_CONN_INCR(session, cursor_reopen);
@@ -835,7 +854,7 @@ __wt_cursor_cache_get(WT_SESSION_IMPL *session, const char *uri,
* __wt_cursor_close --
* WT_CURSOR->close default implementation.
*/
-int
+void
__wt_cursor_close(WT_CURSOR *cursor)
{
WT_SESSION_IMPL *session;
@@ -854,7 +873,6 @@ __wt_cursor_close(WT_CURSOR *cursor)
__wt_free(session, cursor->internal_uri);
__wt_free(session, cursor->uri);
__wt_overwrite_and_free(session, cursor);
- return (0);
}
/*
diff --git a/src/third_party/wiredtiger/src/cursor/cur_table.c b/src/third_party/wiredtiger/src/cursor/cur_table.c
index 495209b7f9f..534c13e7831 100644
--- a/src/third_party/wiredtiger/src/cursor/cur_table.c
+++ b/src/third_party/wiredtiger/src/cursor/cur_table.c
@@ -808,6 +808,7 @@ __curtable_close(WT_CURSOR *cursor)
ctable = (WT_CURSOR_TABLE *)cursor;
JOINABLE_CURSOR_API_CALL_PREPARE_ALLOWED(cursor, session, close, NULL);
+err:
if (ctable->cg_cursors != NULL)
for (i = 0, cp = ctable->cg_cursors;
@@ -841,9 +842,9 @@ __curtable_close(WT_CURSOR *cursor)
WT_TRET(__wt_schema_release_table(session, ctable->table));
/* The URI is owned by the table. */
cursor->internal_uri = NULL;
- WT_TRET(__wt_cursor_close(cursor));
+ __wt_cursor_close(cursor);
-err: API_END_RET(session, ret);
+ API_END_RET(session, ret);
}
/*
@@ -979,8 +980,6 @@ __wt_curtable_open(WT_SESSION_IMPL *session,
WT_STATIC_ASSERT(offsetof(WT_CURSOR_TABLE, iface) == 0);
- ctable = NULL;
-
tablename = uri;
WT_PREFIX_SKIP_REQUIRED(session, tablename, "table:");
columns = strchr(tablename, '(');
@@ -1011,10 +1010,9 @@ __wt_curtable_open(WT_SESSION_IMPL *session,
}
WT_RET(__wt_calloc_one(session, &ctable));
-
- cursor = &ctable->iface;
+ cursor = (WT_CURSOR *)ctable;
*cursor = iface;
- cursor->session = &session->iface;
+ cursor->session = (WT_SESSION *)session;
cursor->internal_uri = table->iface.name;
cursor->key_format = table->key_format;
cursor->value_format = table->value_format;
diff --git a/src/third_party/wiredtiger/src/docs/config-strings.dox b/src/third_party/wiredtiger/src/docs/config-strings.dox
index a583573214f..d6291d5b4ba 100644
--- a/src/third_party/wiredtiger/src/docs/config-strings.dox
+++ b/src/third_party/wiredtiger/src/docs/config-strings.dox
@@ -44,7 +44,7 @@ columns in a table, values are nested using parentheses. For example:
All types of parentheses are treated equivalently by the parser.
-When an integer values is expected, the value may have multiplier characters
+When an integer value is expected, the value may have multiplier characters
appended, as follows:
<table>
diff --git a/src/third_party/wiredtiger/src/docs/error-handling.dox b/src/third_party/wiredtiger/src/docs/error-handling.dox
index fc5062b45a8..e25d3500790 100644
--- a/src/third_party/wiredtiger/src/docs/error-handling.dox
+++ b/src/third_party/wiredtiger/src/docs/error-handling.dox
@@ -74,6 +74,9 @@ This error is only generated when wiredtiger_open is configured to run in-memory
@par <code>WT_PREPARE_CONFLICT</code>
This error is generated when the application attempts to update an already updated record which is in prepared state. An updated record will be in prepared state, when the transaction that performed the update is in prepared state.
+@par <code>WT_TRY_SALVAGE</code>
+This error is generated when corruption is detected in an on-disk file. The application may choose to salvage the file or retry wiredtiger_open with the 'salvage=true' configuration setting.
+
@if IGNORE_BUILT_BY_API_ERR_END
@endif
diff --git a/src/third_party/wiredtiger/src/docs/statistics.dox b/src/third_party/wiredtiger/src/docs/statistics.dox
index 26c4b66fa40..19b7b17257b 100644
--- a/src/third_party/wiredtiger/src/docs/statistics.dox
+++ b/src/third_party/wiredtiger/src/docs/statistics.dox
@@ -156,9 +156,4 @@ currently open in the database, nor will any statistics requiring the
traversal of a tree (as if the \c statistics_fast configuration string
were set).
-A Python script that parses the default logging output and uses the
-<a href="http://www.gnuplot.info/">gnuplot</a>, utility to generate
-Portable Network Graphics (PNG) format graphs is included in the
-WiredTiger distribution in the file \c tools/statlog.py.
-
*/
diff --git a/src/third_party/wiredtiger/src/docs/top/main.dox b/src/third_party/wiredtiger/src/docs/top/main.dox
index e4de22ff042..d802443a9d8 100644
--- a/src/third_party/wiredtiger/src/docs/top/main.dox
+++ b/src/third_party/wiredtiger/src/docs/top/main.dox
@@ -6,12 +6,12 @@ WiredTiger is an high performance, scalable, production quality, NoSQL,
@section releases Releases
<table>
-@row{<b>WiredTiger 3.0.0</b> (current),
+@row{<b>WiredTiger 3.1.0</b> (current),
+ <a href="releases/wiredtiger-3.1.0.tar.bz2"><b>[Release package]</b></a>,
+ <a href="3.1.0/index.html"><b>[Documentation]</b></a>}
+@row{<b>WiredTiger 3.0.0</b> (previous),
<a href="releases/wiredtiger-3.0.0.tar.bz2"><b>[Release package]</b></a>,
<a href="3.0.0/index.html"><b>[Documentation]</b></a>}
-@row{<b>WiredTiger 2.9.3</b> (previous),
- <a href="releases/wiredtiger-2.9.3.tar.bz2"><b>[Release package]</b></a>,
- <a href="2.9.3/index.html"><b>[Documentation]</b></a>}
@row{<b>Development branch</b>,
<a href="https://github.com/wiredtiger/wiredtiger"><b>[Source code]</b></a>,
<a href="develop/index.html"><b>[Documentation]</b></a>}
diff --git a/src/third_party/wiredtiger/src/docs/upgrading.dox b/src/third_party/wiredtiger/src/docs/upgrading.dox
index 2e4990e8a33..0d8e5e1b428 100644
--- a/src/third_party/wiredtiger/src/docs/upgrading.dox
+++ b/src/third_party/wiredtiger/src/docs/upgrading.dox
@@ -1,5 +1,77 @@
/*! @page upgrading Upgrading WiredTiger applications
+</dl><hr>
+@section version_311 Upgrading to Version 3.1.1
+<dl>
+
+<dt>WT_CURSOR::modify transaction requirements</dt>
+<dd>
+In previous releases of WiredTiger, it was possible to use implicit
+transactions in combination with WT_CURSOR::modify operations. This
+requires applications be extraordinarily careful to avoid multiple
+threads which are changing the same values racing with each other. In
+the 3.1.1 release, WT_CURSOR::modify operations must be performed in an
+explicit transaction, and will fail if that's not the case.
+</dd>
+
+</dl><hr>
+@section version_310 Upgrading to Version 3.1.0
+<dl>
+
+<dt>WiredTiger on-disk log file format change</dt>
+<dd>
+The WiredTiger on-disk file format for write-ahead log files has changed
+as the log file version number was incremented. See
+<a href=https://jira.mongodb.org/browse/WT-4029>WT-4029</a> for details.
+</dd>
+
+<dt>::wiredtiger_open compatibility configuration changes</dt>
+<dd>
+The compatibility setting now takes additional options that can define
+the minimum or maximum required version of existing data files. See
+<a href=https://jira.mongodb.org/browse/WT-4056>WT-4056</a> and
+<a href=https://jira.mongodb.org/browse/WT-4098>WT-4098</a> for details.
+</dd>
+
+<dt>::wiredtiger_open cache configuration changes</dt>
+<dd>
+The cache configuration options \c eviction_checkpoint_target, \c
+eviction_dirty_target, \c eviction_dirty_trigger, \c eviction_target and \c
+eviction_trigger have changed. The options can now take an absolute size. It would
+be a percentage of the cache size if the value is within the range of 0 to 100
+or an absolute size when greater than 100. This API change is compatible with
+existing usage. See <a href=https://jira.mongodb.org/browse/WT-3632>WT-3632</a>
+for details.
+</dd>
+
+<dt>Changed transaction semantics around schema operations</dt>
+<dd>
+WiredTiger does not offer fully transactional create and drop operations.
+We have made some changes to how create and drop are implemented
+if done within the scope of an explicit transaction. If an application
+is relying on particular visibility/atomicity guarantees around table
+create or drop, care should be taken when upgrading. See
+<a href=https://jira.mongodb.org/browse/WT-3964>WT-3964</a> for details.
+</dd>
+
+<dt>On-disk format change for metadata</dt>
+<dd>
+There was a change to the content stored in the WiredTiger owned metadata
+files, which means metadata created or updated by this version of WiredTiger
+is not compatible with earlier versions. See
+<a href=https://jira.mongodb.org/browse/WT-3905>WT-3905</a> for details.
+</dd>
+
+<dt>Implement a per-session cursor cache</dt>
+<dd>
+WiredTiger now holds a cache of recently closed cursors in each
+session. This improves performance for applications that open and
+close cursors frequently, but increases memory overhead. The cache
+is enabled by default, but can be disabled. See
+<a href=https://jira.mongodb.org/browse/WT-1228>WT-1228</a> for details.
+</dd>
+
+</dl><hr>
@section version_300 Upgrading to Version 3.0.0
<dl>
@@ -28,16 +100,6 @@ The performance visualization tool \c wtstats has been removed and is
no longer supported.
</dd>
-<dt>::wiredtiger_open cache configuration changes</dt>
-<dd>
-The cache configuration options \c eviction_checkpoint_target, \c
-eviction_dirty_target, \c eviction_dirty_trigger, \c eviction_target and \c
-eviction_trigger have changed. The options can now take absolute size. It would
-be a percentage of the cache size if the value is within the range of 0 to 100
-or an absolute size when greater than 100. This API change is compatible with
-existing usage.
-</dd>
-
</dl><hr>
@section version_292 Upgrading to Version 2.9.2
<dl>
diff --git a/src/third_party/wiredtiger/src/evict/evict_file.c b/src/third_party/wiredtiger/src/evict/evict_file.c
index b6c6305d190..df88a64daa9 100644
--- a/src/third_party/wiredtiger/src/evict/evict_file.c
+++ b/src/third_party/wiredtiger/src/evict/evict_file.c
@@ -111,8 +111,11 @@ __wt_evict_file(WT_SESSION_IMPL *session, WT_CACHE_OP syncop)
case WT_SYNC_CLOSE:
/*
* Evict the page.
+ *
+ * Ensure the ref state is restored to the previous
+ * value if eviction fails.
*/
- WT_ERR(__wt_evict(session, ref, true));
+ WT_ERR(__wt_evict(session, ref, true, ref->state));
break;
case WT_SYNC_DISCARD:
/*
@@ -126,7 +129,7 @@ __wt_evict_file(WT_SESSION_IMPL *session, WT_CACHE_OP syncop)
break;
case WT_SYNC_CHECKPOINT:
case WT_SYNC_WRITE_LEAVES:
- WT_ERR(__wt_illegal_value(session, NULL));
+ WT_ERR(__wt_illegal_value(session, syncop));
break;
}
}
diff --git a/src/third_party/wiredtiger/src/evict/evict_lru.c b/src/third_party/wiredtiger/src/evict/evict_lru.c
index 8396612b7ca..ff3772533ae 100644
--- a/src/third_party/wiredtiger/src/evict/evict_lru.c
+++ b/src/third_party/wiredtiger/src/evict/evict_lru.c
@@ -318,7 +318,7 @@ __wt_evict_thread_run(WT_SESSION_IMPL *session, WT_THREAD *thread)
WT_ERR(__evict_lru_pages(session, false));
if (0) {
-err: WT_PANIC_MSG(session, ret, "cache eviction thread error");
+err: WT_PANIC_RET(session, ret, "cache eviction thread error");
}
return (ret);
}
@@ -357,7 +357,7 @@ __wt_evict_thread_stop(WT_SESSION_IMPL *session, WT_THREAD *thread)
WT_VERB_EVICTSERVER, "%s", "cache eviction thread exiting");
if (0) {
-err: WT_PANIC_MSG(session, ret, "cache eviction thread error");
+err: WT_PANIC_RET(session, ret, "cache eviction thread error");
}
return (ret);
}
@@ -451,7 +451,7 @@ __evict_server(WT_SESSION_IMPL *session, bool *did_work)
"Cache stuck for too long, giving up");
WT_RET(__wt_verbose_dump_txn(session));
WT_RET(__wt_verbose_dump_cache(session));
- return (ETIMEDOUT);
+ return (__wt_set_return(session, ETIMEDOUT));
#else
if (WT_VERBOSE_ISSET(session, WT_VERB_EVICT_STUCK)) {
WT_RET(__wt_verbose_dump_txn(session));
@@ -563,7 +563,7 @@ __evict_update_work(WT_SESSION_IMPL *session)
conn = S2C(session);
cache = conn->cache;
- dirty_target = cache->eviction_dirty_target;
+ dirty_target = __wt_eviction_dirty_target(cache);
dirty_trigger = cache->eviction_dirty_trigger;
target = cache->eviction_target;
trigger = cache->eviction_trigger;
@@ -628,11 +628,10 @@ __evict_update_work(WT_SESSION_IMPL *session)
* (3) the cache is more than half way from the dirty target to the
* dirty trigger.
*/
- if (!F_ISSET(conn, WT_CONN_EVICTION_NO_LOOKASIDE) &&
- (__wt_cache_stuck(session) ||
+ if (__wt_cache_stuck(session) ||
(__wt_cache_lookaside_score(cache) > 80 &&
dirty_inuse >
- (uint64_t)((dirty_target + dirty_trigger) * bytes_max) / 200)))
+ (uint64_t)((dirty_target + dirty_trigger) * bytes_max) / 200))
F_SET(cache, WT_CACHE_EVICT_LOOKASIDE);
/*
@@ -810,10 +809,8 @@ __evict_clear_walk(WT_SESSION_IMPL *session)
cache = S2C(session)->cache;
WT_ASSERT(session, F_ISSET(session, WT_SESSION_LOCKED_PASS));
- if (session->dhandle == cache->walk_tree) {
+ if (session->dhandle == cache->walk_tree)
cache->walk_tree = NULL;
- cache->walk_target = 0;
- }
if ((ref = btree->evict_ref) == NULL)
return (0);
@@ -1260,7 +1257,7 @@ __evict_lru_walk(WT_SESSION_IMPL *session)
queue->evict_current = NULL;
entries = queue->evict_entries;
- qsort(queue->evict_queue,
+ __wt_qsort(queue->evict_queue,
entries, sizeof(WT_EVICT_ENTRY), __evict_lru_cmp);
/* Trim empty entries from the end. */
@@ -1414,10 +1411,8 @@ retry: while (slot < max_entries) {
*/
if ((dhandle = cache->walk_tree) != NULL)
cache->walk_tree = NULL;
- else {
+ else
dhandle = TAILQ_FIRST(&conn->dhqh);
- cache->walk_target = 0;
- }
} else {
if (incr) {
WT_ASSERT(session, dhandle->session_inuse > 0);
@@ -1427,7 +1422,6 @@ retry: while (slot < max_entries) {
cache->walk_tree = NULL;
}
dhandle = TAILQ_NEXT(dhandle, q);
- cache->walk_target = 0;
}
/* If we reach the end of the list, we're done. */
@@ -1448,7 +1442,7 @@ retry: while (slot < max_entries) {
* Skip files that are checkpointing if we are only looking for
* dirty pages.
*/
- if (btree->checkpointing != WT_CKPT_OFF &&
+ if (WT_BTREE_SYNCING(btree) &&
!F_ISSET(cache, WT_CACHE_EVICT_CLEAN))
continue;
@@ -1601,8 +1595,7 @@ __evict_push_candidate(WT_SESSION_IMPL *session,
* Calculate how many pages to queue for a given tree.
*/
static uint32_t
-__evict_walk_target(
- WT_SESSION_IMPL *session, WT_EVICT_QUEUE *queue, u_int max_entries)
+__evict_walk_target(WT_SESSION_IMPL *session, u_int max_entries)
{
WT_CACHE *cache;
uint64_t btree_inuse, bytes_per_slot, cache_inuse;
@@ -1611,7 +1604,7 @@ __evict_walk_target(
cache = S2C(session)->cache;
target_pages_clean = target_pages_dirty = 0;
- total_slots = max_entries - queue->evict_entries;
+ total_slots = max_entries;
/*
* The number of times we should fill the queue by the end of
@@ -1700,7 +1693,7 @@ __evict_walk_tree(WT_SESSION_IMPL *session,
WT_PAGE *last_parent, *page;
WT_REF *ref;
uint64_t min_pages, pages_seen, pages_queued, refs_walked;
- uint32_t remaining_slots, target_pages, walk_flags;
+ uint32_t read_flags, remaining_slots, target_pages, walk_flags;
int restarts;
bool give_up, modified, urgent_queued;
@@ -1717,14 +1710,13 @@ __evict_walk_tree(WT_SESSION_IMPL *session,
*/
start = queue->evict_queue + *slotp;
remaining_slots = max_entries - *slotp;
- if (cache->walk_target != 0) {
- WT_ASSERT(session, cache->walk_progress <= cache->walk_target);
- target_pages = cache->walk_target - cache->walk_progress;
- } else {
- target_pages = cache->walk_target =
- __evict_walk_target(session, queue, max_entries);
- cache->walk_progress = 0;
+ if (btree->evict_walk_progress >= btree->evict_walk_target) {
+ btree->evict_walk_target =
+ __evict_walk_target(session, max_entries);
+ btree->evict_walk_progress = 0;
}
+ target_pages = WT_MIN(btree->evict_walk_target / QUEUE_FILLS_PER_PASS,
+ btree->evict_walk_target - btree->evict_walk_progress);
if (target_pages > remaining_slots)
target_pages = remaining_slots;
@@ -1797,10 +1789,13 @@ __evict_walk_tree(WT_SESSION_IMPL *session,
FLD_SET(walk_flags, WT_READ_PREV);
/* FALLTHROUGH */
case WT_EVICT_WALK_RAND_NEXT:
+ read_flags = WT_READ_CACHE | WT_READ_NO_EVICT |
+ WT_READ_NO_GEN | WT_READ_NO_WAIT |
+ WT_READ_NOTFOUND_OK | WT_READ_RESTART_OK;
if (btree->evict_ref == NULL) {
/* Ensure internal pages indexes remain valid */
WT_WITH_PAGE_INDEX(session, ret = __wt_random_descent(
- session, &btree->evict_ref, true));
+ session, &btree->evict_ref, read_flags));
WT_RET_NOTFOUND_OK(ret);
}
break;
@@ -1915,7 +1910,7 @@ __evict_walk_tree(WT_SESSION_IMPL *session,
continue;
/* Don't queue dirty pages in trees during checkpoints. */
- if (modified && btree->checkpointing != WT_CKPT_OFF)
+ if (modified && WT_BTREE_SYNCING(btree))
continue;
/*
@@ -2017,7 +2012,7 @@ fast: /* If the page can't be evicted, give up. */
continue;
++evict;
++pages_queued;
- ++cache->walk_progress;
+ ++btree->evict_walk_progress;
__wt_verbose(session, WT_VERB_EVICTSERVER,
"select: %p, size %" WT_SIZET_FMT,
@@ -2092,8 +2087,8 @@ fast: /* If the page can't be evicted, give up. */
* Get a page for eviction.
*/
static int
-__evict_get_ref(
- WT_SESSION_IMPL *session, bool is_server, WT_BTREE **btreep, WT_REF **refp)
+__evict_get_ref(WT_SESSION_IMPL *session,
+ bool is_server, WT_BTREE **btreep, WT_REF **refp, uint32_t *previous_statep)
{
WT_CACHE *cache;
WT_EVICT_ENTRY *evict;
@@ -2102,6 +2097,11 @@ __evict_get_ref(
bool is_app, server_only, urgent_ok;
*btreep = NULL;
+ /*
+ * It is polite to initialize output variables, but it isn't safe for
+ * callers to use the previous state if we don't return a locked ref.
+ */
+ *previous_statep = WT_REF_MEM;
*refp = NULL;
cache = S2C(session)->cache;
@@ -2239,6 +2239,7 @@ __evict_get_ref(
*btreep = evict->btree;
*refp = evict->ref;
+ *previous_statep = previous_state;
/*
* Remove the entry so we never try to reconcile the same page
@@ -2273,11 +2274,13 @@ __evict_page(WT_SESSION_IMPL *session, bool is_server)
WT_REF *ref;
WT_TRACK_OP_DECL;
uint64_t time_start, time_stop;
+ uint32_t previous_state;
bool app_timer;
WT_TRACK_OP_INIT(session);
- WT_RET_TRACK(__evict_get_ref(session, is_server, &btree, &ref));
+ WT_RET_TRACK(__evict_get_ref(
+ session, is_server, &btree, &ref, &previous_state));
WT_ASSERT(session, ref->state == WT_REF_LOCKED);
app_timer = false;
@@ -2316,7 +2319,8 @@ __evict_page(WT_SESSION_IMPL *session, bool is_server)
*/
__wt_cache_read_gen_bump(session, ref->page);
- WT_WITH_BTREE(session, btree, ret = __wt_evict(session, ref, false));
+ WT_WITH_BTREE(session, btree,
+ ret = __wt_evict(session, ref, false, previous_state));
(void)__wt_atomic_subv32(&btree->evict_busy, 1);
@@ -2345,7 +2349,8 @@ __wt_cache_eviction_worker(
WT_TRACK_OP_DECL;
WT_TXN_GLOBAL *txn_global;
WT_TXN_STATE *txn_state;
- uint64_t initial_progress, max_progress, time_start, time_stop;
+ uint64_t elapsed, time_start, time_stop;
+ uint64_t initial_progress, max_progress;
bool timer;
WT_TRACK_OP_INIT(session);
@@ -2367,8 +2372,7 @@ __wt_cache_eviction_worker(
__wt_evict_server_wake(session);
/* Track how long application threads spend doing eviction. */
- timer =
- WT_STAT_ENABLED(session) && !F_ISSET(session, WT_SESSION_INTERNAL);
+ timer = !F_ISSET(session, WT_SESSION_INTERNAL);
if (timer)
time_start = __wt_clock(session);
@@ -2405,22 +2409,10 @@ __wt_cache_eviction_worker(
/* See if eviction is still needed. */
if (!__wt_eviction_needed(session, busy, readonly, &pct_full) ||
- ((pct_full < 100.0 || cache->eviction_scrub_limit > 0.0) &&
- (cache->eviction_progress >
+ (pct_full < 100.0 && (cache->eviction_progress >
initial_progress + max_progress)))
break;
- /*
- * Don't make application threads participate in scrubbing for
- * checkpoints. Just throttle updates instead.
- */
- if (WT_EVICT_HAS_WORKERS(session) &&
- cache->eviction_scrub_limit > 0.0 &&
- !F_ISSET(cache, WT_CACHE_EVICT_CLEAN_HARD)) {
- __wt_yield();
- continue;
- }
-
/* Evict a page. */
switch (ret = __evict_page(session, false)) {
case 0:
@@ -2438,13 +2430,26 @@ __wt_cache_eviction_worker(
default:
goto err;
}
+ /* Stop if we've exceeded the time out. */
+ if (timer && cache->cache_max_wait_us != 0) {
+ time_stop = __wt_clock(session);
+ if (session->cache_wait_us +
+ WT_CLOCKDIFF_US(time_stop, time_start) >
+ cache->cache_max_wait_us)
+ goto err;
+ }
}
err: if (timer) {
time_stop = __wt_clock(session);
- WT_STAT_CONN_INCRV(session,
- application_cache_time,
- WT_CLOCKDIFF_US(time_stop, time_start));
+ elapsed = WT_CLOCKDIFF_US(time_stop, time_start);
+ WT_STAT_CONN_INCRV(session, application_cache_time, elapsed);
+ session->cache_wait_us += elapsed;
+ if (cache->cache_max_wait_us != 0 &&
+ session->cache_wait_us > cache->cache_max_wait_us) {
+ WT_TRET(WT_CACHE_FULL);
+ WT_STAT_CONN_INCR(session, cache_timed_out_ops);
+ }
}
done: WT_TRACK_OP_END(session);
diff --git a/src/third_party/wiredtiger/src/evict/evict_page.c b/src/third_party/wiredtiger/src/evict/evict_page.c
index 0daccdf5b1c..44c3bbb8f78 100644
--- a/src/third_party/wiredtiger/src/evict/evict_page.c
+++ b/src/third_party/wiredtiger/src/evict/evict_page.c
@@ -17,11 +17,12 @@ static int __evict_review(WT_SESSION_IMPL *, WT_REF *, bool, bool *);
* Release exclusive access to a page.
*/
static inline void
-__evict_exclusive_clear(WT_SESSION_IMPL *session, WT_REF *ref)
+__evict_exclusive_clear(
+ WT_SESSION_IMPL *session, WT_REF *ref, uint32_t previous_state)
{
WT_ASSERT(session, ref->state == WT_REF_LOCKED && ref->page != NULL);
- ref->state = WT_REF_MEM;
+ ref->state = previous_state;
}
/*
@@ -42,7 +43,7 @@ __evict_exclusive(WT_SESSION_IMPL *session, WT_REF *ref)
WT_STAT_DATA_INCR(session, cache_eviction_hazard);
WT_STAT_CONN_INCR(session, cache_eviction_hazard);
- return (EBUSY);
+ return (__wt_set_return(session, EBUSY));
}
/*
@@ -56,21 +57,27 @@ __wt_page_release_evict(WT_SESSION_IMPL *session, WT_REF *ref)
WT_DECL_RET;
WT_PAGE *page;
uint64_t time_start, time_stop;
+ uint32_t previous_state;
bool locked, too_big;
btree = S2BT(session);
+ locked = false;
page = ref->page;
time_start = __wt_clock(session);
/*
- * Take some care with order of operations: if we release the hazard
- * reference without first locking the page, it could be evicted in
- * between.
+ * This function always releases the hazard pointer - ensure that's
+ * done regardless of whether we can get exclusive access. Take some
+ * care with order of operations: if we release the hazard pointer
+ * without first locking the page, it could be evicted in between.
*/
- locked = __wt_atomic_casv32(&ref->state, WT_REF_MEM, WT_REF_LOCKED);
+ previous_state = ref->state;
+ if ((previous_state == WT_REF_MEM || previous_state == WT_REF_LIMBO) &&
+ __wt_atomic_casv32(&ref->state, previous_state, WT_REF_LOCKED))
+ locked = true;
if ((ret = __wt_hazard_clear(session, ref)) != 0 || !locked) {
if (locked)
- ref->state = WT_REF_MEM;
+ ref->state = previous_state;
return (ret == 0 ? EBUSY : ret);
}
@@ -82,7 +89,7 @@ __wt_page_release_evict(WT_SESSION_IMPL *session, WT_REF *ref)
* Track how long the call to evict took. If eviction is successful then
* we have one of two pairs of stats to increment.
*/
- ret = __wt_evict(session, ref, false);
+ ret = __wt_evict(session, ref, false, previous_state);
time_stop = __wt_clock(session);
if (ret == 0) {
if (too_big) {
@@ -116,7 +123,8 @@ __wt_page_release_evict(WT_SESSION_IMPL *session, WT_REF *ref)
* Evict a page.
*/
int
-__wt_evict(WT_SESSION_IMPL *session, WT_REF *ref, bool closing)
+__wt_evict(WT_SESSION_IMPL *session,
+ WT_REF *ref, bool closing, uint32_t previous_state)
{
WT_CONNECTION_IMPL *conn;
WT_DECL_RET;
@@ -223,7 +231,8 @@ __wt_evict(WT_SESSION_IMPL *session, WT_REF *ref, bool closing)
if (0) {
err: if (!closing)
- __evict_exclusive_clear(session, ref);
+ __evict_exclusive_clear(
+ session, ref, previous_state);
WT_STAT_CONN_INCR(session, cache_eviction_fail);
WT_STAT_DATA_INCR(session, cache_eviction_fail);
@@ -299,6 +308,18 @@ __evict_page_clean_update(WT_SESSION_IMPL *session, WT_REF *ref, bool closing)
WT_DECL_RET;
/*
+ * Before discarding a page, assert that all updates are globally
+ * visible unless the tree is closing, dead, or we're evicting with
+ * history in lookaside.
+ */
+ WT_ASSERT(session,
+ closing || ref->page->modify == NULL ||
+ F_ISSET(session->dhandle, WT_DHANDLE_DEAD) ||
+ (ref->page_las != NULL && ref->page_las->eviction_to_lookaside) ||
+ __wt_txn_visible_all(session, ref->page->modify->rec_max_txn,
+ WT_TIMESTAMP_NULL(&ref->page->modify->rec_max_timestamp)));
+
+ /*
* Discard the page and update the reference structure. If evicting a
* WT_REF_LIMBO page with active history, transition back to
* WT_REF_LOOKASIDE. Otherwise, a page with a disk address is an
@@ -424,7 +445,7 @@ __evict_page_dirty_update(WT_SESSION_IMPL *session, WT_REF *ref, bool closing)
}
break;
- WT_ILLEGAL_VALUE(session);
+ WT_ILLEGAL_VALUE(session, mod->rec_result);
}
return (0);
@@ -438,6 +459,7 @@ static int
__evict_child_check(WT_SESSION_IMPL *session, WT_REF *parent)
{
WT_REF *child;
+ bool active;
WT_INTL_FOREACH_BEGIN(session, parent->page, child) {
switch (child->state) {
@@ -445,15 +467,27 @@ __evict_child_check(WT_SESSION_IMPL *session, WT_REF *parent)
break;
case WT_REF_DELETED: /* Deleted */
/*
- * If the page was part of a truncate, transaction
- * rollback might switch this page into its previous
- * state at any time, so the delete must be resolved.
- * We don't have to lock the page, as no thread of
- * control can be running below our locked internal
- * page.
+ * If the child page was part of a truncate,
+ * transaction rollback might switch this page into its
+ * previous state at any time, so the delete must be
+ * resolved before the parent can be evicted.
+ *
+ * We have the internal page locked, which prevents a
+ * search from descending into it. However, a walk
+ * from an adjacent leaf page could attempt to hazard
+ * couple into a child page and free the page_del
+ * structure as we are examining it. Flip the state to
+ * locked to make this check safe: if that fails, we
+ * have raced with a read and should give up on
+ * evicting the parent.
*/
- if (__wt_page_del_active(session, child, true))
- return (EBUSY);
+ if (!__wt_atomic_casv32(
+ &child->state, WT_REF_DELETED, WT_REF_LOCKED))
+ return (__wt_set_return(session, EBUSY));
+ active = __wt_page_del_active(session, child, true);
+ child->state = WT_REF_DELETED;
+ if (active)
+ return (__wt_set_return(session, EBUSY));
break;
case WT_REF_LOOKASIDE:
/*
@@ -461,10 +495,10 @@ __evict_child_check(WT_SESSION_IMPL *session, WT_REF *parent)
* can be ignored.
*/
if (__wt_page_las_active(session, child))
- return (EBUSY);
+ return (__wt_set_return(session, EBUSY));
break;
default:
- return (EBUSY);
+ return (__wt_set_return(session, EBUSY));
}
} WT_INTL_FOREACH_END;
@@ -492,7 +526,7 @@ __evict_review(
conn = S2C(session);
page = ref->page;
flags = WT_REC_EVICT;
- if (!WT_SESSION_IS_CHECKPOINT(session))
+ if (!WT_SESSION_BTREE_SYNC(session))
LF_SET(WT_REC_VISIBLE_ALL);
/*
@@ -528,7 +562,7 @@ __evict_review(
* should be uncommon - we don't add clean pages to the queue.
*/
if (F_ISSET(conn, WT_CONN_IN_MEMORY) && !modified && !closing)
- return (EBUSY);
+ return (__wt_set_return(session, EBUSY));
/* Check if the page can be evicted. */
if (!closing) {
@@ -541,14 +575,14 @@ __evict_review(
session, WT_TXN_OLDEST_STRICT));
if (!__wt_page_can_evict(session, ref, inmem_splitp))
- return (EBUSY);
+ return (__wt_set_return(session, EBUSY));
/*
- * Check for an append-only workload needing an in-memory
- * split; we can't do this earlier because in-memory splits
- * require exclusive access. If an in-memory split completes,
- * the page stays in memory and the tree is left in the desired
- * state: avoid the usual cleanup.
+ * Check for an append-only workload needing an in-memory split;
+ * we can't do this earlier because in-memory splits require
+ * exclusive access. If an in-memory split completes, the page
+ * stays in memory and the tree is left in the desired state:
+ * avoid the usual cleanup.
*/
if (*inmem_splitp)
return (__wt_split_insert(session, ref));
@@ -563,7 +597,7 @@ __evict_review(
* eviction that writes to lookaside), give up.
*/
if (F_ISSET(session, WT_SESSION_NO_RECONCILE))
- return (EBUSY);
+ return (__wt_set_return(session, EBUSY));
/*
* If the page is dirty, reconcile it to decide if we can evict it.
@@ -606,7 +640,7 @@ __evict_review(
if (F_ISSET(conn, WT_CONN_IN_MEMORY))
LF_SET(WT_REC_IN_MEMORY |
WT_REC_SCRUB | WT_REC_UPDATE_RESTORE);
- else if (WT_SESSION_IS_CHECKPOINT(session))
+ else if (WT_SESSION_BTREE_SYNC(session))
LF_SET(WT_REC_LOOKASIDE);
else if (!WT_IS_METADATA(session->dhandle)) {
LF_SET(WT_REC_UPDATE_RESTORE);
@@ -619,7 +653,8 @@ __evict_review(
* that can't be evicted, check if reconciliation
* suggests trying the lookaside table.
*/
- if (F_ISSET(cache, WT_CACHE_EVICT_LOOKASIDE))
+ if (F_ISSET(cache, WT_CACHE_EVICT_LOOKASIDE) &&
+ !F_ISSET(conn, WT_CONN_EVICTION_NO_LOOKASIDE))
lookaside_retryp = &lookaside_retry;
}
}
@@ -633,10 +668,10 @@ __evict_review(
* to evict. Give up evicting in that case: checkpoint will include
* the reconciled page when it visits the parent.
*/
- if (WT_SESSION_IS_CHECKPOINT(session) && !__wt_page_is_modified(page) &&
+ if (WT_SESSION_BTREE_SYNC(session) && !__wt_page_is_modified(page) &&
!__wt_txn_visible_all(session, page->modify->rec_max_txn,
WT_TIMESTAMP_NULL(&page->modify->rec_max_timestamp)))
- return (EBUSY);
+ return (__wt_set_return(session, EBUSY));
/*
* If reconciliation fails but reports it might succeed if we use the
@@ -661,23 +696,16 @@ __evict_review(
* very unlikely. However, since checkpoint is partway through
* reconciling the parent page, a split can corrupt the checkpoint.
*/
- if (WT_SESSION_IS_CHECKPOINT(session) &&
+ if (WT_SESSION_BTREE_SYNC(session) &&
page->modify->rec_result == WT_PM_REC_MULTIBLOCK)
- return (EBUSY);
+ return (__wt_set_return(session, EBUSY));
/*
- * Success: assert the page is clean or reconciliation was configured
- * for update/restore. If the page is clean, assert that reconciliation
- * was configured for a lookaside table, or it's not a durable object
- * (currently the lookaside table), or all page updates were globally
- * visible.
+ * Success: assert that the page is clean or reconciliation was
+ * configured to save updates.
*/
WT_ASSERT(session, !__wt_page_is_modified(page) ||
LF_ISSET(WT_REC_LOOKASIDE | WT_REC_UPDATE_RESTORE));
- WT_ASSERT(session,
- __wt_page_is_modified(page) ||
- __wt_txn_visible_all(session, page->modify->rec_max_txn,
- WT_TIMESTAMP_NULL(&page->modify->rec_max_timestamp)));
return (0);
}
diff --git a/src/third_party/wiredtiger/src/include/api.h b/src/third_party/wiredtiger/src/include/api.h
index ca2176fcf0e..1c22c99a11c 100644
--- a/src/third_party/wiredtiger/src/include/api.h
+++ b/src/third_party/wiredtiger/src/include/api.h
@@ -48,6 +48,9 @@
WT_TRACK_OP_INIT(s); \
WT_SINGLE_THREAD_CHECK_START(s); \
WT_ERR(WT_SESSION_CHECK_PANIC(s)); \
+ /* Reset wait time if this isn't an API reentry. */ \
+ if (__oldname == NULL) \
+ (s)->cache_wait_us = 0; \
__wt_verbose((s), WT_VERB_API, "%s", "CALL: " #h ":" #n)
#define API_CALL_NOCONF(s, h, n, dh) do { \
diff --git a/src/third_party/wiredtiger/src/include/btmem.h b/src/third_party/wiredtiger/src/include/btmem.h
index 33e382feba2..f4a6871e9a6 100644
--- a/src/third_party/wiredtiger/src/include/btmem.h
+++ b/src/third_party/wiredtiger/src/include/btmem.h
@@ -205,8 +205,8 @@ struct __wt_ovfl_reuse {
* this way so that overall the lookaside table is append-mostly), a counter
* (used to ensure the update records remain in the original order), and the
* record's key (byte-string for row-store, record number for column-store).
- * The value is the WT_UPDATE structure's transaction ID, timestamp, update
- * type and value.
+ * The value is the WT_UPDATE structure's transaction ID, timestamp, update's
+ * prepare state, update type and value.
*
* As the key for the lookaside table is different for row- and column-store, we
* store both key types in a WT_ITEM, building/parsing them in the code, because
@@ -223,7 +223,7 @@ struct __wt_ovfl_reuse {
#endif
#define WT_LAS_CONFIG \
"key_format=" WT_UNCHECKED_STRING(QIQu) \
- ",value_format=" WT_UNCHECKED_STRING(QuBu) \
+ ",value_format=" WT_UNCHECKED_STRING(QuBBu) \
",block_compressor=" WT_LOOKASIDE_COMPRESSOR \
",leaf_value_max=64MB" \
",prefix_compression=true"
@@ -234,14 +234,12 @@ struct __wt_ovfl_reuse {
*/
struct __wt_page_lookaside {
uint64_t las_pageid; /* Page ID in lookaside */
- uint64_t las_max_txn; /* Max transaction ID in lookaside */
- uint64_t las_min_txn; /* Min transaction ID in lookaside */
- WT_DECL_TIMESTAMP(min_timestamp)/* Min timestamp in lookaside */
- /* Max timestamp on page */
- WT_DECL_TIMESTAMP(onpage_timestamp)
+ uint64_t max_txn; /* Maximum transaction ID */
+ uint64_t unstable_txn; /* First transaction ID not on page */
+ WT_DECL_TIMESTAMP(max_timestamp)/* Maximum timestamp */
+ WT_DECL_TIMESTAMP(unstable_timestamp)/* First timestamp not on page */
bool eviction_to_lookaside; /* Revert to lookaside on eviction */
- bool las_skew_newest; /* On-page skewed to newest */
- bool invalid; /* History is required correct reads */
+ bool skew_newest; /* Page image has newest versions */
};
/*
@@ -270,6 +268,9 @@ struct __wt_page_modify {
uint64_t rec_max_txn;
WT_DECL_TIMESTAMP(rec_max_timestamp)
+ /* Stable timestamp at last reconciliation. */
+ WT_DECL_TIMESTAMP(last_stable_timestamp)
+
/* The largest update transaction ID (approximate). */
uint64_t update_txn;
@@ -481,7 +482,9 @@ struct __wt_page_modify {
#define WT_PM_REC_REPLACE 3 /* Reconciliation: single block */
uint8_t rec_result; /* Reconciliation state */
- uint8_t update_restored; /* Page created by restoring updates */
+#define WT_PAGE_RS_LOOKASIDE 0x1
+#define WT_PAGE_RS_RESTORED 0x2
+ uint8_t restore_state; /* Created by restoring updates */
};
/*
diff --git a/src/third_party/wiredtiger/src/include/btree.h b/src/third_party/wiredtiger/src/include/btree.h
index 96f6309aba4..593745cc315 100644
--- a/src/third_party/wiredtiger/src/include/btree.h
+++ b/src/third_party/wiredtiger/src/include/btree.h
@@ -95,6 +95,7 @@ struct __wt_btree {
uint32_t maxleafkey; /* Leaf page max key size */
uint32_t maxleafvalue; /* Leaf page max value size */
uint64_t maxmempage; /* In-memory page max size */
+ uint32_t maxmempage_image; /* In-memory page image max size */
uint64_t splitmempage; /* In-memory split trigger size */
/* AUTOMATIC FLAG VALUE GENERATION START */
@@ -129,6 +130,16 @@ struct __wt_btree {
int split_pct; /* Split page percent */
WT_COMPRESSOR *compressor; /* Page compressor */
+ /*
+ * When doing compression, the pre-compression in-memory byte size is
+ * optionally adjusted based on previous compression results.
+ * It's an 8B value because it's updated without a lock.
+ */
+ bool leafpage_compadjust; /* Run-time compression adjustment */
+ uint64_t maxleafpage_precomp; /* Leaf page pre-compression size */
+ bool intlpage_compadjust; /* Run-time compression adjustment */
+ uint64_t maxintlpage_precomp; /* Internal page pre-compression size */
+
WT_KEYED_ENCRYPTOR *kencryptor; /* Page encryptor */
WT_RWLOCK ovfl_lock; /* Overflow lock */
@@ -154,11 +165,30 @@ struct __wt_btree {
WT_DECL_TIMESTAMP(rec_max_timestamp)
uint64_t checkpoint_gen; /* Checkpoint generation */
+ WT_SESSION_IMPL *sync_session; /* Syncing session */
volatile enum {
- WT_CKPT_OFF, WT_CKPT_PREPARE, WT_CKPT_RUNNING
- } checkpointing; /* Checkpoint in progress */
+ WT_BTREE_SYNC_OFF, WT_BTREE_SYNC_WAIT, WT_BTREE_SYNC_RUNNING
+ } syncing; /* Sync status */
- uint64_t bytes_inmem; /* Cache bytes in memory. */
+ /*
+ * Helper macros:
+ * WT_BTREE_SYNCING indicates if a sync is active (either waiting to
+ * start or already running), so no new operations should start that
+ * would conflict with the sync.
+ * WT_SESSION_BTREE_SYNC indicates if the session is performing a sync
+ * on its current tree.
+ * WT_SESSION_BTREE_SYNC_SAFE checks whether it is safe to perform an
+ * operation that would conflict with a sync.
+ */
+#define WT_BTREE_SYNCING(btree) \
+ (btree->syncing != WT_BTREE_SYNC_OFF)
+#define WT_SESSION_BTREE_SYNC(session) \
+ (S2BT(session)->sync_session == session)
+#define WT_SESSION_BTREE_SYNC_SAFE(session, btree) \
+ ((btree)->syncing != WT_BTREE_SYNC_RUNNING || \
+ (btree)->sync_session == session)
+
+ uint64_t bytes_inmem; /* Cache bytes in memory. */
uint64_t bytes_dirty_intl; /* Bytes in dirty internal pages. */
uint64_t bytes_dirty_leaf; /* Bytes in dirty leaf pages. */
@@ -181,6 +211,8 @@ struct __wt_btree {
*/
WT_REF *evict_ref; /* Eviction thread's location */
uint64_t evict_priority; /* Relative priority of cached pages */
+ uint32_t evict_walk_progress;/* Eviction walk progress */
+ uint32_t evict_walk_target; /* Eviction walk target */
u_int evict_walk_period; /* Skip this many LRU walks */
u_int evict_walk_saved; /* Saved walk skips for checkpoints */
u_int evict_walk_skips; /* Number of walks skipped */
@@ -207,11 +239,12 @@ struct __wt_btree {
#define WT_BTREE_LOOKASIDE 0x002000u /* Look-aside table */
#define WT_BTREE_NO_CHECKPOINT 0x004000u /* Disable checkpoints */
#define WT_BTREE_NO_LOGGING 0x008000u /* Disable logging */
-#define WT_BTREE_REBALANCE 0x010000u /* Handle is for rebalance */
-#define WT_BTREE_SALVAGE 0x020000u /* Handle is for salvage */
-#define WT_BTREE_SKIP_CKPT 0x040000u /* Handle skipped checkpoint */
-#define WT_BTREE_UPGRADE 0x080000u /* Handle is for upgrade */
-#define WT_BTREE_VERIFY 0x100000u /* Handle is for verify */
+#define WT_BTREE_READONLY 0x010000u /* Handle is readonly */
+#define WT_BTREE_REBALANCE 0x020000u /* Handle is for rebalance */
+#define WT_BTREE_SALVAGE 0x040000u /* Handle is for salvage */
+#define WT_BTREE_SKIP_CKPT 0x080000u /* Handle skipped checkpoint */
+#define WT_BTREE_UPGRADE 0x100000u /* Handle is for upgrade */
+#define WT_BTREE_VERIFY 0x200000u /* Handle is for verify */
uint32_t flags;
};
diff --git a/src/third_party/wiredtiger/src/include/btree.i b/src/third_party/wiredtiger/src/include/btree.i
index 81c166eb0e4..41d843790e8 100644
--- a/src/third_party/wiredtiger/src/include/btree.i
+++ b/src/third_party/wiredtiger/src/include/btree.i
@@ -1158,8 +1158,7 @@ __wt_ref_block_free(WT_SESSION_IMPL *session, WT_REF *ref)
* Return if a truncate operation is active.
*/
static inline bool
-__wt_page_del_active(
- WT_SESSION_IMPL *session, WT_REF *ref, bool visible_all)
+__wt_page_del_active(WT_SESSION_IMPL *session, WT_REF *ref, bool visible_all)
{
WT_PAGE_DELETED *page_del;
uint8_t prepare_state;
@@ -1190,10 +1189,10 @@ __wt_page_las_active(WT_SESSION_IMPL *session, WT_REF *ref)
if ((page_las = ref->page_las) == NULL)
return (false);
- if (page_las->invalid || !ref->page_las->las_skew_newest)
+ if (!page_las->skew_newest)
return (true);
- if (__wt_txn_visible_all(session, page_las->las_max_txn,
- WT_TIMESTAMP_NULL(&page_las->onpage_timestamp)))
+ if (__wt_txn_visible_all(session, page_las->max_txn,
+ WT_TIMESTAMP_NULL(&page_las->max_timestamp)))
return (false);
return (true);
@@ -1217,9 +1216,8 @@ __wt_btree_can_evict_dirty(WT_SESSION_IMPL *session)
WT_BTREE *btree;
btree = S2BT(session);
- return ((btree->checkpointing == WT_CKPT_OFF &&
- !F_ISSET(S2C(session), WT_CONN_CLOSING_TIMESTAMP)) ||
- WT_SESSION_IS_CHECKPOINT(session));
+ return ((!WT_BTREE_SYNCING(btree) || WT_SESSION_BTREE_SYNC(session)) &&
+ !F_ISSET(S2C(session), WT_CONN_CLOSING_TIMESTAMP));
}
/*
@@ -1238,6 +1236,14 @@ __wt_leaf_page_can_split(WT_SESSION_IMPL *session, WT_PAGE *page)
btree = S2BT(session);
/*
+ * Checkpoints can't do in-memory splits in the tree they are walking:
+ * that can lead to corruption when the parent internal page is
+ * updated.
+ */
+ if (WT_SESSION_BTREE_SYNC(session))
+ return (false);
+
+ /*
* Only split a page once, otherwise workloads that update in the middle
* of the page could continually split without benefit.
*/
@@ -1329,6 +1335,7 @@ __wt_leaf_page_can_split(WT_SESSION_IMPL *session, WT_PAGE *page)
static inline bool
__wt_page_evict_retry(WT_SESSION_IMPL *session, WT_PAGE *page)
{
+ WT_DECL_TIMESTAMP(pinned_ts)
WT_PAGE_MODIFY *mod;
WT_TXN_GLOBAL *txn_global;
@@ -1338,7 +1345,8 @@ __wt_page_evict_retry(WT_SESSION_IMPL *session, WT_PAGE *page)
* If the page hasn't been through one round of update/restore, give it
* a try.
*/
- if ((mod = page->modify) == NULL || !mod->update_restored)
+ if ((mod = page->modify) == NULL ||
+ !FLD_ISSET(mod->restore_state, WT_PAGE_RS_RESTORED))
return (true);
/*
@@ -1356,18 +1364,12 @@ __wt_page_evict_retry(WT_SESSION_IMPL *session, WT_PAGE *page)
return (true);
#ifdef HAVE_TIMESTAMPS
- {
- bool same_timestamp;
-
- same_timestamp = false;
- if (!__wt_timestamp_iszero(&mod->last_eviction_timestamp))
- WT_WITH_TIMESTAMP_READLOCK(session, &txn_global->rwlock,
- same_timestamp = __wt_timestamp_cmp(
- &mod->last_eviction_timestamp,
- &txn_global->pinned_timestamp) == 0);
- if (!same_timestamp)
+ if (__wt_timestamp_iszero(&mod->last_eviction_timestamp))
+ return (true);
+
+ __wt_txn_pinned_timestamp(session, &pinned_ts);
+ if (__wt_timestamp_cmp(&pinned_ts, &mod->last_eviction_timestamp) > 0)
return (true);
- }
#endif
return (false);
@@ -1509,7 +1511,7 @@ __wt_page_release(WT_SESSION_IMPL *session, WT_REF *ref, uint32_t flags)
if (!__wt_page_evict_clean(page) &&
(LF_ISSET(WT_READ_NO_SPLIT) || (!inmem_split &&
F_ISSET(session, WT_SESSION_NO_RECONCILE)))) {
- if (!WT_SESSION_IS_CHECKPOINT(session))
+ if (!WT_SESSION_BTREE_SYNC(session))
WT_IGNORE_RET(
__wt_page_evict_urgent(session, ref));
} else {
@@ -1605,6 +1607,8 @@ __wt_split_descent_race(
* update. A thread can read the parent page's original page index and
* then read the split page's replacement index.
*
+ * For example, imagine a search descending the tree.
+ *
* Because internal page splits work by truncating the original page to
* the initial part of the original page, the result of this race is we
* will have a search key that points past the end of the current page.
@@ -1649,73 +1653,17 @@ __wt_split_descent_race(
* work by truncating the split page, so the split page search is for
* content the split page retains after the split, and we ignore this
* race.
- */
- WT_INTL_INDEX_GET(session, ref->home, pindex);
- return (pindex != saved_pindex);
-}
-
-/*
- * __wt_split_prev_race --
- * Return if we raced with an internal page split when moving backwards
- * through the tree.
- */
-static inline bool
-__wt_split_prev_race(WT_SESSION_IMPL *session, WT_REF *ref)
-{
- WT_PAGE_INDEX *pindex;
-
- /*
- * There's a split race when a cursor moving backwards through the tree
- * descends the tree. If we're splitting an internal page into its
- * parent, we move the WT_REF structures and update the parent's page
- * index before updating the split page's page index, and it's not an
- * atomic update. A thread can read the parent and split page's original
- * indexes during a split, or read the parent page's replacement page
- * index and then read the split page's original index, either of which
- * can lead to skipping pages.
*
- * For example, imagine an internal page with 3 child pages, with the
- * namespaces a-f, g-h and i-j; the first child page splits. The parent
- * starts out with the following page-index:
- *
- * | ... | a | g | i | ... |
- *
- * The split page starts out with the following page-index:
- *
- * | a | b | c | d | e | f |
- *
- * The first step is to move the c-f ranges into a new subtree, so, for
- * example we might have two new internal pages 'c' and 'e', where the
- * new 'c' page references the c-d namespace and the new 'e' page
- * references the e-f namespace. The top of the subtree references the
- * parent page, but until the parent's page index is updated, threads in
- * the subtree won't be able to ascend out of the subtree. However, once
- * the parent page's page index is updated to this:
- *
- * | ... | a | c | e | g | i | ... |
- *
- * threads in the subtree can ascend into the parent. Imagine a cursor
- * in the c-d part of the namespace that ascends to the parent's 'c'
- * slot. It would then decrement to the slot before the 'c' slot, the
- * 'a' slot.
- *
- * The previous-cursor movement selects the last slot in the 'a' page;
- * if the split page's page-index hasn't been updated yet, it selects
- * the 'f' slot, which is incorrect. Once the split page's page index is
- * updated to this:
+ * This code is a general purpose check for a descent race and we call
+ * it in other cases, for example, a cursor traversing backwards through
+ * the tree.
*
- * | a | b |
- *
- * the previous-cursor movement will select the 'b' slot, which is
- * correct.
- *
- * This function takes an argument which is the internal page into which
- * we're coupling. If the last slot on the page no longer points to
- * the current page as its "home", the page is being split and part of
- * its namespace moved, we have to restart.
+ * Presumably we acquired a page index on the child page before calling
+ * this code, don't re-order that acquisition with this check.
*/
- WT_INTL_INDEX_GET(session, ref->page, pindex);
- return (pindex->index[pindex->entries - 1]->home != ref->page);
+ WT_BARRIER();
+ WT_INTL_INDEX_GET(session, ref->home, pindex);
+ return (pindex != saved_pindex);
}
/*
@@ -1724,10 +1672,10 @@ __wt_split_prev_race(WT_SESSION_IMPL *session, WT_REF *ref)
* coupling up/down the tree.
*/
static inline int
-__wt_page_swap_func(WT_SESSION_IMPL *session,
- WT_REF *held, WT_REF *want, bool prev_race, uint32_t flags
+__wt_page_swap_func(
+ WT_SESSION_IMPL *session, WT_REF *held, WT_REF *want, uint32_t flags
#ifdef HAVE_DIAGNOSTIC
- , const char *file, int line
+ , const char *func, int line
#endif
)
{
@@ -1750,23 +1698,11 @@ __wt_page_swap_func(WT_SESSION_IMPL *session,
/* Get the wanted page. */
ret = __wt_page_in_func(session, want, flags
#ifdef HAVE_DIAGNOSTIC
- , file, line
+ , func, line
#endif
);
/*
- * We can race when descending into an internal page as part of moving
- * backwards through the tree, and we have to detect that race before
- * releasing the page from which we are coupling, else we can't restart
- * the movement.
- */
- if (ret == 0 && prev_race && WT_PAGE_IS_INTERNAL(want->page) &&
- __wt_split_prev_race(session, want)) {
- ret = WT_RESTART;
- WT_TRET(__wt_page_release(session, want, flags));
- }
-
- /*
* Expected failures: page not found or restart. Our callers list the
* errors they're expecting to handle.
*/
diff --git a/src/third_party/wiredtiger/src/include/cache.h b/src/third_party/wiredtiger/src/include/cache.h
index 1299d3e90e3..8afedb30832 100644
--- a/src/third_party/wiredtiger/src/include/cache.h
+++ b/src/third_party/wiredtiger/src/include/cache.h
@@ -120,11 +120,11 @@ struct __wt_cache {
double eviction_checkpoint_target;/* Percent to reduce dirty
to during checkpoint scrubs */
- double eviction_scrub_limit; /* Percent of cache to trigger
- dirty eviction during checkpoint
- scrubs */
+ double eviction_scrub_target; /* Current scrub target */
u_int overhead_pct; /* Cache percent adjustment */
+ uint64_t cache_max_wait_us; /* Maximum time an operation waits for
+ * space in cache */
/*
* Eviction thread tuning information.
@@ -149,7 +149,6 @@ struct __wt_cache {
WT_SPINLOCK evict_pass_lock; /* Eviction pass lock */
WT_SESSION_IMPL *walk_session; /* Eviction pass session */
WT_DATA_HANDLE *walk_tree; /* LRU walk current tree */
- uint32_t walk_progress, walk_target;/* Progress in current tree */
WT_SPINLOCK evict_queue_lock; /* Eviction current queue lock */
WT_EVICT_QUEUE evict_queues[WT_EVICT_QUEUE_MAX];
diff --git a/src/third_party/wiredtiger/src/include/cache.i b/src/third_party/wiredtiger/src/include/cache.i
index fc127942d02..2e3700f6287 100644
--- a/src/third_party/wiredtiger/src/include/cache.i
+++ b/src/third_party/wiredtiger/src/include/cache.i
@@ -220,9 +220,9 @@ __wt_cache_update_lookaside_score(
global_score = cache->evict_lookaside_score;
if (score > global_score && global_score < 100)
- __wt_atomic_addi32(&cache->evict_lookaside_score, 1);
+ (void)__wt_atomic_addi32(&cache->evict_lookaside_score, 1);
else if (score < global_score && global_score > 0)
- __wt_atomic_subi32(&cache->evict_lookaside_score, 1);
+ (void)__wt_atomic_subi32(&cache->evict_lookaside_score, 1);
}
/*
@@ -276,6 +276,22 @@ __wt_eviction_clean_needed(WT_SESSION_IMPL *session, double *pct_fullp)
}
/*
+ * __wt_eviction_dirty_target --
+ * Return the effective dirty target (including checkpoint scrubbing).
+ */
+static inline double
+__wt_eviction_dirty_target(WT_CACHE *cache)
+{
+ double dirty_target, scrub_target;
+
+ dirty_target = cache->eviction_dirty_target;
+ scrub_target = cache->eviction_scrub_target;
+
+ return (scrub_target > 0 && scrub_target < dirty_target ?
+ scrub_target : dirty_target);
+}
+
+/*
* __wt_eviction_dirty_needed --
* Return if an application thread should do eviction due to the total
* volume of dirty data in cache.
@@ -284,7 +300,6 @@ static inline bool
__wt_eviction_dirty_needed(WT_SESSION_IMPL *session, double *pct_fullp)
{
WT_CACHE *cache;
- double dirty_trigger;
uint64_t dirty_inuse, bytes_max;
cache = S2C(session)->cache;
@@ -299,10 +314,8 @@ __wt_eviction_dirty_needed(WT_SESSION_IMPL *session, double *pct_fullp)
if (pct_fullp != NULL)
*pct_fullp = ((100.0 * dirty_inuse) / bytes_max);
- if ((dirty_trigger = cache->eviction_scrub_limit) < 1.0)
- dirty_trigger = cache->eviction_dirty_trigger;
-
- return (dirty_inuse > (uint64_t)(dirty_trigger * bytes_max) / 100);
+ return (dirty_inuse > (uint64_t)(
+ cache->eviction_dirty_trigger * bytes_max) / 100);
}
/*
diff --git a/src/third_party/wiredtiger/src/include/cell.i b/src/third_party/wiredtiger/src/include/cell.i
index c160f84b870..214d13b0206 100644
--- a/src/third_party/wiredtiger/src/include/cell.i
+++ b/src/third_party/wiredtiger/src/include/cell.i
@@ -569,7 +569,7 @@ __wt_cell_unpack_safe(
if (start != NULL && \
((uint8_t *)(t) < (uint8_t *)start || \
(((uint8_t *)(t)) + (len)) > (uint8_t *)end)) \
- return (WT_ERROR); \
+ return (WT_ERROR); \
} while (0)
restart:
@@ -692,7 +692,7 @@ restart:
unpack->__len = WT_PTRDIFF32(p, cell);
break;
default:
- return (WT_ERROR); /* Unknown cell type. */
+ return (WT_ERROR); /* Unknown cell type. */
}
/*
@@ -778,7 +778,7 @@ __cell_data_ref(WT_SESSION_IMPL *session,
return (0);
huffman = btree->huffman_value;
break;
- WT_ILLEGAL_VALUE(session);
+ WT_ILLEGAL_VALUE(session, unpack->type);
}
return (huffman == NULL || store->size == 0 ? 0 :
diff --git a/src/third_party/wiredtiger/src/include/config.h b/src/third_party/wiredtiger/src/include/config.h
index 06847117b7d..8feefd4201d 100644
--- a/src/third_party/wiredtiger/src/include/config.h
+++ b/src/third_party/wiredtiger/src/include/config.h
@@ -50,6 +50,7 @@ struct __wt_config_parser_impl {
"", 0, 0, WT_CONFIG_ITEM_NUM \
}
+#define WT_CONFIG_UNSET -1
/*
* DO NOT EDIT: automatically built by dist/api_config.py.
* configuration section: BEGIN
@@ -84,29 +85,30 @@ struct __wt_config_parser_impl {
#define WT_CONFIG_ENTRY_WT_SESSION_log_printf 27
#define WT_CONFIG_ENTRY_WT_SESSION_open_cursor 28
#define WT_CONFIG_ENTRY_WT_SESSION_prepare_transaction 29
-#define WT_CONFIG_ENTRY_WT_SESSION_rebalance 30
-#define WT_CONFIG_ENTRY_WT_SESSION_reconfigure 31
-#define WT_CONFIG_ENTRY_WT_SESSION_rename 32
-#define WT_CONFIG_ENTRY_WT_SESSION_reset 33
-#define WT_CONFIG_ENTRY_WT_SESSION_rollback_transaction 34
-#define WT_CONFIG_ENTRY_WT_SESSION_salvage 35
-#define WT_CONFIG_ENTRY_WT_SESSION_snapshot 36
-#define WT_CONFIG_ENTRY_WT_SESSION_strerror 37
-#define WT_CONFIG_ENTRY_WT_SESSION_timestamp_transaction 38
-#define WT_CONFIG_ENTRY_WT_SESSION_transaction_sync 39
-#define WT_CONFIG_ENTRY_WT_SESSION_truncate 40
-#define WT_CONFIG_ENTRY_WT_SESSION_upgrade 41
-#define WT_CONFIG_ENTRY_WT_SESSION_verify 42
-#define WT_CONFIG_ENTRY_colgroup_meta 43
-#define WT_CONFIG_ENTRY_file_config 44
-#define WT_CONFIG_ENTRY_file_meta 45
-#define WT_CONFIG_ENTRY_index_meta 46
-#define WT_CONFIG_ENTRY_lsm_meta 47
-#define WT_CONFIG_ENTRY_table_meta 48
-#define WT_CONFIG_ENTRY_wiredtiger_open 49
-#define WT_CONFIG_ENTRY_wiredtiger_open_all 50
-#define WT_CONFIG_ENTRY_wiredtiger_open_basecfg 51
-#define WT_CONFIG_ENTRY_wiredtiger_open_usercfg 52
+#define WT_CONFIG_ENTRY_WT_SESSION_query_timestamp 30
+#define WT_CONFIG_ENTRY_WT_SESSION_rebalance 31
+#define WT_CONFIG_ENTRY_WT_SESSION_reconfigure 32
+#define WT_CONFIG_ENTRY_WT_SESSION_rename 33
+#define WT_CONFIG_ENTRY_WT_SESSION_reset 34
+#define WT_CONFIG_ENTRY_WT_SESSION_rollback_transaction 35
+#define WT_CONFIG_ENTRY_WT_SESSION_salvage 36
+#define WT_CONFIG_ENTRY_WT_SESSION_snapshot 37
+#define WT_CONFIG_ENTRY_WT_SESSION_strerror 38
+#define WT_CONFIG_ENTRY_WT_SESSION_timestamp_transaction 39
+#define WT_CONFIG_ENTRY_WT_SESSION_transaction_sync 40
+#define WT_CONFIG_ENTRY_WT_SESSION_truncate 41
+#define WT_CONFIG_ENTRY_WT_SESSION_upgrade 42
+#define WT_CONFIG_ENTRY_WT_SESSION_verify 43
+#define WT_CONFIG_ENTRY_colgroup_meta 44
+#define WT_CONFIG_ENTRY_file_config 45
+#define WT_CONFIG_ENTRY_file_meta 46
+#define WT_CONFIG_ENTRY_index_meta 47
+#define WT_CONFIG_ENTRY_lsm_meta 48
+#define WT_CONFIG_ENTRY_table_meta 49
+#define WT_CONFIG_ENTRY_wiredtiger_open 50
+#define WT_CONFIG_ENTRY_wiredtiger_open_all 51
+#define WT_CONFIG_ENTRY_wiredtiger_open_basecfg 52
+#define WT_CONFIG_ENTRY_wiredtiger_open_usercfg 53
/*
* configuration section: END
* DO NOT EDIT: automatically built by dist/flags.py.
diff --git a/src/third_party/wiredtiger/src/include/connection.h b/src/third_party/wiredtiger/src/include/connection.h
index d0bebe8da5d..c3af948f928 100644
--- a/src/third_party/wiredtiger/src/include/connection.h
+++ b/src/third_party/wiredtiger/src/include/connection.h
@@ -413,34 +413,35 @@ struct __wt_connection_impl {
#define WT_VERB_CHECKPOINT 0x000000004u
#define WT_VERB_CHECKPOINT_PROGRESS 0x000000008u
#define WT_VERB_COMPACT 0x000000010u
-#define WT_VERB_EVICT 0x000000020u
-#define WT_VERB_EVICTSERVER 0x000000040u
-#define WT_VERB_EVICT_STUCK 0x000000080u
-#define WT_VERB_FILEOPS 0x000000100u
-#define WT_VERB_HANDLEOPS 0x000000200u
-#define WT_VERB_LOG 0x000000400u
-#define WT_VERB_LOOKASIDE 0x000000800u
-#define WT_VERB_LOOKASIDE_ACTIVITY 0x000001000u
-#define WT_VERB_LSM 0x000002000u
-#define WT_VERB_LSM_MANAGER 0x000004000u
-#define WT_VERB_METADATA 0x000008000u
-#define WT_VERB_MUTEX 0x000010000u
-#define WT_VERB_OVERFLOW 0x000020000u
-#define WT_VERB_READ 0x000040000u
-#define WT_VERB_REBALANCE 0x000080000u
-#define WT_VERB_RECONCILE 0x000100000u
-#define WT_VERB_RECOVERY 0x000200000u
-#define WT_VERB_RECOVERY_PROGRESS 0x000400000u
-#define WT_VERB_SALVAGE 0x000800000u
-#define WT_VERB_SHARED_CACHE 0x001000000u
-#define WT_VERB_SPLIT 0x002000000u
-#define WT_VERB_TEMPORARY 0x004000000u
-#define WT_VERB_THREAD_GROUP 0x008000000u
-#define WT_VERB_TIMESTAMP 0x010000000u
-#define WT_VERB_TRANSACTION 0x020000000u
-#define WT_VERB_VERIFY 0x040000000u
-#define WT_VERB_VERSION 0x080000000u
-#define WT_VERB_WRITE 0x100000000u
+#define WT_VERB_ERROR_RETURNS 0x000000020u
+#define WT_VERB_EVICT 0x000000040u
+#define WT_VERB_EVICTSERVER 0x000000080u
+#define WT_VERB_EVICT_STUCK 0x000000100u
+#define WT_VERB_FILEOPS 0x000000200u
+#define WT_VERB_HANDLEOPS 0x000000400u
+#define WT_VERB_LOG 0x000000800u
+#define WT_VERB_LOOKASIDE 0x000001000u
+#define WT_VERB_LOOKASIDE_ACTIVITY 0x000002000u
+#define WT_VERB_LSM 0x000004000u
+#define WT_VERB_LSM_MANAGER 0x000008000u
+#define WT_VERB_METADATA 0x000010000u
+#define WT_VERB_MUTEX 0x000020000u
+#define WT_VERB_OVERFLOW 0x000040000u
+#define WT_VERB_READ 0x000080000u
+#define WT_VERB_REBALANCE 0x000100000u
+#define WT_VERB_RECONCILE 0x000200000u
+#define WT_VERB_RECOVERY 0x000400000u
+#define WT_VERB_RECOVERY_PROGRESS 0x000800000u
+#define WT_VERB_SALVAGE 0x001000000u
+#define WT_VERB_SHARED_CACHE 0x002000000u
+#define WT_VERB_SPLIT 0x004000000u
+#define WT_VERB_TEMPORARY 0x008000000u
+#define WT_VERB_THREAD_GROUP 0x010000000u
+#define WT_VERB_TIMESTAMP 0x020000000u
+#define WT_VERB_TRANSACTION 0x040000000u
+#define WT_VERB_VERIFY 0x080000000u
+#define WT_VERB_VERSION 0x100000000u
+#define WT_VERB_WRITE 0x200000000u
/* AUTOMATIC FLAG VALUE GENERATION STOP */
uint64_t verbose;
@@ -458,6 +459,7 @@ struct __wt_connection_impl {
#define WT_TIMING_STRESS_SPLIT_5 0x040u
#define WT_TIMING_STRESS_SPLIT_6 0x080u
#define WT_TIMING_STRESS_SPLIT_7 0x100u
+#define WT_TIMING_STRESS_SPLIT_8 0x200u
/* AUTOMATIC FLAG VALUE GENERATION STOP */
uint64_t timing_stress_flags;
@@ -472,30 +474,32 @@ struct __wt_connection_impl {
WT_FILE_SYSTEM *file_system;
/* AUTOMATIC FLAG VALUE GENERATION START */
-#define WT_CONN_CACHE_CURSORS 0x000001u
-#define WT_CONN_CACHE_POOL 0x000002u
-#define WT_CONN_CKPT_SYNC 0x000004u
-#define WT_CONN_CLOSING 0x000008u
-#define WT_CONN_CLOSING_NO_MORE_OPENS 0x000010u
-#define WT_CONN_CLOSING_TIMESTAMP 0x000020u
-#define WT_CONN_COMPATIBILITY 0x000040u
-#define WT_CONN_EVICTION_NO_LOOKASIDE 0x000080u
-#define WT_CONN_EVICTION_RUN 0x000100u
-#define WT_CONN_IN_MEMORY 0x000200u
-#define WT_CONN_LEAK_MEMORY 0x000400u
-#define WT_CONN_LOOKASIDE_OPEN 0x000800u
-#define WT_CONN_LSM_MERGE 0x001000u
-#define WT_CONN_OPTRACK 0x002000u
-#define WT_CONN_PANIC 0x004000u
-#define WT_CONN_READONLY 0x008000u
-#define WT_CONN_RECOVERING 0x010000u
-#define WT_CONN_SERVER_ASYNC 0x020000u
-#define WT_CONN_SERVER_CHECKPOINT 0x040000u
-#define WT_CONN_SERVER_LOG 0x080000u
-#define WT_CONN_SERVER_LSM 0x100000u
-#define WT_CONN_SERVER_STATISTICS 0x200000u
-#define WT_CONN_SERVER_SWEEP 0x400000u
-#define WT_CONN_WAS_BACKUP 0x800000u
+#define WT_CONN_CACHE_CURSORS 0x0000001u
+#define WT_CONN_CACHE_POOL 0x0000002u
+#define WT_CONN_CKPT_SYNC 0x0000004u
+#define WT_CONN_CLOSING 0x0000008u
+#define WT_CONN_CLOSING_NO_MORE_OPENS 0x0000010u
+#define WT_CONN_CLOSING_TIMESTAMP 0x0000020u
+#define WT_CONN_COMPATIBILITY 0x0000040u
+#define WT_CONN_DATA_CORRUPTION 0x0000080u
+#define WT_CONN_EVICTION_NO_LOOKASIDE 0x0000100u
+#define WT_CONN_EVICTION_RUN 0x0000200u
+#define WT_CONN_IN_MEMORY 0x0000400u
+#define WT_CONN_LEAK_MEMORY 0x0000800u
+#define WT_CONN_LOOKASIDE_OPEN 0x0001000u
+#define WT_CONN_LSM_MERGE 0x0002000u
+#define WT_CONN_OPTRACK 0x0004000u
+#define WT_CONN_PANIC 0x0008000u
+#define WT_CONN_READONLY 0x0010000u
+#define WT_CONN_RECOVERING 0x0020000u
+#define WT_CONN_SALVAGE 0x0040000u
+#define WT_CONN_SERVER_ASYNC 0x0080000u
+#define WT_CONN_SERVER_CHECKPOINT 0x0100000u
+#define WT_CONN_SERVER_LOG 0x0200000u
+#define WT_CONN_SERVER_LSM 0x0400000u
+#define WT_CONN_SERVER_STATISTICS 0x0800000u
+#define WT_CONN_SERVER_SWEEP 0x1000000u
+#define WT_CONN_WAS_BACKUP 0x2000000u
/* AUTOMATIC FLAG VALUE GENERATION STOP */
uint32_t flags;
};
diff --git a/src/third_party/wiredtiger/src/include/cursor.h b/src/third_party/wiredtiger/src/include/cursor.h
index e84921ad035..36fb2d92ee1 100644
--- a/src/third_party/wiredtiger/src/include/cursor.h
+++ b/src/third_party/wiredtiger/src/include/cursor.h
@@ -228,10 +228,11 @@ struct __wt_cursor_btree {
#define WT_CBT_ITERATE_NEXT 0x004u /* Next iteration configuration */
#define WT_CBT_ITERATE_PREV 0x008u /* Prev iteration configuration */
#define WT_CBT_NO_TXN 0x010u /* Non-txn cursor (e.g. a checkpoint) */
-#define WT_CBT_RETRY_NEXT 0x020u /* Next, resulted in prepare conflict */
-#define WT_CBT_RETRY_PREV 0x040u /* Prev, resulted in prepare conflict */
-#define WT_CBT_SEARCH_SMALLEST 0x080u /* Row-store: small-key insert list */
-#define WT_CBT_VAR_ONPAGE_MATCH 0x100u /* Var-store: on-page recno match */
+#define WT_CBT_READ_ONCE 0x020u /* Page in with WT_READ_WONT_NEED */
+#define WT_CBT_RETRY_NEXT 0x040u /* Next, resulted in prepare conflict */
+#define WT_CBT_RETRY_PREV 0x080u /* Prev, resulted in prepare conflict */
+#define WT_CBT_SEARCH_SMALLEST 0x100u /* Row-store: small-key insert list */
+#define WT_CBT_VAR_ONPAGE_MATCH 0x200u /* Var-store: on-page recno match */
/* AUTOMATIC FLAG VALUE GENERATION STOP */
#define WT_CBT_POSITION_MASK /* Flags associated with position */ \
diff --git a/src/third_party/wiredtiger/src/include/error.h b/src/third_party/wiredtiger/src/include/error.h
index f94c3d7b880..1d2d21617a6 100644
--- a/src/third_party/wiredtiger/src/include/error.h
+++ b/src/third_party/wiredtiger/src/include/error.h
@@ -19,6 +19,13 @@
#define WT_DIAGNOSTIC_YIELD
#endif
+#define __wt_err(session, error, ...) \
+ __wt_err_func(session, error, __func__, __LINE__, __VA_ARGS__)
+#define __wt_errx(session, ...) \
+ __wt_errx_func(session, __func__, __LINE__, __VA_ARGS__)
+#define __wt_set_return(session, error) \
+ __wt_set_return_func(session, __func__, __LINE__, error)
+
/* Set "ret" and branch-to-err-label tests. */
#define WT_ERR(a) do { \
if ((ret = (a)) != 0) \
@@ -89,18 +96,18 @@
#define WT_TRET_BUSY_OK(a) WT_TRET_ERROR_OK(a, EBUSY)
#define WT_TRET_NOTFOUND_OK(a) WT_TRET_ERROR_OK(a, WT_NOTFOUND)
+/* Called on unexpected code path: locate the failure. */
+#define __wt_illegal_value(session, v) \
+ __wt_illegal_value_func(session, (uintmax_t)(v), __func__, __LINE__)
+
/* Return and branch-to-err-label cases for switch statements. */
-#define WT_ILLEGAL_VALUE(session) \
+#define WT_ILLEGAL_VALUE(session, v) \
default: \
- return (__wt_illegal_value(session, NULL))
-#define WT_ILLEGAL_VALUE_ERR(session) \
+ return (__wt_illegal_value(session, v))
+#define WT_ILLEGAL_VALUE_ERR(session, v) \
default: \
- ret = __wt_illegal_value(session, NULL); \
+ ret = __wt_illegal_value(session, v); \
goto err
-#define WT_ILLEGAL_VALUE_SET(session) \
- default: \
- ret = __wt_illegal_value(session, NULL); \
- break
#define WT_PANIC_MSG(session, v, ...) do { \
__wt_err(session, v, __VA_ARGS__); \
@@ -125,8 +132,10 @@
*/
#ifdef HAVE_DIAGNOSTIC
#define WT_ASSERT(session, exp) do { \
- if (!(exp)) \
- __wt_assert(session, 0, __func__, __LINE__, "%s", #exp);\
+ if (!(exp)) { \
+ __wt_errx(session, "%s", #exp); \
+ __wt_abort(session); \
+ } \
} while (0)
#else
#define WT_ASSERT(session, exp) \
diff --git a/src/third_party/wiredtiger/src/include/extern.h b/src/third_party/wiredtiger/src/include/extern.h
index cfb9a0e6f22..0e3bc68fcc6 100644
--- a/src/third_party/wiredtiger/src/include/extern.h
+++ b/src/third_party/wiredtiger/src/include/extern.h
@@ -58,7 +58,7 @@ extern int __wt_block_manager_named_size(WT_SESSION_IMPL *session, const char *n
extern int __wt_bm_preload(WT_BM *bm, WT_SESSION_IMPL *session, const uint8_t *addr, size_t addr_size) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
extern int __wt_bm_read(WT_BM *bm, WT_SESSION_IMPL *session, WT_ITEM *buf, const uint8_t *addr, size_t addr_size) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
extern int __wt_bm_corrupt(WT_BM *bm, WT_SESSION_IMPL *session, const uint8_t *addr, size_t addr_size) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
-extern int __wt_block_read_off_blind(WT_SESSION_IMPL *session, WT_BLOCK *block, WT_ITEM *buf, wt_off_t offset) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
+extern int __wt_block_read_off_blind(WT_SESSION_IMPL *session, WT_BLOCK *block, wt_off_t offset, uint32_t *sizep, uint32_t *checksump) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
extern int __wt_block_read_off(WT_SESSION_IMPL *session, WT_BLOCK *block, WT_ITEM *buf, wt_off_t offset, uint32_t size, uint32_t checksum) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
extern int __wt_block_ext_alloc(WT_SESSION_IMPL *session, WT_EXT **extp) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
extern void __wt_block_ext_free(WT_SESSION_IMPL *session, WT_EXT *ext);
@@ -146,7 +146,7 @@ extern bool __wt_btree_immediately_durable(WT_SESSION_IMPL *session) WT_GCC_FUNC
extern int __wt_btree_huffman_open(WT_SESSION_IMPL *session) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
extern void __wt_btree_huffman_close(WT_SESSION_IMPL *session);
extern int __wt_bt_read(WT_SESSION_IMPL *session, WT_ITEM *buf, const uint8_t *addr, size_t addr_size) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
-extern int __wt_bt_write(WT_SESSION_IMPL *session, WT_ITEM *buf, uint8_t *addr, size_t *addr_sizep, bool checkpoint, bool checkpoint_io, bool compressed) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
+extern int __wt_bt_write(WT_SESSION_IMPL *session, WT_ITEM *buf, uint8_t *addr, size_t *addr_sizep, size_t *compressed_sizep, bool checkpoint, bool checkpoint_io, bool compressed) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
extern const char *__wt_page_type_string(u_int type) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("default")));
extern const char *__wt_cell_type_string(uint8_t type);
extern const char *__wt_page_addr_string(WT_SESSION_IMPL *session, WT_REF *ref, WT_ITEM *buf);
@@ -158,12 +158,12 @@ extern int __wt_ovfl_discard(WT_SESSION_IMPL *session, WT_CELL *cell) WT_GCC_FUN
extern int __wt_page_alloc(WT_SESSION_IMPL *session, uint8_t type, uint32_t alloc_entries, bool alloc_refs, WT_PAGE **pagep) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
extern int __wt_page_inmem(WT_SESSION_IMPL *session, WT_REF *ref, const void *image, uint32_t flags, WT_PAGE **pagep) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
extern int __wt_row_random_leaf(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
-extern int __wt_random_descent(WT_SESSION_IMPL *session, WT_REF **refp, bool eviction) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
+extern int __wt_random_descent(WT_SESSION_IMPL *session, WT_REF **refp, uint32_t flags) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
extern int __wt_btcur_next_random(WT_CURSOR_BTREE *cbt) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
extern int
__wt_page_in_func(WT_SESSION_IMPL *session, WT_REF *ref, uint32_t flags
#ifdef HAVE_DIAGNOSTIC
- , const char *file, int line
+ , const char *func, int line
#endif
);
extern int __wt_bt_rebalance(WT_SESSION_IMPL *session, const char *cfg[]) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
@@ -210,13 +210,12 @@ extern void __wt_las_cursor(WT_SESSION_IMPL *session, WT_CURSOR **cursorp, uint3
extern int __wt_las_cursor_close(WT_SESSION_IMPL *session, WT_CURSOR **cursorp, uint32_t session_flags) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
extern bool __wt_las_page_skip_locked(WT_SESSION_IMPL *session, WT_REF *ref) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
extern bool __wt_las_page_skip(WT_SESSION_IMPL *session, WT_REF *ref) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
-extern int __wt_las_insert_block(WT_SESSION_IMPL *session, WT_CURSOR *cursor, WT_PAGE *page, WT_MULTI *multi, WT_ITEM *key) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
+extern int __wt_las_insert_block(WT_CURSOR *cursor, WT_BTREE *btree, WT_PAGE *page, WT_MULTI *multi, WT_ITEM *key) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
extern int __wt_las_cursor_position(WT_CURSOR *cursor, uint64_t pageid) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
extern int __wt_las_remove_block(WT_SESSION_IMPL *session, uint64_t pageid, bool lock_wait) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
extern int __wt_las_save_dropped(WT_SESSION_IMPL *session) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
extern int __wt_las_sweep(WT_SESSION_IMPL *session) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
-extern uint32_t __wt_checksum_sw(const void *chunk, size_t len) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("default")));
-extern void __wt_checksum_init(void) WT_GCC_FUNC_DECL_ATTRIBUTE((cold));
+extern uint32_t __wt_checksum_sw(const void *chunk, size_t len);
extern void __wt_config_initn(WT_SESSION_IMPL *session, WT_CONFIG *conf, const char *str, size_t len);
extern void __wt_config_init(WT_SESSION_IMPL *session, WT_CONFIG *conf, const char *str);
extern void __wt_config_subinit(WT_SESSION_IMPL *session, WT_CONFIG *conf, WT_CONFIG_ITEM *item);
@@ -355,7 +354,7 @@ extern int __wt_cursor_cache(WT_CURSOR *cursor, WT_DATA_HANDLE *dhandle) WT_GCC_
extern void __wt_cursor_reopen(WT_CURSOR *cursor, WT_DATA_HANDLE *dhandle);
extern int __wt_cursor_cache_release(WT_SESSION_IMPL *session, WT_CURSOR *cursor, bool *released) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
extern int __wt_cursor_cache_get(WT_SESSION_IMPL *session, const char *uri, WT_CURSOR *to_dup, const char *cfg[], WT_CURSOR **cursorp) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
-extern int __wt_cursor_close(WT_CURSOR *cursor) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
+extern void __wt_cursor_close(WT_CURSOR *cursor);
extern int __wt_cursor_equals(WT_CURSOR *cursor, WT_CURSOR *other, int *equalp) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
extern int __wt_cursor_reconfigure(WT_CURSOR *cursor, const char *config) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
extern int __wt_cursor_dup_position(WT_CURSOR *to_dup, WT_CURSOR *cursor) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
@@ -383,7 +382,7 @@ extern void __wt_evict_priority_set(WT_SESSION_IMPL *session, uint64_t v);
extern void __wt_evict_priority_clear(WT_SESSION_IMPL *session);
extern int __wt_verbose_dump_cache(WT_SESSION_IMPL *session) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
extern int __wt_page_release_evict(WT_SESSION_IMPL *session, WT_REF *ref) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
-extern int __wt_evict(WT_SESSION_IMPL *session, WT_REF *ref, bool closing) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
+extern int __wt_evict(WT_SESSION_IMPL *session, WT_REF *ref, bool closing, uint32_t previous_state) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
extern void __wt_curstat_cache_walk(WT_SESSION_IMPL *session);
extern int __wt_log_printf(WT_SESSION_IMPL *session, const char *format, ...) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
extern void __wt_log_ckpt(WT_SESSION_IMPL *session, WT_LSN *ckp_lsn);
@@ -392,7 +391,7 @@ extern void __wt_log_background(WT_SESSION_IMPL *session, WT_LSN *lsn);
extern int __wt_log_force_sync(WT_SESSION_IMPL *session, WT_LSN *min_lsn) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
extern int __wt_log_needs_recovery(WT_SESSION_IMPL *session, WT_LSN *ckp_lsn, bool *recp) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
extern void __wt_log_written_reset(WT_SESSION_IMPL *session);
-extern int __wt_log_get_all_files(WT_SESSION_IMPL *session, char ***filesp, u_int *countp, uint32_t *maxid, bool active_only) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
+extern int __wt_log_get_backup_files(WT_SESSION_IMPL *session, char ***filesp, u_int *countp, uint32_t *maxid, bool active_only) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
extern int __wt_log_extract_lognum(WT_SESSION_IMPL *session, const char *name, uint32_t *id) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
extern int __wt_log_reset(WT_SESSION_IMPL *session, uint32_t lognum) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
extern int __wt_log_fill(WT_SESSION_IMPL *session, WT_MYSLOT *myslot, bool force, WT_ITEM *record, WT_LSN *lsnp) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
@@ -498,6 +497,7 @@ extern int __wt_lsm_work_switch(WT_SESSION_IMPL *session, WT_LSM_WORK_UNIT **ent
extern int __wt_lsm_work_bloom(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
extern bool __wt_lsm_chunk_visible_all(WT_SESSION_IMPL *session, WT_LSM_CHUNK *chunk) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
extern int __wt_lsm_checkpoint_chunk(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree, WT_LSM_CHUNK *chunk) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
+extern int __wt_lsm_work_enable_evict(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
extern int __wt_lsm_free_chunks(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
extern int __wt_lsm_worker_start(WT_SESSION_IMPL *session, WT_LSM_WORKER_ARGS *args) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
extern int __wt_lsm_worker_stop(WT_SESSION_IMPL *session, WT_LSM_WORKER_ARGS *args) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
@@ -524,6 +524,7 @@ extern int __wt_metadata_insert(WT_SESSION_IMPL *session, const char *key, const
extern int __wt_metadata_update(WT_SESSION_IMPL *session, const char *key, const char *value) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
extern int __wt_metadata_remove(WT_SESSION_IMPL *session, const char *key) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
extern int __wt_metadata_search(WT_SESSION_IMPL *session, const char *key, char **valuep) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
+extern int __wt_metadata_salvage(WT_SESSION_IMPL *session) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
extern void __wt_meta_track_discard(WT_SESSION_IMPL *session);
extern int __wt_meta_track_on(WT_SESSION_IMPL *session) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
extern int __wt_meta_track_off(WT_SESSION_IMPL *session, bool need_sync, bool unroll) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
@@ -548,7 +549,7 @@ extern int __wt_nfilename(WT_SESSION_IMPL *session, const char *name, size_t nam
extern int __wt_filename_construct(WT_SESSION_IMPL *session, const char *path, const char *file_prefix, uintmax_t id_1, uint32_t id_2, WT_ITEM *buf) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
extern int __wt_remove_if_exists(WT_SESSION_IMPL *session, const char *name, bool durable) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
extern int __wt_copy_and_sync(WT_SESSION *wt_session, const char *from, const char *to) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("default"))) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
-extern void __wt_abort(WT_SESSION_IMPL *session) WT_GCC_FUNC_DECL_ATTRIBUTE((noreturn));
+extern void __wt_abort(WT_SESSION_IMPL *session) WT_GCC_FUNC_DECL_ATTRIBUTE((noreturn)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("default")));
extern int __wt_calloc(WT_SESSION_IMPL *session, size_t number, size_t size, void *retp) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("default"))) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
extern int __wt_malloc(WT_SESSION_IMPL *session, size_t bytes_to_allocate, void *retp) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
extern int __wt_realloc(WT_SESSION_IMPL *session, size_t *bytes_allocated_ret, size_t bytes_to_allocate, void *retp) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
@@ -598,7 +599,7 @@ extern void __wt_ovfl_reuse_free(WT_SESSION_IMPL *session, WT_PAGE *page);
extern int __wt_ovfl_track_wrapup(WT_SESSION_IMPL *session, WT_PAGE *page) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
extern int __wt_ovfl_track_wrapup_err(WT_SESSION_IMPL *session, WT_PAGE *page) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
extern int __wt_reconcile(WT_SESSION_IMPL *session, WT_REF *ref, WT_SALVAGE_COOKIE *salvage, uint32_t flags, bool *lookaside_retryp) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
-extern uint32_t __wt_split_page_size(WT_BTREE *btree, uint32_t maxpagesize);
+extern uint32_t __wt_split_page_size(int split_pct, uint32_t maxpagesize, uint32_t allocsize);
extern int __wt_bulk_init(WT_SESSION_IMPL *session, WT_CURSOR_BULK *cbulk) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
extern int __wt_bulk_wrapup(WT_SESSION_IMPL *session, WT_CURSOR_BULK *cbulk) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
extern int __wt_bulk_insert_row(WT_SESSION_IMPL *session, WT_CURSOR_BULK *cbulk) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
@@ -675,25 +676,17 @@ extern int __wt_decrypt(WT_SESSION_IMPL *session, WT_ENCRYPTOR *encryptor, size_
extern int __wt_encrypt(WT_SESSION_IMPL *session, WT_KEYED_ENCRYPTOR *kencryptor, size_t skip, WT_ITEM *in, WT_ITEM *out) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
extern void __wt_encrypt_size(WT_SESSION_IMPL *session, WT_KEYED_ENCRYPTOR *kencryptor, size_t incoming_size, size_t *sizep);
extern void __wt_event_handler_set(WT_SESSION_IMPL *session, WT_EVENT_HANDLER *handler);
-extern void __wt_err(WT_SESSION_IMPL *session, int error, const char *fmt, ...) WT_GCC_FUNC_DECL_ATTRIBUTE((cold)) WT_GCC_FUNC_DECL_ATTRIBUTE((format (printf, 3, 4))) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("default")));
-extern void __wt_errx(WT_SESSION_IMPL *session, const char *fmt, ...) WT_GCC_FUNC_DECL_ATTRIBUTE((cold)) WT_GCC_FUNC_DECL_ATTRIBUTE((format (printf, 2, 3)));
+extern void __wt_err_func(WT_SESSION_IMPL *session, int error, const char *func, int line, const char *fmt, ...) WT_GCC_FUNC_DECL_ATTRIBUTE((cold)) WT_GCC_FUNC_DECL_ATTRIBUTE((format (printf, 5, 6))) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("default")));
+extern void __wt_errx_func(WT_SESSION_IMPL *session, const char *func, int line, const char *fmt, ...) WT_GCC_FUNC_DECL_ATTRIBUTE((cold)) WT_GCC_FUNC_DECL_ATTRIBUTE((format (printf, 4, 5))) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("default")));
+extern int __wt_set_return_func(WT_SESSION_IMPL *session, const char*func, int line, int err) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
extern int __wt_ext_err_printf(WT_EXTENSION_API *wt_api, WT_SESSION *wt_session, const char *fmt, ...) WT_GCC_FUNC_DECL_ATTRIBUTE((format (printf, 3, 4))) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
extern void __wt_verbose_worker(WT_SESSION_IMPL *session, const char *fmt, ...) WT_GCC_FUNC_DECL_ATTRIBUTE((format (printf, 2, 3))) WT_GCC_FUNC_DECL_ATTRIBUTE((cold));
extern int __wt_msg(WT_SESSION_IMPL *session, const char *fmt, ...) WT_GCC_FUNC_DECL_ATTRIBUTE((cold)) WT_GCC_FUNC_DECL_ATTRIBUTE((format (printf, 2, 3))) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
extern int __wt_ext_msg_printf(WT_EXTENSION_API *wt_api, WT_SESSION *wt_session, const char *fmt, ...) WT_GCC_FUNC_DECL_ATTRIBUTE((format (printf, 3, 4))) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
extern const char *__wt_ext_strerror(WT_EXTENSION_API *wt_api, WT_SESSION *wt_session, int error);
extern int __wt_progress(WT_SESSION_IMPL *session, const char *s, uint64_t v) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
-extern void
-__wt_assert(WT_SESSION_IMPL *session,
- int error, const char *file_name, int line_number, const char *fmt, ...)
- WT_GCC_FUNC_DECL_ATTRIBUTE((cold))
- WT_GCC_FUNC_DECL_ATTRIBUTE((format (printf, 5, 6)))
-#ifdef HAVE_DIAGNOSTIC
- WT_GCC_FUNC_DECL_ATTRIBUTE((noreturn))
-#endif
- WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("default")));
extern int __wt_panic(WT_SESSION_IMPL *session) WT_GCC_FUNC_DECL_ATTRIBUTE((cold)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("default"))) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
-extern int __wt_illegal_value_func(WT_SESSION_IMPL *session, const char *tag, const char *file, int line) WT_GCC_FUNC_DECL_ATTRIBUTE((cold)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("default"))) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
+extern int __wt_illegal_value_func(WT_SESSION_IMPL *session, uintmax_t v, const char *func, int line) WT_GCC_FUNC_DECL_ATTRIBUTE((cold)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("default"))) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
extern int __wt_inmem_unsupported_op(WT_SESSION_IMPL *session, const char *tag) WT_GCC_FUNC_DECL_ATTRIBUTE((cold)) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
extern int __wt_object_unsupported(WT_SESSION_IMPL *session, const char *uri) WT_GCC_FUNC_DECL_ATTRIBUTE((cold)) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
extern int __wt_bad_object_type(WT_SESSION_IMPL *session, const char *uri) WT_GCC_FUNC_DECL_ATTRIBUTE((cold)) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
@@ -716,13 +709,14 @@ extern uint64_t __wt_hash_fnv64(const void *string, size_t len);
extern int
__wt_hazard_set(WT_SESSION_IMPL *session, WT_REF *ref, bool *busyp
#ifdef HAVE_DIAGNOSTIC
- , const char *file, int line
+ , const char *func, int line
#endif
);
extern int __wt_hazard_clear(WT_SESSION_IMPL *session, WT_REF *ref) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
extern void __wt_hazard_close(WT_SESSION_IMPL *session);
extern WT_HAZARD *__wt_hazard_check(WT_SESSION_IMPL *session, WT_REF *ref);
extern u_int __wt_hazard_count(WT_SESSION_IMPL *session, WT_REF *ref);
+extern bool __wt_hazard_check_assert(WT_SESSION_IMPL *session, void *ref, bool waitfor) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
extern void __wt_fill_hex(const uint8_t *src, size_t src_max, uint8_t *dest, size_t dest_max, size_t *lenp);
extern int __wt_raw_to_hex(WT_SESSION_IMPL *session, const uint8_t *from, size_t size, WT_ITEM *to) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
extern int __wt_raw_to_esc_hex(WT_SESSION_IMPL *session, const uint8_t *from, size_t size, WT_ITEM *to) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
@@ -764,7 +758,7 @@ extern const char *__wt_buf_set_size(WT_SESSION_IMPL *session, uint64_t size, bo
extern int
__wt_scr_alloc_func(WT_SESSION_IMPL *session, size_t size, WT_ITEM **scratchp
#ifdef HAVE_DIAGNOSTIC
- , const char *file, int line
+ , const char *func, int line
#endif
)
WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("default")));
@@ -849,7 +843,7 @@ extern int __wt_timestamp_to_hex_string(WT_SESSION_IMPL *session, char *hex_time
extern void __wt_verbose_timestamp(WT_SESSION_IMPL *session, const wt_timestamp_t *ts, const char *msg);
extern int __wt_txn_parse_timestamp_raw(WT_SESSION_IMPL *session, const char *name, wt_timestamp_t *timestamp, WT_CONFIG_ITEM *cval) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
extern int __wt_txn_parse_timestamp(WT_SESSION_IMPL *session, const char *name, wt_timestamp_t *timestamp, WT_CONFIG_ITEM *cval) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
-extern int __wt_txn_global_query_timestamp(WT_SESSION_IMPL *session, char *hex_timestamp, const char *cfg[]) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
+extern int __wt_txn_query_timestamp(WT_SESSION_IMPL *session, char *hex_timestamp, const char *cfg[], bool global_txn) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
extern int __wt_txn_update_pinned_timestamp(WT_SESSION_IMPL *session, bool force) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
extern int __wt_txn_global_set_timestamp(WT_SESSION_IMPL *session, const char *cfg[]) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
extern int __wt_timestamp_validate(WT_SESSION_IMPL *session, const char *name, wt_timestamp_t *ts, WT_CONFIG_ITEM *cval) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
@@ -860,3 +854,4 @@ extern void __wt_txn_set_commit_timestamp(WT_SESSION_IMPL *session);
extern void __wt_txn_clear_commit_timestamp(WT_SESSION_IMPL *session);
extern void __wt_txn_set_read_timestamp(WT_SESSION_IMPL *session);
extern void __wt_txn_clear_read_timestamp(WT_SESSION_IMPL *session);
+extern void __wt_txn_clear_timestamp_queues(WT_SESSION_IMPL *session);
diff --git a/src/third_party/wiredtiger/src/include/extern_posix.h b/src/third_party/wiredtiger/src/include/extern_posix.h
index 8b92d99d4f1..94bc1e78597 100644
--- a/src/third_party/wiredtiger/src/include/extern_posix.h
+++ b/src/third_party/wiredtiger/src/include/extern_posix.h
@@ -27,9 +27,10 @@ extern void __wt_stream_set_no_buffer(FILE *fp) WT_GCC_FUNC_DECL_ATTRIBUTE((visi
extern void __wt_sleep(uint64_t seconds, uint64_t micro_seconds) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("default")));
extern int __wt_vsnprintf_len_incr(char *buf, size_t size, size_t *retsizep, const char *fmt, va_list ap) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("default"))) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
extern int __wt_thread_create(WT_SESSION_IMPL *session, wt_thread_t *tidret, WT_THREAD_CALLBACK(*func)(void *), void *arg) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("default"))) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
-extern int __wt_thread_join(WT_SESSION_IMPL *session, wt_thread_t tid) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("default"))) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
+extern int __wt_thread_join(WT_SESSION_IMPL *session, wt_thread_t *tid) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("default"))) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
extern void __wt_thread_id(uintmax_t *id) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("default")));
extern int __wt_thread_str(char *buf, size_t buflen) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("default"))) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
extern uintmax_t __wt_process_id(void);
extern void __wt_epoch_raw(WT_SESSION_IMPL *session, struct timespec *tsp);
+extern int __wt_localtime(WT_SESSION_IMPL *session, const time_t *timep, struct tm *result) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("default"))) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
extern void __wt_yield(void) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("default")));
diff --git a/src/third_party/wiredtiger/src/include/extern_win.h b/src/third_party/wiredtiger/src/include/extern_win.h
index 50808750c56..9356b844be0 100644
--- a/src/third_party/wiredtiger/src/include/extern_win.h
+++ b/src/third_party/wiredtiger/src/include/extern_win.h
@@ -25,11 +25,12 @@ extern void __wt_stream_set_no_buffer(FILE *fp);
extern void __wt_sleep(uint64_t seconds, uint64_t micro_seconds);
extern int __wt_vsnprintf_len_incr(char *buf, size_t size, size_t *retsizep, const char *fmt, va_list ap) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
extern int __wt_thread_create(WT_SESSION_IMPL *session, wt_thread_t *tidret, WT_THREAD_CALLBACK(*func)(void *), void *arg) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
-extern int __wt_thread_join(WT_SESSION_IMPL *session, wt_thread_t tid) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
+extern int __wt_thread_join(WT_SESSION_IMPL *session, wt_thread_t *tid) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
extern void __wt_thread_id(uintmax_t *id);
extern int __wt_thread_str(char *buf, size_t buflen) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
extern uintmax_t __wt_process_id(void);
extern void __wt_epoch_raw(WT_SESSION_IMPL *session, struct timespec *tsp);
+extern int __wt_localtime(WT_SESSION_IMPL *session, const time_t *timep, struct tm *result) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
extern int __wt_to_utf16_string(WT_SESSION_IMPL *session, const char *utf8, WT_ITEM **outbuf) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
extern int __wt_to_utf8_string(WT_SESSION_IMPL *session, const wchar_t *wide, WT_ITEM **outbuf) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
extern DWORD __wt_getlasterror(void);
diff --git a/src/third_party/wiredtiger/src/include/log.h b/src/third_party/wiredtiger/src/include/log.h
index 0605c458673..d3e36d37da9 100644
--- a/src/third_party/wiredtiger/src/include/log.h
+++ b/src/third_party/wiredtiger/src/include/log.h
@@ -87,6 +87,12 @@ union __wt_lsn {
#define WT_LOGC_KEY_FORMAT WT_UNCHECKED_STRING(III)
#define WT_LOGC_VALUE_FORMAT WT_UNCHECKED_STRING(qIIIuu)
+/*
+ * Size range for the log files.
+ */
+#define WT_LOG_FILE_MAX ((int64_t)2 * WT_GIGABYTE)
+#define WT_LOG_FILE_MIN (100 * WT_KILOBYTE)
+
#define WT_LOG_SKIP_HEADER(data) \
((const uint8_t *)(data) + offsetof(WT_LOG_RECORD, record))
#define WT_LOG_REC_SIZE(size) \
@@ -317,9 +323,15 @@ struct __wt_log_record {
/*
* No automatic generation: flag values cannot change, they're written
* to disk.
+ *
+ * Unused bits in the flags, as well as the 'unused' padding,
+ * are expected to be zeroed; we check that to help detect file
+ * corruption.
*/
#define WT_LOG_RECORD_COMPRESSED 0x01u /* Compressed except hdr */
#define WT_LOG_RECORD_ENCRYPTED 0x02u /* Encrypted except hdr */
+#define WT_LOG_RECORD_ALL_FLAGS \
+ (WT_LOG_RECORD_COMPRESSED | WT_LOG_RECORD_ENCRYPTED)
uint16_t flags; /* 08-09: Flags */
uint8_t unused[2]; /* 10-11: Padding */
uint32_t mem_len; /* 12-15: Uncompressed len if needed */
diff --git a/src/third_party/wiredtiger/src/include/lsm.h b/src/third_party/wiredtiger/src/include/lsm.h
index f515e03519a..67ef28757ef 100644
--- a/src/third_party/wiredtiger/src/include/lsm.h
+++ b/src/third_party/wiredtiger/src/include/lsm.h
@@ -107,7 +107,8 @@ struct __wt_lsm_chunk {
uint32_t id; /* ID used to generate URIs */
uint32_t generation; /* Merge generation */
uint32_t refcnt; /* Number of worker thread references */
- uint32_t bloom_busy; /* Number of worker thread references */
+ uint32_t bloom_busy; /* Currently creating bloom filter */
+ uint32_t evict_enabled; /* Eviction allowed on the chunk */
int8_t empty; /* 1/0: checkpoint missing */
int8_t evicted; /* 1/0: in-memory chunk was evicted */
@@ -129,13 +130,19 @@ struct __wt_lsm_chunk {
* is required.
*/
/* AUTOMATIC FLAG VALUE GENERATION START */
-#define WT_LSM_WORK_BLOOM 0x01u /* Create a bloom filter */
-#define WT_LSM_WORK_DROP 0x02u /* Drop unused chunks */
-#define WT_LSM_WORK_FLUSH 0x04u /* Flush a chunk to disk */
-#define WT_LSM_WORK_MERGE 0x08u /* Look for a tree merge */
-#define WT_LSM_WORK_SWITCH 0x10u /* Switch to new in-memory chunk */
+#define WT_LSM_WORK_BLOOM 0x01u /* Create a bloom filter */
+#define WT_LSM_WORK_DROP 0x02u /* Drop unused chunks */
+#define WT_LSM_WORK_ENABLE_EVICT 0x04u /* Create a bloom filter */
+#define WT_LSM_WORK_FLUSH 0x08u /* Flush a chunk to disk */
+#define WT_LSM_WORK_MERGE 0x10u /* Look for a tree merge */
+#define WT_LSM_WORK_SWITCH 0x20u /* Switch the in-memory chunk */
/* AUTOMATIC FLAG VALUE GENERATION STOP */
+/* Work units that are serviced by general worker threads. */
+#define WT_LSM_WORK_GENERAL_OPS \
+ (WT_LSM_WORK_BLOOM | WT_LSM_WORK_DROP | WT_LSM_WORK_ENABLE_EVICT |\
+ WT_LSM_WORK_FLUSH | WT_LSM_WORK_SWITCH)
+
/*
* WT_LSM_WORK_UNIT --
* A definition of maintenance that an LSM tree needs done.
diff --git a/src/third_party/wiredtiger/src/include/meta.h b/src/third_party/wiredtiger/src/include/meta.h
index 6ca2bb27832..3c028d1484b 100644
--- a/src/third_party/wiredtiger/src/include/meta.h
+++ b/src/third_party/wiredtiger/src/include/meta.h
@@ -24,6 +24,7 @@
#define WT_METADATA_URI "metadata:" /* Metadata alias */
#define WT_METAFILE "WiredTiger.wt" /* Metadata table */
+#define WT_METAFILE_SLVG "WiredTiger.wt.orig" /* Metadata copy */
#define WT_METAFILE_URI "file:WiredTiger.wt" /* Metadata table URI */
#define WT_LAS_URI "file:WiredTigerLAS.wt" /* Lookaside table URI*/
diff --git a/src/third_party/wiredtiger/src/include/misc.h b/src/third_party/wiredtiger/src/include/misc.h
index 1d2a5075018..8c6af3ca14c 100644
--- a/src/third_party/wiredtiger/src/include/misc.h
+++ b/src/third_party/wiredtiger/src/include/misc.h
@@ -187,6 +187,15 @@
} while (0)
/*
+ * Some C compiler address sanitizers complain if qsort is passed a NULL base
+ * reference, even if there are no elements to compare (note zero elements is
+ * allowed by the IEEE Std 1003.1-2017 standard). Avoid the complaint.
+ */
+#define __wt_qsort(base, nmemb, size, compar) \
+ if ((nmemb) != 0) \
+ qsort(base, nmemb, size, compar)
+
+/*
* Binary search for an integer key.
*/
#define WT_BINARY_SEARCH(key, arrayp, n, found) do { \
@@ -227,9 +236,12 @@
} while (0)
/*
- * Check if a variable string equals a constant string. Inline the common
- * case for WiredTiger of a single byte string. This is required because not
- * all compilers optimize this case in strcmp (e.g., clang).
+ * Check if a variable string equals a constant string. Inline the common case
+ * for WiredTiger of a single byte string. This is required because not all
+ * compilers optimize this case in strcmp (e.g., clang). While this macro works
+ * in the case of comparing two pointers (a sizeof operator on a pointer won't
+ * equal 2 and the extra code will be discarded at compile time), that's not its
+ * purpose.
*/
#define WT_STREQ(s, cs) \
(sizeof(cs) == 2 ? (s)[0] == (cs)[0] && (s)[1] == '\0' : \
@@ -294,22 +306,17 @@ typedef void wt_timestamp_t;
__wt_scr_alloc_func(session, size, scratchp, __func__, __LINE__)
#define __wt_page_in(session, ref, flags) \
__wt_page_in_func(session, ref, flags, __func__, __LINE__)
-#define __wt_page_swap(session, held, want, prev_race, flags) \
- __wt_page_swap_func( \
- session, held, want, prev_race, flags, __func__, __LINE__)
+#define __wt_page_swap(session, held, want, flags) \
+ __wt_page_swap_func(session, held, want, flags, __func__, __LINE__)
#else
#define __wt_scr_alloc(session, size, scratchp) \
__wt_scr_alloc_func(session, size, scratchp)
#define __wt_page_in(session, ref, flags) \
__wt_page_in_func(session, ref, flags)
-#define __wt_page_swap(session, held, want, prev_race, flags) \
- __wt_page_swap_func(session, held, want, prev_race, flags)
+#define __wt_page_swap(session, held, want, flags) \
+ __wt_page_swap_func(session, held, want, flags)
#endif
-/* Called on unexpected code path: locate the failure. */
-#define __wt_illegal_value(session, msg) \
- __wt_illegal_value_func(session, msg, __func__, __LINE__)
-
/* Random number generator state. */
union __wt_rand_state {
uint64_t v;
@@ -337,5 +344,56 @@ union __wt_rand_state {
}
#define WT_TAILQ_SAFE_REMOVE_END }
-/* Sleep time to uncover race conditions during timing stress test. */
-#define TIMING_STRESS_TEST_SLEEP (100 * WT_THOUSAND)
+/*
+ * WT_VA_ARGS_BUF_FORMAT --
+ * Format into a scratch buffer, extending it as necessary. This is a
+ * macro because we need to repeatedly call va_start/va_end and there's no
+ * way to do that inside a function call.
+ */
+#define WT_VA_ARGS_BUF_FORMAT(session, buf, fmt, concatenate) do { \
+ size_t __len, __space; \
+ va_list __ap; \
+ int __ret_xx; /* __ret already used by WT_RET */ \
+ char *__p; \
+ \
+ /* \
+ * This macro is used to both initialize and concatenate into a \
+ * buffer. If not concatenating, clear the size so we don't use \
+ * any existing contents. \
+ */ \
+ if (!(concatenate)) \
+ (buf)->size = 0; \
+ for (;;) { \
+ WT_ASSERT(session, (buf)->memsize >= (buf)->size); \
+ __p = (char *)((uint8_t *)(buf)->mem + (buf)->size); \
+ __space = (buf)->memsize - (buf)->size; \
+ \
+ /* Format into the buffer. */ \
+ va_start(__ap, fmt); \
+ __ret_xx = __wt_vsnprintf_len_set( \
+ __p, __space, &__len, fmt, __ap); \
+ va_end(__ap); \
+ WT_RET(__ret_xx); \
+ \
+ /* Check if there was enough space. */ \
+ if (__len < __space) { \
+ (buf)->data = (buf)->mem; \
+ (buf)->size += __len; \
+ break; \
+ } \
+ \
+ /* \
+ * If not, double the size of the buffer: we're dealing \
+ * with strings, we don't expect the size to get huge. \
+ */ \
+ WT_RET(__wt_buf_extend( \
+ session, buf, (buf)->size + __len + 1)); \
+ } \
+} while (0)
+
+/*
+ * HAVE_LONG_RUNNING_PREPARE
+ * To enable functionality of evicting prepared transactions using
+ * cache overflow mechanism.
+ */
+#undef HAVE_LONG_RUNNING_PREPARE
diff --git a/src/third_party/wiredtiger/src/include/misc.i b/src/third_party/wiredtiger/src/include/misc.i
index 0250479af4a..5c9f95bc08a 100644
--- a/src/third_party/wiredtiger/src/include/misc.i
+++ b/src/third_party/wiredtiger/src/include/misc.i
@@ -248,3 +248,37 @@ __wt_spin_backoff(uint64_t *yield_count, uint64_t *sleep_usecs)
(*sleep_usecs) = WT_MIN((*sleep_usecs) + 100, WT_THOUSAND);
__wt_sleep(0, (*sleep_usecs));
}
+
+ /* Maximum stress delay is 1/10 of a second. */
+#define WT_TIMING_STRESS_MAX_DELAY (100000)
+
+/*
+ * __wt_timing_stress --
+ * Optionally add delay to stress code paths.
+ */
+static inline void
+__wt_timing_stress(WT_SESSION_IMPL *session, u_int flag)
+{
+ uint64_t i;
+
+ /* Optionally only sleep when a specified configuration flag is set. */
+ if (flag != 0 && !FLD_ISSET(S2C(session)->timing_stress_flags, flag))
+ return;
+
+ /*
+ * We need a fast way to choose a sleep time. We want to sleep a short
+ * period most of the time, but occasionally wait longer. Divide the
+ * maximum period of time into 10 buckets (where bucket 0 doesn't sleep
+ * at all), and roll dice, advancing to the next bucket 50% of the time.
+ * That means we'll hit the maximum roughly every 1K calls.
+ */
+ for (i = 0;;)
+ if (__wt_random(&session->rnd) & 0x1 || ++i > 9)
+ break;
+
+ if (i == 0)
+ __wt_yield();
+ else
+ /* The default maximum delay is 1/10th of a second. */
+ __wt_sleep(0, i * (WT_TIMING_STRESS_MAX_DELAY / 10));
+}
diff --git a/src/third_party/wiredtiger/src/include/os_fhandle.i b/src/third_party/wiredtiger/src/include/os_fhandle.i
index 7c09a83132c..78d01abca4b 100644
--- a/src/third_party/wiredtiger/src/include/os_fhandle.i
+++ b/src/third_party/wiredtiger/src/include/os_fhandle.i
@@ -72,7 +72,7 @@ __wt_fextend(WT_SESSION_IMPL *session, WT_FH *fh, wt_off_t offset)
if (handle->fh_extend != NULL)
return (handle->fh_extend(
handle, (WT_SESSION *)session, offset));
- return (ENOTSUP);
+ return (__wt_set_return(session, ENOTSUP));
}
/*
@@ -157,7 +157,7 @@ __wt_ftruncate(WT_SESSION_IMPL *session, WT_FH *fh, wt_off_t offset)
if (handle->fh_truncate != NULL)
return (handle->fh_truncate(
handle, (WT_SESSION *)session, offset));
- return (ENOTSUP);
+ return (__wt_set_return(session, ENOTSUP));
}
/*
diff --git a/src/third_party/wiredtiger/src/include/packing.i b/src/third_party/wiredtiger/src/include/packing.i
index 34a1bb62edb..6e5ea92b54d 100644
--- a/src/third_party/wiredtiger/src/include/packing.i
+++ b/src/third_party/wiredtiger/src/include/packing.i
@@ -245,7 +245,7 @@ next: if (pack->cur == pack->end)
(pv).u.u = va_arg(ap, uint64_t); \
break; \
/* User format strings have already been validated. */ \
- WT_ILLEGAL_VALUE(session); \
+ WT_ILLEGAL_VALUE(session, (pv).type); \
} \
} while (0)
@@ -612,7 +612,7 @@ __unpack_read(WT_SESSION_IMPL *session,
*va_arg(ap, uint64_t *) = (pv).u.u; \
break; \
/* User format strings have already been validated. */ \
- WT_ILLEGAL_VALUE(session); \
+ WT_ILLEGAL_VALUE(session, (pv).type); \
} \
} while (0)
diff --git a/src/third_party/wiredtiger/src/include/session.h b/src/third_party/wiredtiger/src/include/session.h
index e102d7f5057..10ff7bd48dc 100644
--- a/src/third_party/wiredtiger/src/include/session.h
+++ b/src/third_party/wiredtiger/src/include/session.h
@@ -25,7 +25,7 @@ struct __wt_data_handle_cache {
struct __wt_hazard {
WT_REF *ref; /* Page reference */
#ifdef HAVE_DIAGNOSTIC
- const char *file; /* File/line where hazard acquired */
+ const char *func; /* Function/line hazard acquired */
int line;
#endif
};
@@ -120,7 +120,7 @@ struct __wt_session_impl {
* to applications, create a parallel structure instead.
*/
struct __wt_scratch_track {
- const char *file; /* Allocating file, line */
+ const char *func; /* Allocating function, line */
int line;
} *scratch_track;
#endif
@@ -141,6 +141,8 @@ struct __wt_session_impl {
u_int ckpt_handle_next; /* Next empty slot */
size_t ckpt_handle_allocated; /* Bytes allocated */
+ uint64_t cache_wait_us; /* Wait time for cache for current operation */
+
/*
* Operations acting on handles.
*
diff --git a/src/third_party/wiredtiger/src/include/stat.h b/src/third_party/wiredtiger/src/include/stat.h
index 77e0fa85b0f..1ae4e56be03 100644
--- a/src/third_party/wiredtiger/src/include/stat.h
+++ b/src/third_party/wiredtiger/src/include/stat.h
@@ -354,6 +354,12 @@ struct __wt_connection_stats {
int64_t cache_bytes_other;
int64_t cache_bytes_read;
int64_t cache_bytes_write;
+ int64_t cache_lookaside_cursor_wait_application;
+ int64_t cache_lookaside_cursor_wait_internal;
+ int64_t cache_lookaside_score;
+ int64_t cache_lookaside_entries;
+ int64_t cache_lookaside_insert;
+ int64_t cache_lookaside_remove;
int64_t cache_eviction_checkpoint;
int64_t cache_eviction_get_ref;
int64_t cache_eviction_get_ref_empty;
@@ -398,14 +404,11 @@ struct __wt_connection_stats {
int64_t cache_eviction_internal;
int64_t cache_eviction_split_internal;
int64_t cache_eviction_split_leaf;
- int64_t cache_lookaside_score;
- int64_t cache_lookaside_entries;
- int64_t cache_lookaside_insert;
- int64_t cache_lookaside_remove;
int64_t cache_bytes_max;
int64_t cache_eviction_maximum_page_size;
int64_t cache_eviction_dirty;
int64_t cache_eviction_app_dirty;
+ int64_t cache_timed_out_ops;
int64_t cache_read_overflow;
int64_t cache_eviction_deepen;
int64_t cache_write_lookaside;
@@ -455,11 +458,11 @@ struct __wt_connection_stats {
int64_t cursor_insert;
int64_t cursor_modify;
int64_t cursor_next;
+ int64_t cursor_restart;
int64_t cursor_prev;
int64_t cursor_remove;
int64_t cursor_reserve;
int64_t cursor_reset;
- int64_t cursor_restart;
int64_t cursor_search;
int64_t cursor_search_near;
int64_t cursor_sweep_buckets;
@@ -583,6 +586,7 @@ struct __wt_connection_stats {
int64_t rec_split_stashed_objects;
int64_t session_cursor_open;
int64_t session_open;
+ int64_t session_query_ts;
int64_t session_table_alter_fail;
int64_t session_table_alter_success;
int64_t session_table_alter_skip;
@@ -620,8 +624,9 @@ struct __wt_connection_stats {
int64_t page_sleep;
int64_t page_del_rollback_blocked;
int64_t child_modify_blocked_page;
+ int64_t txn_commit_queue_walked;
int64_t txn_commit_queue_empty;
- int64_t txn_commit_queue_tail;
+ int64_t txn_commit_queue_head;
int64_t txn_commit_queue_inserts;
int64_t txn_commit_queue_len;
int64_t txn_snapshots_created;
@@ -631,6 +636,7 @@ struct __wt_connection_stats {
int64_t txn_prepare_active;
int64_t txn_prepare_rollback;
int64_t txn_query_ts;
+ int64_t txn_read_queue_walked;
int64_t txn_read_queue_empty;
int64_t txn_read_queue_head;
int64_t txn_read_queue_inserts;
@@ -663,6 +669,7 @@ struct __wt_connection_stats {
int64_t txn_pinned_checkpoint_range;
int64_t txn_pinned_snapshot_range;
int64_t txn_pinned_timestamp;
+ int64_t txn_pinned_timestamp_checkpoint;
int64_t txn_pinned_timestamp_oldest;
int64_t txn_sync;
int64_t txn_commit;
@@ -783,6 +790,7 @@ struct __wt_dsrc_stats {
int64_t compress_raw_ok;
int64_t cursor_insert_bulk;
int64_t cursor_create;
+ int64_t cursor_restart;
int64_t cursor_insert_bytes;
int64_t cursor_remove_bytes;
int64_t cursor_update_bytes;
@@ -795,7 +803,6 @@ struct __wt_dsrc_stats {
int64_t cursor_remove;
int64_t cursor_reserve;
int64_t cursor_reset;
- int64_t cursor_restart;
int64_t cursor_search;
int64_t cursor_search_near;
int64_t cursor_truncate;
diff --git a/src/third_party/wiredtiger/src/include/txn.h b/src/third_party/wiredtiger/src/include/txn.h
index 32234dca23e..6a0c3edaa13 100644
--- a/src/third_party/wiredtiger/src/include/txn.h
+++ b/src/third_party/wiredtiger/src/include/txn.h
@@ -147,7 +147,7 @@ struct __wt_txn_global {
volatile bool checkpoint_running; /* Checkpoint running */
volatile uint32_t checkpoint_id; /* Checkpoint's session ID */
WT_TXN_STATE checkpoint_state; /* Checkpoint's txn state */
- WT_TXN *checkpoint_txn; /* Checkpoint's txn structure */
+ WT_DECL_TIMESTAMP(checkpoint_timestamp) /* Checkpoint's timestamp */
volatile uint64_t metadata_pinned; /* Oldest ID for metadata */
@@ -172,18 +172,36 @@ typedef enum __wt_txn_isolation {
* records during commit or undo the operations during rollback.
*/
struct __wt_txn_op {
- uint32_t fileid;
+ WT_BTREE *btree;
enum {
- WT_TXN_OP_NONE,
- WT_TXN_OP_BASIC,
- WT_TXN_OP_INMEM,
+ WT_TXN_OP_NONE=0,
+ WT_TXN_OP_BASIC_COL,
+ WT_TXN_OP_BASIC_ROW,
+ WT_TXN_OP_INMEM_COL,
+ WT_TXN_OP_INMEM_ROW,
WT_TXN_OP_REF_DELETE,
WT_TXN_OP_TRUNCATE_COL,
WT_TXN_OP_TRUNCATE_ROW
} type;
union {
- /* WT_TXN_OP_BASIC, WT_TXN_OP_INMEM */
- WT_UPDATE *upd;
+ /* WT_TXN_OP_BASIC_ROW, WT_TXN_OP_INMEM_ROW */
+ struct {
+ WT_UPDATE *upd;
+ WT_ITEM key;
+ } op_row;
+
+ /* WT_TXN_OP_BASIC_COL, WT_TXN_OP_INMEM_COL */
+ struct {
+ WT_UPDATE *upd;
+ uint64_t recno;
+ } op_col;
+/*
+ * upd is pointing to same memory in both op_row and op_col, so for simplicity
+ * just chose op_row upd
+ */
+#undef op_upd
+#define op_upd op_row.upd
+
/* WT_TXN_OP_REF_DELETE */
WT_REF *ref;
/* WT_TXN_OP_TRUNCATE_COL */
@@ -250,7 +268,8 @@ struct __wt_txn {
TAILQ_ENTRY(__wt_txn) commit_timestampq;
TAILQ_ENTRY(__wt_txn) read_timestampq;
- bool clear_ts_queue; /* Set if we need to clear from the queue */
+ bool clear_commit_q; /* Set if need to clear from the commit queue */
+ bool clear_read_q; /* Set if need to clear from the read queue */
/* Array of modifications by this transaction. */
WT_TXN_OP *mod;
diff --git a/src/third_party/wiredtiger/src/include/txn.i b/src/third_party/wiredtiger/src/include/txn.i
index 9276ca62903..7010af975c1 100644
--- a/src/third_party/wiredtiger/src/include/txn.i
+++ b/src/third_party/wiredtiger/src/include/txn.i
@@ -201,6 +201,7 @@ static inline int
__txn_next_op(WT_SESSION_IMPL *session, WT_TXN_OP **opp)
{
WT_TXN *txn;
+ WT_TXN_OP *op;
*opp = NULL;
@@ -216,9 +217,11 @@ __txn_next_op(WT_SESSION_IMPL *session, WT_TXN_OP **opp)
WT_RET(__wt_realloc_def(session, &txn->mod_alloc,
txn->mod_count + 1, &txn->mod));
- *opp = &txn->mod[txn->mod_count++];
- WT_CLEAR(**opp);
- (*opp)->fileid = S2BT(session)->id;
+ op = &txn->mod[txn->mod_count++];
+ WT_CLEAR(*op);
+ op->btree = S2BT(session);
+ (void)__wt_atomic_addi32(&session->dhandle->session_inuse, 1);
+ *opp = op;
return (0);
}
@@ -232,48 +235,122 @@ static inline void
__wt_txn_unmodify(WT_SESSION_IMPL *session)
{
WT_TXN *txn;
+ WT_TXN_OP *op;
txn = &session->txn;
if (F_ISSET(txn, WT_TXN_HAS_ID)) {
WT_ASSERT(session, txn->mod_count > 0);
- txn->mod_count--;
+ --txn->mod_count;
+ op = txn->mod + txn->mod_count;
+ __wt_txn_op_free(session, op);
}
}
#ifdef HAVE_TIMESTAMPS
/*
- * __wt_txn_update_needs_timestamp --
+ * __wt_txn_op_commit_page_del --
+ * Make the transaction ID and timestamp updates necessary to a ref that
+ * was created by a fast delete truncate operation.
+ */
+static inline void
+__wt_txn_op_commit_page_del(WT_SESSION_IMPL *session, WT_REF *ref)
+{
+ WT_TXN *txn;
+ WT_UPDATE **updp;
+ uint32_t previous_state;
+
+ txn = &session->txn;
+
+ /* Avoid locking the page if a previous eviction already cleaned up. */
+ if (ref->page_del->update_list == NULL)
+ return;
+
+ /*
+ * Lock the ref to ensure we don't race with eviction freeing the
+ * page deleted update list.
+ */
+ for (;; __wt_yield()) {
+ previous_state = ref->state;
+ if (previous_state != WT_REF_LOCKED &&
+ __wt_atomic_casv32(
+ &ref->state, previous_state, WT_REF_LOCKED))
+ break;
+ }
+
+ for (updp = ref->page_del->update_list;
+ updp != NULL && *updp != NULL; ++updp) {
+ __wt_timestamp_set(&(*updp)->timestamp, &txn->commit_timestamp);
+ if (F_ISSET(txn, WT_TXN_PREPARE))
+ /*
+ * Holding the ref locked means we have exclusive
+ * access, so don't need to use the prepare locked
+ * transition state.
+ */
+ (*updp)->prepare_state = WT_PREPARE_RESOLVED;
+ }
+
+ /*
+ * Publish to ensure we don't let the page be evicted and the updates
+ * discarded before being written.
+ */
+ WT_PUBLISH(ref->state, previous_state);
+}
+
+/*
+ * __wt_txn_op_set_timestamp --
* Decide whether to copy a commit timestamp into an update. If the op
* structure doesn't have a populated update or ref field or in prepared
* state there won't be any check for an existing timestamp.
*/
-static inline bool
-__wt_txn_update_needs_timestamp(WT_SESSION_IMPL *session, WT_TXN_OP *op)
+static inline void
+__wt_txn_op_set_timestamp(WT_SESSION_IMPL *session, WT_TXN_OP *op)
{
WT_TXN *txn;
+ WT_UPDATE *upd;
wt_timestamp_t *timestamp;
txn = &session->txn;
/*
- * The timestamp is in the page deleted structure for truncates, or
- * in the update for other operations.
- */
- if (op->type == WT_TXN_OP_REF_DELETE)
- timestamp = op->u.ref == NULL || op->u.ref->page_del == NULL ?
- NULL : &op->u.ref->page_del->timestamp;
- else
- timestamp = op->u.upd == NULL ? NULL : &op->u.upd->timestamp;
-
- /*
* Updates in the metadata never get timestamps (either now or at
* commit): metadata cannot be read at a point in time, only the most
* recently committed data matches files on disk.
*/
- return (op->fileid != WT_METAFILE_ID &&
- F_ISSET(txn, WT_TXN_HAS_TS_COMMIT) &&
- (timestamp == NULL || __wt_timestamp_iszero(timestamp) ||
- F_ISSET(txn, WT_TXN_PREPARE)));
+ if (WT_IS_METADATA(op->btree->dhandle) ||
+ !F_ISSET(txn, WT_TXN_HAS_TS_COMMIT))
+ return;
+
+ if (F_ISSET(txn, WT_TXN_PREPARE)) {
+ if (op->type == WT_TXN_OP_REF_DELETE)
+ __wt_txn_op_commit_page_del(session, op->u.ref);
+ else {
+ /*
+ * In case of a prepared transaction, the order of
+ * modification of the prepare timestamp to the commit
+ * timestamp in the update chain will not affect the
+ * data visibility, a reader will encounter a prepared
+ * update resulting in prepare conflict.
+ *
+ * As updating timestamp might not be an atomic
+ * operation, we will manage using state.
+ */
+ upd = op->u.op_upd;
+ upd->prepare_state = WT_PREPARE_LOCKED;
+ WT_WRITE_BARRIER();
+ __wt_timestamp_set(
+ &upd->timestamp, &txn->commit_timestamp);
+ WT_PUBLISH(upd->prepare_state, WT_PREPARE_RESOLVED);
+ }
+ } else {
+ /*
+ * The timestamp is in the page deleted structure for
+ * truncates, or in the update for other operations.
+ */
+ timestamp = op->type == WT_TXN_OP_REF_DELETE ?
+ &op->u.ref->page_del->timestamp : &op->u.op_upd->timestamp;
+ if (__wt_timestamp_iszero(timestamp))
+ __wt_timestamp_set(timestamp, &txn->commit_timestamp);
+ }
}
#endif
@@ -282,11 +359,14 @@ __wt_txn_update_needs_timestamp(WT_SESSION_IMPL *session, WT_TXN_OP *op)
* Mark a WT_UPDATE object modified by the current transaction.
*/
static inline int
-__wt_txn_modify(WT_SESSION_IMPL *session, WT_UPDATE *upd)
+__wt_txn_modify(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, WT_UPDATE *upd)
{
+ WT_BTREE *btree;
+ WT_ITEM key;
WT_TXN *txn;
WT_TXN_OP *op;
+ btree = S2BT(session);
txn = &session->txn;
if (F_ISSET(txn, WT_TXN_READONLY))
@@ -294,14 +374,59 @@ __wt_txn_modify(WT_SESSION_IMPL *session, WT_UPDATE *upd)
"Attempt to update in a read-only transaction");
WT_RET(__txn_next_op(session, &op));
- op->type = F_ISSET(session, WT_SESSION_LOGGING_INMEM) ?
- WT_TXN_OP_INMEM : WT_TXN_OP_BASIC;
+ if (F_ISSET(session, WT_SESSION_LOGGING_INMEM)) {
+ if (btree->type == BTREE_ROW)
+ op->type = WT_TXN_OP_INMEM_ROW;
+ else
+ op->type = WT_TXN_OP_INMEM_COL;
+ } else {
+ if (btree->type == BTREE_ROW)
+ op->type = WT_TXN_OP_BASIC_ROW;
+ else
+ op->type = WT_TXN_OP_BASIC_COL;
+ }
+ op->u.op_upd = upd;
+ upd->txnid = session->txn.id;
+
#ifdef HAVE_TIMESTAMPS
- if (__wt_txn_update_needs_timestamp(session, op))
- __wt_timestamp_set(&upd->timestamp, &txn->commit_timestamp);
+ __wt_txn_op_set_timestamp(session, op);
+
+ /*
+ * TODO:
+ * Following code block is under #ifdef temporarily, to avoid
+ * performance penalty. This block will be enabled, once an alternative
+ * is figured out, or we have to live with this penalty.
+ */
+#ifdef HAVE_LONG_RUNNING_PREPARE
+ /*
+ * Transaction operation with timestamp cannot be prepared.
+ * Copy the key into the transaction op structure, so the update
+ * can be evicted to lookaside, and we have a chance of finding it
+ * again. This is only possible for transactions that are in the
+ * prepared state, but we don't know at this stage if a transaction
+ * will be prepared or not.
+ */
+ if (!WT_SESSION_IS_CHECKPOINT(session) &&
+ !F_ISSET(btree, WT_BTREE_LOOKASIDE) &&
+ !WT_IS_METADATA(op->btree->dhandle)) {
+ /*
+ * Store the key, to search the prepared update in case of
+ * prepared transaction.
+ */
+ if (btree->type == BTREE_ROW) {
+ WT_RET(__wt_cursor_get_raw_key(&cbt->iface, &key));
+ WT_RET(__wt_buf_set(session,
+ &op->u.op_row.key, key.data, key.size));
+ } else
+ op->u.op_col.recno = cbt->recno;
+ }
#endif
- op->u.upd = upd;
- upd->txnid = session->txn.id;
+
+#endif
+ WT_UNUSED(btree);
+ WT_UNUSED(cbt);
+ WT_UNUSED(key);
+
return (0);
}
@@ -321,13 +446,11 @@ __wt_txn_modify_page_delete(WT_SESSION_IMPL *session, WT_REF *ref)
WT_RET(__txn_next_op(session, &op));
op->type = WT_TXN_OP_REF_DELETE;
-#ifdef HAVE_TIMESTAMPS
- if (__wt_txn_update_needs_timestamp(session, op))
- __wt_timestamp_set(
- &ref->page_del->timestamp, &txn->commit_timestamp);
-#endif
op->u.ref = ref;
ref->page_del->txnid = txn->id;
+#ifdef HAVE_TIMESTAMPS
+ __wt_txn_op_set_timestamp(session, op);
+#endif
WT_ERR(__wt_txn_log_op(session, NULL));
return (0);
@@ -396,6 +519,60 @@ __wt_txn_oldest_id(WT_SESSION_IMPL *session)
return (checkpoint_pinned);
}
+#ifdef HAVE_TIMESTAMPS
+/*
+ * __wt_txn_pinned_timestamp --
+ * Get the first timestamp that has to be kept for the current tree.
+ */
+static inline void
+__wt_txn_pinned_timestamp(WT_SESSION_IMPL *session, wt_timestamp_t *pinned_tsp)
+{
+ WT_BTREE *btree;
+ WT_TXN_GLOBAL *txn_global;
+ wt_timestamp_t checkpoint_ts, pinned_ts;
+ bool include_checkpoint_txn;
+
+ btree = S2BT_SAFE(session);
+ txn_global = &S2C(session)->txn_global;
+
+ WT_WITH_TIMESTAMP_READLOCK(session, &txn_global->rwlock,
+ __wt_timestamp_set(&pinned_ts, &txn_global->pinned_timestamp));
+ __wt_timestamp_set(pinned_tsp, &pinned_ts);
+
+ /*
+ * Checkpoint transactions often fall behind ordinary application
+ * threads. Take special effort to not keep changes pinned in cache if
+ * they are only required for the checkpoint and it has already seen
+ * them.
+ *
+ * If there is no active checkpoint or this handle is up to date with
+ * the active checkpoint then it's safe to ignore the checkpoint ID in
+ * the visibility check.
+ */
+ include_checkpoint_txn = btree == NULL ||
+ (!F_ISSET(btree, WT_BTREE_LOOKASIDE) &&
+ btree->checkpoint_gen != __wt_gen(session, WT_GEN_CHECKPOINT));
+ if (!include_checkpoint_txn)
+ return;
+
+ /*
+ * The read of the timestamp pinned by a checkpoint needs to be
+ * carefully ordered: if a checkpoint is starting and we have to use
+ * the checkpoint timestamp, we take the minimum of it with the oldest
+ * timestamp, which is what we want.
+ */
+ WT_READ_BARRIER();
+
+ WT_WITH_TIMESTAMP_READLOCK(session, &txn_global->rwlock,
+ __wt_timestamp_set(&checkpoint_ts,
+ &txn_global->checkpoint_timestamp));
+
+ if (!__wt_timestamp_iszero(&checkpoint_ts) &&
+ __wt_timestamp_cmp(&checkpoint_ts, &pinned_ts) < 0)
+ __wt_timestamp_set(pinned_tsp, &checkpoint_ts);
+}
+#endif
+
/*
* __txn_visible_all_id --
* Check if a given transaction ID is "globally visible". This is, if
@@ -427,8 +604,7 @@ __wt_txn_visible_all(
#ifdef HAVE_TIMESTAMPS
{
- WT_TXN_GLOBAL *txn_global = &S2C(session)->txn_global;
- int cmp;
+ wt_timestamp_t pinned_ts;
/* Timestamp check. */
if (timestamp == NULL || __wt_timestamp_iszero(timestamp))
@@ -438,20 +614,11 @@ __wt_txn_visible_all(
* If no oldest timestamp has been supplied, updates have to stay in
* cache until we are shutting down.
*/
- if (!txn_global->has_pinned_timestamp)
+ if (!S2C(session)->txn_global.has_pinned_timestamp)
return (F_ISSET(S2C(session), WT_CONN_CLOSING));
- WT_WITH_TIMESTAMP_READLOCK(session, &txn_global->rwlock,
- cmp = __wt_timestamp_cmp(timestamp, &txn_global->pinned_timestamp));
-
- /*
- * We can discard updates with timestamps less than or equal to the
- * pinned timestamp. This is different to the situation for
- * transaction IDs, because we know that updates with timestamps are
- * definitely committed (and in this case, that the transaction ID is
- * globally visible).
- */
- return (cmp <= 0);
+ __wt_txn_pinned_timestamp(session, &pinned_ts);
+ return (__wt_timestamp_cmp(timestamp, &pinned_ts) <= 0);
}
#else
WT_UNUSED(timestamp);
@@ -466,6 +633,10 @@ __wt_txn_visible_all(
static inline bool
__wt_txn_upd_visible_all(WT_SESSION_IMPL *session, WT_UPDATE *upd)
{
+ if (upd->prepare_state == WT_PREPARE_LOCKED ||
+ upd->prepare_state == WT_PREPARE_INPROGRESS)
+ return (false);
+
return (__wt_txn_visible_all(
session, upd->txnid, WT_TIMESTAMP_NULL(&upd->timestamp)));
}
@@ -853,12 +1024,13 @@ __wt_txn_update_check(WT_SESSION_IMPL *session, WT_UPDATE *upd)
bool ignore_prepare_set;
txn = &session->txn;
+
if (txn->isolation != WT_ISO_SNAPSHOT)
return (0);
/*
- * Clear the ignore prepare setting of txn, as it is not supposed, to
- * affect the visibility for update operations.
+ * Always include prepared transactions in this check: they are not
+ * supposed to affect visibility for update operations.
*/
ignore_prepare_set = F_ISSET(txn, WT_TXN_IGNORE_PREPARE);
F_CLR(txn, WT_TXN_IGNORE_PREPARE);
diff --git a/src/third_party/wiredtiger/src/include/wiredtiger.in b/src/third_party/wiredtiger/src/include/wiredtiger.in
index 2991d6f74e3..333c74b5e80 100644
--- a/src/third_party/wiredtiger/src/include/wiredtiger.in
+++ b/src/third_party/wiredtiger/src/include/wiredtiger.in
@@ -495,10 +495,22 @@ struct __wt_cursor {
* \c S), or raw byte arrays accessed using a WT_ITEM structure (value
* format type \c u).
*
- * Calling the WT_CURSOR::modify method outside of snapshot isolation
- * can lead to unexpected results. While \c read-committed isolation
- * is supported with the WT_CURSOR::modify method, \c read-uncommitted
- * isolation is not.
+ * The WT_CURSOR::modify method can only be called from within an
+ * explicit transaction configured at a higher isolation level than
+ * \c read-uncommitted. Using \c read-committed isolation is allowed,
+ * but requires caution: reading a value, re-positioning the cursor
+ * and then modifying the value based on the initial read could lead
+ * to unexpected results. Using \c snapshot isolation is recommended.
+ *
+ * The WT_CURSOR::modify method stores a change record in cache and
+ * writes a change record to the log instead of the usual complete
+ * values. Note that WT_CURSOR::modify is generally slower than the
+ * WT_CURSOR::update method, and can result in slower reads because
+ * the complete value must be assembled during retrieval. The
+ * WT_CURSOR::modify method is intended for applications modifying
+ * large records where there is cache or I/O pressure, that is,
+ * applications that will benefit when data updates require less cache
+ * and they write less logging information.
*
* @snippet ex_all.c Modify an existing record
*
@@ -510,12 +522,6 @@ struct __wt_cursor {
* (as it partially depends on the underlying file configuration), but
* is always a small number of bytes less than 4GB.
*
- * The WT_CURSOR::modify method stores a change record in cache and
- * writes a change record to the log, instead of the usual complete
- * value. This can reduce cache and logging requirements, but may result
- * in slower reads because the complete value must be assembled during
- * retrieval.
- *
* @param cursor the cursor handle
* @param entries an array of modification data structures
* @param nentries the number of modification data structures
@@ -602,6 +608,14 @@ struct __wt_cursor {
*
* The key must first be set and the record must already exist.
*
+ * Note that reserve works by doing a special update operation that is
+ * not logged and does not change the value of the record. This update
+ * is aborted when the enclosing transaction ends regardless of whether
+ * it commits or rolls back. Given that, reserve can only be used to
+ * detect conflicts between transactions that execute concurrently. It
+ * cannot detect all logical conflicts between transactions. For that,
+ * some update to the record must be committed.
+ *
* @snippet ex_all.c Reserve a record
*
* On success, the cursor ends positioned at the specified record; to
@@ -1080,6 +1094,9 @@ struct __wt_session {
* @config{raw, ignore the encodings for the key and value\, manage data
* as if the formats were \c "u". See @ref cursor_raw for details., a
* boolean flag; default \c false.}
+ * @config{read_once, results that are brought into cache from disk by
+ * this cursor will be given less priority in the cache., a boolean
+ * flag; default \c false.}
* @config{readonly, only query operations are supported by this cursor.
* An error is returned if a modification is attempted using the cursor.
* The default is false for all cursor types except for log and metadata
@@ -1367,6 +1384,14 @@ struct __wt_session {
* value of merge_max is used., an integer no more than 100; default \c
* 0.}
* @config{ ),,}
+ * @config{memory_page_image_max, the maximum in-memory page image
+ * represented by a single storage block. Depending on compression
+ * efficiency\, compression can create storage blocks which require
+ * significant resources to re-instantiate in the cache\, penalizing the
+ * performance of future point updates. The value limits the maximum
+ * in-memory page image a storage block will need. If set to 0\, a
+ * default of 4 times \c leaf_page_max is used., an integer greater than
+ * or equal to 0; default \c 0.}
* @config{memory_page_max, the maximum size a page can grow to in
* memory before being reconciled to disk. The specified size will be
* adjusted to a lower bound of <code>leaf_page_max</code>\, and an
@@ -1639,6 +1664,12 @@ struct __wt_session {
* the WT_CURSOR::next (WT_CURSOR::prev) method will iterate from the
* beginning (end) of the table.
*
+ * When a range truncate is in progress, and another transaction inserts
+ * a key into that range, the behavior is not well defined - a conflict
+ * may be detected or both transactions may be permitted to commit. If
+ * they do commit, and if there is a crash and recovery runs, the result
+ * may be different than what was in cache before the crash.
+ *
* @param session the session handle
* @param name the URI of the table or file to truncate
* @param start optional cursor marking the first record discarded;
@@ -1866,6 +1897,30 @@ struct __wt_session {
int __F(timestamp_transaction)(WT_SESSION *session, const char *config);
/*!
+ * Query the session's transaction timestamp state.
+ *
+ * @param session the session handle
+ * @param[out] hex_timestamp a buffer that will be set to the
+ * hexadecimal encoding of the timestamp being queried. Must be large
+ * enough to hold a hex-encoded timestamp (i.e., double the timestamp
+ * size plus one byte for NUL termination).
+ * @configstart{WT_SESSION.query_timestamp, see dist/api_data.py}
+ * @config{get, specify which timestamp to query: \c commit returns the
+ * most recently set commit_timestamp. \c first_commit returns the
+ * first set commit_timestamp. \c prepare returns the timestamp used in
+ * preparing a transaction. \c read returns the timestamp at which the
+ * transaction is reading at. See @ref transaction_timestamps., a
+ * string\, chosen from the following options: \c "commit"\, \c
+ * "first_commit"\, \c "prepare"\, \c "read"; default \c read.}
+ * @configend
+ * @errors
+ * If the session is not in a transaction ::WT_NOTFOUND will be
+ * returned.
+ */
+ int __F(query_timestamp)(
+ WT_SESSION *session, char *hex_timestamp, const char *config);
+
+ /*!
* Write a transactionally consistent snapshot of a database or set of
* objects. In the absence of transaction timestamps, the checkpoint
* includes all transactions committed before the checkpoint starts.
@@ -2128,6 +2183,10 @@ struct __wt_connection {
* thread uses a session from the configured session_max., an integer
* between 1 and 20; default \c 2.}
* @config{ ),,}
+ * @config{cache_max_wait_ms, the maximum number of milliseconds an
+ * application thread will wait for space to be available in cache
+ * before giving up. Default will wait forever., an integer greater
+ * than or equal to 0; default \c 0.}
* @config{cache_overhead, assume the heap allocator overhead is the
* specified percentage\, and adjust the cache usage by that amount (for
* example\, if there is 10GB of data in cache\, a percentage of 10
@@ -2179,7 +2238,7 @@ struct __wt_connection {
* is a percentage of the cache size if the value is within the range of
* 0 to 100 or an absolute size when greater than 100. The value is not
* allowed to exceed the \c cache_size. Ignored if set to zero or \c
- * in_memory is \c true., an integer between 0 and 10TB; default \c 5.}
+ * in_memory is \c true., an integer between 0 and 10TB; default \c 1.}
* @config{eviction_dirty_target, perform eviction in worker threads
* when the cache contains at least this much dirty content. It is a
* percentage of the cache size if the value is within the range of 1 to
@@ -2316,15 +2375,15 @@ struct __wt_connection {
* given as a list\, such as
* <code>"verbose=[evictserver\,read]"</code>., a list\, with values
* chosen from the following options: \c "api"\, \c "block"\, \c
- * "checkpoint"\, \c "checkpoint_progress"\, \c "compact"\, \c "evict"\,
- * \c "evict_stuck"\, \c "evictserver"\, \c "fileops"\, \c "handleops"\,
- * \c "log"\, \c "lookaside"\, \c "lookaside_activity"\, \c "lsm"\, \c
- * "lsm_manager"\, \c "metadata"\, \c "mutex"\, \c "overflow"\, \c
- * "read"\, \c "rebalance"\, \c "reconcile"\, \c "recovery"\, \c
- * "recovery_progress"\, \c "salvage"\, \c "shared_cache"\, \c "split"\,
- * \c "temporary"\, \c "thread_group"\, \c "timestamp"\, \c
- * "transaction"\, \c "verify"\, \c "version"\, \c "write"; default
- * empty.}
+ * "checkpoint"\, \c "checkpoint_progress"\, \c "compact"\, \c
+ * "error_returns"\, \c "evict"\, \c "evict_stuck"\, \c "evictserver"\,
+ * \c "fileops"\, \c "handleops"\, \c "log"\, \c "lookaside"\, \c
+ * "lookaside_activity"\, \c "lsm"\, \c "lsm_manager"\, \c "metadata"\,
+ * \c "mutex"\, \c "overflow"\, \c "read"\, \c "rebalance"\, \c
+ * "reconcile"\, \c "recovery"\, \c "recovery_progress"\, \c "salvage"\,
+ * \c "shared_cache"\, \c "split"\, \c "temporary"\, \c "thread_group"\,
+ * \c "timestamp"\, \c "transaction"\, \c "verify"\, \c "version"\, \c
+ * "write"; default empty.}
* @configend
* @errors
*/
@@ -2428,14 +2487,15 @@ struct __wt_connection {
* @config{get, specify which timestamp to query: \c all_committed
* returns the largest timestamp such that all timestamps up to that
* value have committed\, \c oldest returns the most recent \c
- * oldest_timestamp set with WT_CONNECTION::set_timestamp\, \c pinned
- * returns the minimum of the \c oldest_timestamp and the read
- * timestamps of all active readers\, and \c stable returns the most
- * recent \c stable_timestamp set with WT_CONNECTION::set_timestamp.
- * See @ref transaction_timestamps., a string\, chosen from the
- * following options: \c "all_committed"\, \c "last_checkpoint"\, \c
- * "oldest"\, \c "pinned"\, \c "recovery"\, \c "stable"; default \c
- * all_committed.}
+ * oldest_timestamp set with WT_CONNECTION::set_timestamp\, \c
+ * oldest_reader returns the minimum of the read timestamps of all
+ * active readers \c pinned returns the minimum of the\c
+ * oldest_timestamp and the read timestamps of all active readers\, and
+ * \c stable returns the most recent \c stable_timestamp set with
+ * WT_CONNECTION::set_timestamp. See @ref transaction_timestamps., a
+ * string\, chosen from the following options: \c "all_committed"\, \c
+ * "last_checkpoint"\, \c "oldest"\, \c "oldest_reader"\, \c "pinned"\,
+ * \c "recovery"\, \c "stable"; default \c all_committed.}
* @configend
* @errors
* If there is no matching timestamp (e.g., if this method is called
@@ -2708,6 +2768,10 @@ struct __wt_connection {
* default value for any sessions created\, and can be overridden in configuring
* \c cache_cursors in WT_CONNECTION.open_session., a boolean flag; default \c
* true.}
+ * @config{cache_max_wait_ms, the maximum number of milliseconds an application
+ * thread will wait for space to be available in cache before giving up.
+ * Default will wait forever., an integer greater than or equal to 0; default \c
+ * 0.}
* @config{cache_overhead, assume the heap allocator overhead is the specified
* percentage\, and adjust the cache usage by that amount (for example\, if
* there is 10GB of data in cache\, a percentage of 10 means WiredTiger treats
@@ -2808,7 +2872,7 @@ struct __wt_connection {
* percentage of the cache size if the value is within the range of 0 to 100 or
* an absolute size when greater than 100. The value is not allowed to exceed
* the \c cache_size. Ignored if set to zero or \c in_memory is \c true., an
- * integer between 0 and 10TB; default \c 5.}
+ * integer between 0 and 10TB; default \c 1.}
* @config{eviction_dirty_target, perform eviction in worker threads when the
* cache contains at least this much dirty content. It is a percentage of the
* cache size if the value is within the range of 1 to 100 or an absolute size
@@ -2840,8 +2904,12 @@ struct __wt_connection {
* @config{file_extend, file extension configuration. If set\, extend files of
* the set type in allocations of the set size\, instead of a block at a time as
* each new block is written. For example\,
- * <code>file_extend=(data=16MB)</code>., a list\, with values chosen from the
- * following options: \c "data"\, \c "log"; default empty.}
+ * <code>file_extend=(data=16MB)</code>. If set to 0\, disable the file
+ * extension for the set type. For log files\, the allowed range is between
+ * 100KB and 2GB; values larger than the configured maximum log size and the
+ * default config would extend log files in allocations of the maximum log file
+ * size., a list\, with values chosen from the following options: \c "data"\, \c
+ * "log"; default empty.}
* @config{file_manager = (, control how file handles are managed., a set of
* related configuration options defined below.}
* @config{&nbsp;&nbsp;&nbsp;&nbsp;close_handle_minimum, number of handles open
@@ -2914,6 +2982,12 @@ struct __wt_connection {
* @config{readonly, open connection in read-only mode. The database must
* exist. All methods that may modify a database are disabled. See @ref
* readonly for more information., a boolean flag; default \c false.}
+ * @config{salvage, open connection and salvage any WiredTiger-owned database
+ * and log files that it detects as corrupted. This API should only be used
+ * after getting an error return of WT_TRY_SALVAGE. Salvage rebuilds files in
+ * place\, overwriting existing files. We recommend making a backup copy of all
+ * files with the WiredTiger prefix prior to passing this flag., a boolean flag;
+ * default \c false.}
* @config{session_max, maximum expected number of sessions (including server
* threads)., an integer greater than or equal to 1; default \c 100.}
* @config{shared_cache = (, shared cache configuration options. A database
@@ -2995,14 +3069,14 @@ struct __wt_connection {
* @config{verbose, enable messages for various events. Options are given as a
* list\, such as <code>"verbose=[evictserver\,read]"</code>., a list\, with
* values chosen from the following options: \c "api"\, \c "block"\, \c
- * "checkpoint"\, \c "checkpoint_progress"\, \c "compact"\, \c "evict"\, \c
- * "evict_stuck"\, \c "evictserver"\, \c "fileops"\, \c "handleops"\, \c "log"\,
- * \c "lookaside"\, \c "lookaside_activity"\, \c "lsm"\, \c "lsm_manager"\, \c
- * "metadata"\, \c "mutex"\, \c "overflow"\, \c "read"\, \c "rebalance"\, \c
- * "reconcile"\, \c "recovery"\, \c "recovery_progress"\, \c "salvage"\, \c
- * "shared_cache"\, \c "split"\, \c "temporary"\, \c "thread_group"\, \c
- * "timestamp"\, \c "transaction"\, \c "verify"\, \c "version"\, \c "write";
- * default empty.}
+ * "checkpoint"\, \c "checkpoint_progress"\, \c "compact"\, \c "error_returns"\,
+ * \c "evict"\, \c "evict_stuck"\, \c "evictserver"\, \c "fileops"\, \c
+ * "handleops"\, \c "log"\, \c "lookaside"\, \c "lookaside_activity"\, \c
+ * "lsm"\, \c "lsm_manager"\, \c "metadata"\, \c "mutex"\, \c "overflow"\, \c
+ * "read"\, \c "rebalance"\, \c "reconcile"\, \c "recovery"\, \c
+ * "recovery_progress"\, \c "salvage"\, \c "shared_cache"\, \c "split"\, \c
+ * "temporary"\, \c "thread_group"\, \c "timestamp"\, \c "transaction"\, \c
+ * "verify"\, \c "version"\, \c "write"; default empty.}
* @config{write_through, Use \c FILE_FLAG_WRITE_THROUGH on Windows to write to
* files. Ignored on non-Windows systems. Options are given as a list\, such
* as <code>"write_through=[data]"</code>. Configuring \c write_through requires
@@ -3527,18 +3601,17 @@ struct __wt_config_parser {
*/
/*!
- * Return a buffer's CRC32C checksum.
+ * Return a pointer to a function that calculates a CRC32C checksum.
*
* The WiredTiger library CRC32C checksum function uses hardware support where
* available, else it falls back to a software implementation.
*
* @snippet ex_all.c Checksum a buffer
*
- * @param buffer a pointer to a buffer
- * @param len the number of valid bytes in the buffer
- * @returns the buffer's CRC32C checksum
+ * @returns a pointer to a function that takes a buffer and length and returns
+ * the CRC32C checksum
*/
-uint32_t wiredtiger_checksum_crc32c(const void *buffer, size_t len)
+uint32_t (*wiredtiger_crc32c_func(void))(const void *, size_t)
WT_ATTRIBUTE_LIBRARY_VISIBLE;
/*! @} */
@@ -3640,6 +3713,13 @@ const char *wiredtiger_version(int *majorp, int *minorp, int *patchp)
* state.
*/
#define WT_PREPARE_CONFLICT (-31808)
+/*!
+ * Database corruption detected.
+ * This error is generated when corruption is detected in an on-disk file. The
+ * application may choose to salvage the file or retry wiredtiger_open with the
+ * 'salvage=true' configuration setting.
+ */
+#define WT_TRY_SALVAGE (-31809)
/*
* Error return section: END
* DO NOT EDIT: automatically built by dist/api_err.py.
@@ -4974,7 +5054,7 @@ extern int wiredtiger_extension_terminate(WT_CONNECTION *connection);
#define WT_STAT_CONN_CACHE_WRITE_APP_TIME 1034
/*! cache: bytes belonging to page images in the cache */
#define WT_STAT_CONN_CACHE_BYTES_IMAGE 1035
-/*! cache: bytes belonging to the lookaside table in the cache */
+/*! cache: bytes belonging to the cache overflow table in the cache */
#define WT_STAT_CONN_CACHE_BYTES_LOOKASIDE 1036
/*! cache: bytes currently in the cache */
#define WT_STAT_CONN_CACHE_BYTES_INUSE 1037
@@ -4984,715 +5064,711 @@ extern int wiredtiger_extension_terminate(WT_CONNECTION *connection);
#define WT_STAT_CONN_CACHE_BYTES_READ 1039
/*! cache: bytes written from cache */
#define WT_STAT_CONN_CACHE_BYTES_WRITE 1040
+/*! cache: cache overflow cursor application thread wait time (usecs) */
+#define WT_STAT_CONN_CACHE_LOOKASIDE_CURSOR_WAIT_APPLICATION 1041
+/*! cache: cache overflow cursor internal thread wait time (usecs) */
+#define WT_STAT_CONN_CACHE_LOOKASIDE_CURSOR_WAIT_INTERNAL 1042
+/*! cache: cache overflow score */
+#define WT_STAT_CONN_CACHE_LOOKASIDE_SCORE 1043
+/*! cache: cache overflow table entries */
+#define WT_STAT_CONN_CACHE_LOOKASIDE_ENTRIES 1044
+/*! cache: cache overflow table insert calls */
+#define WT_STAT_CONN_CACHE_LOOKASIDE_INSERT 1045
+/*! cache: cache overflow table remove calls */
+#define WT_STAT_CONN_CACHE_LOOKASIDE_REMOVE 1046
/*! cache: checkpoint blocked page eviction */
-#define WT_STAT_CONN_CACHE_EVICTION_CHECKPOINT 1041
+#define WT_STAT_CONN_CACHE_EVICTION_CHECKPOINT 1047
/*! cache: eviction calls to get a page */
-#define WT_STAT_CONN_CACHE_EVICTION_GET_REF 1042
+#define WT_STAT_CONN_CACHE_EVICTION_GET_REF 1048
/*! cache: eviction calls to get a page found queue empty */
-#define WT_STAT_CONN_CACHE_EVICTION_GET_REF_EMPTY 1043
+#define WT_STAT_CONN_CACHE_EVICTION_GET_REF_EMPTY 1049
/*! cache: eviction calls to get a page found queue empty after locking */
-#define WT_STAT_CONN_CACHE_EVICTION_GET_REF_EMPTY2 1044
+#define WT_STAT_CONN_CACHE_EVICTION_GET_REF_EMPTY2 1050
/*! cache: eviction currently operating in aggressive mode */
-#define WT_STAT_CONN_CACHE_EVICTION_AGGRESSIVE_SET 1045
+#define WT_STAT_CONN_CACHE_EVICTION_AGGRESSIVE_SET 1051
/*! cache: eviction empty score */
-#define WT_STAT_CONN_CACHE_EVICTION_EMPTY_SCORE 1046
+#define WT_STAT_CONN_CACHE_EVICTION_EMPTY_SCORE 1052
/*! cache: eviction passes of a file */
-#define WT_STAT_CONN_CACHE_EVICTION_WALK_PASSES 1047
+#define WT_STAT_CONN_CACHE_EVICTION_WALK_PASSES 1053
/*! cache: eviction server candidate queue empty when topping up */
-#define WT_STAT_CONN_CACHE_EVICTION_QUEUE_EMPTY 1048
+#define WT_STAT_CONN_CACHE_EVICTION_QUEUE_EMPTY 1054
/*! cache: eviction server candidate queue not empty when topping up */
-#define WT_STAT_CONN_CACHE_EVICTION_QUEUE_NOT_EMPTY 1049
+#define WT_STAT_CONN_CACHE_EVICTION_QUEUE_NOT_EMPTY 1055
/*! cache: eviction server evicting pages */
-#define WT_STAT_CONN_CACHE_EVICTION_SERVER_EVICTING 1050
+#define WT_STAT_CONN_CACHE_EVICTION_SERVER_EVICTING 1056
/*!
* cache: eviction server slept, because we did not make progress with
* eviction
*/
-#define WT_STAT_CONN_CACHE_EVICTION_SERVER_SLEPT 1051
+#define WT_STAT_CONN_CACHE_EVICTION_SERVER_SLEPT 1057
/*! cache: eviction server unable to reach eviction goal */
-#define WT_STAT_CONN_CACHE_EVICTION_SLOW 1052
+#define WT_STAT_CONN_CACHE_EVICTION_SLOW 1058
/*! cache: eviction state */
-#define WT_STAT_CONN_CACHE_EVICTION_STATE 1053
+#define WT_STAT_CONN_CACHE_EVICTION_STATE 1059
/*! cache: eviction walk target pages histogram - 0-9 */
-#define WT_STAT_CONN_CACHE_EVICTION_TARGET_PAGE_LT10 1054
+#define WT_STAT_CONN_CACHE_EVICTION_TARGET_PAGE_LT10 1060
/*! cache: eviction walk target pages histogram - 10-31 */
-#define WT_STAT_CONN_CACHE_EVICTION_TARGET_PAGE_LT32 1055
+#define WT_STAT_CONN_CACHE_EVICTION_TARGET_PAGE_LT32 1061
/*! cache: eviction walk target pages histogram - 128 and higher */
-#define WT_STAT_CONN_CACHE_EVICTION_TARGET_PAGE_GE128 1056
+#define WT_STAT_CONN_CACHE_EVICTION_TARGET_PAGE_GE128 1062
/*! cache: eviction walk target pages histogram - 32-63 */
-#define WT_STAT_CONN_CACHE_EVICTION_TARGET_PAGE_LT64 1057
+#define WT_STAT_CONN_CACHE_EVICTION_TARGET_PAGE_LT64 1063
/*! cache: eviction walk target pages histogram - 64-128 */
-#define WT_STAT_CONN_CACHE_EVICTION_TARGET_PAGE_LT128 1058
+#define WT_STAT_CONN_CACHE_EVICTION_TARGET_PAGE_LT128 1064
/*! cache: eviction walks abandoned */
-#define WT_STAT_CONN_CACHE_EVICTION_WALKS_ABANDONED 1059
+#define WT_STAT_CONN_CACHE_EVICTION_WALKS_ABANDONED 1065
/*! cache: eviction walks gave up because they restarted their walk twice */
-#define WT_STAT_CONN_CACHE_EVICTION_WALKS_STOPPED 1060
+#define WT_STAT_CONN_CACHE_EVICTION_WALKS_STOPPED 1066
/*!
* cache: eviction walks gave up because they saw too many pages and
* found no candidates
*/
-#define WT_STAT_CONN_CACHE_EVICTION_WALKS_GAVE_UP_NO_TARGETS 1061
+#define WT_STAT_CONN_CACHE_EVICTION_WALKS_GAVE_UP_NO_TARGETS 1067
/*!
* cache: eviction walks gave up because they saw too many pages and
* found too few candidates
*/
-#define WT_STAT_CONN_CACHE_EVICTION_WALKS_GAVE_UP_RATIO 1062
+#define WT_STAT_CONN_CACHE_EVICTION_WALKS_GAVE_UP_RATIO 1068
/*! cache: eviction walks reached end of tree */
-#define WT_STAT_CONN_CACHE_EVICTION_WALKS_ENDED 1063
+#define WT_STAT_CONN_CACHE_EVICTION_WALKS_ENDED 1069
/*! cache: eviction walks started from root of tree */
-#define WT_STAT_CONN_CACHE_EVICTION_WALK_FROM_ROOT 1064
+#define WT_STAT_CONN_CACHE_EVICTION_WALK_FROM_ROOT 1070
/*! cache: eviction walks started from saved location in tree */
-#define WT_STAT_CONN_CACHE_EVICTION_WALK_SAVED_POS 1065
+#define WT_STAT_CONN_CACHE_EVICTION_WALK_SAVED_POS 1071
/*! cache: eviction worker thread active */
-#define WT_STAT_CONN_CACHE_EVICTION_ACTIVE_WORKERS 1066
+#define WT_STAT_CONN_CACHE_EVICTION_ACTIVE_WORKERS 1072
/*! cache: eviction worker thread created */
-#define WT_STAT_CONN_CACHE_EVICTION_WORKER_CREATED 1067
+#define WT_STAT_CONN_CACHE_EVICTION_WORKER_CREATED 1073
/*! cache: eviction worker thread evicting pages */
-#define WT_STAT_CONN_CACHE_EVICTION_WORKER_EVICTING 1068
+#define WT_STAT_CONN_CACHE_EVICTION_WORKER_EVICTING 1074
/*! cache: eviction worker thread removed */
-#define WT_STAT_CONN_CACHE_EVICTION_WORKER_REMOVED 1069
+#define WT_STAT_CONN_CACHE_EVICTION_WORKER_REMOVED 1075
/*! cache: eviction worker thread stable number */
-#define WT_STAT_CONN_CACHE_EVICTION_STABLE_STATE_WORKERS 1070
+#define WT_STAT_CONN_CACHE_EVICTION_STABLE_STATE_WORKERS 1076
/*!
* cache: failed eviction of pages that exceeded the in-memory maximum
* count
*/
-#define WT_STAT_CONN_CACHE_EVICTION_FORCE_FAIL 1071
+#define WT_STAT_CONN_CACHE_EVICTION_FORCE_FAIL 1077
/*!
* cache: failed eviction of pages that exceeded the in-memory maximum
* time (usecs)
*/
-#define WT_STAT_CONN_CACHE_EVICTION_FORCE_FAIL_TIME 1072
+#define WT_STAT_CONN_CACHE_EVICTION_FORCE_FAIL_TIME 1078
/*! cache: files with active eviction walks */
-#define WT_STAT_CONN_CACHE_EVICTION_WALKS_ACTIVE 1073
+#define WT_STAT_CONN_CACHE_EVICTION_WALKS_ACTIVE 1079
/*! cache: files with new eviction walks started */
-#define WT_STAT_CONN_CACHE_EVICTION_WALKS_STARTED 1074
+#define WT_STAT_CONN_CACHE_EVICTION_WALKS_STARTED 1080
/*! cache: force re-tuning of eviction workers once in a while */
-#define WT_STAT_CONN_CACHE_EVICTION_FORCE_RETUNE 1075
+#define WT_STAT_CONN_CACHE_EVICTION_FORCE_RETUNE 1081
/*! cache: hazard pointer blocked page eviction */
-#define WT_STAT_CONN_CACHE_EVICTION_HAZARD 1076
+#define WT_STAT_CONN_CACHE_EVICTION_HAZARD 1082
/*! cache: hazard pointer check calls */
-#define WT_STAT_CONN_CACHE_HAZARD_CHECKS 1077
+#define WT_STAT_CONN_CACHE_HAZARD_CHECKS 1083
/*! cache: hazard pointer check entries walked */
-#define WT_STAT_CONN_CACHE_HAZARD_WALKS 1078
+#define WT_STAT_CONN_CACHE_HAZARD_WALKS 1084
/*! cache: hazard pointer maximum array length */
-#define WT_STAT_CONN_CACHE_HAZARD_MAX 1079
+#define WT_STAT_CONN_CACHE_HAZARD_MAX 1085
/*! cache: in-memory page passed criteria to be split */
-#define WT_STAT_CONN_CACHE_INMEM_SPLITTABLE 1080
+#define WT_STAT_CONN_CACHE_INMEM_SPLITTABLE 1086
/*! cache: in-memory page splits */
-#define WT_STAT_CONN_CACHE_INMEM_SPLIT 1081
+#define WT_STAT_CONN_CACHE_INMEM_SPLIT 1087
/*! cache: internal pages evicted */
-#define WT_STAT_CONN_CACHE_EVICTION_INTERNAL 1082
+#define WT_STAT_CONN_CACHE_EVICTION_INTERNAL 1088
/*! cache: internal pages split during eviction */
-#define WT_STAT_CONN_CACHE_EVICTION_SPLIT_INTERNAL 1083
+#define WT_STAT_CONN_CACHE_EVICTION_SPLIT_INTERNAL 1089
/*! cache: leaf pages split during eviction */
-#define WT_STAT_CONN_CACHE_EVICTION_SPLIT_LEAF 1084
-/*! cache: lookaside score */
-#define WT_STAT_CONN_CACHE_LOOKASIDE_SCORE 1085
-/*! cache: lookaside table entries */
-#define WT_STAT_CONN_CACHE_LOOKASIDE_ENTRIES 1086
-/*! cache: lookaside table insert calls */
-#define WT_STAT_CONN_CACHE_LOOKASIDE_INSERT 1087
-/*! cache: lookaside table remove calls */
-#define WT_STAT_CONN_CACHE_LOOKASIDE_REMOVE 1088
+#define WT_STAT_CONN_CACHE_EVICTION_SPLIT_LEAF 1090
/*! cache: maximum bytes configured */
-#define WT_STAT_CONN_CACHE_BYTES_MAX 1089
+#define WT_STAT_CONN_CACHE_BYTES_MAX 1091
/*! cache: maximum page size at eviction */
-#define WT_STAT_CONN_CACHE_EVICTION_MAXIMUM_PAGE_SIZE 1090
+#define WT_STAT_CONN_CACHE_EVICTION_MAXIMUM_PAGE_SIZE 1092
/*! cache: modified pages evicted */
-#define WT_STAT_CONN_CACHE_EVICTION_DIRTY 1091
+#define WT_STAT_CONN_CACHE_EVICTION_DIRTY 1093
/*! cache: modified pages evicted by application threads */
-#define WT_STAT_CONN_CACHE_EVICTION_APP_DIRTY 1092
+#define WT_STAT_CONN_CACHE_EVICTION_APP_DIRTY 1094
+/*! cache: operations timed out waiting for space in cache */
+#define WT_STAT_CONN_CACHE_TIMED_OUT_OPS 1095
/*! cache: overflow pages read into cache */
-#define WT_STAT_CONN_CACHE_READ_OVERFLOW 1093
+#define WT_STAT_CONN_CACHE_READ_OVERFLOW 1096
/*! cache: page split during eviction deepened the tree */
-#define WT_STAT_CONN_CACHE_EVICTION_DEEPEN 1094
-/*! cache: page written requiring lookaside records */
-#define WT_STAT_CONN_CACHE_WRITE_LOOKASIDE 1095
+#define WT_STAT_CONN_CACHE_EVICTION_DEEPEN 1097
+/*! cache: page written requiring cache overflow records */
+#define WT_STAT_CONN_CACHE_WRITE_LOOKASIDE 1098
/*! cache: pages currently held in the cache */
-#define WT_STAT_CONN_CACHE_PAGES_INUSE 1096
+#define WT_STAT_CONN_CACHE_PAGES_INUSE 1099
/*! cache: pages evicted because they exceeded the in-memory maximum count */
-#define WT_STAT_CONN_CACHE_EVICTION_FORCE 1097
+#define WT_STAT_CONN_CACHE_EVICTION_FORCE 1100
/*!
* cache: pages evicted because they exceeded the in-memory maximum time
* (usecs)
*/
-#define WT_STAT_CONN_CACHE_EVICTION_FORCE_TIME 1098
+#define WT_STAT_CONN_CACHE_EVICTION_FORCE_TIME 1101
/*! cache: pages evicted because they had chains of deleted items count */
-#define WT_STAT_CONN_CACHE_EVICTION_FORCE_DELETE 1099
+#define WT_STAT_CONN_CACHE_EVICTION_FORCE_DELETE 1102
/*!
* cache: pages evicted because they had chains of deleted items time
* (usecs)
*/
-#define WT_STAT_CONN_CACHE_EVICTION_FORCE_DELETE_TIME 1100
+#define WT_STAT_CONN_CACHE_EVICTION_FORCE_DELETE_TIME 1103
/*! cache: pages evicted by application threads */
-#define WT_STAT_CONN_CACHE_EVICTION_APP 1101
+#define WT_STAT_CONN_CACHE_EVICTION_APP 1104
/*! cache: pages queued for eviction */
-#define WT_STAT_CONN_CACHE_EVICTION_PAGES_QUEUED 1102
+#define WT_STAT_CONN_CACHE_EVICTION_PAGES_QUEUED 1105
/*! cache: pages queued for urgent eviction */
-#define WT_STAT_CONN_CACHE_EVICTION_PAGES_QUEUED_URGENT 1103
+#define WT_STAT_CONN_CACHE_EVICTION_PAGES_QUEUED_URGENT 1106
/*! cache: pages queued for urgent eviction during walk */
-#define WT_STAT_CONN_CACHE_EVICTION_PAGES_QUEUED_OLDEST 1104
+#define WT_STAT_CONN_CACHE_EVICTION_PAGES_QUEUED_OLDEST 1107
/*! cache: pages read into cache */
-#define WT_STAT_CONN_CACHE_READ 1105
+#define WT_STAT_CONN_CACHE_READ 1108
/*! cache: pages read into cache after truncate */
-#define WT_STAT_CONN_CACHE_READ_DELETED 1106
+#define WT_STAT_CONN_CACHE_READ_DELETED 1109
/*! cache: pages read into cache after truncate in prepare state */
-#define WT_STAT_CONN_CACHE_READ_DELETED_PREPARED 1107
-/*! cache: pages read into cache requiring lookaside entries */
-#define WT_STAT_CONN_CACHE_READ_LOOKASIDE 1108
-/*! cache: pages read into cache requiring lookaside for checkpoint */
-#define WT_STAT_CONN_CACHE_READ_LOOKASIDE_CHECKPOINT 1109
-/*! cache: pages read into cache skipping older lookaside entries */
-#define WT_STAT_CONN_CACHE_READ_LOOKASIDE_SKIPPED 1110
+#define WT_STAT_CONN_CACHE_READ_DELETED_PREPARED 1110
+/*! cache: pages read into cache requiring cache overflow entries */
+#define WT_STAT_CONN_CACHE_READ_LOOKASIDE 1111
+/*! cache: pages read into cache requiring cache overflow for checkpoint */
+#define WT_STAT_CONN_CACHE_READ_LOOKASIDE_CHECKPOINT 1112
+/*! cache: pages read into cache skipping older cache overflow entries */
+#define WT_STAT_CONN_CACHE_READ_LOOKASIDE_SKIPPED 1113
/*!
- * cache: pages read into cache with skipped lookaside entries needed
- * later
+ * cache: pages read into cache with skipped cache overflow entries
+ * needed later
*/
-#define WT_STAT_CONN_CACHE_READ_LOOKASIDE_DELAY 1111
+#define WT_STAT_CONN_CACHE_READ_LOOKASIDE_DELAY 1114
/*!
- * cache: pages read into cache with skipped lookaside entries needed
- * later by checkpoint
+ * cache: pages read into cache with skipped cache overflow entries
+ * needed later by checkpoint
*/
-#define WT_STAT_CONN_CACHE_READ_LOOKASIDE_DELAY_CHECKPOINT 1112
+#define WT_STAT_CONN_CACHE_READ_LOOKASIDE_DELAY_CHECKPOINT 1115
/*! cache: pages requested from the cache */
-#define WT_STAT_CONN_CACHE_PAGES_REQUESTED 1113
+#define WT_STAT_CONN_CACHE_PAGES_REQUESTED 1116
/*! cache: pages seen by eviction walk */
-#define WT_STAT_CONN_CACHE_EVICTION_PAGES_SEEN 1114
+#define WT_STAT_CONN_CACHE_EVICTION_PAGES_SEEN 1117
/*! cache: pages selected for eviction unable to be evicted */
-#define WT_STAT_CONN_CACHE_EVICTION_FAIL 1115
+#define WT_STAT_CONN_CACHE_EVICTION_FAIL 1118
/*! cache: pages walked for eviction */
-#define WT_STAT_CONN_CACHE_EVICTION_WALK 1116
+#define WT_STAT_CONN_CACHE_EVICTION_WALK 1119
/*! cache: pages written from cache */
-#define WT_STAT_CONN_CACHE_WRITE 1117
+#define WT_STAT_CONN_CACHE_WRITE 1120
/*! cache: pages written requiring in-memory restoration */
-#define WT_STAT_CONN_CACHE_WRITE_RESTORE 1118
+#define WT_STAT_CONN_CACHE_WRITE_RESTORE 1121
/*! cache: percentage overhead */
-#define WT_STAT_CONN_CACHE_OVERHEAD 1119
+#define WT_STAT_CONN_CACHE_OVERHEAD 1122
/*! cache: tracked bytes belonging to internal pages in the cache */
-#define WT_STAT_CONN_CACHE_BYTES_INTERNAL 1120
+#define WT_STAT_CONN_CACHE_BYTES_INTERNAL 1123
/*! cache: tracked bytes belonging to leaf pages in the cache */
-#define WT_STAT_CONN_CACHE_BYTES_LEAF 1121
+#define WT_STAT_CONN_CACHE_BYTES_LEAF 1124
/*! cache: tracked dirty bytes in the cache */
-#define WT_STAT_CONN_CACHE_BYTES_DIRTY 1122
+#define WT_STAT_CONN_CACHE_BYTES_DIRTY 1125
/*! cache: tracked dirty pages in the cache */
-#define WT_STAT_CONN_CACHE_PAGES_DIRTY 1123
+#define WT_STAT_CONN_CACHE_PAGES_DIRTY 1126
/*! cache: unmodified pages evicted */
-#define WT_STAT_CONN_CACHE_EVICTION_CLEAN 1124
+#define WT_STAT_CONN_CACHE_EVICTION_CLEAN 1127
/*! connection: auto adjusting condition resets */
-#define WT_STAT_CONN_COND_AUTO_WAIT_RESET 1125
+#define WT_STAT_CONN_COND_AUTO_WAIT_RESET 1128
/*! connection: auto adjusting condition wait calls */
-#define WT_STAT_CONN_COND_AUTO_WAIT 1126
+#define WT_STAT_CONN_COND_AUTO_WAIT 1129
/*! connection: detected system time went backwards */
-#define WT_STAT_CONN_TIME_TRAVEL 1127
+#define WT_STAT_CONN_TIME_TRAVEL 1130
/*! connection: files currently open */
-#define WT_STAT_CONN_FILE_OPEN 1128
+#define WT_STAT_CONN_FILE_OPEN 1131
/*! connection: memory allocations */
-#define WT_STAT_CONN_MEMORY_ALLOCATION 1129
+#define WT_STAT_CONN_MEMORY_ALLOCATION 1132
/*! connection: memory frees */
-#define WT_STAT_CONN_MEMORY_FREE 1130
+#define WT_STAT_CONN_MEMORY_FREE 1133
/*! connection: memory re-allocations */
-#define WT_STAT_CONN_MEMORY_GROW 1131
+#define WT_STAT_CONN_MEMORY_GROW 1134
/*! connection: pthread mutex condition wait calls */
-#define WT_STAT_CONN_COND_WAIT 1132
+#define WT_STAT_CONN_COND_WAIT 1135
/*! connection: pthread mutex shared lock read-lock calls */
-#define WT_STAT_CONN_RWLOCK_READ 1133
+#define WT_STAT_CONN_RWLOCK_READ 1136
/*! connection: pthread mutex shared lock write-lock calls */
-#define WT_STAT_CONN_RWLOCK_WRITE 1134
+#define WT_STAT_CONN_RWLOCK_WRITE 1137
/*! connection: total fsync I/Os */
-#define WT_STAT_CONN_FSYNC_IO 1135
+#define WT_STAT_CONN_FSYNC_IO 1138
/*! connection: total read I/Os */
-#define WT_STAT_CONN_READ_IO 1136
+#define WT_STAT_CONN_READ_IO 1139
/*! connection: total write I/Os */
-#define WT_STAT_CONN_WRITE_IO 1137
+#define WT_STAT_CONN_WRITE_IO 1140
/*! cursor: cursor create calls */
-#define WT_STAT_CONN_CURSOR_CREATE 1138
+#define WT_STAT_CONN_CURSOR_CREATE 1141
/*! cursor: cursor insert calls */
-#define WT_STAT_CONN_CURSOR_INSERT 1139
+#define WT_STAT_CONN_CURSOR_INSERT 1142
/*! cursor: cursor modify calls */
-#define WT_STAT_CONN_CURSOR_MODIFY 1140
+#define WT_STAT_CONN_CURSOR_MODIFY 1143
/*! cursor: cursor next calls */
-#define WT_STAT_CONN_CURSOR_NEXT 1141
+#define WT_STAT_CONN_CURSOR_NEXT 1144
+/*! cursor: cursor operation restarted */
+#define WT_STAT_CONN_CURSOR_RESTART 1145
/*! cursor: cursor prev calls */
-#define WT_STAT_CONN_CURSOR_PREV 1142
+#define WT_STAT_CONN_CURSOR_PREV 1146
/*! cursor: cursor remove calls */
-#define WT_STAT_CONN_CURSOR_REMOVE 1143
+#define WT_STAT_CONN_CURSOR_REMOVE 1147
/*! cursor: cursor reserve calls */
-#define WT_STAT_CONN_CURSOR_RESERVE 1144
+#define WT_STAT_CONN_CURSOR_RESERVE 1148
/*! cursor: cursor reset calls */
-#define WT_STAT_CONN_CURSOR_RESET 1145
-/*! cursor: cursor restarted searches */
-#define WT_STAT_CONN_CURSOR_RESTART 1146
+#define WT_STAT_CONN_CURSOR_RESET 1149
/*! cursor: cursor search calls */
-#define WT_STAT_CONN_CURSOR_SEARCH 1147
+#define WT_STAT_CONN_CURSOR_SEARCH 1150
/*! cursor: cursor search near calls */
-#define WT_STAT_CONN_CURSOR_SEARCH_NEAR 1148
+#define WT_STAT_CONN_CURSOR_SEARCH_NEAR 1151
/*! cursor: cursor sweep buckets */
-#define WT_STAT_CONN_CURSOR_SWEEP_BUCKETS 1149
+#define WT_STAT_CONN_CURSOR_SWEEP_BUCKETS 1152
/*! cursor: cursor sweep cursors closed */
-#define WT_STAT_CONN_CURSOR_SWEEP_CLOSED 1150
+#define WT_STAT_CONN_CURSOR_SWEEP_CLOSED 1153
/*! cursor: cursor sweep cursors examined */
-#define WT_STAT_CONN_CURSOR_SWEEP_EXAMINED 1151
+#define WT_STAT_CONN_CURSOR_SWEEP_EXAMINED 1154
/*! cursor: cursor sweeps */
-#define WT_STAT_CONN_CURSOR_SWEEP 1152
+#define WT_STAT_CONN_CURSOR_SWEEP 1155
/*! cursor: cursor update calls */
-#define WT_STAT_CONN_CURSOR_UPDATE 1153
+#define WT_STAT_CONN_CURSOR_UPDATE 1156
/*! cursor: cursors cached on close */
-#define WT_STAT_CONN_CURSOR_CACHE 1154
+#define WT_STAT_CONN_CURSOR_CACHE 1157
/*! cursor: cursors reused from cache */
-#define WT_STAT_CONN_CURSOR_REOPEN 1155
+#define WT_STAT_CONN_CURSOR_REOPEN 1158
/*! cursor: truncate calls */
-#define WT_STAT_CONN_CURSOR_TRUNCATE 1156
+#define WT_STAT_CONN_CURSOR_TRUNCATE 1159
/*! data-handle: connection data handles currently active */
-#define WT_STAT_CONN_DH_CONN_HANDLE_COUNT 1157
+#define WT_STAT_CONN_DH_CONN_HANDLE_COUNT 1160
/*! data-handle: connection sweep candidate became referenced */
-#define WT_STAT_CONN_DH_SWEEP_REF 1158
+#define WT_STAT_CONN_DH_SWEEP_REF 1161
/*! data-handle: connection sweep dhandles closed */
-#define WT_STAT_CONN_DH_SWEEP_CLOSE 1159
+#define WT_STAT_CONN_DH_SWEEP_CLOSE 1162
/*! data-handle: connection sweep dhandles removed from hash list */
-#define WT_STAT_CONN_DH_SWEEP_REMOVE 1160
+#define WT_STAT_CONN_DH_SWEEP_REMOVE 1163
/*! data-handle: connection sweep time-of-death sets */
-#define WT_STAT_CONN_DH_SWEEP_TOD 1161
+#define WT_STAT_CONN_DH_SWEEP_TOD 1164
/*! data-handle: connection sweeps */
-#define WT_STAT_CONN_DH_SWEEPS 1162
+#define WT_STAT_CONN_DH_SWEEPS 1165
/*! data-handle: session dhandles swept */
-#define WT_STAT_CONN_DH_SESSION_HANDLES 1163
+#define WT_STAT_CONN_DH_SESSION_HANDLES 1166
/*! data-handle: session sweep attempts */
-#define WT_STAT_CONN_DH_SESSION_SWEEPS 1164
+#define WT_STAT_CONN_DH_SESSION_SWEEPS 1167
/*! lock: checkpoint lock acquisitions */
-#define WT_STAT_CONN_LOCK_CHECKPOINT_COUNT 1165
+#define WT_STAT_CONN_LOCK_CHECKPOINT_COUNT 1168
/*! lock: checkpoint lock application thread wait time (usecs) */
-#define WT_STAT_CONN_LOCK_CHECKPOINT_WAIT_APPLICATION 1166
+#define WT_STAT_CONN_LOCK_CHECKPOINT_WAIT_APPLICATION 1169
/*! lock: checkpoint lock internal thread wait time (usecs) */
-#define WT_STAT_CONN_LOCK_CHECKPOINT_WAIT_INTERNAL 1167
+#define WT_STAT_CONN_LOCK_CHECKPOINT_WAIT_INTERNAL 1170
/*!
- * lock: commit timestamp queue lock application thread time waiting for
- * the dhandle lock (usecs)
- */
-#define WT_STAT_CONN_LOCK_COMMIT_TIMESTAMP_WAIT_APPLICATION 1168
-/*!
- * lock: commit timestamp queue lock internal thread time waiting for the
- * dhandle lock (usecs)
+ * lock: commit timestamp queue lock application thread time waiting
+ * (usecs)
*/
-#define WT_STAT_CONN_LOCK_COMMIT_TIMESTAMP_WAIT_INTERNAL 1169
+#define WT_STAT_CONN_LOCK_COMMIT_TIMESTAMP_WAIT_APPLICATION 1171
+/*! lock: commit timestamp queue lock internal thread time waiting (usecs) */
+#define WT_STAT_CONN_LOCK_COMMIT_TIMESTAMP_WAIT_INTERNAL 1172
/*! lock: commit timestamp queue read lock acquisitions */
-#define WT_STAT_CONN_LOCK_COMMIT_TIMESTAMP_READ_COUNT 1170
+#define WT_STAT_CONN_LOCK_COMMIT_TIMESTAMP_READ_COUNT 1173
/*! lock: commit timestamp queue write lock acquisitions */
-#define WT_STAT_CONN_LOCK_COMMIT_TIMESTAMP_WRITE_COUNT 1171
-/*!
- * lock: dhandle lock application thread time waiting for the dhandle
- * lock (usecs)
- */
-#define WT_STAT_CONN_LOCK_DHANDLE_WAIT_APPLICATION 1172
-/*!
- * lock: dhandle lock internal thread time waiting for the dhandle lock
- * (usecs)
- */
-#define WT_STAT_CONN_LOCK_DHANDLE_WAIT_INTERNAL 1173
+#define WT_STAT_CONN_LOCK_COMMIT_TIMESTAMP_WRITE_COUNT 1174
+/*! lock: dhandle lock application thread time waiting (usecs) */
+#define WT_STAT_CONN_LOCK_DHANDLE_WAIT_APPLICATION 1175
+/*! lock: dhandle lock internal thread time waiting (usecs) */
+#define WT_STAT_CONN_LOCK_DHANDLE_WAIT_INTERNAL 1176
/*! lock: dhandle read lock acquisitions */
-#define WT_STAT_CONN_LOCK_DHANDLE_READ_COUNT 1174
+#define WT_STAT_CONN_LOCK_DHANDLE_READ_COUNT 1177
/*! lock: dhandle write lock acquisitions */
-#define WT_STAT_CONN_LOCK_DHANDLE_WRITE_COUNT 1175
+#define WT_STAT_CONN_LOCK_DHANDLE_WRITE_COUNT 1178
/*! lock: metadata lock acquisitions */
-#define WT_STAT_CONN_LOCK_METADATA_COUNT 1176
+#define WT_STAT_CONN_LOCK_METADATA_COUNT 1179
/*! lock: metadata lock application thread wait time (usecs) */
-#define WT_STAT_CONN_LOCK_METADATA_WAIT_APPLICATION 1177
+#define WT_STAT_CONN_LOCK_METADATA_WAIT_APPLICATION 1180
/*! lock: metadata lock internal thread wait time (usecs) */
-#define WT_STAT_CONN_LOCK_METADATA_WAIT_INTERNAL 1178
-/*!
- * lock: read timestamp queue lock application thread time waiting for
- * the dhandle lock (usecs)
- */
-#define WT_STAT_CONN_LOCK_READ_TIMESTAMP_WAIT_APPLICATION 1179
+#define WT_STAT_CONN_LOCK_METADATA_WAIT_INTERNAL 1181
/*!
- * lock: read timestamp queue lock internal thread time waiting for the
- * dhandle lock (usecs)
+ * lock: read timestamp queue lock application thread time waiting
+ * (usecs)
*/
-#define WT_STAT_CONN_LOCK_READ_TIMESTAMP_WAIT_INTERNAL 1180
+#define WT_STAT_CONN_LOCK_READ_TIMESTAMP_WAIT_APPLICATION 1182
+/*! lock: read timestamp queue lock internal thread time waiting (usecs) */
+#define WT_STAT_CONN_LOCK_READ_TIMESTAMP_WAIT_INTERNAL 1183
/*! lock: read timestamp queue read lock acquisitions */
-#define WT_STAT_CONN_LOCK_READ_TIMESTAMP_READ_COUNT 1181
+#define WT_STAT_CONN_LOCK_READ_TIMESTAMP_READ_COUNT 1184
/*! lock: read timestamp queue write lock acquisitions */
-#define WT_STAT_CONN_LOCK_READ_TIMESTAMP_WRITE_COUNT 1182
+#define WT_STAT_CONN_LOCK_READ_TIMESTAMP_WRITE_COUNT 1185
/*! lock: schema lock acquisitions */
-#define WT_STAT_CONN_LOCK_SCHEMA_COUNT 1183
+#define WT_STAT_CONN_LOCK_SCHEMA_COUNT 1186
/*! lock: schema lock application thread wait time (usecs) */
-#define WT_STAT_CONN_LOCK_SCHEMA_WAIT_APPLICATION 1184
+#define WT_STAT_CONN_LOCK_SCHEMA_WAIT_APPLICATION 1187
/*! lock: schema lock internal thread wait time (usecs) */
-#define WT_STAT_CONN_LOCK_SCHEMA_WAIT_INTERNAL 1185
+#define WT_STAT_CONN_LOCK_SCHEMA_WAIT_INTERNAL 1188
/*!
* lock: table lock application thread time waiting for the table lock
* (usecs)
*/
-#define WT_STAT_CONN_LOCK_TABLE_WAIT_APPLICATION 1186
+#define WT_STAT_CONN_LOCK_TABLE_WAIT_APPLICATION 1189
/*!
* lock: table lock internal thread time waiting for the table lock
* (usecs)
*/
-#define WT_STAT_CONN_LOCK_TABLE_WAIT_INTERNAL 1187
+#define WT_STAT_CONN_LOCK_TABLE_WAIT_INTERNAL 1190
/*! lock: table read lock acquisitions */
-#define WT_STAT_CONN_LOCK_TABLE_READ_COUNT 1188
+#define WT_STAT_CONN_LOCK_TABLE_READ_COUNT 1191
/*! lock: table write lock acquisitions */
-#define WT_STAT_CONN_LOCK_TABLE_WRITE_COUNT 1189
-/*!
- * lock: txn global lock application thread time waiting for the dhandle
- * lock (usecs)
- */
-#define WT_STAT_CONN_LOCK_TXN_GLOBAL_WAIT_APPLICATION 1190
-/*!
- * lock: txn global lock internal thread time waiting for the dhandle
- * lock (usecs)
- */
-#define WT_STAT_CONN_LOCK_TXN_GLOBAL_WAIT_INTERNAL 1191
+#define WT_STAT_CONN_LOCK_TABLE_WRITE_COUNT 1192
+/*! lock: txn global lock application thread time waiting (usecs) */
+#define WT_STAT_CONN_LOCK_TXN_GLOBAL_WAIT_APPLICATION 1193
+/*! lock: txn global lock internal thread time waiting (usecs) */
+#define WT_STAT_CONN_LOCK_TXN_GLOBAL_WAIT_INTERNAL 1194
/*! lock: txn global read lock acquisitions */
-#define WT_STAT_CONN_LOCK_TXN_GLOBAL_READ_COUNT 1192
+#define WT_STAT_CONN_LOCK_TXN_GLOBAL_READ_COUNT 1195
/*! lock: txn global write lock acquisitions */
-#define WT_STAT_CONN_LOCK_TXN_GLOBAL_WRITE_COUNT 1193
+#define WT_STAT_CONN_LOCK_TXN_GLOBAL_WRITE_COUNT 1196
/*! log: busy returns attempting to switch slots */
-#define WT_STAT_CONN_LOG_SLOT_SWITCH_BUSY 1194
+#define WT_STAT_CONN_LOG_SLOT_SWITCH_BUSY 1197
/*! log: force archive time sleeping (usecs) */
-#define WT_STAT_CONN_LOG_FORCE_ARCHIVE_SLEEP 1195
+#define WT_STAT_CONN_LOG_FORCE_ARCHIVE_SLEEP 1198
/*! log: log bytes of payload data */
-#define WT_STAT_CONN_LOG_BYTES_PAYLOAD 1196
+#define WT_STAT_CONN_LOG_BYTES_PAYLOAD 1199
/*! log: log bytes written */
-#define WT_STAT_CONN_LOG_BYTES_WRITTEN 1197
+#define WT_STAT_CONN_LOG_BYTES_WRITTEN 1200
/*! log: log files manually zero-filled */
-#define WT_STAT_CONN_LOG_ZERO_FILLS 1198
+#define WT_STAT_CONN_LOG_ZERO_FILLS 1201
/*! log: log flush operations */
-#define WT_STAT_CONN_LOG_FLUSH 1199
+#define WT_STAT_CONN_LOG_FLUSH 1202
/*! log: log force write operations */
-#define WT_STAT_CONN_LOG_FORCE_WRITE 1200
+#define WT_STAT_CONN_LOG_FORCE_WRITE 1203
/*! log: log force write operations skipped */
-#define WT_STAT_CONN_LOG_FORCE_WRITE_SKIP 1201
+#define WT_STAT_CONN_LOG_FORCE_WRITE_SKIP 1204
/*! log: log records compressed */
-#define WT_STAT_CONN_LOG_COMPRESS_WRITES 1202
+#define WT_STAT_CONN_LOG_COMPRESS_WRITES 1205
/*! log: log records not compressed */
-#define WT_STAT_CONN_LOG_COMPRESS_WRITE_FAILS 1203
+#define WT_STAT_CONN_LOG_COMPRESS_WRITE_FAILS 1206
/*! log: log records too small to compress */
-#define WT_STAT_CONN_LOG_COMPRESS_SMALL 1204
+#define WT_STAT_CONN_LOG_COMPRESS_SMALL 1207
/*! log: log release advances write LSN */
-#define WT_STAT_CONN_LOG_RELEASE_WRITE_LSN 1205
+#define WT_STAT_CONN_LOG_RELEASE_WRITE_LSN 1208
/*! log: log scan operations */
-#define WT_STAT_CONN_LOG_SCANS 1206
+#define WT_STAT_CONN_LOG_SCANS 1209
/*! log: log scan records requiring two reads */
-#define WT_STAT_CONN_LOG_SCAN_REREADS 1207
+#define WT_STAT_CONN_LOG_SCAN_REREADS 1210
/*! log: log server thread advances write LSN */
-#define WT_STAT_CONN_LOG_WRITE_LSN 1208
+#define WT_STAT_CONN_LOG_WRITE_LSN 1211
/*! log: log server thread write LSN walk skipped */
-#define WT_STAT_CONN_LOG_WRITE_LSN_SKIP 1209
+#define WT_STAT_CONN_LOG_WRITE_LSN_SKIP 1212
/*! log: log sync operations */
-#define WT_STAT_CONN_LOG_SYNC 1210
+#define WT_STAT_CONN_LOG_SYNC 1213
/*! log: log sync time duration (usecs) */
-#define WT_STAT_CONN_LOG_SYNC_DURATION 1211
+#define WT_STAT_CONN_LOG_SYNC_DURATION 1214
/*! log: log sync_dir operations */
-#define WT_STAT_CONN_LOG_SYNC_DIR 1212
+#define WT_STAT_CONN_LOG_SYNC_DIR 1215
/*! log: log sync_dir time duration (usecs) */
-#define WT_STAT_CONN_LOG_SYNC_DIR_DURATION 1213
+#define WT_STAT_CONN_LOG_SYNC_DIR_DURATION 1216
/*! log: log write operations */
-#define WT_STAT_CONN_LOG_WRITES 1214
+#define WT_STAT_CONN_LOG_WRITES 1217
/*! log: logging bytes consolidated */
-#define WT_STAT_CONN_LOG_SLOT_CONSOLIDATED 1215
+#define WT_STAT_CONN_LOG_SLOT_CONSOLIDATED 1218
/*! log: maximum log file size */
-#define WT_STAT_CONN_LOG_MAX_FILESIZE 1216
+#define WT_STAT_CONN_LOG_MAX_FILESIZE 1219
/*! log: number of pre-allocated log files to create */
-#define WT_STAT_CONN_LOG_PREALLOC_MAX 1217
+#define WT_STAT_CONN_LOG_PREALLOC_MAX 1220
/*! log: pre-allocated log files not ready and missed */
-#define WT_STAT_CONN_LOG_PREALLOC_MISSED 1218
+#define WT_STAT_CONN_LOG_PREALLOC_MISSED 1221
/*! log: pre-allocated log files prepared */
-#define WT_STAT_CONN_LOG_PREALLOC_FILES 1219
+#define WT_STAT_CONN_LOG_PREALLOC_FILES 1222
/*! log: pre-allocated log files used */
-#define WT_STAT_CONN_LOG_PREALLOC_USED 1220
+#define WT_STAT_CONN_LOG_PREALLOC_USED 1223
/*! log: records processed by log scan */
-#define WT_STAT_CONN_LOG_SCAN_RECORDS 1221
+#define WT_STAT_CONN_LOG_SCAN_RECORDS 1224
/*! log: slot close lost race */
-#define WT_STAT_CONN_LOG_SLOT_CLOSE_RACE 1222
+#define WT_STAT_CONN_LOG_SLOT_CLOSE_RACE 1225
/*! log: slot close unbuffered waits */
-#define WT_STAT_CONN_LOG_SLOT_CLOSE_UNBUF 1223
+#define WT_STAT_CONN_LOG_SLOT_CLOSE_UNBUF 1226
/*! log: slot closures */
-#define WT_STAT_CONN_LOG_SLOT_CLOSES 1224
+#define WT_STAT_CONN_LOG_SLOT_CLOSES 1227
/*! log: slot join atomic update races */
-#define WT_STAT_CONN_LOG_SLOT_RACES 1225
+#define WT_STAT_CONN_LOG_SLOT_RACES 1228
/*! log: slot join calls atomic updates raced */
-#define WT_STAT_CONN_LOG_SLOT_YIELD_RACE 1226
+#define WT_STAT_CONN_LOG_SLOT_YIELD_RACE 1229
/*! log: slot join calls did not yield */
-#define WT_STAT_CONN_LOG_SLOT_IMMEDIATE 1227
+#define WT_STAT_CONN_LOG_SLOT_IMMEDIATE 1230
/*! log: slot join calls found active slot closed */
-#define WT_STAT_CONN_LOG_SLOT_YIELD_CLOSE 1228
+#define WT_STAT_CONN_LOG_SLOT_YIELD_CLOSE 1231
/*! log: slot join calls slept */
-#define WT_STAT_CONN_LOG_SLOT_YIELD_SLEEP 1229
+#define WT_STAT_CONN_LOG_SLOT_YIELD_SLEEP 1232
/*! log: slot join calls yielded */
-#define WT_STAT_CONN_LOG_SLOT_YIELD 1230
+#define WT_STAT_CONN_LOG_SLOT_YIELD 1233
/*! log: slot join found active slot closed */
-#define WT_STAT_CONN_LOG_SLOT_ACTIVE_CLOSED 1231
+#define WT_STAT_CONN_LOG_SLOT_ACTIVE_CLOSED 1234
/*! log: slot joins yield time (usecs) */
-#define WT_STAT_CONN_LOG_SLOT_YIELD_DURATION 1232
+#define WT_STAT_CONN_LOG_SLOT_YIELD_DURATION 1235
/*! log: slot transitions unable to find free slot */
-#define WT_STAT_CONN_LOG_SLOT_NO_FREE_SLOTS 1233
+#define WT_STAT_CONN_LOG_SLOT_NO_FREE_SLOTS 1236
/*! log: slot unbuffered writes */
-#define WT_STAT_CONN_LOG_SLOT_UNBUFFERED 1234
+#define WT_STAT_CONN_LOG_SLOT_UNBUFFERED 1237
/*! log: total in-memory size of compressed records */
-#define WT_STAT_CONN_LOG_COMPRESS_MEM 1235
+#define WT_STAT_CONN_LOG_COMPRESS_MEM 1238
/*! log: total log buffer size */
-#define WT_STAT_CONN_LOG_BUFFER_SIZE 1236
+#define WT_STAT_CONN_LOG_BUFFER_SIZE 1239
/*! log: total size of compressed records */
-#define WT_STAT_CONN_LOG_COMPRESS_LEN 1237
+#define WT_STAT_CONN_LOG_COMPRESS_LEN 1240
/*! log: written slots coalesced */
-#define WT_STAT_CONN_LOG_SLOT_COALESCED 1238
+#define WT_STAT_CONN_LOG_SLOT_COALESCED 1241
/*! log: yields waiting for previous log file close */
-#define WT_STAT_CONN_LOG_CLOSE_YIELDS 1239
+#define WT_STAT_CONN_LOG_CLOSE_YIELDS 1242
/*! perf: file system read latency histogram (bucket 1) - 10-49ms */
-#define WT_STAT_CONN_PERF_HIST_FSREAD_LATENCY_LT50 1240
+#define WT_STAT_CONN_PERF_HIST_FSREAD_LATENCY_LT50 1243
/*! perf: file system read latency histogram (bucket 2) - 50-99ms */
-#define WT_STAT_CONN_PERF_HIST_FSREAD_LATENCY_LT100 1241
+#define WT_STAT_CONN_PERF_HIST_FSREAD_LATENCY_LT100 1244
/*! perf: file system read latency histogram (bucket 3) - 100-249ms */
-#define WT_STAT_CONN_PERF_HIST_FSREAD_LATENCY_LT250 1242
+#define WT_STAT_CONN_PERF_HIST_FSREAD_LATENCY_LT250 1245
/*! perf: file system read latency histogram (bucket 4) - 250-499ms */
-#define WT_STAT_CONN_PERF_HIST_FSREAD_LATENCY_LT500 1243
+#define WT_STAT_CONN_PERF_HIST_FSREAD_LATENCY_LT500 1246
/*! perf: file system read latency histogram (bucket 5) - 500-999ms */
-#define WT_STAT_CONN_PERF_HIST_FSREAD_LATENCY_LT1000 1244
+#define WT_STAT_CONN_PERF_HIST_FSREAD_LATENCY_LT1000 1247
/*! perf: file system read latency histogram (bucket 6) - 1000ms+ */
-#define WT_STAT_CONN_PERF_HIST_FSREAD_LATENCY_GT1000 1245
+#define WT_STAT_CONN_PERF_HIST_FSREAD_LATENCY_GT1000 1248
/*! perf: file system write latency histogram (bucket 1) - 10-49ms */
-#define WT_STAT_CONN_PERF_HIST_FSWRITE_LATENCY_LT50 1246
+#define WT_STAT_CONN_PERF_HIST_FSWRITE_LATENCY_LT50 1249
/*! perf: file system write latency histogram (bucket 2) - 50-99ms */
-#define WT_STAT_CONN_PERF_HIST_FSWRITE_LATENCY_LT100 1247
+#define WT_STAT_CONN_PERF_HIST_FSWRITE_LATENCY_LT100 1250
/*! perf: file system write latency histogram (bucket 3) - 100-249ms */
-#define WT_STAT_CONN_PERF_HIST_FSWRITE_LATENCY_LT250 1248
+#define WT_STAT_CONN_PERF_HIST_FSWRITE_LATENCY_LT250 1251
/*! perf: file system write latency histogram (bucket 4) - 250-499ms */
-#define WT_STAT_CONN_PERF_HIST_FSWRITE_LATENCY_LT500 1249
+#define WT_STAT_CONN_PERF_HIST_FSWRITE_LATENCY_LT500 1252
/*! perf: file system write latency histogram (bucket 5) - 500-999ms */
-#define WT_STAT_CONN_PERF_HIST_FSWRITE_LATENCY_LT1000 1250
+#define WT_STAT_CONN_PERF_HIST_FSWRITE_LATENCY_LT1000 1253
/*! perf: file system write latency histogram (bucket 6) - 1000ms+ */
-#define WT_STAT_CONN_PERF_HIST_FSWRITE_LATENCY_GT1000 1251
+#define WT_STAT_CONN_PERF_HIST_FSWRITE_LATENCY_GT1000 1254
/*! perf: operation read latency histogram (bucket 1) - 100-249us */
-#define WT_STAT_CONN_PERF_HIST_OPREAD_LATENCY_LT250 1252
+#define WT_STAT_CONN_PERF_HIST_OPREAD_LATENCY_LT250 1255
/*! perf: operation read latency histogram (bucket 2) - 250-499us */
-#define WT_STAT_CONN_PERF_HIST_OPREAD_LATENCY_LT500 1253
+#define WT_STAT_CONN_PERF_HIST_OPREAD_LATENCY_LT500 1256
/*! perf: operation read latency histogram (bucket 3) - 500-999us */
-#define WT_STAT_CONN_PERF_HIST_OPREAD_LATENCY_LT1000 1254
+#define WT_STAT_CONN_PERF_HIST_OPREAD_LATENCY_LT1000 1257
/*! perf: operation read latency histogram (bucket 4) - 1000-9999us */
-#define WT_STAT_CONN_PERF_HIST_OPREAD_LATENCY_LT10000 1255
+#define WT_STAT_CONN_PERF_HIST_OPREAD_LATENCY_LT10000 1258
/*! perf: operation read latency histogram (bucket 5) - 10000us+ */
-#define WT_STAT_CONN_PERF_HIST_OPREAD_LATENCY_GT10000 1256
+#define WT_STAT_CONN_PERF_HIST_OPREAD_LATENCY_GT10000 1259
/*! perf: operation write latency histogram (bucket 1) - 100-249us */
-#define WT_STAT_CONN_PERF_HIST_OPWRITE_LATENCY_LT250 1257
+#define WT_STAT_CONN_PERF_HIST_OPWRITE_LATENCY_LT250 1260
/*! perf: operation write latency histogram (bucket 2) - 250-499us */
-#define WT_STAT_CONN_PERF_HIST_OPWRITE_LATENCY_LT500 1258
+#define WT_STAT_CONN_PERF_HIST_OPWRITE_LATENCY_LT500 1261
/*! perf: operation write latency histogram (bucket 3) - 500-999us */
-#define WT_STAT_CONN_PERF_HIST_OPWRITE_LATENCY_LT1000 1259
+#define WT_STAT_CONN_PERF_HIST_OPWRITE_LATENCY_LT1000 1262
/*! perf: operation write latency histogram (bucket 4) - 1000-9999us */
-#define WT_STAT_CONN_PERF_HIST_OPWRITE_LATENCY_LT10000 1260
+#define WT_STAT_CONN_PERF_HIST_OPWRITE_LATENCY_LT10000 1263
/*! perf: operation write latency histogram (bucket 5) - 10000us+ */
-#define WT_STAT_CONN_PERF_HIST_OPWRITE_LATENCY_GT10000 1261
+#define WT_STAT_CONN_PERF_HIST_OPWRITE_LATENCY_GT10000 1264
/*! reconciliation: fast-path pages deleted */
-#define WT_STAT_CONN_REC_PAGE_DELETE_FAST 1262
+#define WT_STAT_CONN_REC_PAGE_DELETE_FAST 1265
/*! reconciliation: page reconciliation calls */
-#define WT_STAT_CONN_REC_PAGES 1263
+#define WT_STAT_CONN_REC_PAGES 1266
/*! reconciliation: page reconciliation calls for eviction */
-#define WT_STAT_CONN_REC_PAGES_EVICTION 1264
+#define WT_STAT_CONN_REC_PAGES_EVICTION 1267
/*! reconciliation: pages deleted */
-#define WT_STAT_CONN_REC_PAGE_DELETE 1265
+#define WT_STAT_CONN_REC_PAGE_DELETE 1268
/*! reconciliation: split bytes currently awaiting free */
-#define WT_STAT_CONN_REC_SPLIT_STASHED_BYTES 1266
+#define WT_STAT_CONN_REC_SPLIT_STASHED_BYTES 1269
/*! reconciliation: split objects currently awaiting free */
-#define WT_STAT_CONN_REC_SPLIT_STASHED_OBJECTS 1267
+#define WT_STAT_CONN_REC_SPLIT_STASHED_OBJECTS 1270
/*! session: open cursor count */
-#define WT_STAT_CONN_SESSION_CURSOR_OPEN 1268
+#define WT_STAT_CONN_SESSION_CURSOR_OPEN 1271
/*! session: open session count */
-#define WT_STAT_CONN_SESSION_OPEN 1269
+#define WT_STAT_CONN_SESSION_OPEN 1272
+/*! session: session query timestamp calls */
+#define WT_STAT_CONN_SESSION_QUERY_TS 1273
/*! session: table alter failed calls */
-#define WT_STAT_CONN_SESSION_TABLE_ALTER_FAIL 1270
+#define WT_STAT_CONN_SESSION_TABLE_ALTER_FAIL 1274
/*! session: table alter successful calls */
-#define WT_STAT_CONN_SESSION_TABLE_ALTER_SUCCESS 1271
+#define WT_STAT_CONN_SESSION_TABLE_ALTER_SUCCESS 1275
/*! session: table alter unchanged and skipped */
-#define WT_STAT_CONN_SESSION_TABLE_ALTER_SKIP 1272
+#define WT_STAT_CONN_SESSION_TABLE_ALTER_SKIP 1276
/*! session: table compact failed calls */
-#define WT_STAT_CONN_SESSION_TABLE_COMPACT_FAIL 1273
+#define WT_STAT_CONN_SESSION_TABLE_COMPACT_FAIL 1277
/*! session: table compact successful calls */
-#define WT_STAT_CONN_SESSION_TABLE_COMPACT_SUCCESS 1274
+#define WT_STAT_CONN_SESSION_TABLE_COMPACT_SUCCESS 1278
/*! session: table create failed calls */
-#define WT_STAT_CONN_SESSION_TABLE_CREATE_FAIL 1275
+#define WT_STAT_CONN_SESSION_TABLE_CREATE_FAIL 1279
/*! session: table create successful calls */
-#define WT_STAT_CONN_SESSION_TABLE_CREATE_SUCCESS 1276
+#define WT_STAT_CONN_SESSION_TABLE_CREATE_SUCCESS 1280
/*! session: table drop failed calls */
-#define WT_STAT_CONN_SESSION_TABLE_DROP_FAIL 1277
+#define WT_STAT_CONN_SESSION_TABLE_DROP_FAIL 1281
/*! session: table drop successful calls */
-#define WT_STAT_CONN_SESSION_TABLE_DROP_SUCCESS 1278
+#define WT_STAT_CONN_SESSION_TABLE_DROP_SUCCESS 1282
/*! session: table rebalance failed calls */
-#define WT_STAT_CONN_SESSION_TABLE_REBALANCE_FAIL 1279
+#define WT_STAT_CONN_SESSION_TABLE_REBALANCE_FAIL 1283
/*! session: table rebalance successful calls */
-#define WT_STAT_CONN_SESSION_TABLE_REBALANCE_SUCCESS 1280
+#define WT_STAT_CONN_SESSION_TABLE_REBALANCE_SUCCESS 1284
/*! session: table rename failed calls */
-#define WT_STAT_CONN_SESSION_TABLE_RENAME_FAIL 1281
+#define WT_STAT_CONN_SESSION_TABLE_RENAME_FAIL 1285
/*! session: table rename successful calls */
-#define WT_STAT_CONN_SESSION_TABLE_RENAME_SUCCESS 1282
+#define WT_STAT_CONN_SESSION_TABLE_RENAME_SUCCESS 1286
/*! session: table salvage failed calls */
-#define WT_STAT_CONN_SESSION_TABLE_SALVAGE_FAIL 1283
+#define WT_STAT_CONN_SESSION_TABLE_SALVAGE_FAIL 1287
/*! session: table salvage successful calls */
-#define WT_STAT_CONN_SESSION_TABLE_SALVAGE_SUCCESS 1284
+#define WT_STAT_CONN_SESSION_TABLE_SALVAGE_SUCCESS 1288
/*! session: table truncate failed calls */
-#define WT_STAT_CONN_SESSION_TABLE_TRUNCATE_FAIL 1285
+#define WT_STAT_CONN_SESSION_TABLE_TRUNCATE_FAIL 1289
/*! session: table truncate successful calls */
-#define WT_STAT_CONN_SESSION_TABLE_TRUNCATE_SUCCESS 1286
+#define WT_STAT_CONN_SESSION_TABLE_TRUNCATE_SUCCESS 1290
/*! session: table verify failed calls */
-#define WT_STAT_CONN_SESSION_TABLE_VERIFY_FAIL 1287
+#define WT_STAT_CONN_SESSION_TABLE_VERIFY_FAIL 1291
/*! session: table verify successful calls */
-#define WT_STAT_CONN_SESSION_TABLE_VERIFY_SUCCESS 1288
+#define WT_STAT_CONN_SESSION_TABLE_VERIFY_SUCCESS 1292
/*! thread-state: active filesystem fsync calls */
-#define WT_STAT_CONN_THREAD_FSYNC_ACTIVE 1289
+#define WT_STAT_CONN_THREAD_FSYNC_ACTIVE 1293
/*! thread-state: active filesystem read calls */
-#define WT_STAT_CONN_THREAD_READ_ACTIVE 1290
+#define WT_STAT_CONN_THREAD_READ_ACTIVE 1294
/*! thread-state: active filesystem write calls */
-#define WT_STAT_CONN_THREAD_WRITE_ACTIVE 1291
+#define WT_STAT_CONN_THREAD_WRITE_ACTIVE 1295
/*! thread-yield: application thread time evicting (usecs) */
-#define WT_STAT_CONN_APPLICATION_EVICT_TIME 1292
+#define WT_STAT_CONN_APPLICATION_EVICT_TIME 1296
/*! thread-yield: application thread time waiting for cache (usecs) */
-#define WT_STAT_CONN_APPLICATION_CACHE_TIME 1293
+#define WT_STAT_CONN_APPLICATION_CACHE_TIME 1297
/*!
* thread-yield: connection close blocked waiting for transaction state
* stabilization
*/
-#define WT_STAT_CONN_TXN_RELEASE_BLOCKED 1294
+#define WT_STAT_CONN_TXN_RELEASE_BLOCKED 1298
/*! thread-yield: connection close yielded for lsm manager shutdown */
-#define WT_STAT_CONN_CONN_CLOSE_BLOCKED_LSM 1295
+#define WT_STAT_CONN_CONN_CLOSE_BLOCKED_LSM 1299
/*! thread-yield: data handle lock yielded */
-#define WT_STAT_CONN_DHANDLE_LOCK_BLOCKED 1296
+#define WT_STAT_CONN_DHANDLE_LOCK_BLOCKED 1300
/*!
* thread-yield: get reference for page index and slot time sleeping
* (usecs)
*/
-#define WT_STAT_CONN_PAGE_INDEX_SLOT_REF_BLOCKED 1297
+#define WT_STAT_CONN_PAGE_INDEX_SLOT_REF_BLOCKED 1301
/*! thread-yield: log server sync yielded for log write */
-#define WT_STAT_CONN_LOG_SERVER_SYNC_BLOCKED 1298
+#define WT_STAT_CONN_LOG_SERVER_SYNC_BLOCKED 1302
/*! thread-yield: page access yielded due to prepare state change */
-#define WT_STAT_CONN_PREPARED_TRANSITION_BLOCKED_PAGE 1299
+#define WT_STAT_CONN_PREPARED_TRANSITION_BLOCKED_PAGE 1303
/*! thread-yield: page acquire busy blocked */
-#define WT_STAT_CONN_PAGE_BUSY_BLOCKED 1300
+#define WT_STAT_CONN_PAGE_BUSY_BLOCKED 1304
/*! thread-yield: page acquire eviction blocked */
-#define WT_STAT_CONN_PAGE_FORCIBLE_EVICT_BLOCKED 1301
+#define WT_STAT_CONN_PAGE_FORCIBLE_EVICT_BLOCKED 1305
/*! thread-yield: page acquire locked blocked */
-#define WT_STAT_CONN_PAGE_LOCKED_BLOCKED 1302
+#define WT_STAT_CONN_PAGE_LOCKED_BLOCKED 1306
/*! thread-yield: page acquire read blocked */
-#define WT_STAT_CONN_PAGE_READ_BLOCKED 1303
+#define WT_STAT_CONN_PAGE_READ_BLOCKED 1307
/*! thread-yield: page acquire time sleeping (usecs) */
-#define WT_STAT_CONN_PAGE_SLEEP 1304
+#define WT_STAT_CONN_PAGE_SLEEP 1308
/*!
* thread-yield: page delete rollback time sleeping for state change
* (usecs)
*/
-#define WT_STAT_CONN_PAGE_DEL_ROLLBACK_BLOCKED 1305
+#define WT_STAT_CONN_PAGE_DEL_ROLLBACK_BLOCKED 1309
/*! thread-yield: page reconciliation yielded due to child modification */
-#define WT_STAT_CONN_CHILD_MODIFY_BLOCKED_PAGE 1306
+#define WT_STAT_CONN_CHILD_MODIFY_BLOCKED_PAGE 1310
+/*! transaction: commit timestamp queue entries walked */
+#define WT_STAT_CONN_TXN_COMMIT_QUEUE_WALKED 1311
/*! transaction: commit timestamp queue insert to empty */
-#define WT_STAT_CONN_TXN_COMMIT_QUEUE_EMPTY 1307
-/*! transaction: commit timestamp queue inserts to tail */
-#define WT_STAT_CONN_TXN_COMMIT_QUEUE_TAIL 1308
+#define WT_STAT_CONN_TXN_COMMIT_QUEUE_EMPTY 1312
+/*! transaction: commit timestamp queue inserts to head */
+#define WT_STAT_CONN_TXN_COMMIT_QUEUE_HEAD 1313
/*! transaction: commit timestamp queue inserts total */
-#define WT_STAT_CONN_TXN_COMMIT_QUEUE_INSERTS 1309
+#define WT_STAT_CONN_TXN_COMMIT_QUEUE_INSERTS 1314
/*! transaction: commit timestamp queue length */
-#define WT_STAT_CONN_TXN_COMMIT_QUEUE_LEN 1310
+#define WT_STAT_CONN_TXN_COMMIT_QUEUE_LEN 1315
/*! transaction: number of named snapshots created */
-#define WT_STAT_CONN_TXN_SNAPSHOTS_CREATED 1311
+#define WT_STAT_CONN_TXN_SNAPSHOTS_CREATED 1316
/*! transaction: number of named snapshots dropped */
-#define WT_STAT_CONN_TXN_SNAPSHOTS_DROPPED 1312
+#define WT_STAT_CONN_TXN_SNAPSHOTS_DROPPED 1317
/*! transaction: prepared transactions */
-#define WT_STAT_CONN_TXN_PREPARE 1313
+#define WT_STAT_CONN_TXN_PREPARE 1318
/*! transaction: prepared transactions committed */
-#define WT_STAT_CONN_TXN_PREPARE_COMMIT 1314
+#define WT_STAT_CONN_TXN_PREPARE_COMMIT 1319
/*! transaction: prepared transactions currently active */
-#define WT_STAT_CONN_TXN_PREPARE_ACTIVE 1315
+#define WT_STAT_CONN_TXN_PREPARE_ACTIVE 1320
/*! transaction: prepared transactions rolled back */
-#define WT_STAT_CONN_TXN_PREPARE_ROLLBACK 1316
+#define WT_STAT_CONN_TXN_PREPARE_ROLLBACK 1321
/*! transaction: query timestamp calls */
-#define WT_STAT_CONN_TXN_QUERY_TS 1317
+#define WT_STAT_CONN_TXN_QUERY_TS 1322
+/*! transaction: read timestamp queue entries walked */
+#define WT_STAT_CONN_TXN_READ_QUEUE_WALKED 1323
/*! transaction: read timestamp queue insert to empty */
-#define WT_STAT_CONN_TXN_READ_QUEUE_EMPTY 1318
+#define WT_STAT_CONN_TXN_READ_QUEUE_EMPTY 1324
/*! transaction: read timestamp queue inserts to head */
-#define WT_STAT_CONN_TXN_READ_QUEUE_HEAD 1319
+#define WT_STAT_CONN_TXN_READ_QUEUE_HEAD 1325
/*! transaction: read timestamp queue inserts total */
-#define WT_STAT_CONN_TXN_READ_QUEUE_INSERTS 1320
+#define WT_STAT_CONN_TXN_READ_QUEUE_INSERTS 1326
/*! transaction: read timestamp queue length */
-#define WT_STAT_CONN_TXN_READ_QUEUE_LEN 1321
+#define WT_STAT_CONN_TXN_READ_QUEUE_LEN 1327
/*! transaction: rollback to stable calls */
-#define WT_STAT_CONN_TXN_ROLLBACK_TO_STABLE 1322
+#define WT_STAT_CONN_TXN_ROLLBACK_TO_STABLE 1328
/*! transaction: rollback to stable updates aborted */
-#define WT_STAT_CONN_TXN_ROLLBACK_UPD_ABORTED 1323
-/*! transaction: rollback to stable updates removed from lookaside */
-#define WT_STAT_CONN_TXN_ROLLBACK_LAS_REMOVED 1324
+#define WT_STAT_CONN_TXN_ROLLBACK_UPD_ABORTED 1329
+/*! transaction: rollback to stable updates removed from cache overflow */
+#define WT_STAT_CONN_TXN_ROLLBACK_LAS_REMOVED 1330
/*! transaction: set timestamp calls */
-#define WT_STAT_CONN_TXN_SET_TS 1325
+#define WT_STAT_CONN_TXN_SET_TS 1331
/*! transaction: set timestamp commit calls */
-#define WT_STAT_CONN_TXN_SET_TS_COMMIT 1326
+#define WT_STAT_CONN_TXN_SET_TS_COMMIT 1332
/*! transaction: set timestamp commit updates */
-#define WT_STAT_CONN_TXN_SET_TS_COMMIT_UPD 1327
+#define WT_STAT_CONN_TXN_SET_TS_COMMIT_UPD 1333
/*! transaction: set timestamp oldest calls */
-#define WT_STAT_CONN_TXN_SET_TS_OLDEST 1328
+#define WT_STAT_CONN_TXN_SET_TS_OLDEST 1334
/*! transaction: set timestamp oldest updates */
-#define WT_STAT_CONN_TXN_SET_TS_OLDEST_UPD 1329
+#define WT_STAT_CONN_TXN_SET_TS_OLDEST_UPD 1335
/*! transaction: set timestamp stable calls */
-#define WT_STAT_CONN_TXN_SET_TS_STABLE 1330
+#define WT_STAT_CONN_TXN_SET_TS_STABLE 1336
/*! transaction: set timestamp stable updates */
-#define WT_STAT_CONN_TXN_SET_TS_STABLE_UPD 1331
+#define WT_STAT_CONN_TXN_SET_TS_STABLE_UPD 1337
/*! transaction: transaction begins */
-#define WT_STAT_CONN_TXN_BEGIN 1332
+#define WT_STAT_CONN_TXN_BEGIN 1338
/*! transaction: transaction checkpoint currently running */
-#define WT_STAT_CONN_TXN_CHECKPOINT_RUNNING 1333
+#define WT_STAT_CONN_TXN_CHECKPOINT_RUNNING 1339
/*! transaction: transaction checkpoint generation */
-#define WT_STAT_CONN_TXN_CHECKPOINT_GENERATION 1334
+#define WT_STAT_CONN_TXN_CHECKPOINT_GENERATION 1340
/*! transaction: transaction checkpoint max time (msecs) */
-#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_MAX 1335
+#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_MAX 1341
/*! transaction: transaction checkpoint min time (msecs) */
-#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_MIN 1336
+#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_MIN 1342
/*! transaction: transaction checkpoint most recent time (msecs) */
-#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_RECENT 1337
+#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_RECENT 1343
/*! transaction: transaction checkpoint scrub dirty target */
-#define WT_STAT_CONN_TXN_CHECKPOINT_SCRUB_TARGET 1338
+#define WT_STAT_CONN_TXN_CHECKPOINT_SCRUB_TARGET 1344
/*! transaction: transaction checkpoint scrub time (msecs) */
-#define WT_STAT_CONN_TXN_CHECKPOINT_SCRUB_TIME 1339
+#define WT_STAT_CONN_TXN_CHECKPOINT_SCRUB_TIME 1345
/*! transaction: transaction checkpoint total time (msecs) */
-#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_TOTAL 1340
+#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_TOTAL 1346
/*! transaction: transaction checkpoints */
-#define WT_STAT_CONN_TXN_CHECKPOINT 1341
+#define WT_STAT_CONN_TXN_CHECKPOINT 1347
/*!
* transaction: transaction checkpoints skipped because database was
* clean
*/
-#define WT_STAT_CONN_TXN_CHECKPOINT_SKIPPED 1342
+#define WT_STAT_CONN_TXN_CHECKPOINT_SKIPPED 1348
/*! transaction: transaction failures due to cache overflow */
-#define WT_STAT_CONN_TXN_FAIL_CACHE 1343
+#define WT_STAT_CONN_TXN_FAIL_CACHE 1349
/*!
* transaction: transaction fsync calls for checkpoint after allocating
* the transaction ID
*/
-#define WT_STAT_CONN_TXN_CHECKPOINT_FSYNC_POST 1344
+#define WT_STAT_CONN_TXN_CHECKPOINT_FSYNC_POST 1350
/*!
* transaction: transaction fsync duration for checkpoint after
* allocating the transaction ID (usecs)
*/
-#define WT_STAT_CONN_TXN_CHECKPOINT_FSYNC_POST_DURATION 1345
+#define WT_STAT_CONN_TXN_CHECKPOINT_FSYNC_POST_DURATION 1351
/*! transaction: transaction range of IDs currently pinned */
-#define WT_STAT_CONN_TXN_PINNED_RANGE 1346
+#define WT_STAT_CONN_TXN_PINNED_RANGE 1352
/*! transaction: transaction range of IDs currently pinned by a checkpoint */
-#define WT_STAT_CONN_TXN_PINNED_CHECKPOINT_RANGE 1347
+#define WT_STAT_CONN_TXN_PINNED_CHECKPOINT_RANGE 1353
/*!
* transaction: transaction range of IDs currently pinned by named
* snapshots
*/
-#define WT_STAT_CONN_TXN_PINNED_SNAPSHOT_RANGE 1348
+#define WT_STAT_CONN_TXN_PINNED_SNAPSHOT_RANGE 1354
/*! transaction: transaction range of timestamps currently pinned */
-#define WT_STAT_CONN_TXN_PINNED_TIMESTAMP 1349
+#define WT_STAT_CONN_TXN_PINNED_TIMESTAMP 1355
+/*! transaction: transaction range of timestamps pinned by a checkpoint */
+#define WT_STAT_CONN_TXN_PINNED_TIMESTAMP_CHECKPOINT 1356
/*!
* transaction: transaction range of timestamps pinned by the oldest
* timestamp
*/
-#define WT_STAT_CONN_TXN_PINNED_TIMESTAMP_OLDEST 1350
+#define WT_STAT_CONN_TXN_PINNED_TIMESTAMP_OLDEST 1357
/*! transaction: transaction sync calls */
-#define WT_STAT_CONN_TXN_SYNC 1351
+#define WT_STAT_CONN_TXN_SYNC 1358
/*! transaction: transactions committed */
-#define WT_STAT_CONN_TXN_COMMIT 1352
+#define WT_STAT_CONN_TXN_COMMIT 1359
/*! transaction: transactions rolled back */
-#define WT_STAT_CONN_TXN_ROLLBACK 1353
+#define WT_STAT_CONN_TXN_ROLLBACK 1360
/*! transaction: update conflicts */
-#define WT_STAT_CONN_TXN_UPDATE_CONFLICT 1354
+#define WT_STAT_CONN_TXN_UPDATE_CONFLICT 1361
/*!
* @}
@@ -5870,7 +5946,7 @@ extern int wiredtiger_extension_terminate(WT_CONNECTION *connection);
#define WT_STAT_DSRC_CACHE_READ_OVERFLOW 2065
/*! cache: page split during eviction deepened the tree */
#define WT_STAT_DSRC_CACHE_EVICTION_DEEPEN 2066
-/*! cache: page written requiring lookaside records */
+/*! cache: page written requiring cache overflow records */
#define WT_STAT_DSRC_CACHE_WRITE_LOOKASIDE 2067
/*! cache: pages read into cache */
#define WT_STAT_DSRC_CACHE_READ 2068
@@ -5878,7 +5954,7 @@ extern int wiredtiger_extension_terminate(WT_CONNECTION *connection);
#define WT_STAT_DSRC_CACHE_READ_DELETED 2069
/*! cache: pages read into cache after truncate in prepare state */
#define WT_STAT_DSRC_CACHE_READ_DELETED_PREPARED 2070
-/*! cache: pages read into cache requiring lookaside entries */
+/*! cache: pages read into cache requiring cache overflow entries */
#define WT_STAT_DSRC_CACHE_READ_LOOKASIDE 2071
/*! cache: pages requested from the cache */
#define WT_STAT_DSRC_CACHE_PAGES_REQUESTED 2072
@@ -6019,32 +6095,32 @@ extern int wiredtiger_extension_terminate(WT_CONNECTION *connection);
#define WT_STAT_DSRC_CURSOR_INSERT_BULK 2106
/*! cursor: create calls */
#define WT_STAT_DSRC_CURSOR_CREATE 2107
+/*! cursor: cursor operation restarted */
+#define WT_STAT_DSRC_CURSOR_RESTART 2108
/*! cursor: cursor-insert key and value bytes inserted */
-#define WT_STAT_DSRC_CURSOR_INSERT_BYTES 2108
+#define WT_STAT_DSRC_CURSOR_INSERT_BYTES 2109
/*! cursor: cursor-remove key bytes removed */
-#define WT_STAT_DSRC_CURSOR_REMOVE_BYTES 2109
+#define WT_STAT_DSRC_CURSOR_REMOVE_BYTES 2110
/*! cursor: cursor-update value bytes updated */
-#define WT_STAT_DSRC_CURSOR_UPDATE_BYTES 2110
+#define WT_STAT_DSRC_CURSOR_UPDATE_BYTES 2111
/*! cursor: cursors cached on close */
-#define WT_STAT_DSRC_CURSOR_CACHE 2111
+#define WT_STAT_DSRC_CURSOR_CACHE 2112
/*! cursor: cursors reused from cache */
-#define WT_STAT_DSRC_CURSOR_REOPEN 2112
+#define WT_STAT_DSRC_CURSOR_REOPEN 2113
/*! cursor: insert calls */
-#define WT_STAT_DSRC_CURSOR_INSERT 2113
+#define WT_STAT_DSRC_CURSOR_INSERT 2114
/*! cursor: modify calls */
-#define WT_STAT_DSRC_CURSOR_MODIFY 2114
+#define WT_STAT_DSRC_CURSOR_MODIFY 2115
/*! cursor: next calls */
-#define WT_STAT_DSRC_CURSOR_NEXT 2115
+#define WT_STAT_DSRC_CURSOR_NEXT 2116
/*! cursor: prev calls */
-#define WT_STAT_DSRC_CURSOR_PREV 2116
+#define WT_STAT_DSRC_CURSOR_PREV 2117
/*! cursor: remove calls */
-#define WT_STAT_DSRC_CURSOR_REMOVE 2117
+#define WT_STAT_DSRC_CURSOR_REMOVE 2118
/*! cursor: reserve calls */
-#define WT_STAT_DSRC_CURSOR_RESERVE 2118
+#define WT_STAT_DSRC_CURSOR_RESERVE 2119
/*! cursor: reset calls */
-#define WT_STAT_DSRC_CURSOR_RESET 2119
-/*! cursor: restarted searches */
-#define WT_STAT_DSRC_CURSOR_RESTART 2120
+#define WT_STAT_DSRC_CURSOR_RESET 2120
/*! cursor: search calls */
#define WT_STAT_DSRC_CURSOR_SEARCH 2121
/*! cursor: search near calls */
diff --git a/src/third_party/wiredtiger/src/log/log.c b/src/third_party/wiredtiger/src/log/log.c
index 6ce26e03c5d..9a5854c8195 100644
--- a/src/third_party/wiredtiger/src/log/log.c
+++ b/src/third_party/wiredtiger/src/log/log.c
@@ -10,6 +10,7 @@
static int __log_newfile(WT_SESSION_IMPL *, bool, bool *);
static int __log_openfile(WT_SESSION_IMPL *, uint32_t, uint32_t, WT_FH **);
+static int __log_truncate(WT_SESSION_IMPL *, WT_LSN *, bool, bool);
static int __log_write_internal(
WT_SESSION_IMPL *, WT_ITEM *, WT_LSN *, uint32_t);
@@ -207,7 +208,7 @@ __log_fs_write(WT_SESSION_IMPL *session,
WT_RET(__wt_log_force_sync(session, &slot->slot_release_lsn));
}
if ((ret = __wt_write(session, slot->slot_fh, offset, len, buf)) != 0)
- WT_PANIC_MSG(session, ret,
+ WT_PANIC_RET(session, ret,
"%s: fatal log failure", slot->slot_fh->name);
return (ret);
}
@@ -445,18 +446,18 @@ __wt_log_written_reset(WT_SESSION_IMPL *session)
}
/*
- * __wt_log_get_all_files --
- * Retrieve the list of log files, either all of them or only the active
- * ones (those that are not candidates for archiving). The caller is
- * responsible for freeing the directory list returned.
+ * __wt_log_get_backup_files --
+ * Retrieve the list of log files for taking a backup, either all of them
+ * or only the active ones (those that are not candidates for archiving).
+ * The caller is responsible for freeing the directory list returned.
*/
int
-__wt_log_get_all_files(WT_SESSION_IMPL *session,
+__wt_log_get_backup_files(WT_SESSION_IMPL *session,
char ***filesp, u_int *countp, uint32_t *maxid, bool active_only)
{
WT_DECL_RET;
WT_LOG *log;
- uint32_t id, max;
+ uint32_t id, max, max_file, min_file;
u_int count, i;
char **files;
@@ -468,16 +469,36 @@ __wt_log_get_all_files(WT_SESSION_IMPL *session,
log = S2C(session)->log;
/*
- * These may be files needed by backup. Force the current slot
- * to get written to the file.
+ * Capture the next file utilized for writing to the log, before forcing
+ * a new log file. This represents the latest journal file that needs to
+ * be copied. Note the checkpoint selected for backup may be writing to
+ * an even later log file. In that case, copying the journal files is
+ * correct, but wasteful.
*/
+ max_file = log->alloc_lsn.l.file;
+
+ /*
+ * Capture the journal file the current checkpoint started in. The
+ * current checkpoint or a later one may be selected for backing up,
+ * requiring log files as early as this file. Together with max_file,
+ * this defines the range of journal files to include.
+ */
+ min_file = log->ckpt_lsn.l.file;
+
+ /*
+ * Force the current slot to get written to the file. Also switch to
+ * using a new log file. That log file will be removed from the list of
+ * files returned. New writes will not be included in the backup.
+ */
+ if (active_only)
+ F_SET(log, WT_LOG_FORCE_NEWFILE);
WT_RET(__wt_log_force_write(session, 1, NULL));
WT_RET(__log_get_files(session, WT_LOG_FILENAME, &files, &count));
- /* Filter out any files that are below the checkpoint LSN. */
for (max = 0, i = 0; i < count; ) {
WT_ERR(__wt_log_extract_lognum(session, files[i], &id));
- if (active_only && id < log->ckpt_lsn.l.file) {
+ if (active_only &&
+ (id < min_file || id > max_file)) {
/*
* Any files not being returned are individually freed
* and the array adjusted.
@@ -531,7 +552,7 @@ __wt_log_extract_lognum(
if (id == NULL || name == NULL)
WT_RET_MSG(session, EINVAL,
- "%s: unexpected usage: no id or no name", __func__);
+ "unexpected usage: no id or no name");
if ((p = strrchr(name, '.')) == NULL ||
sscanf(++p, "%" SCNu32, id) != 1)
WT_RET_MSG(session, WT_ERROR, "Bad log file name '%s'", name);
@@ -671,13 +692,17 @@ __log_prealloc(WT_SESSION_IMPL *session, WT_FH *fh)
*/
if (FLD_ISSET(conn->log_flags, WT_CONN_LOG_ZERO_FILL))
return (__log_zero(session, fh,
- WT_LOG_END_HEADER, conn->log_file_max));
+ log->first_record, conn->log_file_max));
+
+ /* If configured to not extend the file, we're done. */
+ if (conn->log_extend_len == 0)
+ return (0);
/*
* We have exclusive access to the log file and there are no other
* writes happening concurrently, so there are no locking issues.
*/
- ret = __wt_fextend(session, fh, conn->log_file_max);
+ ret = __wt_fextend(session, fh, conn->log_extend_len);
return (ret == EBUSY || ret == ENOTSUP ? 0 : ret);
}
@@ -716,8 +741,7 @@ __log_decompress(WT_SESSION_IMPL *session, WT_ITEM *in, WT_ITEM *out)
compressor = conn->log_compressor;
if (compressor == NULL || compressor->decompress == NULL)
WT_RET_MSG(session, WT_ERROR,
- "%s: Compressed record with no configured compressor",
- __func__);
+ "Compressed record with no configured compressor");
uncompressed_size = logrec->mem_len;
WT_RET(__wt_buf_initsize(session, out, uncompressed_size));
memcpy(out->mem, in->mem, skip);
@@ -734,7 +758,7 @@ __log_decompress(WT_SESSION_IMPL *session, WT_ITEM *in, WT_ITEM *out)
*/
if (result_len != uncompressed_size - WT_LOG_COMPRESS_SKIP)
WT_RET_MSG(session, WT_ERROR,
- "%s: decompression failed with incorrect size", __func__);
+ "decompression failed with incorrect size");
return (0);
}
@@ -756,8 +780,7 @@ __log_decrypt(WT_SESSION_IMPL *session, WT_ITEM *in, WT_ITEM *out)
(encryptor = kencryptor->encryptor) == NULL ||
encryptor->decrypt == NULL)
WT_RET_MSG(session, WT_ERROR,
- "%s: Encrypted record with no configured decrypt method",
- __func__);
+ "Encrypted record with no configured decrypt method");
return (__wt_decrypt(session, encryptor, WT_LOG_ENCRYPT_SKIP, in, out));
}
@@ -926,7 +949,7 @@ err: __wt_scr_free(session, &buf);
*/
static int
__log_open_verify(WT_SESSION_IMPL *session, uint32_t id, WT_FH **fhp,
- WT_LSN *lsnp, uint16_t *versionp)
+ WT_LSN *lsnp, uint16_t *versionp, bool *need_salvagep)
{
WT_CONNECTION_IMPL *conn;
WT_DECL_ITEM(buf);
@@ -937,11 +960,15 @@ __log_open_verify(WT_SESSION_IMPL *session, uint32_t id, WT_FH **fhp,
WT_LOG_RECORD *logrec;
uint32_t allocsize, rectype;
const uint8_t *end, *p;
+ bool need_salvage, salvage_mode;
conn = S2C(session);
+ fh = NULL;
log = conn->log;
+ need_salvage = false;
WT_RET(__wt_scr_alloc(session, 0, &buf));
- WT_ERR(__log_openfile(session, id, 0, &fh));
+ salvage_mode = (need_salvagep != NULL &&
+ F_ISSET(conn, WT_CONN_SALVAGE));
if (log == NULL)
allocsize = WT_LOG_ALIGN;
@@ -953,17 +980,30 @@ __log_open_verify(WT_SESSION_IMPL *session, uint32_t id, WT_FH **fhp,
memset(buf->mem, 0, allocsize);
/*
+ * Any operation that fails from here on out indicates corruption
+ * that could be salvaged.
+ */
+ need_salvage = true;
+
+ /*
* Read in the log file header and verify it.
*/
+ WT_ERR(__log_openfile(session, id, 0, &fh));
WT_ERR(__wt_read(session, fh, 0, allocsize, buf->mem));
logrec = (WT_LOG_RECORD *)buf->mem;
__wt_log_record_byteswap(logrec);
desc = (WT_LOG_DESC *)logrec->record;
__wt_log_desc_byteswap(desc);
- if (desc->log_magic != WT_LOG_MAGIC)
- WT_PANIC_RET(session, WT_ERROR,
- "log file %s corrupted: Bad magic number %" PRIu32,
- fh->name, desc->log_magic);
+ if (desc->log_magic != WT_LOG_MAGIC) {
+ if (salvage_mode)
+ WT_ERR_MSG(session, WT_ERROR,
+ "log file %s corrupted: Bad magic number %" PRIu32,
+ fh->name, desc->log_magic);
+ else
+ WT_PANIC_RET(session, WT_ERROR,
+ "log file %s corrupted: Bad magic number %" PRIu32,
+ fh->name, desc->log_magic);
+ }
/*
* We cannot read future log file formats.
*/
@@ -1043,13 +1083,74 @@ err: __wt_scr_free(session, &buf);
*/
if (fhp != NULL && ret == 0)
*fhp = fh;
- else
+ else if (ret != 0 && need_salvage && salvage_mode) {
+ /* Let the caller know this file must be salvaged. */
+ ret = 0;
+ WT_TRET(__wt_close(session, &fh));
+ if (fhp != NULL)
+ *fhp = NULL;
+ *need_salvagep = true;
+ } else
WT_TRET(__wt_close(session, &fh));
return (ret);
}
/*
+ * __log_record_verify --
+ * Check that values of the log record header are valid.
+ * No byteswap of the header has been done at this point.
+ */
+static int
+__log_record_verify(WT_SESSION_IMPL *session, WT_FH *log_fh, uint32_t offset,
+ WT_LOG_RECORD *logrecp, bool *corrupt)
+{
+ WT_LOG_RECORD logrec;
+ size_t i;
+
+ *corrupt = false;
+
+ /*
+ * Make our own copy of the header so we can get the bytes in the
+ * proper order.
+ */
+ logrec = *logrecp;
+ __wt_log_record_byteswap(&logrec);
+
+ if (F_ISSET(&logrec, ~(WT_LOG_RECORD_ALL_FLAGS))) {
+ WT_RET(__wt_msg(session,
+ "%s: log record at position %" PRIu32
+ " has flag corruption 0x%" PRIx16, log_fh->name, offset,
+ logrec.flags));
+ *corrupt = true;
+ }
+ for (i = 0; i < sizeof(logrec.unused); i++)
+ if (logrec.unused[i] != 0) {
+ WT_RET(__wt_msg(session,
+ "%s: log record at position %" PRIu32
+ " has unused[%" WT_SIZET_FMT "] corruption 0x%"
+ PRIx8, log_fh->name, offset, i, logrec.unused[i]));
+ *corrupt = true;
+ }
+ if (logrec.mem_len != 0 && !F_ISSET(&logrec,
+ WT_LOG_RECORD_COMPRESSED | WT_LOG_RECORD_ENCRYPTED)) {
+ WT_RET(__wt_msg(session,
+ "%s: log record at position %" PRIu32
+ " has memory len corruption 0x%" PRIx32, log_fh->name,
+ offset, logrec.mem_len));
+ *corrupt = true;
+ }
+ if (logrec.len <= offsetof(WT_LOG_RECORD, record)) {
+ WT_RET(__wt_msg(session,
+ "%s: log record at position %" PRIu32
+ " has record len corruption 0x%" PRIx32, log_fh->name,
+ offset, logrec.len));
+ *corrupt = true;
+ }
+ return (0);
+}
+
+/*
* __log_alloc_prealloc --
* Look for a pre-allocated log file and rename it to use as the next
* real log file. Called locked.
@@ -1131,7 +1232,7 @@ __log_newfile(WT_SESSION_IMPL *session, bool conn_open, bool *created)
WT_STAT_CONN_INCR(session, log_close_yields);
__wt_log_wrlsn(session, NULL);
if (++yield_cnt > 10000)
- return (EBUSY);
+ return (__wt_set_return(session, EBUSY));
__wt_yield();
}
/*
@@ -1200,7 +1301,8 @@ __log_newfile(WT_SESSION_IMPL *session, bool conn_open, bool *created)
* we must pass in a local file handle. Otherwise there is a wide
* window where another thread could see a NULL log file handle.
*/
- WT_RET(__log_open_verify(session, log->fileid, &log_fh, NULL, NULL));
+ WT_RET(__log_open_verify(session, log->fileid, &log_fh, NULL, NULL,
+ NULL));
/*
* Write the LSN at the end of the last record in the previous log file
* as the first record in this log file.
@@ -1412,25 +1514,31 @@ __log_truncate_file(WT_SESSION_IMPL *session, WT_FH *log_fh, wt_off_t offset)
* it will truncate between the given LSN and the trunc_lsn. That is,
* since we pre-allocate log files, it will free that space and allow the
* log to be traversed. We use the trunc_lsn because logging has already
- * opened the new/next log file before recovery ran. This function assumes
- * we are in recovery or other dedicated time and not during live running.
+ * opened the new/next log file before recovery ran. If salvage_mode is
+ * set, we verify headers of log files visited and recreate them if they
+ * are damaged. This function assumes we are in recovery or other
+ * dedicated time and not during live running.
*/
static int
-__log_truncate(WT_SESSION_IMPL *session, WT_LSN *lsn, bool this_log)
+__log_truncate(WT_SESSION_IMPL *session, WT_LSN *lsn, bool this_log,
+ bool salvage_mode)
{
WT_CONNECTION_IMPL *conn;
WT_DECL_RET;
WT_FH *log_fh;
WT_LOG *log;
- uint32_t lognum;
+ uint32_t lognum, salvage_first, salvage_last;
u_int i, logcount;
char **logfiles;
+ bool need_salvage, opened;
conn = S2C(session);
log = conn->log;
log_fh = NULL;
logcount = 0;
logfiles = NULL;
+ salvage_first = salvage_last = 0;
+ need_salvage = false;
/*
* Truncate the log file to the given LSN.
@@ -1446,6 +1554,10 @@ __log_truncate(WT_SESSION_IMPL *session, WT_LSN *lsn, bool this_log)
WT_ERR(__wt_fsync(session, log_fh, true));
WT_ERR(__wt_close(session, &log_fh));
+ if (salvage_mode)
+ WT_ERR(__wt_msg(session,
+ "salvage: log file %" PRIu32 " truncated", lsn->l.file));
+
/*
* If we just want to truncate the current log, return and skip
* looking for intervening logs.
@@ -1456,7 +1568,32 @@ __log_truncate(WT_SESSION_IMPL *session, WT_LSN *lsn, bool this_log)
for (i = 0; i < logcount; i++) {
WT_ERR(__wt_log_extract_lognum(session, logfiles[i], &lognum));
if (lognum > lsn->l.file && lognum < log->trunc_lsn.l.file) {
- WT_ERR(__log_openfile(session, lognum, 0, &log_fh));
+ opened = false;
+ if (salvage_mode) {
+ /*
+ * When salvaging, we verify that the
+ * header of the log file is valid.
+ * If not, create a new, empty one.
+ */
+ need_salvage = false;
+ WT_ERR(__log_open_verify(session, lognum,
+ &log_fh, NULL, NULL, &need_salvage));
+ if (need_salvage) {
+ WT_ASSERT(session, log_fh == NULL);
+ WT_ERR(__wt_log_remove(session,
+ WT_LOG_FILENAME, lognum));
+ WT_ERR(__wt_log_allocfile(session,
+ lognum, WT_LOG_FILENAME));
+ } else
+ opened = true;
+
+ if (salvage_first == 0)
+ salvage_first = lognum;
+ salvage_last = lognum;
+ }
+ if (!opened)
+ WT_ERR(__log_openfile(session, lognum, 0,
+ &log_fh));
/*
* If there are intervening files pre-allocated,
* truncate them to the end of the log file header.
@@ -1469,6 +1606,17 @@ __log_truncate(WT_SESSION_IMPL *session, WT_LSN *lsn, bool this_log)
}
err: WT_TRET(__wt_close(session, &log_fh));
WT_TRET(__wt_fs_directory_list_free(session, &logfiles, logcount));
+ if (salvage_first != 0) {
+ if (salvage_last > salvage_first)
+ WT_TRET(__wt_msg(session,
+ "salvage: log files %" PRIu32 "-%" PRIu32
+ " truncated at beginning", salvage_first,
+ salvage_last));
+ else
+ WT_TRET(__wt_msg(session,
+ "salvage: log file %" PRIu32
+ " truncated at beginning", salvage_first));
+ }
return (ret);
}
@@ -1566,13 +1714,12 @@ __wt_log_open(WT_SESSION_IMPL *session)
uint16_t version;
u_int i, logcount;
char **logfiles;
+ bool need_salvage;
conn = S2C(session);
log = conn->log;
logfiles = NULL;
logcount = 0;
- lastlog = 0;
- firstlog = UINT32_MAX;
/*
* Open up a file handle to the log directory if we haven't.
@@ -1587,9 +1734,14 @@ __wt_log_open(WT_SESSION_IMPL *session)
if (!F_ISSET(conn, WT_CONN_READONLY))
WT_ERR(__log_prealloc_remove(session));
+again:
/*
* Now look at the log files and set our LSNs.
*/
+ lastlog = 0;
+ firstlog = UINT32_MAX;
+ need_salvage = false;
+
WT_ERR(__log_get_files(session, WT_LOG_FILENAME, &logfiles, &logcount));
for (i = 0; i < logcount; i++) {
WT_ERR(__wt_log_extract_lognum(session, logfiles[i], &lognum));
@@ -1610,8 +1762,23 @@ __wt_log_open(WT_SESSION_IMPL *session)
* we create a new log file so that we can detect an unsupported
* version before modifying the file space.
*/
- WT_ERR(__log_open_verify(session,
- lastlog, NULL, NULL, &version));
+ WT_ERR(__log_open_verify(session, lastlog, NULL, NULL,
+ &version, &need_salvage));
+
+ /*
+ * If we were asked to salvage and the last log file was
+ * indeed corrupt, remove it and try all over again.
+ */
+ if (need_salvage) {
+ WT_ERR(__wt_log_remove(
+ session, WT_LOG_FILENAME, lastlog));
+ WT_ERR(__wt_msg(session,
+ "salvage: log file %" PRIu32 " removed", lastlog));
+ WT_ERR(__wt_fs_directory_list_free(session, &logfiles,
+ logcount));
+ logfiles = NULL;
+ goto again;
+ }
}
/*
@@ -1641,7 +1808,7 @@ __wt_log_open(WT_SESSION_IMPL *session)
* have to close the file.
*/
WT_ERR(__log_open_verify(session,
- lognum, NULL, NULL, &version));
+ lognum, NULL, NULL, &version, NULL));
/*
* If we find any log file at the wrong version
* set the flag and we're done.
@@ -1711,17 +1878,20 @@ __wt_log_close(WT_SESSION_IMPL *session)
* file is zeroes.
*/
static int
-__log_has_hole(WT_SESSION_IMPL *session,
- WT_FH *fh, wt_off_t log_size, wt_off_t offset, bool *hole)
+__log_has_hole(WT_SESSION_IMPL *session, WT_FH *fh, wt_off_t log_size,
+ wt_off_t offset, wt_off_t *error_offset, bool *hole)
{
WT_CONNECTION_IMPL *conn;
WT_DECL_RET;
WT_LOG *log;
+ WT_LOG_RECORD *logrec;
wt_off_t off, remainder;
- size_t bufsz, rdlen;
- char *buf, *zerobuf;
+ size_t allocsize, buf_left, bufsz, rdlen;
+ char *buf, *p, *zerobuf;
+ bool corrupt;
- *hole = false;
+ *error_offset = 0;
+ corrupt = *hole = false;
conn = S2C(session);
log = conn->log;
@@ -1752,7 +1922,40 @@ __log_has_hole(WT_SESSION_IMPL *session,
remainder -= (wt_off_t)rdlen, off += (wt_off_t)rdlen) {
rdlen = WT_MIN(bufsz, (size_t)remainder);
WT_ERR(__wt_read(session, fh, off, rdlen, buf));
+ allocsize = (log == NULL ? WT_LOG_ALIGN : log->allocsize);
if (memcmp(buf, zerobuf, rdlen) != 0) {
+ /*
+ * Find where the next log record starts after the
+ * hole.
+ */
+ for (p = buf, buf_left = rdlen; buf_left > 0;
+ buf_left -= rdlen, p += rdlen) {
+ rdlen = WT_MIN(allocsize, buf_left);
+ if (memcmp(p, zerobuf, rdlen) != 0)
+ break;
+ }
+ /*
+ * A presumed log record begins here where the buffer
+ * becomes non-zero. If we have enough of a log record
+ * present in the buffer, we either have a valid header
+ * or corruption. Verify the header of this record to
+ * determine whether it is just a hole or corruption.
+ *
+ * We don't bother making this check for backup copies,
+ * as records may have their beginning zeroed, hence
+ * the part after a hole may in fact be the middle of
+ * the record.
+ */
+ if (!F_ISSET(conn, WT_CONN_WAS_BACKUP)) {
+ logrec = (WT_LOG_RECORD *)p;
+ if (buf_left >= sizeof(WT_LOG_RECORD)) {
+ off += p - buf;
+ WT_ERR(__log_record_verify(session, fh,
+ (uint32_t)off, logrec, &corrupt));
+ if (corrupt)
+ *error_offset = off;
+ }
+ }
*hole = true;
break;
}
@@ -1764,6 +1967,36 @@ err: __wt_free(session, buf);
}
/*
+ * __log_check_partial_write --
+ * Determine if the log record may be a partial write. If that's
+ * possible, return true, otherwise false.
+ *
+ * Since the log file is initially zeroed up to a predetermined size,
+ * any record that falls within that boundary that ends in one or
+ * more zeroes may be partial (or the initial record may have been
+ * padded with zeroes before writing). The only way we have any certainty
+ * is if the last byte is non-zero, when that happens, we know that
+ * the write cannot be partial.
+ */
+static bool
+__log_check_partial_write(WT_SESSION_IMPL *session, WT_ITEM *buf,
+ uint32_t reclen)
+{
+ uint8_t *rec;
+
+ WT_UNUSED(session);
+
+ /*
+ * We only check the final byte since that's the only way have any
+ * certainty. Even if the second to last byte is non-zero and the
+ * last byte is zero, that could still technically be the result of
+ * a partial write, however unlikely it may be.
+ */
+ rec = buf->mem;
+ return (reclen > 0 && rec[reclen - 1] == 0);
+}
+
+/*
* __wt_log_release --
* Release a log slot.
*/
@@ -1932,6 +2165,21 @@ err: if (locked)
}
/*
+ * __log_salvage_message --
+ * Show messages consistently for a salvageable error.
+ */
+static int
+__log_salvage_message(WT_SESSION_IMPL *session, const char *log_name,
+ const char *extra_msg, wt_off_t offset)
+{
+ WT_RET(__wt_msg(session,
+ "log file %s corrupted%s at position %" PRIuMAX
+ ", truncated", log_name, extra_msg, (uintmax_t)offset));
+ F_SET(S2C(session), WT_CONN_DATA_CORRUPTION);
+ return (WT_ERROR);
+}
+
+/*
* __wt_log_scan --
* Scan the logs, calling a function on each record found.
*/
@@ -1951,21 +2199,22 @@ __wt_log_scan(WT_SESSION_IMPL *session, WT_LSN *lsnp, uint32_t flags,
WT_LOG *log;
WT_LOG_RECORD *logrec;
WT_LSN end_lsn, next_lsn, prev_eof, prev_lsn, rd_lsn, start_lsn;
- wt_off_t log_size;
+ wt_off_t bad_offset, log_size;
uint32_t allocsize, firstlog, lastlog, lognum, rdup_len, reclen;
uint16_t version;
u_int i, logcount;
int firstrecord;
char **logfiles;
- bool eol, partial_record;
+ bool corrupt, eol, need_salvage, partial_record;
conn = S2C(session);
log = conn->log;
log_fh = NULL;
logcount = 0;
logfiles = NULL;
- eol = false;
+ corrupt = eol = false;
firstrecord = 1;
+ need_salvage = false;
/*
* If the caller did not give us a callback function there is nothing
@@ -1992,7 +2241,7 @@ __wt_log_scan(WT_SESSION_IMPL *session, WT_LSN *lsnp, uint32_t flags,
start_lsn = log->ckpt_lsn;
else if (!LF_ISSET(WT_LOGSCAN_FIRST))
WT_RET_MSG(session, WT_ERROR,
- "%s: WT_LOGSCAN_FIRST not set", __func__);
+ "WT_LOGSCAN_FIRST not set");
}
lastlog = log->fileid;
} else {
@@ -2064,8 +2313,10 @@ __wt_log_scan(WT_SESSION_IMPL *session, WT_LSN *lsnp, uint32_t flags,
if (!WT_IS_INIT_LSN(lsnp))
start_lsn = *lsnp;
}
- WT_ERR(__log_open_verify(session,
- start_lsn.l.file, &log_fh, &prev_lsn, NULL));
+ WT_ERR(__log_open_verify(session, start_lsn.l.file, &log_fh, &prev_lsn,
+ NULL, &need_salvage));
+ if (need_salvage)
+ WT_ERR_MSG(session, WT_ERROR, "log file requires salvage");
WT_ERR(__wt_filesize(session, log_fh, &log_size));
rd_lsn = start_lsn;
if (LF_ISSET(WT_LOGSCAN_RECOVER))
@@ -2081,14 +2332,21 @@ __wt_log_scan(WT_SESSION_IMPL *session, WT_LSN *lsnp, uint32_t flags,
advance:
if (rd_lsn.l.offset == log_size)
partial_record = false;
- else
+ else {
/*
* See if there is anything non-zero at the
* end of this log file.
*/
WT_ERR(__log_has_hole(
session, log_fh, log_size,
- rd_lsn.l.offset, &partial_record));
+ rd_lsn.l.offset, &bad_offset,
+ &partial_record));
+ if (bad_offset != 0) {
+ need_salvage = true;
+ WT_ERR(__log_salvage_message(session,
+ log_fh->name, "", bad_offset));
+ }
+ }
/*
* If we read the last record, go to the next file.
*/
@@ -2103,7 +2361,8 @@ advance:
__wt_verbose(session, WT_VERB_LOG,
"Truncate end of log %" PRIu32 "/%" PRIu32,
rd_lsn.l.file, rd_lsn.l.offset);
- WT_ERR(__log_truncate(session, &rd_lsn, true));
+ WT_ERR(__log_truncate(session, &rd_lsn, true,
+ false));
}
/*
* If we had a partial record, we'll want to break
@@ -2128,7 +2387,11 @@ advance:
" through %" PRIu32,
rd_lsn.l.file, end_lsn.l.file);
WT_ERR(__log_open_verify(session,
- rd_lsn.l.file, &log_fh, &prev_lsn, &version));
+ rd_lsn.l.file, &log_fh, &prev_lsn, &version,
+ &need_salvage));
+ if (need_salvage)
+ WT_ERR_MSG(session, WT_ERROR,
+ "log file requires salvage");
/*
* Opening the log file reads with verify sets up the
* previous LSN from the first record. This detects
@@ -2161,10 +2424,14 @@ advance:
}
/*
* Read the minimum allocation size a record could be.
+ * Conditionally set the need_salvage flag so that if the
+ * read fails, we know this is an situation we can salvage.
*/
WT_ASSERT(session, buf->memsize >= allocsize);
+ need_salvage = F_ISSET(conn, WT_CONN_SALVAGE);
WT_ERR(__wt_read(session,
log_fh, rd_lsn.l.offset, (size_t)allocsize, buf->mem));
+ need_salvage = false;
/*
* See if we need to read more than the allocation size. We
* expect that we rarely will have to read more. Most log
@@ -2187,7 +2454,13 @@ advance:
*/
if (reclen == 0) {
WT_ERR(__log_has_hole(
- session, log_fh, log_size, rd_lsn.l.offset, &eol));
+ session, log_fh, log_size, rd_lsn.l.offset,
+ &bad_offset, &eol));
+ if (bad_offset != 0) {
+ need_salvage = true;
+ WT_ERR(__log_salvage_message(session,
+ log_fh->name, "", bad_offset));
+ }
if (eol)
/* Found a hole. This LSN is the end. */
break;
@@ -2215,14 +2488,9 @@ advance:
WT_STAT_CONN_INCR(session, log_scan_rereads);
}
/*
- * We read in the record, verify checksum.
- *
- * Handle little- and big-endian objects. Objects are written
- * in little-endian format: save the header checksum, and
- * calculate the checksum for the header in its little-endian
- * form. Then, restore the header's checksum, and byte-swap
- * the whole thing as necessary, leaving us with a calculated
- * checksum that should match the checksum in the header.
+ * We read in the record, now verify the checksum. A failed
+ * checksum does not imply corruption, it may be the result
+ * of a partial write.
*/
buf->size = reclen;
logrec = (WT_LOG_RECORD *)buf->mem;
@@ -2242,6 +2510,50 @@ advance:
*/
if (LF_ISSET(WT_LOGSCAN_ONE))
ret = WT_NOTFOUND;
+
+ /*
+ * When we have a checksum mismatch, we would like
+ * to determine whether it may be the result of:
+ * 1) some expected corruption that can occur during
+ * backups
+ * 2) a partial write that can naturally occur when
+ * an application crashes
+ * 3) some other corruption
+ * so that we can (in case 3) flag cases of file system
+ * or hardware failures. Unfortunately, we have found
+ * on some systems that file system writes may in fact
+ * be lost, and this can readily be triggered with
+ * normal operations. Rather than force users to
+ * salvage in these situations, we merely truncate the
+ * log at this point and issue a message.
+ */
+ if (F_ISSET(conn, WT_CONN_WAS_BACKUP))
+ break;
+
+ if (!__log_check_partial_write(session, buf, reclen)) {
+ /*
+ * It's not a partial write, and we have a bad
+ * checksum. We treat it as a corruption that
+ * must be salvaged.
+ */
+ need_salvage = true;
+ WT_ERR(__log_salvage_message(session,
+ log_fh->name, ", bad checksum",
+ rd_lsn.l.offset));
+ } else {
+ /*
+ * It may be a partial write, or it's possible
+ * that the header is corrupt. Make a sanity
+ * check of the log record header.
+ */
+ WT_ERR(__log_record_verify(session, log_fh,
+ rd_lsn.l.offset, logrec, &corrupt));
+ if (corrupt) {
+ need_salvage = true;
+ WT_ERR(__log_salvage_message(session,
+ log_fh->name, "", rd_lsn.l.offset));
+ }
+ }
break;
}
__wt_log_record_byteswap(logrec);
@@ -2291,11 +2603,22 @@ advance:
__wt_verbose(session, WT_VERB_LOG,
"End of recovery truncate end of log %" PRIu32 "/%" PRIu32,
rd_lsn.l.file, rd_lsn.l.offset);
- WT_ERR(__log_truncate(session, &rd_lsn, false));
+ WT_ERR(__log_truncate(session, &rd_lsn, false, false));
}
err: WT_STAT_CONN_INCR(session, log_scans);
/*
+ * If we are salvaging and failed a salvageable operation, then
+ * truncate the log at the fail point.
+ */
+ if (ret != 0 && ret != WT_PANIC && need_salvage) {
+ WT_TRET(__wt_close(session, &log_fh));
+ log_fh = NULL;
+ WT_TRET(__log_truncate(session, &rd_lsn, false, true));
+ ret = 0;
+ }
+
+ /*
* If the first attempt to read a log record results in
* an error recovery is likely going to fail. Try to provide
* a helpful failure message.
@@ -2491,7 +2814,7 @@ __log_write_internal(WT_SESSION_IMPL *session, WT_ITEM *record, WT_LSN *lsnp,
WT_LSN lsn;
WT_MYSLOT myslot;
int64_t release_size;
- uint32_t force, rdup_len;
+ uint32_t fill_size, force, rdup_len;
bool free_slot;
conn = S2C(session);
@@ -2520,10 +2843,39 @@ __log_write_internal(WT_SESSION_IMPL *session, WT_ITEM *record, WT_LSN *lsnp,
/*
* If the caller's record only partially fills the necessary
* space, we need to zero-fill the remainder.
+ *
+ * The cast is safe, we've already checked to make sure it's in range.
*/
- if (record->size != rdup_len) {
- memset((uint8_t *)record->mem + record->size, 0,
- rdup_len - record->size);
+ fill_size = rdup_len - (uint32_t)record->size;
+ if (fill_size != 0) {
+ memset((uint8_t *)record->mem + record->size, 0, fill_size);
+ /*
+ * Set the last byte of the log record to a non-zero value,
+ * that allows us, on the input side, to tell that a log
+ * record was completely written; there couldn't have been
+ * a partial write. That means that any checksum mismatch
+ * in those conditions is a log corruption.
+ *
+ * Without this changed byte, when we see a zeroed last byte,
+ * we must always treat a checksum error as a possible partial
+ * write. Since partial writes can happen as a result of an
+ * interrupted process (for example, a shutdown), we must
+ * treat a checksum error as a normal occurrence, and merely
+ * the place where the log must be truncated. So any real
+ * corruption within log records is hard to detect as such.
+ *
+ * However, we can only make this modification if there is
+ * more than one byte being filled, as the first zero byte
+ * past the actual record is needed to terminate the loop
+ * in txn_commit_apply.
+ *
+ * This is not a log format change, as we only are changing a
+ * byte in the padding portion of a record, and no logging code
+ * has ever checked that it is any particular value up to now.
+ */
+ if (fill_size > 1)
+ *((uint8_t *)record->mem + rdup_len - 1) =
+ WT_DEBUG_BYTE;
record->size = rdup_len;
}
/*
diff --git a/src/third_party/wiredtiger/src/log/log_auto.c b/src/third_party/wiredtiger/src/log/log_auto.c
index 703a87b09d4..3a6aabf32c0 100644
--- a/src/third_party/wiredtiger/src/log/log_auto.c
+++ b/src/third_party/wiredtiger/src/log/log_auto.c
@@ -852,7 +852,7 @@ __wt_txn_op_printlog(WT_SESSION_IMPL *session,
WT_RET(__wt_logop_prev_lsn_print(session, pp, end, flags));
break;
- WT_ILLEGAL_VALUE(session);
+ WT_ILLEGAL_VALUE(session, optype);
}
return (0);
diff --git a/src/third_party/wiredtiger/src/log/log_slot.c b/src/third_party/wiredtiger/src/log/log_slot.c
index 8deda5e242f..c75181d0687 100644
--- a/src/third_party/wiredtiger/src/log/log_slot.c
+++ b/src/third_party/wiredtiger/src/log/log_slot.c
@@ -119,7 +119,7 @@ retry:
* decide if retrying is necessary or not.
*/
if (forced && WT_LOG_SLOT_INPROGRESS(old_state))
- return (EBUSY);
+ return (__wt_set_return(session, EBUSY));
/*
* If someone else is switching out this slot we lost. Nothing to
* do but return. Return WT_NOTFOUND anytime the given slot was
diff --git a/src/third_party/wiredtiger/src/lsm/lsm_cursor.c b/src/third_party/wiredtiger/src/lsm/lsm_cursor.c
index 4d9f6f92832..5dd3122d41c 100644
--- a/src/third_party/wiredtiger/src/lsm/lsm_cursor.c
+++ b/src/third_party/wiredtiger/src/lsm/lsm_cursor.c
@@ -265,6 +265,12 @@ open: WT_WITH_SCHEMA_LOCK(session,
}
if (!F_ISSET(clsm, WT_CLSM_ACTIVE)) {
+ /*
+ * Opening this LSM cursor has opened a number of btree
+ * cursors, ensure other code doesn't think this is the first
+ * cursor in a session.
+ */
+ ++session->ncursors;
WT_RET(__cursor_enter(session));
F_SET(clsm, WT_CLSM_ACTIVE);
}
@@ -284,6 +290,7 @@ __clsm_leave(WT_CURSOR_LSM *clsm)
session = (WT_SESSION_IMPL *)clsm->iface.session;
if (F_ISSET(clsm, WT_CLSM_ACTIVE)) {
+ --session->ncursors;
__cursor_leave(session);
F_CLR(clsm, WT_CLSM_ACTIVE);
}
@@ -365,12 +372,17 @@ __clsm_deleted_decode(WT_CURSOR_LSM *clsm, WT_ITEM *value)
* Close any btree cursors that are not needed.
*/
static int
-__clsm_close_cursors(WT_CURSOR_LSM *clsm, u_int start, u_int end)
+__clsm_close_cursors(
+ WT_SESSION_IMPL *session, WT_CURSOR_LSM *clsm, u_int start, u_int end)
{
WT_BLOOM *bloom;
WT_CURSOR *c;
u_int i;
+ __wt_verbose(session, WT_VERB_LSM,
+ "LSM closing cursor session(%p):clsm(%p), start: %u, end: %u",
+ (void *)session, (void *)clsm, start, end);
+
if (clsm->chunks == NULL || clsm->nchunks == 0)
return (0);
@@ -609,7 +621,7 @@ retry: if (F_ISSET(clsm, WT_CLSM_MERGE)) {
saved_gen = lsm_tree->dsk_gen;
locked = false;
__wt_lsm_tree_readunlock(session, lsm_tree);
- WT_ERR(__clsm_close_cursors(
+ WT_ERR(__clsm_close_cursors(session,
clsm, close_range_start, close_range_end));
__wt_lsm_tree_readlock(session, lsm_tree);
locked = true;
@@ -626,6 +638,10 @@ retry: if (F_ISSET(clsm, WT_CLSM_MERGE)) {
clsm->nchunks = nchunks;
/* Open the cursors for chunks that have changed. */
+ __wt_verbose(session, WT_VERB_LSM,
+ "LSM opening cursor session(%p):clsm(%p)%s, chunks: %u, good: %u",
+ (void *)session, (void *)clsm,
+ update ? ", update" : "", nchunks, ngood);
for (i = ngood; i != nchunks; i++) {
chunk = lsm_tree->chunk[i + start_chunk];
/* Copy the maximum transaction ID. */
@@ -1736,7 +1752,9 @@ __wt_clsm_close(WT_CURSOR *cursor)
*/
clsm = (WT_CURSOR_LSM *)cursor;
CURSOR_API_CALL_PREPARE_ALLOWED(cursor, session, close, NULL);
- WT_TRET(__clsm_close_cursors(clsm, 0, clsm->nchunks));
+err:
+
+ WT_TRET(__clsm_close_cursors(session, clsm, 0, clsm->nchunks));
__clsm_free_chunks(session, clsm);
/* In case we were somehow left positioned, clear that. */
@@ -1744,9 +1762,9 @@ __wt_clsm_close(WT_CURSOR *cursor)
if (clsm->lsm_tree != NULL)
__wt_lsm_tree_release(session, clsm->lsm_tree);
- WT_TRET(__wt_cursor_close(cursor));
+ __wt_cursor_close(cursor);
-err: API_END_RET(session, ret);
+ API_END_RET(session, ret);
}
/*
@@ -1821,10 +1839,9 @@ __wt_clsm_open(WT_SESSION_IMPL *session,
WT_ASSERT(session, !bulk || lsm_tree->excl_session != NULL);
WT_ERR(__wt_calloc_one(session, &clsm));
-
- cursor = &clsm->iface;
+ cursor = (WT_CURSOR *)clsm;
*cursor = iface;
- cursor->session = &session->iface;
+ cursor->session = (WT_SESSION *)session;
WT_ERR(__wt_strdup(session, lsm_tree->name, &cursor->uri));
cursor->key_format = lsm_tree->key_format;
cursor->value_format = lsm_tree->value_format;
diff --git a/src/third_party/wiredtiger/src/lsm/lsm_manager.c b/src/third_party/wiredtiger/src/lsm/lsm_manager.c
index 40ff5fc0b26..7a354403ad2 100644
--- a/src/third_party/wiredtiger/src/lsm/lsm_manager.c
+++ b/src/third_party/wiredtiger/src/lsm/lsm_manager.c
@@ -72,11 +72,7 @@ __lsm_general_worker_start(WT_SESSION_IMPL *session)
worker_args->type =
WT_LSM_WORK_DROP | WT_LSM_WORK_SWITCH;
else {
- worker_args->type =
- WT_LSM_WORK_BLOOM |
- WT_LSM_WORK_DROP |
- WT_LSM_WORK_FLUSH |
- WT_LSM_WORK_SWITCH;
+ worker_args->type = WT_LSM_WORK_GENERAL_OPS;
/*
* Only allow half of the threads to run merges to
* avoid all all workers getting stuck in long-running
@@ -315,7 +311,7 @@ __wt_lsm_manager_destroy(WT_SESSION_IMPL *session)
ret = __wt_lsm_tree_close_all(session);
WT_TRET(__wt_thread_join(
- session, manager->lsm_worker_cookies[0].tid));
+ session, &manager->lsm_worker_cookies[0].tid));
/* Release memory from any operations left on the queue. */
while ((current = TAILQ_FIRST(&manager->switchqh)) != NULL) {
@@ -422,9 +418,10 @@ __lsm_manager_run_server(WT_SESSION_IMPL *session)
fillms = 10000;
/*
* If the tree appears to not be triggering enough
- * LSM maintenance, help it out. Additional work units
- * don't hurt, and can be necessary if some work
- * units aren't completed for some reason.
+ * LSM maintenance, help it out. Some types of
+ * additional work units don't hurt, and can be
+ * necessary if some work units aren't completed for
+ * some reason.
* If the tree hasn't been modified, and there are
* more than 1 chunks - try to get the tree smaller
* so queries run faster.
diff --git a/src/third_party/wiredtiger/src/lsm/lsm_tree.c b/src/third_party/wiredtiger/src/lsm/lsm_tree.c
index 16b28a1aecc..3dd7222630f 100644
--- a/src/third_party/wiredtiger/src/lsm/lsm_tree.c
+++ b/src/third_party/wiredtiger/src/lsm/lsm_tree.c
@@ -407,6 +407,7 @@ __lsm_tree_find(WT_SESSION_IMPL *session,
{
WT_LSM_TREE *lsm_tree;
+ *treep = NULL;
WT_ASSERT(session, F_ISSET(session, WT_SESSION_LOCKED_HANDLE_LIST));
/* See if the tree is already open. */
@@ -419,7 +420,8 @@ __lsm_tree_find(WT_SESSION_IMPL *session,
*/
if (!__wt_atomic_cas_ptr(
&lsm_tree->excl_session, NULL, session))
- return (EBUSY);
+ return (__wt_set_return(
+ session, EBUSY));
/*
* Drain the work queue before checking for
@@ -431,7 +433,8 @@ __lsm_tree_find(WT_SESSION_IMPL *session,
if (lsm_tree->refcnt != 1) {
__wt_lsm_tree_release(
session, lsm_tree);
- return (EBUSY);
+ return (__wt_set_return(
+ session, EBUSY));
}
} else {
(void)__wt_atomic_add32(&lsm_tree->refcnt, 1);
@@ -445,7 +448,8 @@ __lsm_tree_find(WT_SESSION_IMPL *session,
lsm_tree->refcnt > 0);
__wt_lsm_tree_release(
session, lsm_tree);
- return (EBUSY);
+ return (__wt_set_return(
+ session, EBUSY));
}
}
diff --git a/src/third_party/wiredtiger/src/lsm/lsm_work_unit.c b/src/third_party/wiredtiger/src/lsm/lsm_work_unit.c
index 6f18f4fb152..a283670eba6 100644
--- a/src/third_party/wiredtiger/src/lsm/lsm_work_unit.c
+++ b/src/third_party/wiredtiger/src/lsm/lsm_work_unit.c
@@ -313,6 +313,37 @@ __wt_lsm_chunk_visible_all(
}
/*
+ * __lsm_set_chunk_evictable --
+ * Enable eviction in an LSM chunk.
+ */
+static int
+__lsm_set_chunk_evictable(
+ WT_SESSION_IMPL *session, WT_LSM_CHUNK *chunk, bool need_handle)
+{
+ WT_BTREE *btree;
+ WT_DECL_RET;
+
+ if (chunk->evict_enabled != 0)
+ return (0);
+
+ /* See if we win the race to enable eviction. */
+ if (__wt_atomic_cas32(&chunk->evict_enabled, 0, 1)) {
+ if (need_handle)
+ WT_RET(__wt_session_get_dhandle(
+ session, chunk->uri, NULL, NULL, 0));
+ btree = session->dhandle->handle;
+ if (btree->evict_disabled_open) {
+ btree->evict_disabled_open = false;
+ __wt_evict_file_exclusive_off(session);
+ }
+
+ if (need_handle)
+ WT_TRET(__wt_session_release_dhandle(session));
+ }
+ return (ret);
+}
+
+/*
* __lsm_checkpoint_chunk --
* Checkpoint an LSM chunk, separated out to make locking easier.
*/
@@ -340,7 +371,6 @@ int
__wt_lsm_checkpoint_chunk(WT_SESSION_IMPL *session,
WT_LSM_TREE *lsm_tree, WT_LSM_CHUNK *chunk)
{
- WT_BTREE *btree;
WT_DECL_RET;
WT_TXN_ISOLATION saved_isolation;
bool flush_set, release_dhandle;
@@ -375,6 +405,14 @@ __wt_lsm_checkpoint_chunk(WT_SESSION_IMPL *session,
WT_RET(__wt_txn_update_oldest(
session, WT_TXN_OLDEST_STRICT | WT_TXN_OLDEST_WAIT));
if (!__wt_lsm_chunk_visible_all(session, chunk)) {
+ /*
+ * If there is cache pressure consider making a chunk evictable
+ * to avoid the cache getting stuck when history is required.
+ */
+ if (__wt_eviction_needed(session, false, false, NULL))
+ WT_ERR(__wt_lsm_manager_push_entry(
+ session, WT_LSM_WORK_ENABLE_EVICT, 0, lsm_tree));
+
__wt_verbose(session, WT_VERB_LSM,
"LSM worker %s: running transaction, return",
chunk->uri);
@@ -446,11 +484,7 @@ __wt_lsm_checkpoint_chunk(WT_SESSION_IMPL *session,
* Enable eviction on the live chunk so it doesn't block the cache.
* Future reads should direct to the on-disk chunk anyway.
*/
- btree = session->dhandle->handle;
- if (btree->evict_disabled_open) {
- btree->evict_disabled_open = false;
- __wt_evict_file_exclusive_off(session);
- }
+ WT_ERR(__lsm_set_chunk_evictable(session, chunk, false));
release_dhandle = false;
WT_ERR(__wt_session_release_dhandle(session));
@@ -481,6 +515,54 @@ err: if (flush_set)
}
/*
+ * __wt_lsm_work_enable_evict --
+ * LSM usually pins live chunks in memory - preferring to force them
+ * out via a checkpoint when they are no longer required. For applications
+ * that keep data pinned for a long time this can lead to the cache
+ * being pinned full. This work unit detects that case, and enables
+ * regular eviction in chunks that can be correctly evicted.
+ */
+int
+__wt_lsm_work_enable_evict(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree)
+{
+ WT_DECL_RET;
+ WT_LSM_CHUNK *chunk;
+ WT_LSM_WORKER_COOKIE cookie;
+ u_int i;
+
+ WT_CLEAR(cookie);
+
+ /* Only do this if there is cache pressure */
+ if (!__wt_eviction_needed(session, false, false, NULL))
+ return (0);
+
+ WT_RET(__lsm_copy_chunks(session, lsm_tree, &cookie, false));
+
+ /*
+ * Turn on eviction in chunks that have had some chance to
+ * checkpoint if there is cache pressure.
+ */
+ for (i = 0; cookie.nchunks > 2 && i < cookie.nchunks - 2; i++) {
+ chunk = cookie.chunk_array[i];
+
+ /*
+ * Skip if the chunk isn't on disk yet, or if it's still in
+ * cache for a reason other than transaction visibility.
+ */
+ if (!F_ISSET(chunk, WT_LSM_CHUNK_ONDISK) ||
+ chunk->evict_enabled != 0 ||
+ __wt_lsm_chunk_visible_all(session, chunk))
+ continue;
+
+ WT_ERR(__lsm_set_chunk_evictable(session, chunk, true));
+ }
+
+err: __lsm_unpin_chunks(session, &cookie);
+ __wt_free(session, cookie.chunk_array);
+ return (ret);
+}
+
+/*
* __lsm_bloom_create --
* Create a bloom filter for a chunk of the LSM tree that has been
* checkpointed but not yet been merged.
diff --git a/src/third_party/wiredtiger/src/lsm/lsm_worker.c b/src/third_party/wiredtiger/src/lsm/lsm_worker.c
index 82f72bdf355..3579207e0c7 100644
--- a/src/third_party/wiredtiger/src/lsm/lsm_worker.c
+++ b/src/third_party/wiredtiger/src/lsm/lsm_worker.c
@@ -37,12 +37,14 @@ __wt_lsm_worker_stop(WT_SESSION_IMPL *session, WT_LSM_WORKER_ARGS *args)
{
args->running = false;
args->tid_set = false;
- return (__wt_thread_join(session, args->tid));
+ return (__wt_thread_join(session, &args->tid));
}
/*
* __lsm_worker_general_op --
- * Execute a single bloom, drop or flush work unit.
+ * Execute a single medium importance maintenance operation that should
+ * not be super long running. That includes bloom creation, drop or flush
+ * work unit types.
*/
static int
__lsm_worker_general_op(
@@ -55,11 +57,7 @@ __lsm_worker_general_op(
*completed = false;
- /*
- * Return if this thread cannot process a bloom, drop or flush.
- */
- if (!FLD_ISSET(cookie->type,
- WT_LSM_WORK_BLOOM | WT_LSM_WORK_DROP | WT_LSM_WORK_FLUSH))
+ if (!FLD_ISSET(cookie->type, WT_LSM_WORK_GENERAL_OPS))
return (WT_NOTFOUND);
if ((ret = __wt_lsm_manager_pop_entry(session,
@@ -88,6 +86,8 @@ __lsm_worker_general_op(
WT_ERR(__wt_lsm_free_chunks(session, entry->lsm_tree));
else if (entry->type == WT_LSM_WORK_BLOOM)
WT_ERR(__wt_lsm_work_bloom(session, entry->lsm_tree));
+ else if (entry->type == WT_LSM_WORK_ENABLE_EVICT)
+ WT_ERR(__wt_lsm_work_enable_evict(session, entry->lsm_tree));
*completed = true;
err: __wt_lsm_manager_free_work_unit(session, entry);
diff --git a/src/third_party/wiredtiger/src/meta/meta_ckpt.c b/src/third_party/wiredtiger/src/meta/meta_ckpt.c
index f32a1cbeb19..13e84efc199 100644
--- a/src/third_party/wiredtiger/src/meta/meta_ckpt.c
+++ b/src/third_party/wiredtiger/src/meta/meta_ckpt.c
@@ -291,7 +291,7 @@ __wt_meta_ckptlist_get(
WT_ERR(__wt_realloc_def(session, &allocated, slot + 2, &ckptbase));
/* Sort in creation-order. */
- qsort(ckptbase, slot, sizeof(WT_CKPT), __ckpt_compare_order);
+ __wt_qsort(ckptbase, slot, sizeof(WT_CKPT), __ckpt_compare_order);
/* Return the array to our caller. */
*ckptbasep = ckptbase;
diff --git a/src/third_party/wiredtiger/src/meta/meta_table.c b/src/third_party/wiredtiger/src/meta/meta_table.c
index 48ff3e9ab32..aca483264aa 100644
--- a/src/third_party/wiredtiger/src/meta/meta_table.c
+++ b/src/third_party/wiredtiger/src/meta/meta_table.c
@@ -331,3 +331,27 @@ err: WT_TRET(__wt_metadata_cursor_release(session, &cursor));
__wt_free(session, *valuep);
return (ret);
}
+
+/*
+ * __wt_metadata_salvage --
+ * Salvage the metadata file. This is a destructive operation.
+ * Save a copy of the original metadata.
+ */
+int
+__wt_metadata_salvage(WT_SESSION_IMPL *session)
+{
+ WT_SESSION *wt_session;
+
+ wt_session = &session->iface;
+ /*
+ * Copy the original metadata.
+ */
+ WT_RET(__wt_copy_and_sync(wt_session, WT_METAFILE, WT_METAFILE_SLVG));
+
+ /*
+ * Now salvage the metadata. We know we're in wiredtiger_open and
+ * single threaded.
+ */
+ WT_RET(wt_session->salvage(wt_session, WT_METAFILE_URI, NULL));
+ return (0);
+}
diff --git a/src/third_party/wiredtiger/src/meta/meta_track.c b/src/third_party/wiredtiger/src/meta/meta_track.c
index a8289b91ffa..188f1265363 100644
--- a/src/third_party/wiredtiger/src/meta/meta_track.c
+++ b/src/third_party/wiredtiger/src/meta/meta_track.c
@@ -8,6 +8,7 @@
#include "wt_internal.h"
+#undef WT_ENABLE_SCHEMA_TXN
/*
* WT_META_TRACK -- A tracked metadata operation: a non-transactional log,
* maintained to make it easy to unroll simple metadata and filesystem
@@ -118,6 +119,7 @@ __wt_meta_track_on(WT_SESSION_IMPL *session)
if (!F_ISSET(&session->txn, WT_TXN_RUNNING)) {
#ifdef WT_ENABLE_SCHEMA_TXN
WT_RET(__wt_txn_begin(session, NULL));
+ __wt_errx(session, "TRACK: Using internal schema txn");
#endif
F_SET(session, WT_SESSION_SCHEMA_TXN);
}
@@ -279,6 +281,7 @@ __wt_meta_track_off(WT_SESSION_IMPL *session, bool need_sync, bool unroll)
F_CLR(session, WT_SESSION_SCHEMA_TXN);
#ifdef WT_ENABLE_SCHEMA_TXN
WT_ERR(__wt_txn_commit(session, NULL));
+ __wt_errx(session, "TRACK: Commit internal schema txn");
#endif
}
@@ -304,12 +307,11 @@ __wt_meta_track_off(WT_SESSION_IMPL *session, bool need_sync, bool unroll)
* should be included in the checkpoint.
*/
ckpt_session->txn.id = session->txn.id;
- F_SET(ckpt_session, WT_SESSION_LOCKED_METADATA);
- WT_WITH_METADATA_LOCK(session,
- WT_WITH_DHANDLE(ckpt_session,
- WT_SESSION_META_DHANDLE(session),
- ret = __wt_checkpoint(ckpt_session, NULL)));
- F_CLR(ckpt_session, WT_SESSION_LOCKED_METADATA);
+ WT_ASSERT(session,
+ !F_ISSET(session, WT_SESSION_LOCKED_METADATA));
+ WT_WITH_DHANDLE(ckpt_session, WT_SESSION_META_DHANDLE(session),
+ WT_WITH_METADATA_LOCK(ckpt_session,
+ ret = __wt_checkpoint(ckpt_session, NULL)));
ckpt_session->txn.id = WT_TXN_NONE;
if (ret == 0)
WT_WITH_DHANDLE(session,
@@ -339,6 +341,7 @@ err: /*
WT_ASSERT(session, unroll || saved_ret != 0 ||
session->txn.mod_count == 0);
#ifdef WT_ENABLE_SCHEMA_TXN
+ __wt_errx(session, "TRACK: Abort internal schema txn");
WT_TRET(__wt_txn_rollback(session, NULL));
#endif
}
diff --git a/src/third_party/wiredtiger/src/meta/meta_turtle.c b/src/third_party/wiredtiger/src/meta/meta_turtle.c
index 2c83167c28f..af5c753b70a 100644
--- a/src/third_party/wiredtiger/src/meta/meta_turtle.c
+++ b/src/third_party/wiredtiger/src/meta/meta_turtle.c
@@ -82,7 +82,8 @@ __metadata_load_hot_backup(WT_SESSION_IMPL *session)
break;
WT_ERR(__wt_getline(session, fs, value));
if (value->size == 0)
- WT_ERR(__wt_illegal_value(session, WT_METADATA_BACKUP));
+ WT_PANIC_ERR(session, EINVAL,
+ "%s: zero-length value", WT_METADATA_BACKUP);
WT_ERR(__wt_metadata_update(session, key->data, value->data));
}
@@ -329,8 +330,10 @@ err: WT_TRET(__wt_fclose(session, &fs));
* something has gone horribly wrong, except for the compatibility
* setting which is optional.
*/
- return (ret == 0 || strcmp(key, WT_METADATA_COMPAT) == 0 ? ret :
- __wt_illegal_value(session, WT_METADATA_TURTLE));
+ if (ret == 0 || strcmp(key, WT_METADATA_COMPAT) == 0)
+ return (ret);
+ WT_PANIC_RET(session, ret,
+ "%s: fatal turtle file read error", WT_METADATA_TURTLE);
}
/*
@@ -388,5 +391,8 @@ err: WT_TRET(__wt_fclose(session, &fs));
* An error updating the turtle file means something has gone horribly
* wrong -- we're done.
*/
- return (ret == 0 ? 0 : __wt_illegal_value(session, WT_METADATA_TURTLE));
+ if (ret == 0)
+ return (ret);
+ WT_PANIC_RET(session, ret,
+ "%s: fatal turtle file update error", WT_METADATA_TURTLE);
}
diff --git a/src/third_party/wiredtiger/src/optrack/optrack.c b/src/third_party/wiredtiger/src/optrack/optrack.c
index ccec13d433b..3a2ac879122 100644
--- a/src/third_party/wiredtiger/src/optrack/optrack.c
+++ b/src/third_party/wiredtiger/src/optrack/optrack.c
@@ -21,12 +21,15 @@ __wt_optrack_record_funcid(
WT_DECL_ITEM(tmp);
WT_DECL_RET;
wt_off_t fsize;
+ bool locked;
conn = S2C(session);
+ locked = false;
WT_ERR(__wt_scr_alloc(session, strlen(func) + 32, &tmp));
__wt_spin_lock(session, &conn->optrack_map_spinlock);
+ locked = true;
if (*func_idp == 0) {
*func_idp = ++optrack_uid;
@@ -38,10 +41,12 @@ __wt_optrack_record_funcid(
}
if (0) {
-err: WT_PANIC_MSG(session, ret, "%s", __func__);
+err: WT_PANIC_MSG(session, ret,
+ "operation tracking initialization failure");
}
- __wt_spin_unlock(session, &conn->optrack_map_spinlock);
+ if (locked)
+ __wt_spin_unlock(session, &conn->optrack_map_spinlock);
__wt_scr_free(session, &tmp);
}
@@ -61,8 +66,7 @@ __optrack_open_file(WT_SESSION_IMPL *session)
conn = S2C(session);
if (!F_ISSET(conn, WT_CONN_OPTRACK))
- WT_RET_MSG(session, WT_ERROR,
- "%s: WT_CONN_OPTRACK not set", __func__);
+ WT_RET_MSG(session, WT_ERROR, "WT_CONN_OPTRACK not set");
WT_RET(__wt_scr_alloc(session, 0, &buf));
WT_ERR(__wt_filename_construct(session, conn->optrack_path,
diff --git a/src/third_party/wiredtiger/src/os_common/filename.c b/src/third_party/wiredtiger/src/os_common/filename.c
index 6eebe545242..4ace3598c72 100644
--- a/src/third_party/wiredtiger/src/os_common/filename.c
+++ b/src/third_party/wiredtiger/src/os_common/filename.c
@@ -91,7 +91,7 @@ __wt_remove_if_exists(WT_SESSION_IMPL *session, const char *name, bool durable)
/*
* __wt_copy_and_sync --
- * Copy a file safely; here to support the wt utility.
+ * Copy a file safely.
*/
int
__wt_copy_and_sync(WT_SESSION *wt_session, const char *from, const char *to)
diff --git a/src/third_party/wiredtiger/src/os_common/os_abort.c b/src/third_party/wiredtiger/src/os_common/os_abort.c
index 85dcc741855..54cae3e61aa 100644
--- a/src/third_party/wiredtiger/src/os_common/os_abort.c
+++ b/src/third_party/wiredtiger/src/os_common/os_abort.c
@@ -15,6 +15,7 @@
void
__wt_abort(WT_SESSION_IMPL *session)
WT_GCC_FUNC_ATTRIBUTE((noreturn))
+ WT_GCC_FUNC_ATTRIBUTE((visibility("default")))
{
#ifdef HAVE_ATTACH
u_int i;
diff --git a/src/third_party/wiredtiger/src/os_common/os_errno.c b/src/third_party/wiredtiger/src/os_common/os_errno.c
index 0f57658ab23..8dccab0373e 100644
--- a/src/third_party/wiredtiger/src/os_common/os_errno.c
+++ b/src/third_party/wiredtiger/src/os_common/os_errno.c
@@ -38,16 +38,23 @@ __wt_strerror(WT_SESSION_IMPL *session, int error, char *errbuf, size_t errlen)
return (p);
/*
- * When called from wiredtiger_strerror, write a passed-in buffer.
- * When called from WT_SESSION.strerror, write the session's buffer.
+ * !!!
+ * This function MUST handle a NULL WT_SESSION_IMPL handle.
+ *
+ * When called with a passed-in buffer, write the buffer.
+ * When called with a valid session handle, write the session's buffer.
+ * There's no way the session's buffer should be NULL if buffer format
+ * succeeded, but Coverity is unconvinced; regardless, a test for NULL
+ * isn't a bad idea given future code changes in the underlying code.
*
* Fallback to a generic message.
*/
- if (session == NULL &&
+ if (errbuf != NULL &&
__wt_snprintf(errbuf, errlen, "error return: %d", error) == 0)
return (errbuf);
if (session != NULL && __wt_buf_fmt(
- session, &session->err, "error return: %d", error) == 0)
+ session, &session->err, "error return: %d", error) == 0 &&
+ session->err.data != NULL)
return (session->err.data);
/* Defeated. */
diff --git a/src/third_party/wiredtiger/src/os_common/os_fs_inmemory.c b/src/third_party/wiredtiger/src/os_common/os_fs_inmemory.c
index cb7a05e05d9..b00fa7bd8a5 100644
--- a/src/third_party/wiredtiger/src/os_common/os_fs_inmemory.c
+++ b/src/third_party/wiredtiger/src/os_common/os_fs_inmemory.c
@@ -63,7 +63,7 @@ __im_handle_remove(WT_SESSION_IMPL *session,
if (im_fh->ref != 0) {
__wt_err(session, EBUSY, "%s: file-remove", im_fh->iface.name);
if (!force)
- return (EBUSY);
+ return (__wt_set_return(session, EBUSY));
}
bucket = im_fh->name_hash % WT_HASH_ARRAY_SIZE;
@@ -272,7 +272,7 @@ __im_fs_size(WT_FILE_SYSTEM *file_system,
/* Search for the handle, then get its size. */
if ((im_fh = __im_handle_search(file_system, name)) == NULL)
- ret = ENOENT;
+ ret = __wt_set_return(session, ENOENT);
else
*sizep = (wt_off_t)im_fh->buf.size;
@@ -349,6 +349,7 @@ __im_file_read(WT_FILE_HANDLE *file_handle,
__wt_spin_unlock(session, &im_fs->lock);
if (ret == 0)
return (0);
+ F_SET(S2C(session), WT_CONN_DATA_CORRUPTION);
WT_RET_MSG(session, WT_ERROR,
"%s: handle-read: failed to read %" WT_SIZET_FMT " bytes at "
"offset %" WT_SIZET_FMT,
diff --git a/src/third_party/wiredtiger/src/os_posix/os_dir.c b/src/third_party/wiredtiger/src/os_posix/os_dir.c
index 2c2cb084a91..54614d67649 100644
--- a/src/third_party/wiredtiger/src/os_posix/os_dir.c
+++ b/src/third_party/wiredtiger/src/os_posix/os_dir.c
@@ -41,11 +41,12 @@ __directory_list_worker(WT_FILE_SYSTEM *file_system,
* but various static analysis programs remain unconvinced, check both.
*/
WT_SYSCALL_RETRY(((dirp = opendir(directory)) == NULL ? -1 : 0), ret);
- if (dirp == NULL && ret == 0)
- ret = EINVAL;
- if (ret != 0)
+ if (dirp == NULL || ret != 0) {
+ if (ret == 0)
+ ret = EINVAL;
WT_RET_MSG(session, ret,
"%s: directory-list: opendir", directory);
+ }
for (count = 0; (dp = readdir(dirp)) != NULL;) {
/*
diff --git a/src/third_party/wiredtiger/src/os_posix/os_fallocate.c b/src/third_party/wiredtiger/src/os_posix/os_fallocate.c
index 6f6e6a6bdc2..cde6adfe780 100644
--- a/src/third_party/wiredtiger/src/os_posix/os_fallocate.c
+++ b/src/third_party/wiredtiger/src/os_posix/os_fallocate.c
@@ -33,9 +33,9 @@ __posix_std_fallocate(
return (ret);
#else
WT_UNUSED(file_handle);
- WT_UNUSED(wt_session);
WT_UNUSED(offset);
- return (ENOTSUP);
+
+ return (__wt_set_return((WT_SESSION_IMPL *)wt_session, ENOTSUP));
#endif
}
@@ -66,9 +66,9 @@ __posix_sys_fallocate(
return (ret);
#else
WT_UNUSED(file_handle);
- WT_UNUSED(wt_session);
WT_UNUSED(offset);
- return (ENOTSUP);
+
+ return (__wt_set_return((WT_SESSION_IMPL *)wt_session, ENOTSUP));
#endif
}
@@ -92,9 +92,9 @@ __posix_posix_fallocate(
return (ret);
#else
WT_UNUSED(file_handle);
- WT_UNUSED(wt_session);
WT_UNUSED(offset);
- return (ENOTSUP);
+
+ return (__wt_set_return((WT_SESSION_IMPL *)wt_session, ENOTSUP));
#endif
}
diff --git a/src/third_party/wiredtiger/src/os_posix/os_fs.c b/src/third_party/wiredtiger/src/os_posix/os_fs.c
index 0af67ad38c5..37f328a113a 100644
--- a/src/third_party/wiredtiger/src/os_posix/os_fs.c
+++ b/src/third_party/wiredtiger/src/os_posix/os_fs.c
@@ -338,7 +338,7 @@ __posix_file_advise(WT_FILE_HANDLE *file_handle, WT_SESSION *wt_session,
*/
if (ret == EINVAL) {
file_handle->fh_advise = NULL;
- return (ENOTSUP);
+ return (__wt_set_return(session, ENOTSUP));
}
WT_RET_MSG(session, ret,
@@ -439,11 +439,15 @@ __posix_file_read(WT_FILE_HANDLE *file_handle,
/* Break reads larger than 1GB into 1GB chunks. */
for (addr = buf; len > 0; addr += nr, len -= (size_t)nr, offset += nr) {
chunk = WT_MIN(len, WT_GIGABYTE);
- if ((nr = pread(pfh->fd, addr, chunk, offset)) <= 0)
- WT_RET_MSG(session, nr == 0 ? WT_ERROR : __wt_errno(),
+ if ((nr = pread(pfh->fd, addr, chunk, offset)) <= 0) {
+ if (nr == 0)
+ F_SET(S2C(session), WT_CONN_DATA_CORRUPTION);
+ WT_RET_MSG(session,
+ nr == 0 ? WT_ERROR : __wt_errno(),
"%s: handle-read: pread: failed to read %"
WT_SIZET_FMT " bytes at offset %" PRIuMAX,
file_handle->name, chunk, (uintmax_t)offset);
+ }
}
return (0);
}
diff --git a/src/third_party/wiredtiger/src/os_posix/os_map.c b/src/third_party/wiredtiger/src/os_posix/os_map.c
index b9ec284e124..f04a966c468 100644
--- a/src/third_party/wiredtiger/src/os_posix/os_map.c
+++ b/src/third_party/wiredtiger/src/os_posix/os_map.c
@@ -33,7 +33,7 @@ __wt_posix_map(WT_FILE_HANDLE *fh, WT_SESSION *wt_session,
* mmap(2) of files with direct I/O to the same files.
*/
if (pfh->direct_io)
- return (ENOTSUP);
+ return (__wt_set_return(session, ENOTSUP));
/*
* There's no locking here to prevent the underlying file from changing
diff --git a/src/third_party/wiredtiger/src/os_posix/os_thread.c b/src/third_party/wiredtiger/src/os_posix/os_thread.c
index 7dd803e1b00..3c972b2991d 100644
--- a/src/third_party/wiredtiger/src/os_posix/os_thread.c
+++ b/src/third_party/wiredtiger/src/os_posix/os_thread.c
@@ -40,14 +40,15 @@ __wt_thread_create(WT_SESSION_IMPL *session,
* Wait for a thread of control to exit.
*/
int
-__wt_thread_join(WT_SESSION_IMPL *session, wt_thread_t tid)
+__wt_thread_join(WT_SESSION_IMPL *session, wt_thread_t *tid)
WT_GCC_FUNC_ATTRIBUTE((visibility("default")))
{
WT_DECL_RET;
/* Only attempt to join if thread was created successfully */
- if (!tid.created)
+ if (!tid->created)
return (0);
+ tid->created = false;
/*
* Joining a thread isn't a memory barrier, but WiredTiger commonly
@@ -56,11 +57,9 @@ __wt_thread_join(WT_SESSION_IMPL *session, wt_thread_t tid)
*/
WT_FULL_BARRIER();
- WT_SYSCALL(pthread_join(tid.id, NULL), ret);
- if (ret == 0) {
- tid.created = false;
+ WT_SYSCALL(pthread_join(tid->id, NULL), ret);
+ if (ret == 0)
return (0);
- }
WT_RET_MSG(session, ret, "pthread_join");
}
diff --git a/src/third_party/wiredtiger/src/os_posix/os_time.c b/src/third_party/wiredtiger/src/os_posix/os_time.c
index 7db4522a3e6..22b2b2effad 100644
--- a/src/third_party/wiredtiger/src/os_posix/os_time.c
+++ b/src/third_party/wiredtiger/src/os_posix/os_time.c
@@ -47,3 +47,17 @@ __wt_epoch_raw(WT_SESSION_IMPL *session, struct timespec *tsp)
NO TIME-OF-DAY IMPLEMENTATION: see src/os_posix/os_time.c
#endif
}
+
+/*
+ * __wt_localtime --
+ * Return the current local broken-down time.
+ */
+int
+__wt_localtime(WT_SESSION_IMPL *session, const time_t *timep, struct tm *result)
+ WT_GCC_FUNC_ATTRIBUTE((visibility("default")))
+{
+ if (localtime_r(timep, result) != NULL)
+ return (0);
+
+ WT_RET_MSG(session, __wt_errno(), "localtime_r");
+}
diff --git a/src/third_party/wiredtiger/src/os_win/os_fs.c b/src/third_party/wiredtiger/src/os_win/os_fs.c
index 66f4de87299..2eac258fc70 100644
--- a/src/third_party/wiredtiger/src/os_win/os_fs.c
+++ b/src/third_party/wiredtiger/src/os_win/os_fs.c
@@ -279,6 +279,8 @@ __win_file_read(WT_FILE_HANDLE *file_handle,
win_fh->filehandle, addr, chunk, &nr, &overlapped)) {
windows_error = __wt_getlasterror();
ret = __wt_map_windows_error(windows_error);
+ if (ret == WT_ERROR)
+ F_SET(S2C(session), WT_CONN_DATA_CORRUPTION);
__wt_err(session, ret,
"%s: handle-read: ReadFile: failed to read %lu "
"bytes at offset %" PRIuMAX ": %s",
@@ -393,7 +395,7 @@ __win_file_set_end(
if (SetEndOfFile(win_fh->filehandle_secondary) == FALSE) {
if (GetLastError() == ERROR_USER_MAPPED_FILE)
- return (EBUSY);
+ return (__wt_set_return(session, EBUSY));
windows_error = __wt_getlasterror();
ret = __wt_map_windows_error(windows_error);
__wt_err(session, ret,
diff --git a/src/third_party/wiredtiger/src/os_win/os_thread.c b/src/third_party/wiredtiger/src/os_win/os_thread.c
index f27ea84181e..c553bbffef6 100644
--- a/src/third_party/wiredtiger/src/os_win/os_thread.c
+++ b/src/third_party/wiredtiger/src/os_win/os_thread.c
@@ -38,13 +38,14 @@ __wt_thread_create(WT_SESSION_IMPL *session,
* Wait for a thread of control to exit.
*/
int
-__wt_thread_join(WT_SESSION_IMPL *session, wt_thread_t tid)
+__wt_thread_join(WT_SESSION_IMPL *session, wt_thread_t *tid)
{
DWORD windows_error;
/* Only attempt to join if thread was created successfully */
- if (!tid.created)
+ if (!tid->created)
return (0);
+ tid->created = false;
/*
* Joining a thread isn't a memory barrier, but WiredTiger commonly
@@ -54,7 +55,7 @@ __wt_thread_join(WT_SESSION_IMPL *session, wt_thread_t tid)
WT_FULL_BARRIER();
if ((windows_error =
- WaitForSingleObject(tid.id, INFINITE)) != WAIT_OBJECT_0) {
+ WaitForSingleObject(tid->id, INFINITE)) != WAIT_OBJECT_0) {
if (windows_error == WAIT_FAILED)
windows_error = __wt_getlasterror();
__wt_errx(session, "thread join: WaitForSingleObject: %s",
@@ -64,14 +65,13 @@ __wt_thread_join(WT_SESSION_IMPL *session, wt_thread_t tid)
return (WT_PANIC);
}
- if (CloseHandle(tid.id) == 0) {
+ if (CloseHandle(tid->id) == 0) {
windows_error = __wt_getlasterror();
__wt_errx(session, "thread join: CloseHandle: %s",
__wt_formatmessage(session, windows_error));
return (__wt_map_windows_error(windows_error));
}
- tid.created = false;
return (0);
}
diff --git a/src/third_party/wiredtiger/src/os_win/os_time.c b/src/third_party/wiredtiger/src/os_win/os_time.c
index 284d2a63931..f6cda08b03d 100644
--- a/src/third_party/wiredtiger/src/os_win/os_time.c
+++ b/src/third_party/wiredtiger/src/os_win/os_time.c
@@ -29,19 +29,16 @@ __wt_epoch_raw(WT_SESSION_IMPL *session, struct timespec *tsp)
}
/*
- * localtime_r --
- * Return the current local time.
+ * __wt_localtime --
+ * Return the current local broken-down time.
*/
-struct tm *
-localtime_r(const time_t *timer, struct tm *result)
+int
+__wt_localtime(WT_SESSION_IMPL *session, const time_t *timep, struct tm *result)
{
errno_t err;
- err = localtime_s(result, timer);
- if (err != 0) {
- __wt_err(NULL, err, "localtime_s");
- return (NULL);
- }
+ if ((err = localtime_s(result, timep)) == 0)
+ return (0);
- return (result);
+ WT_RET_MSG(session, err, "localtime_s");
}
diff --git a/src/third_party/wiredtiger/src/packing/pack_stream.c b/src/third_party/wiredtiger/src/packing/pack_stream.c
index 80dfe906bdf..fababff7dea 100644
--- a/src/third_party/wiredtiger/src/packing/pack_stream.c
+++ b/src/third_party/wiredtiger/src/packing/pack_stream.c
@@ -95,7 +95,7 @@ wiredtiger_pack_item(WT_PACK_STREAM *ps, WT_ITEM *item)
WT_RET(__pack_write(
session, &pv, &ps->p, (size_t)(ps->end - ps->p)));
break;
- WT_ILLEGAL_VALUE(session);
+ WT_ILLEGAL_VALUE(session, pv.type);
}
return (0);
@@ -128,7 +128,7 @@ wiredtiger_pack_int(WT_PACK_STREAM *ps, int64_t i)
WT_RET(__pack_write(
session, &pv, &ps->p, (size_t)(ps->end - ps->p)));
break;
- WT_ILLEGAL_VALUE(session);
+ WT_ILLEGAL_VALUE(session, pv.type);
}
return (0);
@@ -158,7 +158,7 @@ wiredtiger_pack_str(WT_PACK_STREAM *ps, const char *s)
WT_RET(__pack_write(
session, &pv, &ps->p, (size_t)(ps->end - ps->p)));
break;
- WT_ILLEGAL_VALUE(session);
+ WT_ILLEGAL_VALUE(session, pv.type);
}
return (0);
@@ -194,7 +194,7 @@ wiredtiger_pack_uint(WT_PACK_STREAM *ps, uint64_t u)
WT_RET(__pack_write(
session, &pv, &ps->p, (size_t)(ps->end - ps->p)));
break;
- WT_ILLEGAL_VALUE(session);
+ WT_ILLEGAL_VALUE(session, pv.type);
}
return (0);
@@ -225,7 +225,7 @@ wiredtiger_unpack_item(WT_PACK_STREAM *ps, WT_ITEM *item)
item->data = pv.u.item.data;
item->size = pv.u.item.size;
break;
- WT_ILLEGAL_VALUE(session);
+ WT_ILLEGAL_VALUE(session, pv.type);
}
return (0);
@@ -258,7 +258,7 @@ wiredtiger_unpack_int(WT_PACK_STREAM *ps, int64_t *ip)
&pv, (const uint8_t **)&ps->p, (size_t)(ps->end - ps->p)));
*ip = pv.u.i;
break;
- WT_ILLEGAL_VALUE(session);
+ WT_ILLEGAL_VALUE(session, pv.type);
}
return (0);
}
@@ -287,7 +287,7 @@ wiredtiger_unpack_str(WT_PACK_STREAM *ps, const char **sp)
&pv, (const uint8_t **)&ps->p, (size_t)(ps->end - ps->p)));
*sp = pv.u.s;
break;
- WT_ILLEGAL_VALUE(session);
+ WT_ILLEGAL_VALUE(session, pv.type);
}
return (0);
}
@@ -322,7 +322,7 @@ wiredtiger_unpack_uint(WT_PACK_STREAM *ps, uint64_t *up)
&pv, (const uint8_t **)&ps->p, (size_t)(ps->end - ps->p)));
*up = pv.u.u;
break;
- WT_ILLEGAL_VALUE(session);
+ WT_ILLEGAL_VALUE(session, pv.type);
}
return (0);
}
diff --git a/src/third_party/wiredtiger/src/reconcile/rec_write.c b/src/third_party/wiredtiger/src/reconcile/rec_write.c
index eb3b0038525..2b70db8443f 100644
--- a/src/third_party/wiredtiger/src/reconcile/rec_write.c
+++ b/src/third_party/wiredtiger/src/reconcile/rec_write.c
@@ -47,10 +47,11 @@ typedef struct {
/* Track the page's min/maximum transactions. */
uint64_t max_txn;
- uint64_t min_txn_unstable;
WT_DECL_TIMESTAMP(max_timestamp)
- WT_DECL_TIMESTAMP(max_onpage_timestamp)
- WT_DECL_TIMESTAMP(min_saved_timestamp)
+
+ /* Lookaside boundary tracking. */
+ uint64_t unstable_txn;
+ WT_DECL_TIMESTAMP(unstable_timestamp)
u_int updates_seen; /* Count of updates seen. */
u_int updates_unstable; /* Count of updates not visible_all. */
@@ -321,7 +322,6 @@ static int __rec_init(WT_SESSION_IMPL *,
WT_REF *, uint32_t, WT_SALVAGE_COOKIE *, void *);
static int __rec_las_wrapup(WT_SESSION_IMPL *, WT_RECONCILE *);
static int __rec_las_wrapup_err(WT_SESSION_IMPL *, WT_RECONCILE *);
-static uint32_t __rec_min_split_page_size(WT_BTREE *, uint32_t);
static int __rec_root_write(WT_SESSION_IMPL *, WT_PAGE *, uint32_t);
static int __rec_row_int(WT_SESSION_IMPL *, WT_RECONCILE *, WT_PAGE *);
static int __rec_row_leaf(WT_SESSION_IMPL *,
@@ -419,17 +419,31 @@ __wt_reconcile(WT_SESSION_IMPL *session, WT_REF *ref,
if (LF_ISSET(WT_REC_EVICT) &&
!__wt_page_can_evict(session, ref, NULL)) {
WT_PAGE_UNLOCK(session, page);
- return (EBUSY);
+ return (__wt_set_return(session, EBUSY));
+ }
+
+ /* Initialize the reconciliation structure for each new run. */
+ if ((ret = __rec_init(
+ session, ref, flags, salvage, &session->reconcile)) != 0) {
+ WT_PAGE_UNLOCK(session, page);
+ return (ret);
}
+ r = session->reconcile;
oldest_id = __wt_txn_oldest_id(session);
+
+ /*
+ * During eviction, save the transaction state that causes history to
+ * be pinned, regardless of whether reconciliation succeeds or fails.
+ * There is usually no point retrying eviction until this state
+ * changes.
+ */
if (LF_ISSET(WT_REC_EVICT)) {
mod->last_eviction_id = oldest_id;
#ifdef HAVE_TIMESTAMPS
- WT_WITH_TIMESTAMP_READLOCK(session,
- &S2C(session)->txn_global.rwlock,
- __wt_timestamp_set(&mod->last_eviction_timestamp,
- &S2C(session)->txn_global.pinned_timestamp));
+ if (S2C(session)->txn_global.has_pinned_timestamp)
+ __wt_txn_pinned_timestamp(
+ session, &mod->last_eviction_timestamp);
#endif
mod->last_evict_pass_gen = S2C(session)->cache->evict_pass_gen;
}
@@ -444,14 +458,6 @@ __wt_reconcile(WT_SESSION_IMPL *session, WT_REF *ref,
mod->last_oldest_id = oldest_id;
#endif
- /* Initialize the reconciliation structure for each new run. */
- if ((ret = __rec_init(
- session, ref, flags, salvage, &session->reconcile)) != 0) {
- WT_PAGE_UNLOCK(session, page);
- return (ret);
- }
- r = session->reconcile;
-
/* Reconcile the page. */
switch (page->type) {
case WT_PAGE_COL_FIX:
@@ -474,7 +480,9 @@ __wt_reconcile(WT_SESSION_IMPL *session, WT_REF *ref,
case WT_PAGE_ROW_LEAF:
ret = __rec_row_leaf(session, r, page, salvage);
break;
- WT_ILLEGAL_VALUE_SET(session);
+ default:
+ ret = __wt_illegal_value(session, page->type);
+ break;
}
/*
@@ -495,6 +503,17 @@ __wt_reconcile(WT_SESSION_IMPL *session, WT_REF *ref,
else
WT_TRET(__rec_write_wrapup_err(session, r, page));
+#ifdef HAVE_TIMESTAMPS
+ /*
+ * If reconciliation completes successfully, save the stable timestamp.
+ */
+ if (ret == 0 && S2C(session)->txn_global.has_stable_timestamp)
+ WT_WITH_TIMESTAMP_READLOCK(session,
+ &S2C(session)->txn_global.rwlock,
+ __wt_timestamp_set(&mod->last_stable_timestamp,
+ &S2C(session)->txn_global.stable_timestamp));
+#endif
+
/* Release the reconciliation lock. */
WT_PAGE_UNLOCK(session, page);
@@ -576,8 +595,6 @@ static int
__rec_write_check_complete(
WT_SESSION_IMPL *session, WT_RECONCILE *r, int tret, bool *lookaside_retryp)
{
- WT_UNUSED(session);
-
/*
* Tests in this function are lookaside tests and tests to decide if
* rewriting a page in memory is worth doing. In-memory configurations
@@ -634,7 +651,7 @@ __rec_write_check_complete(
return (0);
*lookaside_retryp = true;
- return (EBUSY);
+ return (__wt_set_return(session, EBUSY));
}
/*
@@ -681,7 +698,7 @@ __rec_write_page_status(WT_SESSION_IMPL *session, WT_RECONCILE *r)
} else {
/*
* Track the page's maximum transaction ID (used to decide if
- * we're likely to be able to evict this page in the future).
+ * we can evict a clean page and discard its history).
*/
mod->rec_max_txn = r->max_txn;
__wt_timestamp_set(&mod->rec_max_timestamp, &r->max_timestamp);
@@ -750,7 +767,7 @@ __rec_root_write(WT_SESSION_IMPL *session, WT_PAGE *page, uint32_t flags)
return (0);
case WT_PM_REC_MULTIBLOCK: /* Multiple blocks */
break;
- WT_ILLEGAL_VALUE(session);
+ WT_ILLEGAL_VALUE(session, mod->rec_result);
}
__wt_verbose(session, WT_VERB_SPLIT,
@@ -873,7 +890,6 @@ __rec_init(WT_SESSION_IMPL *session,
WT_PAGE *page;
WT_RECONCILE *r;
WT_TXN_GLOBAL *txn_global;
- bool las_skew_oldest;
btree = S2BT(session);
page = ref->page;
@@ -928,27 +944,24 @@ __rec_init(WT_SESSION_IMPL *session,
* We usually prefer to skew to newer versions, the logic being that by
* the time the next checkpoint runs, it is likely that all the updates
* we choose will be stable. However, if checkpointing with a
- * timestamp (indicated by a stable_timestamp being set), and the
- * timestamp hasn't changed since the last time this page was
- * reconciled, skew oldest instead. If a checkpoint is already running,
- * the oldest version is more likely to be what it needs.
+ * timestamp (indicated by a stable_timestamp being set), and there is
+ * a checkpoint already running, or this page was read with lookaside
+ * history, or the stable timestamp hasn't changed since last time this
+ * page was successfully, skew oldest instead.
*/
- if (__wt_btree_immediately_durable(session))
- las_skew_oldest = false;
- else {
- WT_ORDERED_READ(las_skew_oldest,
- txn_global->has_stable_timestamp);
- if (las_skew_oldest) {
- las_skew_oldest = (ref->page_las != NULL &&
- !__wt_txn_visible_all(session, WT_TXN_NONE,
- WT_TIMESTAMP_NULL(
- &ref->page_las->min_timestamp))) ||
- btree->checkpoint_gen !=
- __wt_gen(session, WT_GEN_CHECKPOINT);
- }
- }
- r->las_skew_newest = LF_ISSET(WT_REC_LOOKASIDE) &&
- LF_ISSET(WT_REC_VISIBLE_ALL) && !las_skew_oldest;
+ r->las_skew_newest =
+ LF_ISSET(WT_REC_LOOKASIDE) && LF_ISSET(WT_REC_VISIBLE_ALL);
+#ifdef HAVE_TIMESTAMPS
+ if (r->las_skew_newest &&
+ !__wt_btree_immediately_durable(session) &&
+ txn_global->has_stable_timestamp &&
+ ((btree->checkpoint_gen != __wt_gen(session, WT_GEN_CHECKPOINT) &&
+ txn_global->stable_is_pinned) ||
+ FLD_ISSET(page->modify->restore_state, WT_PAGE_RS_LOOKASIDE) ||
+ __wt_timestamp_cmp(&page->modify->last_stable_timestamp,
+ &txn_global->stable_timestamp) == 0))
+ r->las_skew_newest = false;
+#endif
/*
* When operating on the lookaside table, we should never try
@@ -979,10 +992,21 @@ __rec_init(WT_SESSION_IMPL *session,
/* Track the page's min/maximum transaction */
r->max_txn = WT_TXN_NONE;
- r->min_txn_unstable = WT_TXN_ABORTED;
__wt_timestamp_set_zero(&r->max_timestamp);
- __wt_timestamp_set_zero(&r->max_onpage_timestamp);
- __wt_timestamp_set_inf(&r->min_saved_timestamp);
+
+ /*
+ * Track the first unstable transaction (when skewing newest this is
+ * the newest update, otherwise the newest update not on the page).
+ * This is the boundary between the on-page information and the history
+ * stored in the lookaside table.
+ */
+ if (r->las_skew_newest) {
+ r->unstable_txn = WT_TXN_NONE;
+ __wt_timestamp_set_zero(&r->unstable_timestamp);
+ } else {
+ r->unstable_txn = WT_TXN_ABORTED;
+ __wt_timestamp_set_inf(&r->unstable_timestamp);
+ }
/* Track if updates were used and/or uncommitted. */
r->updates_seen = r->updates_unstable = 0;
@@ -1264,8 +1288,8 @@ __rec_txn_read(WT_SESSION_IMPL *session, WT_RECONCILE *r,
WT_UPDATE *first_txn_upd, *first_upd, *upd;
wt_timestamp_t *timestampp;
size_t upd_memsize;
- uint64_t max_txn, min_txn_unstable, txnid;
- bool all_visible, skipped_birthmark, uncommitted;
+ uint64_t max_txn, txnid;
+ bool all_visible, prepared, skipped_birthmark, uncommitted;
#ifdef HAVE_TIMESTAMPS
WT_UPDATE *first_ts_upd;
@@ -1280,8 +1304,7 @@ __rec_txn_read(WT_SESSION_IMPL *session, WT_RECONCILE *r,
first_txn_upd = NULL;
upd_memsize = 0;
max_txn = WT_TXN_NONE;
- min_txn_unstable = WT_TXN_ABORTED;
- skipped_birthmark = uncommitted = false;
+ prepared = skipped_birthmark = uncommitted = false;
/*
* If called with a WT_INSERT item, use its WT_UPDATE list (which must
@@ -1319,14 +1342,33 @@ __rec_txn_read(WT_SESSION_IMPL *session, WT_RECONCILE *r,
* examining its updates. As prepared transaction id's are
* globally visible, need to check the update state as well.
*/
- if (F_ISSET(r, WT_REC_EVICT) &&
- (upd->prepare_state == WT_PREPARE_LOCKED ||
- upd->prepare_state == WT_PREPARE_INPROGRESS ||
- (F_ISSET(r, WT_REC_VISIBLE_ALL) ?
- WT_TXNID_LE(r->last_running, txnid) :
- !__txn_visible_id(session, txnid)))) {
- uncommitted = r->update_uncommitted = true;
- continue;
+ if (F_ISSET(r, WT_REC_EVICT)) {
+ if (upd->prepare_state == WT_PREPARE_LOCKED ||
+ upd->prepare_state == WT_PREPARE_INPROGRESS)
+ prepared = true;
+
+ if (F_ISSET(r, WT_REC_VISIBLE_ALL) ?
+ WT_TXNID_LE(r->last_running, txnid) :
+ !__txn_visible_id(session, txnid))
+ uncommitted = r->update_uncommitted = true;
+
+ /*
+ * TODO:
+ * The following portion of code under #ifdef is there
+ * to temporarily disable lookaside eviction of the
+ * prepared updates. Once we have all the pieces put
+ * together to enable the feature, remove this temporary
+ * code.
+ */
+#ifndef HAVE_LONG_RUNNING_PREPARE
+ if (prepared) {
+ prepared = false;
+ uncommitted = r->update_uncommitted = true;
+ }
+#endif
+
+ if (prepared || uncommitted)
+ continue;
}
#ifdef HAVE_TIMESTAMPS
@@ -1372,27 +1414,20 @@ __rec_txn_read(WT_SESSION_IMPL *session, WT_RECONCILE *r,
* discard an uncommitted update.
*/
if (F_ISSET(r, WT_REC_UPDATE_RESTORE) &&
- *updp != NULL && uncommitted) {
+ *updp != NULL && (uncommitted || prepared)) {
r->leave_dirty = true;
- return (EBUSY);
+ return (__wt_set_return(session, EBUSY));
}
if (upd->type == WT_UPDATE_BIRTHMARK)
skipped_birthmark = true;
- /*
- * Track minimum transaction ID for unstable updates.
- */
- if (txnid != WT_TXN_NONE &&
- WT_TXNID_LT(txnid, min_txn_unstable))
- min_txn_unstable = txnid;
-
continue;
}
/*
* Lookaside without stable timestamp was taken care of above
- * (set to the first uncommitted transaction. Lookaside with
+ * (set to the first uncommitted transaction). Lookaside with
* stable timestamp always takes the first stable update.
*/
if (*updp == NULL)
@@ -1434,23 +1469,11 @@ __rec_txn_read(WT_SESSION_IMPL *session, WT_RECONCILE *r,
if (WT_TXNID_LT(r->max_txn, max_txn))
r->max_txn = max_txn;
- /*
- * Track the oldest unstable transaction in the page. It is used to
- * decide whether to or not to read the history during a page read.
- */
- if (WT_TXNID_LT(min_txn_unstable, r->min_txn_unstable))
- r->min_txn_unstable = min_txn_unstable;
-
#ifdef HAVE_TIMESTAMPS
/* Update the maximum timestamp. */
if (first_ts_upd != NULL &&
__wt_timestamp_cmp(&r->max_timestamp, &first_ts_upd->timestamp) < 0)
__wt_timestamp_set(&r->max_timestamp, &first_ts_upd->timestamp);
-
- /* Update the maximum on-page timestamp. */
- if (upd != NULL &&
- __wt_timestamp_cmp(&upd->timestamp, &r->max_onpage_timestamp) > 0)
- __wt_timestamp_set(&r->max_onpage_timestamp, &upd->timestamp);
#endif
/*
@@ -1476,7 +1499,7 @@ __rec_txn_read(WT_SESSION_IMPL *session, WT_RECONCILE *r,
#else
timestampp = NULL;
#endif
- all_visible = upd == first_txn_upd && !uncommitted &&
+ all_visible = upd == first_txn_upd && !(uncommitted || prepared) &&
(F_ISSET(r, WT_REC_VISIBLE_ALL) ?
__wt_txn_visible_all(session, max_txn, timestampp) :
__wt_txn_visible(session, max_txn, timestampp));
@@ -1513,9 +1536,9 @@ __rec_txn_read(WT_SESSION_IMPL *session, WT_RECONCILE *r,
* the WT_REC_LOOKASIDE flag.
*/
if (!F_ISSET(r, WT_REC_LOOKASIDE | WT_REC_UPDATE_RESTORE))
- return (EBUSY);
+ return (__wt_set_return(session, EBUSY));
if (uncommitted && !F_ISSET(r, WT_REC_UPDATE_RESTORE))
- return (EBUSY);
+ return (__wt_set_return(session, EBUSY));
WT_ASSERT(session, r->max_txn != WT_TXN_NONE);
@@ -1527,24 +1550,38 @@ __rec_txn_read(WT_SESSION_IMPL *session, WT_RECONCILE *r,
if (upd_savedp != NULL)
*upd_savedp = true;
+ /*
+ * Track the first off-page update when saving history in the lookaside
+ * table. When skewing newest, we want the first (non-aborted) update
+ * after the one stored on the page. Otherwise, we want the update
+ * before the on-page update.
+ */
+ if (F_ISSET(r, WT_REC_LOOKASIDE) && r->las_skew_newest) {
+ if (WT_TXNID_LT(r->unstable_txn, first_upd->txnid))
+ r->unstable_txn = first_upd->txnid;
#ifdef HAVE_TIMESTAMPS
- /* Track the oldest saved timestamp for lookaside. */
- if (F_ISSET(r, WT_REC_LOOKASIDE)) {
- /* If no updates had timestamps, we're done. */
- if (first_ts_upd == NULL)
- __wt_timestamp_set_zero(&r->min_saved_timestamp);
+ if (first_ts_upd != NULL &&
+ __wt_timestamp_cmp(&r->unstable_timestamp,
+ &first_ts_upd->timestamp) < 0)
+ __wt_timestamp_set(&r->unstable_timestamp,
+ &first_ts_upd->timestamp);
+#endif
+ } else if (F_ISSET(r, WT_REC_LOOKASIDE)) {
for (upd = first_upd; upd != *updp; upd = upd->next) {
- if (upd->txnid != WT_TXN_ABORTED &&
- __wt_timestamp_cmp(&upd->timestamp,
- &r->min_saved_timestamp) < 0)
- __wt_timestamp_set(&r->min_saved_timestamp,
- &upd->timestamp);
+ if (upd->txnid == WT_TXN_ABORTED)
+ continue;
- WT_ASSERT(session, upd->txnid == WT_TXN_ABORTED ||
- WT_TXNID_LE(upd->txnid, r->max_txn));
+ if (upd->txnid != WT_TXN_NONE &&
+ WT_TXNID_LT(upd->txnid, r->unstable_txn))
+ r->unstable_txn = upd->txnid;
+#ifdef HAVE_TIMESTAMPS
+ if (__wt_timestamp_cmp(&upd->timestamp,
+ &r->unstable_timestamp) < 0)
+ __wt_timestamp_set(&r->unstable_timestamp,
+ &upd->timestamp);
+#endif
}
}
-#endif
check_original_value:
/*
@@ -1686,7 +1723,7 @@ __rec_child_deleted(WT_SESSION_IMPL *session,
* the original page and which should see the deleted page).
*/
if (F_ISSET(r, WT_REC_EVICT))
- return (EBUSY);
+ return (__wt_set_return(session, EBUSY));
/*
* If there are deleted child pages we can't discard immediately, keep
@@ -1771,7 +1808,7 @@ __rec_child_modify(WT_SESSION_IMPL *session,
*/
WT_ASSERT(session, !F_ISSET(r, WT_REC_EVICT));
if (F_ISSET(r, WT_REC_EVICT))
- return (EBUSY);
+ return (__wt_set_return(session, EBUSY));
/*
* If called during checkpoint, the child is being
@@ -1799,7 +1836,7 @@ __rec_child_modify(WT_SESSION_IMPL *session,
if (F_ISSET(r, WT_REC_EVICT) &&
__wt_page_las_active(session, ref)) {
WT_ASSERT(session, false);
- return (EBUSY);
+ return (__wt_set_return(session, EBUSY));
}
/*
@@ -1824,7 +1861,7 @@ __rec_child_modify(WT_SESSION_IMPL *session,
*/
WT_ASSERT(session, !F_ISSET(r, WT_REC_EVICT));
if (F_ISSET(r, WT_REC_EVICT))
- return (EBUSY);
+ return (__wt_set_return(session, EBUSY));
/*
* If called during checkpoint, acquire a hazard pointer
@@ -1859,7 +1896,7 @@ __rec_child_modify(WT_SESSION_IMPL *session,
*/
WT_ASSERT(session, !F_ISSET(r, WT_REC_EVICT));
if (F_ISSET(r, WT_REC_EVICT))
- return (EBUSY);
+ return (__wt_set_return(session, EBUSY));
goto done;
case WT_REF_SPLIT:
@@ -1876,9 +1913,9 @@ __rec_child_modify(WT_SESSION_IMPL *session,
* for checkpoint.
*/
WT_ASSERT(session, WT_REF_SPLIT != WT_REF_SPLIT);
- return (EBUSY);
+ return (__wt_set_return(session, EBUSY));
- WT_ILLEGAL_VALUE(session);
+ WT_ILLEGAL_VALUE(session, r->tested_ref_state);
}
WT_STAT_CONN_INCR(session, child_modify_blocked_page);
}
@@ -2185,12 +2222,12 @@ __rec_need_split(WT_RECONCILE *r, size_t len)
}
/*
- * __rec_split_page_size_from_pct --
+ * __wt_split_page_size --
* Given a split percentage, calculate split page size in bytes.
*/
-static uint32_t
-__rec_split_page_size_from_pct(
- int split_pct, uint32_t maxpagesize, uint32_t allocsize) {
+uint32_t
+__wt_split_page_size(int split_pct, uint32_t maxpagesize, uint32_t allocsize)
+{
uintmax_t a;
uint32_t split_size;
@@ -2218,32 +2255,6 @@ __rec_split_page_size_from_pct(
}
/*
- * __wt_split_page_size --
- * Split page size calculation: we don't want to repeatedly split every
- * time a new entry is added, so we split to a smaller-than-maximum page
- * size.
- */
-uint32_t
-__wt_split_page_size(WT_BTREE *btree, uint32_t maxpagesize)
-{
- return (__rec_split_page_size_from_pct(
- btree->split_pct, maxpagesize, btree->allocsize));
-}
-
-/*
- * __rec_min_split_page_size --
- * Minimum split size boundary calculation: To track a boundary at the
- * minimum split size that we could have split at instead of splitting at
- * the split page size.
- */
-static uint32_t
-__rec_min_split_page_size(WT_BTREE *btree, uint32_t maxpagesize)
-{
- return (__rec_split_page_size_from_pct(
- WT_BTREE_MIN_SPLIT_PCT, maxpagesize, btree->allocsize));
-}
-
-/*
* __rec_split_chunk_init --
* Initialize a single chunk structure.
*/
@@ -2288,7 +2299,7 @@ __rec_split_chunk_init(
*/
static int
__rec_split_init(WT_SESSION_IMPL *session,
- WT_RECONCILE *r, WT_PAGE *page, uint64_t recno, uint32_t max)
+ WT_RECONCILE *r, WT_PAGE *page, uint64_t recno, uint64_t max)
{
WT_BM *bm;
WT_BTREE *btree;
@@ -2324,7 +2335,7 @@ __rec_split_init(WT_SESSION_IMPL *session,
* records, in those cases we split the pages once they have crossed
* the maximum size for a page with raw compression.
*/
- r->page_size = r->page_size_orig = max;
+ r->page_size = r->page_size_orig = (uint32_t)max;
if (r->raw_compression)
r->max_raw_page_size = r->page_size =
(uint32_t)WT_MIN((uint64_t)r->page_size * 10,
@@ -2374,11 +2385,12 @@ __rec_split_init(WT_SESSION_IMPL *session,
r->space_avail =
r->split_size - WT_PAGE_HEADER_BYTE_SIZE(btree);
} else {
- r->split_size = __wt_split_page_size(btree, r->page_size);
+ r->split_size = __wt_split_page_size(
+ btree->split_pct, r->page_size, btree->allocsize);
r->space_avail =
r->split_size - WT_PAGE_HEADER_BYTE_SIZE(btree);
- r->min_split_size =
- __rec_min_split_page_size(btree, r->page_size);
+ r->min_split_size = __wt_split_page_size(
+ WT_BTREE_MIN_SPLIT_PCT, r->page_size, btree->allocsize);
r->min_space_avail =
r->min_split_size - WT_PAGE_HEADER_BYTE_SIZE(btree);
}
@@ -2387,13 +2399,17 @@ __rec_split_init(WT_SESSION_IMPL *session,
* Ensure the disk image buffer is large enough for the max object, as
* corrected by the underlying block manager.
*
- * Since we want to support split_size more than the page size (to allow
- * for adjustments based on the compression), this buffer should be the
- * greater of split_size and page_size.
+ * Since we want to support split_size values larger than the page size
+ * (to allow for adjustments based on the compression), this buffer
+ * should be the greater of split_size and page_size, then aligned to
+ * the next allocation size boundary. The latter shouldn't be an issue,
+ * but it's a possible scenario if, for example, the compression engine
+ * is expected to give us 5x compression and gives us nothing at all.
*/
corrected_page_size = r->page_size;
WT_RET(bm->write_size(bm, session, &corrected_page_size));
- disk_img_buf_size = WT_MAX(corrected_page_size, r->split_size);
+ disk_img_buf_size = WT_ALIGN(
+ WT_MAX(corrected_page_size, r->split_size), btree->allocsize);
/* Initialize the first split chunk. */
WT_RET(
@@ -2901,7 +2917,7 @@ __rec_split_raw(WT_SESSION_IMPL *session,
}
r->raw_entries[slots] = entry;
continue;
- WT_ILLEGAL_VALUE(session);
+ WT_ILLEGAL_VALUE(session, unpack->type);
}
/*
@@ -3429,16 +3445,15 @@ __rec_split_write_supd(WT_SESSION_IMPL *session,
done: if (F_ISSET(r, WT_REC_LOOKASIDE)) {
/* Track the oldest lookaside timestamp seen so far. */
- multi->page_las.las_skew_newest = r->las_skew_newest;
- multi->page_las.las_max_txn = r->max_txn;
- multi->page_las.las_min_txn = r->min_txn_unstable;
- WT_ASSERT(session, r->max_txn != WT_TXN_NONE);
- WT_ASSERT(session, r->min_txn_unstable != WT_TXN_NONE);
+ multi->page_las.skew_newest = r->las_skew_newest;
+ multi->page_las.max_txn = r->max_txn;
+ multi->page_las.unstable_txn = r->unstable_txn;
+ WT_ASSERT(session, r->unstable_txn != WT_TXN_NONE);
#ifdef HAVE_TIMESTAMPS
- __wt_timestamp_set(&multi->page_las.min_timestamp,
- &r->min_saved_timestamp);
- __wt_timestamp_set(&multi->page_las.onpage_timestamp,
- &r->max_onpage_timestamp);
+ __wt_timestamp_set(&multi->page_las.max_timestamp,
+ &r->max_timestamp);
+ __wt_timestamp_set(&multi->page_las.unstable_timestamp,
+ &r->unstable_timestamp);
#endif
}
@@ -3580,6 +3595,85 @@ __rec_split_write_reuse(WT_SESSION_IMPL *session,
}
/*
+ * __rec_compression_adjust --
+ * Adjust the pre-compression page size based on compression results.
+ */
+static inline void
+__rec_compression_adjust(WT_SESSION_IMPL *session,
+ uint32_t max, size_t compressed_size, bool last_block, uint64_t *adjustp)
+{
+ WT_BTREE *btree;
+ uint64_t adjust, current, new;
+ u_int ten_percent;
+
+ btree = S2BT(session);
+ ten_percent = max / 10;
+
+ /*
+ * Changing the pre-compression size updates a shared memory location
+ * and it's not uncommon to be pushing out large numbers of pages from
+ * the same file. If compression creates a page larger than the target
+ * size, decrease the pre-compression size. If compression creates a
+ * page smaller than the target size, increase the pre-compression size.
+ * Once we get under the target size, try and stay there to minimize
+ * shared memory updates, but don't go over the target size, that means
+ * we're writing bad page sizes.
+ * Writing a shared memory location without a lock and letting it
+ * race, minor trickiness so we only read and write the value once.
+ */
+ WT_ORDERED_READ(current, *adjustp);
+ WT_ASSERT(session, current >= max);
+
+ if (compressed_size > max) {
+ /*
+ * The compressed size is GT the page maximum.
+ * Check if the pre-compression size is larger than the maximum.
+ * If 10% of the page size larger than the maximum, decrease it
+ * by that amount. Else if it's not already at the page maximum,
+ * set it there.
+ *
+ * Note we're using 10% of the maximum page size as our test for
+ * when to adjust the pre-compression size as well as the amount
+ * by which we adjust it. Not updating the value when it's close
+ * to the page size keeps us from constantly updating a shared
+ * memory location, and 10% of the page size is an OK step value
+ * as well, so we use it in both cases.
+ */
+ adjust = current - max;
+ if (adjust > ten_percent)
+ new = current - ten_percent;
+ else if (adjust != 0)
+ new = max;
+ else
+ return;
+ } else {
+ /*
+ * The compressed size is LTE the page maximum.
+ *
+ * Don't increase the pre-compressed size on the last block, the
+ * last block might be tiny.
+ *
+ * If the compressed size is less than the page maximum by 10%,
+ * increase the pre-compression size by 10% of the page, or up
+ * to the maximum in-memory image size.
+ *
+ * Note we're using 10% of the maximum page size... see above.
+ */
+ if (last_block || compressed_size > max - ten_percent)
+ return;
+
+ adjust = current + ten_percent;
+ if (adjust < btree->maxmempage_image)
+ new = adjust;
+ else if (current != btree->maxmempage_image)
+ new = btree->maxmempage_image;
+ else
+ return;
+ }
+ *adjustp = new;
+}
+
+/*
* __rec_split_write --
* Write a disk block out for the split helper functions.
*/
@@ -3590,7 +3684,7 @@ __rec_split_write(WT_SESSION_IMPL *session, WT_RECONCILE *r,
WT_BTREE *btree;
WT_MULTI *multi;
WT_PAGE *page;
- size_t addr_size;
+ size_t addr_size, compressed_size;
uint8_t addr[WT_BTREE_MAX_ADDR_COOKIE];
#ifdef HAVE_DIAGNOSTIC
bool verify_image;
@@ -3621,7 +3715,7 @@ __rec_split_write(WT_SESSION_IMPL *session, WT_RECONCILE *r,
case WT_PAGE_ROW_INT:
multi->addr.type = WT_ADDR_INT;
break;
- WT_ILLEGAL_VALUE(session);
+ WT_ILLEGAL_VALUE(session, page->type);
}
multi->size = WT_STORE_SIZE(chunk->image.size);
multi->checksum = 0;
@@ -3687,7 +3781,7 @@ __rec_split_write(WT_SESSION_IMPL *session, WT_RECONCILE *r,
* allocate a zero-length array.
*/
if (r->page->type != WT_PAGE_ROW_LEAF && chunk->entries == 0)
- return (EBUSY);
+ return (__wt_set_return(session, EBUSY));
if (F_ISSET(r, WT_REC_LOOKASIDE)) {
r->cache_write_lookaside = true;
@@ -3723,14 +3817,24 @@ __rec_split_write(WT_SESSION_IMPL *session, WT_RECONCILE *r,
/* Write the disk image and get an address. */
WT_RET(__wt_bt_write(session,
compressed_image == NULL ? &chunk->image : compressed_image,
- addr, &addr_size, false, F_ISSET(r, WT_REC_CHECKPOINT),
- compressed_image != NULL));
+ addr, &addr_size, &compressed_size,
+ false, F_ISSET(r, WT_REC_CHECKPOINT), compressed_image != NULL));
#ifdef HAVE_DIAGNOSTIC
verify_image = false;
#endif
WT_RET(__wt_memdup(session, addr, addr_size, &multi->addr.addr));
multi->addr.size = (uint8_t)addr_size;
+ /* Adjust the pre-compression page size based on compression results. */
+ if (WT_PAGE_IS_INTERNAL(page) &&
+ compressed_size != 0 && btree->intlpage_compadjust)
+ __rec_compression_adjust(session, btree->maxintlpage,
+ compressed_size, last_block, &btree->maxintlpage_precomp);
+ if (!WT_PAGE_IS_INTERNAL(page) &&
+ compressed_size != 0 && btree->leafpage_compadjust)
+ __rec_compression_adjust(session, btree->maxleafpage,
+ compressed_size, last_block, &btree->maxleafpage_precomp);
+
copy_image:
#ifdef HAVE_DIAGNOSTIC
/*
@@ -3791,8 +3895,8 @@ __wt_bulk_init(WT_SESSION_IMPL *session, WT_CURSOR_BULK *cbulk)
recno = btree->type == BTREE_ROW ? WT_RECNO_OOB : 1;
- return (__rec_split_init(
- session, r, cbulk->leaf, recno, btree->maxleafpage));
+ return (__rec_split_init(session,
+ r, cbulk->leaf, recno, btree->maxleafpage_precomp));
}
/*
@@ -4090,8 +4194,8 @@ __rec_col_int(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_REF *pageref)
val = &r->v;
vpack = &_vpack;
- WT_RET(__rec_split_init(
- session, r, page, pageref->ref_recno, btree->maxintlpage));
+ WT_RET(__rec_split_init(session,
+ r, page, pageref->ref_recno, btree->maxintlpage_precomp));
/* For each entry in the in-memory page... */
WT_INTL_FOREACH_BEGIN(session, page, ref) {
@@ -4135,7 +4239,8 @@ __rec_col_int(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_REF *pageref)
case WT_PM_REC_REPLACE:
addr = &child->modify->mod_replace;
break;
- WT_ILLEGAL_VALUE_ERR(session);
+ WT_ILLEGAL_VALUE_ERR(
+ session, child->modify->rec_result);
}
break;
case WT_CHILD_ORIGINAL:
@@ -4143,11 +4248,10 @@ __rec_col_int(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_REF *pageref)
break;
case WT_CHILD_PROXY:
/*
- * Deleted child where we write a proxy cell, not
- * yet supported for column-store.
+ * Deleted child where we write a proxy cell, not yet
+ * supported for column-store.
*/
- ret = __wt_illegal_value(session, NULL);
- goto err;
+ WT_ERR(__wt_illegal_value(session, state));
}
/*
@@ -4546,8 +4650,8 @@ __rec_col_var(WT_SESSION_IMPL *session,
vpack = &_vpack;
cbt = &r->update_modify_cbt;
- WT_RET(__rec_split_init(
- session, r, page, pageref->ref_recno, btree->maxleafpage));
+ WT_RET(__rec_split_init(session,
+ r, page, pageref->ref_recno, btree->maxleafpage_precomp));
WT_RET(__wt_scr_alloc(session, 0, &orig));
data = NULL;
@@ -4686,7 +4790,7 @@ record_loop: /*
case WT_UPDATE_TOMBSTONE:
deleted = true;
break;
- WT_ILLEGAL_VALUE_ERR(session);
+ WT_ILLEGAL_VALUE_ERR(session, upd->type);
}
} else if (vpack->raw == WT_CELL_VALUE_OVFL_RM) {
/*
@@ -4931,7 +5035,7 @@ compare: /*
case WT_UPDATE_TOMBSTONE:
deleted = true;
break;
- WT_ILLEGAL_VALUE_ERR(session);
+ WT_ILLEGAL_VALUE_ERR(session, upd->type);
}
/*
@@ -5039,7 +5143,8 @@ __rec_row_int(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page)
cell = NULL;
key_onpage_ovfl = false;
- WT_RET(__rec_split_init(session, r, page, 0, btree->maxintlpage));
+ WT_RET(__rec_split_init(
+ session, r, page, 0, btree->maxintlpage_precomp));
/*
* Ideally, we'd never store the 0th key on row-store internal pages
@@ -5149,7 +5254,8 @@ __rec_row_int(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page)
*/
addr = &child->modify->mod_replace;
break;
- WT_ILLEGAL_VALUE_ERR(session);
+ WT_ILLEGAL_VALUE_ERR(
+ session, child->modify->rec_result);
}
break;
case WT_CHILD_ORIGINAL:
@@ -5324,7 +5430,8 @@ __rec_row_leaf(WT_SESSION_IMPL *session,
val = &r->v;
vpack = &_vpack;
- WT_RET(__rec_split_init(session, r, page, 0, btree->maxleafpage));
+ WT_RET(__rec_split_init(
+ session, r, page, 0, btree->maxleafpage_precomp));
/*
* Write any K/V pairs inserted into the page before the first from-disk
@@ -5534,7 +5641,7 @@ __rec_row_leaf(WT_SESSION_IMPL *session,
/* Proceed with appended key/value pairs. */
goto leaf_insert;
- WT_ILLEGAL_VALUE_ERR(session);
+ WT_ILLEGAL_VALUE_ERR(session, upd->type);
}
}
@@ -5744,7 +5851,7 @@ __rec_row_leaf_insert(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_INSERT *ins)
break;
case WT_UPDATE_TOMBSTONE:
continue;
- WT_ILLEGAL_VALUE(session);
+ WT_ILLEGAL_VALUE(session, upd->type);
}
/* Build key cell. */
@@ -5954,7 +6061,7 @@ __rec_write_wrapup(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page)
mod->mod_replace.size = 0;
__wt_free(session, mod->mod_disk_image);
break;
- WT_ILLEGAL_VALUE(session);
+ WT_ILLEGAL_VALUE(session, mod->rec_result);
}
/* Reset the reconciliation state. */
@@ -6031,7 +6138,8 @@ __rec_write_wrapup(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page)
mod->mod_page_las = r->multi->page_las;
} else
WT_RET(__wt_bt_write(session, r->wrapup_checkpoint,
- NULL, NULL, true, F_ISSET(r, WT_REC_CHECKPOINT),
+ NULL, NULL, NULL,
+ true, F_ISSET(r, WT_REC_CHECKPOINT),
r->wrapup_checkpoint_compressed));
mod->rec_result = WT_PM_REC_REPLACE;
@@ -6147,7 +6255,7 @@ __rec_las_wrapup(WT_SESSION_IMPL *session, WT_RECONCILE *r)
for (multi = r->multi, i = 0; i < r->multi_next; ++multi, ++i)
if (multi->supd != NULL) {
WT_ERR(__wt_las_insert_block(
- session, cursor, r->page, multi, key));
+ cursor, S2BT(session), r->page, multi, key));
__wt_free(session, multi->supd);
multi->supd_entries = 0;
@@ -6453,8 +6561,8 @@ __rec_cell_build_ovfl(WT_SESSION_IMPL *session,
/* Write the buffer. */
addr = buf;
- WT_ERR(__wt_bt_write(session, tmp,
- addr, &size, false, F_ISSET(r, WT_REC_CHECKPOINT), false));
+ WT_ERR(__wt_bt_write(session, tmp, addr, &size, NULL,
+ false, F_ISSET(r, WT_REC_CHECKPOINT), false));
/*
* Track the overflow record (unless it's a bulk load, which
diff --git a/src/third_party/wiredtiger/src/schema/schema_alter.c b/src/third_party/wiredtiger/src/schema/schema_alter.c
index e880cb415c8..aba708f0e0b 100644
--- a/src/third_party/wiredtiger/src/schema/schema_alter.c
+++ b/src/third_party/wiredtiger/src/schema/schema_alter.c
@@ -53,7 +53,7 @@ err: __wt_free(session, config);
* there was no metadata entry.
*/
if (ret == WT_NOTFOUND)
- ret = ENOENT;
+ ret = __wt_set_return(session, ENOENT);
return (ret);
}
diff --git a/src/third_party/wiredtiger/src/schema/schema_list.c b/src/third_party/wiredtiger/src/schema/schema_list.c
index 0f8806dd462..990f2636bf9 100644
--- a/src/third_party/wiredtiger/src/schema/schema_list.c
+++ b/src/third_party/wiredtiger/src/schema/schema_list.c
@@ -28,7 +28,7 @@ __wt_schema_get_table_uri(WT_SESSION_IMPL *session,
WT_ERR(__wt_session_get_dhandle(session, uri, NULL, NULL, flags));
table = (WT_TABLE *)session->dhandle;
if (!ok_incomplete && !table->cg_complete) {
- ret = EINVAL;
+ ret = __wt_set_return(session, EINVAL);
WT_TRET(__wt_session_release_dhandle(session));
WT_ERR_MSG(session, ret, "'%s' cannot be used "
"until all column groups are created",
diff --git a/src/third_party/wiredtiger/src/schema/schema_plan.c b/src/third_party/wiredtiger/src/schema/schema_plan.c
index abe76013e12..7fb2a391784 100644
--- a/src/third_party/wiredtiger/src/schema/schema_plan.c
+++ b/src/third_party/wiredtiger/src/schema/schema_plan.c
@@ -298,7 +298,7 @@ __find_column_format(WT_SESSION_IMPL *session, WT_TABLE *table,
if (k.len == colname->len &&
strncmp(colname->str, k.str, k.len) == 0) {
if (value_only && inkey)
- return (EINVAL);
+ return (__wt_set_return(session, EINVAL));
return (0);
}
}
diff --git a/src/third_party/wiredtiger/src/schema/schema_util.c b/src/third_party/wiredtiger/src/schema/schema_util.c
index ceec6db6cb5..a281ec3fe12 100644
--- a/src/third_party/wiredtiger/src/schema/schema_util.c
+++ b/src/third_party/wiredtiger/src/schema/schema_util.c
@@ -39,7 +39,7 @@ __wt_schema_backup_check(WT_SESSION_IMPL *session, const char *name)
}
for (i = 0; backup_list[i] != NULL; ++i) {
if (strcmp(backup_list[i], name) == 0) {
- ret = EBUSY;
+ ret = __wt_set_return(session, EBUSY);
break;
}
}
diff --git a/src/third_party/wiredtiger/src/session/session_api.c b/src/third_party/wiredtiger/src/session/session_api.c
index 0becbe0b536..857003c7abe 100644
--- a/src/third_party/wiredtiger/src/session/session_api.c
+++ b/src/third_party/wiredtiger/src/session/session_api.c
@@ -182,37 +182,6 @@ __wt_session_release_resources(WT_SESSION_IMPL *session)
}
/*
- * __session_clear_commit_queue --
- * We're about to clear the session and overwrite the txn structure.
- * Remove ourselves from the commit timestamp queue if we're on it.
- */
-static void
-__session_clear_commit_queue(WT_SESSION_IMPL *session)
-{
- WT_TXN *txn;
- WT_TXN_GLOBAL *txn_global;
-
- txn = &session->txn;
- txn_global = &S2C(session)->txn_global;
-
- if (!txn->clear_ts_queue)
- return;
-
- __wt_writelock(session, &txn_global->commit_timestamp_rwlock);
- /*
- * Recheck after acquiring the lock.
- */
- if (txn->clear_ts_queue) {
- TAILQ_REMOVE(
- &txn_global->commit_timestamph, txn, commit_timestampq);
- --txn_global->commit_timestampq_len;
- txn->clear_ts_queue = false;
- }
- __wt_writeunlock(session, &txn_global->commit_timestamp_rwlock);
-
-}
-
-/*
* __session_clear --
* Clear a session structure.
*/
@@ -231,7 +200,7 @@ __session_clear(WT_SESSION_IMPL *session)
*
* For these reasons, be careful when clearing the session structure.
*/
- __session_clear_commit_queue(session);
+ __wt_txn_clear_timestamp_queues(session);
memset(session, 0, WT_SESSION_CLEAR_SIZE);
WT_INIT_LSN(&session->bg_sync_lsn);
@@ -259,7 +228,7 @@ __session_close_cursors(WT_SESSION_IMPL *session, WT_CURSOR_LIST *cursors)
*/
WT_TRET_NOTFOUND_OK(cursor->reopen(cursor, false));
else if (session->event_handler->handle_close != NULL &&
- !WT_STREQ(cursor->internal_uri, WT_LAS_URI))
+ strcmp(cursor->internal_uri, WT_LAS_URI) != 0)
/*
* Notify the user that we are closing the cursor
* handle via the registered close callback.
@@ -609,7 +578,7 @@ __session_open_cursor(WT_SESSION *wt_session,
SESSION_API_CALL(session, open_cursor, config, cfg);
statjoin = (to_dup != NULL && uri != NULL &&
- WT_STREQ(uri, "statistics:join"));
+ strcmp(uri, "statistics:join") == 0);
if (!statjoin) {
if ((to_dup == NULL && uri == NULL) ||
(to_dup != NULL && uri != NULL))
@@ -1490,7 +1459,7 @@ __session_truncate(WT_SESSION *wt_session,
* Verify the user only gave the URI prefix and not
* a specific target name after that.
*/
- if (!WT_STREQ(uri, "log:"))
+ if (strcmp(uri, "log:") != 0)
WT_ERR_MSG(session, EINVAL,
"the truncate method should not specify any"
"target after the log: URI prefix");
@@ -1794,6 +1763,24 @@ err: API_END_RET(session, ret);
}
/*
+ * __session_query_timestamp --
+ * WT_SESSION->query_timestamp method.
+ */
+static int
+__session_query_timestamp(
+ WT_SESSION *wt_session, char *hex_timestamp, const char *config)
+{
+ WT_DECL_RET;
+ WT_SESSION_IMPL *session;
+
+ session = (WT_SESSION_IMPL *)wt_session;
+ SESSION_API_CALL_PREPARE_ALLOWED(session,
+ query_timestamp, config, cfg);
+ WT_TRET(__wt_txn_query_timestamp(session, hex_timestamp, cfg, false));
+err: API_END_RET(session, ret);
+}
+
+/*
* __session_transaction_pinned_range --
* WT_SESSION->transaction_pinned_range method.
*/
@@ -2105,6 +2092,7 @@ __open_session(WT_CONNECTION_IMPL *conn,
__session_prepare_transaction,
__session_rollback_transaction,
__session_timestamp_transaction,
+ __session_query_timestamp,
__session_checkpoint,
__session_snapshot,
__session_transaction_pinned_range,
@@ -2136,6 +2124,7 @@ __open_session(WT_CONNECTION_IMPL *conn,
__session_prepare_transaction_readonly,
__session_rollback_transaction,
__session_timestamp_transaction,
+ __session_query_timestamp,
__session_checkpoint_readonly,
__session_snapshot,
__session_transaction_pinned_range,
diff --git a/src/third_party/wiredtiger/src/session/session_dhandle.c b/src/third_party/wiredtiger/src/session/session_dhandle.c
index caa775686cf..30399cafd22 100644
--- a/src/third_party/wiredtiger/src/session/session_dhandle.c
+++ b/src/third_party/wiredtiger/src/session/session_dhandle.c
@@ -140,7 +140,7 @@ __wt_session_lock_dhandle(
if (!LF_ISSET(WT_DHANDLE_LOCK_ONLY) &&
(!F_ISSET(dhandle, WT_DHANDLE_OPEN) ||
(btree != NULL && F_ISSET(btree, WT_BTREE_SPECIAL_FLAGS))))
- return (EBUSY);
+ return (__wt_set_return(session, EBUSY));
++dhandle->excl_ref;
return (0);
}
@@ -167,7 +167,7 @@ __wt_session_lock_dhandle(
* give up.
*/
if (btree != NULL && F_ISSET(btree, WT_BTREE_SPECIAL_FLAGS))
- return (EBUSY);
+ return (__wt_set_return(session, EBUSY));
/*
* If the handle is open, get a read lock and recheck.
diff --git a/src/third_party/wiredtiger/src/support/err.c b/src/third_party/wiredtiger/src/support/err.c
index 0569d0545e6..33dc1a0a0d4 100644
--- a/src/third_party/wiredtiger/src/support/err.c
+++ b/src/third_party/wiredtiger/src/support/err.c
@@ -174,7 +174,7 @@ __wt_event_handler_set(WT_SESSION_IMPL *session, WT_EVENT_HANDLER *handler)
*/
static int
__eventv(WT_SESSION_IMPL *session, bool msg_event, int error,
- const char *file_name, int line_number, const char *fmt, va_list ap)
+ const char *func, int line, const char *fmt, va_list ap)
WT_GCC_FUNC_ATTRIBUTE((cold))
{
struct timespec ts;
@@ -231,8 +231,8 @@ __eventv(WT_SESSION_IMPL *session, bool msg_event, int error,
WT_ERROR_APPEND(p, remain, ", %s", prefix);
WT_ERROR_APPEND(p, remain, ": ");
- if (file_name != NULL)
- WT_ERROR_APPEND(p, remain, "%s, %d: ", file_name, line_number);
+ if (func != NULL)
+ WT_ERROR_APPEND(p, remain, "%s, %d: ", func, line);
WT_ERROR_APPEND_AP(p, remain, fmt, ap);
@@ -309,13 +309,14 @@ err: if (fprintf(stderr,
}
/*
- * __wt_err --
+ * __wt_err_func --
* Report an error.
*/
void
-__wt_err(WT_SESSION_IMPL *session, int error, const char *fmt, ...)
+__wt_err_func(WT_SESSION_IMPL *session,
+ int error, const char *func, int line, const char *fmt, ...)
WT_GCC_FUNC_ATTRIBUTE((cold))
- WT_GCC_FUNC_ATTRIBUTE((format (printf, 3, 4)))
+ WT_GCC_FUNC_ATTRIBUTE((format (printf, 5, 6)))
WT_GCC_FUNC_ATTRIBUTE((visibility("default")))
{
va_list ap;
@@ -325,18 +326,20 @@ __wt_err(WT_SESSION_IMPL *session, int error, const char *fmt, ...)
* an error value to return.
*/
va_start(ap, fmt);
- WT_IGNORE_RET(__eventv(session, false, error, NULL, 0, fmt, ap));
+ WT_IGNORE_RET(__eventv(session, false, error, func, line, fmt, ap));
va_end(ap);
}
/*
- * __wt_errx --
+ * __wt_errx_func --
* Report an error with no error code.
*/
void
-__wt_errx(WT_SESSION_IMPL *session, const char *fmt, ...)
+__wt_errx_func(WT_SESSION_IMPL *session,
+ const char *func, int line, const char *fmt, ...)
WT_GCC_FUNC_ATTRIBUTE((cold))
- WT_GCC_FUNC_ATTRIBUTE((format (printf, 2, 3)))
+ WT_GCC_FUNC_ATTRIBUTE((format (printf, 4, 5)))
+ WT_GCC_FUNC_ATTRIBUTE((visibility("default")))
{
va_list ap;
@@ -345,11 +348,24 @@ __wt_errx(WT_SESSION_IMPL *session, const char *fmt, ...)
* an error value to return.
*/
va_start(ap, fmt);
- WT_IGNORE_RET(__eventv(session, false, 0, NULL, 0, fmt, ap));
+ WT_IGNORE_RET(__eventv(session, false, 0, func, line, fmt, ap));
va_end(ap);
}
/*
+ * __wt_set_return_func --
+ * Conditionally log the source of an error code and return the error.
+ */
+int
+__wt_set_return_func(
+ WT_SESSION_IMPL *session, const char* func, int line, int err)
+{
+ __wt_verbose(session,
+ WT_VERB_ERROR_RETURNS, "%s: %d Error: %d", func, line, err);
+ return (err);
+}
+
+/*
* __wt_ext_err_printf --
* Extension API call to print to the error stream.
*/
@@ -388,44 +404,28 @@ __wt_verbose_worker(WT_SESSION_IMPL *session, const char *fmt, ...)
}
/*
- * info_msg --
+ * __wt_msg --
* Informational message.
*/
-static int
-info_msg(WT_SESSION_IMPL *session, const char *fmt, va_list ap)
+int
+__wt_msg(WT_SESSION_IMPL *session, const char *fmt, ...)
+ WT_GCC_FUNC_ATTRIBUTE((cold))
+ WT_GCC_FUNC_ATTRIBUTE((format (printf, 2, 3)))
{
+ WT_DECL_ITEM(buf);
+ WT_DECL_RET;
WT_EVENT_HANDLER *handler;
WT_SESSION *wt_session;
- /*
- * !!!
- * SECURITY:
- * Buffer placed at the end of the stack in case snprintf overflows.
- */
- char s[2048];
+ WT_RET(__wt_scr_alloc(session, 0, &buf));
- WT_RET(__wt_vsnprintf(s, sizeof(s), fmt, ap));
+ WT_VA_ARGS_BUF_FORMAT(session, buf, fmt, false);
wt_session = (WT_SESSION *)session;
handler = session->event_handler;
- return (handler->handle_message(handler, wt_session, s));
-}
-
-/*
- * __wt_msg --
- * Informational message.
- */
-int
-__wt_msg(WT_SESSION_IMPL *session, const char *fmt, ...)
- WT_GCC_FUNC_ATTRIBUTE((cold))
- WT_GCC_FUNC_ATTRIBUTE((format (printf, 2, 3)))
-{
- WT_DECL_RET;
- va_list ap;
+ ret = handler->handle_message(handler, wt_session, buf->data);
- va_start(ap, fmt);
- ret = info_msg(session, fmt, ap);
- va_end(ap);
+ __wt_scr_free(session, &buf);
return (ret);
}
@@ -439,16 +439,24 @@ __wt_ext_msg_printf(
WT_EXTENSION_API *wt_api, WT_SESSION *wt_session, const char *fmt, ...)
WT_GCC_FUNC_ATTRIBUTE((format (printf, 3, 4)))
{
+ WT_DECL_ITEM(buf);
WT_DECL_RET;
+ WT_EVENT_HANDLER *handler;
WT_SESSION_IMPL *session;
- va_list ap;
if ((session = (WT_SESSION_IMPL *)wt_session) == NULL)
session = ((WT_CONNECTION_IMPL *)wt_api->conn)->default_session;
- va_start(ap, fmt);
- ret = info_msg(session, fmt, ap);
- va_end(ap);
+ WT_RET(__wt_scr_alloc(session, 0, &buf));
+
+ WT_VA_ARGS_BUF_FORMAT(session, buf, fmt, false);
+
+ wt_session = (WT_SESSION *)session;
+ handler = session->event_handler;
+ ret = handler->handle_message(handler, wt_session, buf->data);
+
+ __wt_scr_free(session, &buf);
+
return (ret);
}
@@ -487,34 +495,6 @@ __wt_progress(WT_SESSION_IMPL *session, const char *s, uint64_t v)
}
/*
- * __wt_assert --
- * Assert and other unexpected failures, includes file/line information
- * for debugging.
- */
-void
-__wt_assert(WT_SESSION_IMPL *session,
- int error, const char *file_name, int line_number, const char *fmt, ...)
- WT_GCC_FUNC_ATTRIBUTE((cold))
- WT_GCC_FUNC_ATTRIBUTE((format (printf, 5, 6)))
-#ifdef HAVE_DIAGNOSTIC
- WT_GCC_FUNC_ATTRIBUTE((noreturn))
-#endif
- WT_GCC_FUNC_ATTRIBUTE((visibility("default")))
-{
- va_list ap;
-
- va_start(ap, fmt);
- WT_IGNORE_RET(__eventv(
- session, false, error, file_name, line_number, fmt, ap));
- va_end(ap);
-
-#ifdef HAVE_DIAGNOSTIC
- __wt_abort(session); /* Drop core if testing. */
- /* NOTREACHED */
-#endif
-}
-
-/*
* __wt_panic --
* A standard error message when we panic.
*/
@@ -565,16 +545,13 @@ __wt_panic(WT_SESSION_IMPL *session)
*/
int
__wt_illegal_value_func(
- WT_SESSION_IMPL *session, const char *tag, const char *file, int line)
+ WT_SESSION_IMPL *session, uintmax_t v, const char *func, int line)
WT_GCC_FUNC_ATTRIBUTE((cold))
WT_GCC_FUNC_ATTRIBUTE((visibility("default")))
{
- __wt_errx(session, "%s%s%s: (%s, %d)",
- tag == NULL ? "" : tag,
- tag == NULL ? "" : ": ",
- "encountered an illegal file format or internal value",
- file, line);
-
+ __wt_err_func(session, EINVAL,
+ func, line, "%s: 0x%" PRIxMAX,
+ "encountered an illegal file format or internal value", v);
return (__wt_panic(session));
}
diff --git a/src/third_party/wiredtiger/src/support/global.c b/src/third_party/wiredtiger/src/support/global.c
index d1271e0d427..f71f91a4daa 100644
--- a/src/third_party/wiredtiger/src/support/global.c
+++ b/src/third_party/wiredtiger/src/support/global.c
@@ -117,7 +117,7 @@ __wt_global_once(void)
return;
}
- __wt_checksum_init();
+ __wt_process.checksum = wiredtiger_crc32c_func();
__global_calibrate_ticks();
TAILQ_INIT(&__wt_process.connqh);
diff --git a/src/third_party/wiredtiger/src/support/hazard.c b/src/third_party/wiredtiger/src/support/hazard.c
index 815c876f444..eb65c00741c 100644
--- a/src/third_party/wiredtiger/src/support/hazard.c
+++ b/src/third_party/wiredtiger/src/support/hazard.c
@@ -67,7 +67,7 @@ hazard_grow(WT_SESSION_IMPL *session)
int
__wt_hazard_set(WT_SESSION_IMPL *session, WT_REF *ref, bool *busyp
#ifdef HAVE_DIAGNOSTIC
- , const char *file, int line
+ , const char *func, int line
#endif
)
{
@@ -146,7 +146,7 @@ __wt_hazard_set(WT_SESSION_IMPL *session, WT_REF *ref, bool *busyp
*/
hp->ref = ref;
#ifdef HAVE_DIAGNOSTIC
- hp->file = file;
+ hp->func = func;
hp->line = line;
#endif
/* Publish the hazard pointer before reading page's state. */
@@ -401,6 +401,29 @@ __wt_hazard_count(WT_SESSION_IMPL *session, WT_REF *ref)
#ifdef HAVE_DIAGNOSTIC
/*
+ * __wt_hazard_check_assert --
+ * Assert there's no hazard pointer to the page.
+ */
+bool
+__wt_hazard_check_assert(WT_SESSION_IMPL *session, void *ref, bool waitfor)
+{
+ WT_HAZARD *hp;
+ int i;
+
+ for (i = 0;;) {
+ if ((hp = __wt_hazard_check(session, ref)) == NULL)
+ return (true);
+ if (!waitfor || ++i > 100)
+ break;
+ __wt_sleep(0, 10000);
+ }
+ __wt_errx(session,
+ "hazard pointer reference to discarded object: (%p: %s, line %d)",
+ (void *)hp->ref, hp->func, hp->line);
+ return (false);
+}
+
+/*
* __hazard_dump --
* Display the list of hazard pointers.
*/
@@ -415,6 +438,6 @@ __hazard_dump(WT_SESSION_IMPL *session)
__wt_errx(session,
"session %p: hazard pointer %p: %s, line %d",
(void *)session,
- (void *)hp->ref, hp->file, hp->line);
+ (void *)hp->ref, hp->func, hp->line);
}
#endif
diff --git a/src/third_party/wiredtiger/src/support/huffman.c b/src/third_party/wiredtiger/src/support/huffman.c
index 55910ab835d..03d442403d5 100644
--- a/src/third_party/wiredtiger/src/support/huffman.c
+++ b/src/third_party/wiredtiger/src/support/huffman.c
@@ -352,7 +352,7 @@ __wt_huffman_open(WT_SESSION_IMPL *session,
* duplicates.
*/
sym = symbol_frequency_array;
- qsort(sym, symcnt, sizeof(INDEXED_SYMBOL), indexed_symbol_compare);
+ __wt_qsort(sym, symcnt, sizeof(INDEXED_SYMBOL), indexed_symbol_compare);
for (i = 0; i < symcnt; ++i) {
if (i > 0 && sym[i].symbol == sym[i - 1].symbol)
WT_ERR_MSG(session, EINVAL,
@@ -397,7 +397,7 @@ __wt_huffman_open(WT_SESSION_IMPL *session,
* The array must be sorted by frequency to be able to use a linear time
* construction algorithm.
*/
- qsort((void *)indexed_freqs,
+ __wt_qsort((void *)indexed_freqs,
symcnt, sizeof(INDEXED_SYMBOL), indexed_freq_compare);
/* We need two node queues to build the tree. */
diff --git a/src/third_party/wiredtiger/src/support/mtx_rw.c b/src/third_party/wiredtiger/src/support/mtx_rw.c
index fd66a1a40bb..959405dee50 100644
--- a/src/third_party/wiredtiger/src/support/mtx_rw.c
+++ b/src/third_party/wiredtiger/src/support/mtx_rw.c
@@ -137,7 +137,7 @@ __wt_try_readlock(WT_SESSION_IMPL *session, WT_RWLOCK *l)
/* This read lock can only be granted if there are no active writers. */
if (old.u.s.current != old.u.s.next)
- return (EBUSY);
+ return (__wt_set_return(session, EBUSY));
/*
* The replacement lock value is a result of adding an active reader.
@@ -146,7 +146,7 @@ __wt_try_readlock(WT_SESSION_IMPL *session, WT_RWLOCK *l)
*/
new.u.v = old.u.v;
if (++new.u.s.readers_active == 0)
- return (EBUSY);
+ return (__wt_set_return(session, EBUSY));
/* We rely on this atomic operation to provide a barrier. */
return (__wt_atomic_casv64(&l->u.v, old.u.v, new.u.v) ? 0 : EBUSY);
@@ -331,7 +331,7 @@ __wt_try_writelock(WT_SESSION_IMPL *session, WT_RWLOCK *l)
*/
old.u.v = l->u.v;
if (old.u.s.current != old.u.s.next || old.u.s.readers_active != 0)
- return (EBUSY);
+ return (__wt_set_return(session, EBUSY));
/*
* We've checked above that there is no writer active (since
diff --git a/src/third_party/wiredtiger/src/support/scratch.c b/src/third_party/wiredtiger/src/support/scratch.c
index 2ead79a1c1c..a0f7de3179f 100644
--- a/src/third_party/wiredtiger/src/support/scratch.c
+++ b/src/third_party/wiredtiger/src/support/scratch.c
@@ -71,30 +71,9 @@ __wt_buf_fmt(WT_SESSION_IMPL *session, WT_ITEM *buf, const char *fmt, ...)
WT_GCC_FUNC_ATTRIBUTE((format (printf, 3, 4)))
WT_GCC_FUNC_ATTRIBUTE((visibility("default")))
{
- WT_DECL_RET;
- size_t len;
- va_list ap;
-
- for (;;) {
- va_start(ap, fmt);
- ret = __wt_vsnprintf_len_set(
- buf->mem, buf->memsize, &len, fmt, ap);
- va_end(ap);
- WT_RET(ret);
-
- /* Check if there was enough space. */
- if (len < buf->memsize) {
- buf->data = buf->mem;
- buf->size = len;
- return (0);
- }
+ WT_VA_ARGS_BUF_FORMAT(session, buf, fmt, false);
- /*
- * If not, double the size of the buffer: we're dealing with
- * strings, and we don't expect these numbers to get huge.
- */
- WT_RET(__wt_buf_extend(session, buf, len + 1));
- }
+ return (0);
}
/*
@@ -106,11 +85,6 @@ __wt_buf_catfmt(WT_SESSION_IMPL *session, WT_ITEM *buf, const char *fmt, ...)
WT_GCC_FUNC_ATTRIBUTE((format (printf, 3, 4)))
WT_GCC_FUNC_ATTRIBUTE((visibility("default")))
{
- WT_DECL_RET;
- size_t len, space;
- char *p;
- va_list ap;
-
/*
* If we're appending data to an existing buffer, any data field should
* point into the allocated memory. (It wouldn't be insane to copy any
@@ -119,27 +93,9 @@ __wt_buf_catfmt(WT_SESSION_IMPL *session, WT_ITEM *buf, const char *fmt, ...)
*/
WT_ASSERT(session, buf->data == NULL || WT_DATA_IN_ITEM(buf));
- for (;;) {
- va_start(ap, fmt);
- p = (char *)((uint8_t *)buf->mem + buf->size);
- WT_ASSERT(session, buf->memsize >= buf->size);
- space = buf->memsize - buf->size;
- ret = __wt_vsnprintf_len_set(p, space, &len, fmt, ap);
- va_end(ap);
- WT_RET(ret);
-
- /* Check if there was enough space. */
- if (len < space) {
- buf->size += len;
- return (0);
- }
+ WT_VA_ARGS_BUF_FORMAT(session, buf, fmt, true);
- /*
- * If not, double the size of the buffer: we're dealing with
- * strings, and we don't expect these numbers to get huge.
- */
- WT_RET(__wt_buf_extend(session, buf, buf->size + len + 1));
- }
+ return (0);
}
/*
@@ -223,7 +179,7 @@ __wt_buf_set_printable_format(WT_SESSION_IMPL *session,
session, buf, "%s%" PRIu64, sep, pv.u.u));
sep = ",";
break;
- WT_ILLEGAL_VALUE_ERR(session);
+ WT_ILLEGAL_VALUE_ERR(session, pv.type);
}
}
WT_ERR_NOTFOUND_OK(ret);
@@ -233,7 +189,7 @@ err: __wt_scr_free(session, &tmp);
return ((const char *)buf->data);
retp = "failed to create printable output";
- __wt_err(session, ret, "%s: %s", __func__, retp);
+ __wt_err(session, ret, "%s", retp);
return (retp);
}
@@ -286,7 +242,7 @@ __wt_buf_set_size(
int
__wt_scr_alloc_func(WT_SESSION_IMPL *session, size_t size, WT_ITEM **scratchp
#ifdef HAVE_DIAGNOSTIC
- , const char *file, int line
+ , const char *func, int line
#endif
)
WT_GCC_FUNC_ATTRIBUTE((visibility("default")))
@@ -374,15 +330,14 @@ __wt_scr_alloc_func(WT_SESSION_IMPL *session, size_t size, WT_ITEM **scratchp
F_SET(*best, WT_ITEM_INUSE);
#ifdef HAVE_DIAGNOSTIC
- session->scratch_track[best - session->scratch].file = file;
+ session->scratch_track[best - session->scratch].func = func;
session->scratch_track[best - session->scratch].line = line;
#endif
*scratchp = *best;
return (0);
-err: WT_RET_MSG(session, ret,
- "session unable to allocate a scratch buffer");
+err: WT_RET_MSG(session, ret, "session unable to allocate a scratch buffer");
}
/*
@@ -400,16 +355,19 @@ __wt_scr_discard(WT_SESSION_IMPL *session)
if (*bufp == NULL)
continue;
if (F_ISSET(*bufp, WT_ITEM_INUSE))
+#ifdef HAVE_DIAGNOSTIC
__wt_errx(session,
"scratch buffer allocated and never discarded"
-#ifdef HAVE_DIAGNOSTIC
": %s: %d",
session->
- scratch_track[bufp - session->scratch].file,
+ scratch_track[bufp - session->scratch].func,
session->
scratch_track[bufp - session->scratch].line
-#endif
);
+#else
+ __wt_errx(session,
+ "scratch buffer allocated and never discarded");
+#endif
__wt_buf_free(session, *bufp);
__wt_free(session, *bufp);
diff --git a/src/third_party/wiredtiger/src/support/stat.c b/src/third_party/wiredtiger/src/support/stat.c
index 749564c2464..61300dfeab9 100644
--- a/src/third_party/wiredtiger/src/support/stat.c
+++ b/src/third_party/wiredtiger/src/support/stat.c
@@ -70,11 +70,11 @@ static const char * const __stats_dsrc_desc[] = {
"cache: modified pages evicted",
"cache: overflow pages read into cache",
"cache: page split during eviction deepened the tree",
- "cache: page written requiring lookaside records",
+ "cache: page written requiring cache overflow records",
"cache: pages read into cache",
"cache: pages read into cache after truncate",
"cache: pages read into cache after truncate in prepare state",
- "cache: pages read into cache requiring lookaside entries",
+ "cache: pages read into cache requiring cache overflow entries",
"cache: pages requested from the cache",
"cache: pages seen by eviction walk",
"cache: pages written from cache",
@@ -111,6 +111,7 @@ static const char * const __stats_dsrc_desc[] = {
"compression: raw compression call succeeded",
"cursor: bulk-loaded cursor-insert calls",
"cursor: create calls",
+ "cursor: cursor operation restarted",
"cursor: cursor-insert key and value bytes inserted",
"cursor: cursor-remove key bytes removed",
"cursor: cursor-update value bytes updated",
@@ -123,7 +124,6 @@ static const char * const __stats_dsrc_desc[] = {
"cursor: remove calls",
"cursor: reserve calls",
"cursor: reset calls",
- "cursor: restarted searches",
"cursor: search calls",
"cursor: search near calls",
"cursor: truncate calls",
@@ -296,6 +296,7 @@ __wt_stat_dsrc_clear_single(WT_DSRC_STATS *stats)
stats->compress_raw_ok = 0;
stats->cursor_insert_bulk = 0;
stats->cursor_create = 0;
+ stats->cursor_restart = 0;
stats->cursor_insert_bytes = 0;
stats->cursor_remove_bytes = 0;
stats->cursor_update_bytes = 0;
@@ -308,7 +309,6 @@ __wt_stat_dsrc_clear_single(WT_DSRC_STATS *stats)
stats->cursor_remove = 0;
stats->cursor_reserve = 0;
stats->cursor_reset = 0;
- stats->cursor_restart = 0;
stats->cursor_search = 0;
stats->cursor_search_near = 0;
stats->cursor_truncate = 0;
@@ -482,6 +482,7 @@ __wt_stat_dsrc_aggregate_single(
to->compress_raw_ok += from->compress_raw_ok;
to->cursor_insert_bulk += from->cursor_insert_bulk;
to->cursor_create += from->cursor_create;
+ to->cursor_restart += from->cursor_restart;
to->cursor_insert_bytes += from->cursor_insert_bytes;
to->cursor_remove_bytes += from->cursor_remove_bytes;
to->cursor_update_bytes += from->cursor_update_bytes;
@@ -494,7 +495,6 @@ __wt_stat_dsrc_aggregate_single(
to->cursor_remove += from->cursor_remove;
to->cursor_reserve += from->cursor_reserve;
to->cursor_reset += from->cursor_reset;
- to->cursor_restart += from->cursor_restart;
to->cursor_search += from->cursor_search;
to->cursor_search_near += from->cursor_search_near;
to->cursor_truncate += from->cursor_truncate;
@@ -701,6 +701,7 @@ __wt_stat_dsrc_aggregate(
to->compress_raw_ok += WT_STAT_READ(from, compress_raw_ok);
to->cursor_insert_bulk += WT_STAT_READ(from, cursor_insert_bulk);
to->cursor_create += WT_STAT_READ(from, cursor_create);
+ to->cursor_restart += WT_STAT_READ(from, cursor_restart);
to->cursor_insert_bytes += WT_STAT_READ(from, cursor_insert_bytes);
to->cursor_remove_bytes += WT_STAT_READ(from, cursor_remove_bytes);
to->cursor_update_bytes += WT_STAT_READ(from, cursor_update_bytes);
@@ -713,7 +714,6 @@ __wt_stat_dsrc_aggregate(
to->cursor_remove += WT_STAT_READ(from, cursor_remove);
to->cursor_reserve += WT_STAT_READ(from, cursor_reserve);
to->cursor_reset += WT_STAT_READ(from, cursor_reset);
- to->cursor_restart += WT_STAT_READ(from, cursor_restart);
to->cursor_search += WT_STAT_READ(from, cursor_search);
to->cursor_search_near += WT_STAT_READ(from, cursor_search_near);
to->cursor_truncate += WT_STAT_READ(from, cursor_truncate);
@@ -783,11 +783,17 @@ static const char * const __stats_connection_desc[] = {
"cache: application threads page write from cache to disk count",
"cache: application threads page write from cache to disk time (usecs)",
"cache: bytes belonging to page images in the cache",
- "cache: bytes belonging to the lookaside table in the cache",
+ "cache: bytes belonging to the cache overflow table in the cache",
"cache: bytes currently in the cache",
"cache: bytes not belonging to page images in the cache",
"cache: bytes read into cache",
"cache: bytes written from cache",
+ "cache: cache overflow cursor application thread wait time (usecs)",
+ "cache: cache overflow cursor internal thread wait time (usecs)",
+ "cache: cache overflow score",
+ "cache: cache overflow table entries",
+ "cache: cache overflow table insert calls",
+ "cache: cache overflow table remove calls",
"cache: checkpoint blocked page eviction",
"cache: eviction calls to get a page",
"cache: eviction calls to get a page found queue empty",
@@ -832,17 +838,14 @@ static const char * const __stats_connection_desc[] = {
"cache: internal pages evicted",
"cache: internal pages split during eviction",
"cache: leaf pages split during eviction",
- "cache: lookaside score",
- "cache: lookaside table entries",
- "cache: lookaside table insert calls",
- "cache: lookaside table remove calls",
"cache: maximum bytes configured",
"cache: maximum page size at eviction",
"cache: modified pages evicted",
"cache: modified pages evicted by application threads",
+ "cache: operations timed out waiting for space in cache",
"cache: overflow pages read into cache",
"cache: page split during eviction deepened the tree",
- "cache: page written requiring lookaside records",
+ "cache: page written requiring cache overflow records",
"cache: pages currently held in the cache",
"cache: pages evicted because they exceeded the in-memory maximum count",
"cache: pages evicted because they exceeded the in-memory maximum time (usecs)",
@@ -855,11 +858,11 @@ static const char * const __stats_connection_desc[] = {
"cache: pages read into cache",
"cache: pages read into cache after truncate",
"cache: pages read into cache after truncate in prepare state",
- "cache: pages read into cache requiring lookaside entries",
- "cache: pages read into cache requiring lookaside for checkpoint",
- "cache: pages read into cache skipping older lookaside entries",
- "cache: pages read into cache with skipped lookaside entries needed later",
- "cache: pages read into cache with skipped lookaside entries needed later by checkpoint",
+ "cache: pages read into cache requiring cache overflow entries",
+ "cache: pages read into cache requiring cache overflow for checkpoint",
+ "cache: pages read into cache skipping older cache overflow entries",
+ "cache: pages read into cache with skipped cache overflow entries needed later",
+ "cache: pages read into cache with skipped cache overflow entries needed later by checkpoint",
"cache: pages requested from the cache",
"cache: pages seen by eviction walk",
"cache: pages selected for eviction unable to be evicted",
@@ -889,11 +892,11 @@ static const char * const __stats_connection_desc[] = {
"cursor: cursor insert calls",
"cursor: cursor modify calls",
"cursor: cursor next calls",
+ "cursor: cursor operation restarted",
"cursor: cursor prev calls",
"cursor: cursor remove calls",
"cursor: cursor reserve calls",
"cursor: cursor reset calls",
- "cursor: cursor restarted searches",
"cursor: cursor search calls",
"cursor: cursor search near calls",
"cursor: cursor sweep buckets",
@@ -915,19 +918,19 @@ static const char * const __stats_connection_desc[] = {
"lock: checkpoint lock acquisitions",
"lock: checkpoint lock application thread wait time (usecs)",
"lock: checkpoint lock internal thread wait time (usecs)",
- "lock: commit timestamp queue lock application thread time waiting for the dhandle lock (usecs)",
- "lock: commit timestamp queue lock internal thread time waiting for the dhandle lock (usecs)",
+ "lock: commit timestamp queue lock application thread time waiting (usecs)",
+ "lock: commit timestamp queue lock internal thread time waiting (usecs)",
"lock: commit timestamp queue read lock acquisitions",
"lock: commit timestamp queue write lock acquisitions",
- "lock: dhandle lock application thread time waiting for the dhandle lock (usecs)",
- "lock: dhandle lock internal thread time waiting for the dhandle lock (usecs)",
+ "lock: dhandle lock application thread time waiting (usecs)",
+ "lock: dhandle lock internal thread time waiting (usecs)",
"lock: dhandle read lock acquisitions",
"lock: dhandle write lock acquisitions",
"lock: metadata lock acquisitions",
"lock: metadata lock application thread wait time (usecs)",
"lock: metadata lock internal thread wait time (usecs)",
- "lock: read timestamp queue lock application thread time waiting for the dhandle lock (usecs)",
- "lock: read timestamp queue lock internal thread time waiting for the dhandle lock (usecs)",
+ "lock: read timestamp queue lock application thread time waiting (usecs)",
+ "lock: read timestamp queue lock internal thread time waiting (usecs)",
"lock: read timestamp queue read lock acquisitions",
"lock: read timestamp queue write lock acquisitions",
"lock: schema lock acquisitions",
@@ -937,8 +940,8 @@ static const char * const __stats_connection_desc[] = {
"lock: table lock internal thread time waiting for the table lock (usecs)",
"lock: table read lock acquisitions",
"lock: table write lock acquisitions",
- "lock: txn global lock application thread time waiting for the dhandle lock (usecs)",
- "lock: txn global lock internal thread time waiting for the dhandle lock (usecs)",
+ "lock: txn global lock application thread time waiting (usecs)",
+ "lock: txn global lock internal thread time waiting (usecs)",
"lock: txn global read lock acquisitions",
"lock: txn global write lock acquisitions",
"log: busy returns attempting to switch slots",
@@ -1017,6 +1020,7 @@ static const char * const __stats_connection_desc[] = {
"reconciliation: split objects currently awaiting free",
"session: open cursor count",
"session: open session count",
+ "session: session query timestamp calls",
"session: table alter failed calls",
"session: table alter successful calls",
"session: table alter unchanged and skipped",
@@ -1054,8 +1058,9 @@ static const char * const __stats_connection_desc[] = {
"thread-yield: page acquire time sleeping (usecs)",
"thread-yield: page delete rollback time sleeping for state change (usecs)",
"thread-yield: page reconciliation yielded due to child modification",
+ "transaction: commit timestamp queue entries walked",
"transaction: commit timestamp queue insert to empty",
- "transaction: commit timestamp queue inserts to tail",
+ "transaction: commit timestamp queue inserts to head",
"transaction: commit timestamp queue inserts total",
"transaction: commit timestamp queue length",
"transaction: number of named snapshots created",
@@ -1065,13 +1070,14 @@ static const char * const __stats_connection_desc[] = {
"transaction: prepared transactions currently active",
"transaction: prepared transactions rolled back",
"transaction: query timestamp calls",
+ "transaction: read timestamp queue entries walked",
"transaction: read timestamp queue insert to empty",
"transaction: read timestamp queue inserts to head",
"transaction: read timestamp queue inserts total",
"transaction: read timestamp queue length",
"transaction: rollback to stable calls",
"transaction: rollback to stable updates aborted",
- "transaction: rollback to stable updates removed from lookaside",
+ "transaction: rollback to stable updates removed from cache overflow",
"transaction: set timestamp calls",
"transaction: set timestamp commit calls",
"transaction: set timestamp commit updates",
@@ -1097,6 +1103,7 @@ static const char * const __stats_connection_desc[] = {
"transaction: transaction range of IDs currently pinned by a checkpoint",
"transaction: transaction range of IDs currently pinned by named snapshots",
"transaction: transaction range of timestamps currently pinned",
+ "transaction: transaction range of timestamps pinned by a checkpoint",
"transaction: transaction range of timestamps pinned by the oldest timestamp",
"transaction: transaction sync calls",
"transaction: transactions committed",
@@ -1185,6 +1192,12 @@ __wt_stat_connection_clear_single(WT_CONNECTION_STATS *stats)
/* not clearing cache_bytes_other */
stats->cache_bytes_read = 0;
stats->cache_bytes_write = 0;
+ stats->cache_lookaside_cursor_wait_application = 0;
+ stats->cache_lookaside_cursor_wait_internal = 0;
+ /* not clearing cache_lookaside_score */
+ /* not clearing cache_lookaside_entries */
+ stats->cache_lookaside_insert = 0;
+ stats->cache_lookaside_remove = 0;
stats->cache_eviction_checkpoint = 0;
stats->cache_eviction_get_ref = 0;
stats->cache_eviction_get_ref_empty = 0;
@@ -1229,14 +1242,11 @@ __wt_stat_connection_clear_single(WT_CONNECTION_STATS *stats)
stats->cache_eviction_internal = 0;
stats->cache_eviction_split_internal = 0;
stats->cache_eviction_split_leaf = 0;
- /* not clearing cache_lookaside_score */
- /* not clearing cache_lookaside_entries */
- stats->cache_lookaside_insert = 0;
- stats->cache_lookaside_remove = 0;
/* not clearing cache_bytes_max */
/* not clearing cache_eviction_maximum_page_size */
stats->cache_eviction_dirty = 0;
stats->cache_eviction_app_dirty = 0;
+ stats->cache_timed_out_ops = 0;
stats->cache_read_overflow = 0;
stats->cache_eviction_deepen = 0;
stats->cache_write_lookaside = 0;
@@ -1286,11 +1296,11 @@ __wt_stat_connection_clear_single(WT_CONNECTION_STATS *stats)
stats->cursor_insert = 0;
stats->cursor_modify = 0;
stats->cursor_next = 0;
+ stats->cursor_restart = 0;
stats->cursor_prev = 0;
stats->cursor_remove = 0;
stats->cursor_reserve = 0;
stats->cursor_reset = 0;
- stats->cursor_restart = 0;
stats->cursor_search = 0;
stats->cursor_search_near = 0;
stats->cursor_sweep_buckets = 0;
@@ -1414,6 +1424,7 @@ __wt_stat_connection_clear_single(WT_CONNECTION_STATS *stats)
/* not clearing rec_split_stashed_objects */
/* not clearing session_cursor_open */
/* not clearing session_open */
+ stats->session_query_ts = 0;
/* not clearing session_table_alter_fail */
/* not clearing session_table_alter_success */
/* not clearing session_table_alter_skip */
@@ -1451,8 +1462,9 @@ __wt_stat_connection_clear_single(WT_CONNECTION_STATS *stats)
stats->page_sleep = 0;
stats->page_del_rollback_blocked = 0;
stats->child_modify_blocked_page = 0;
+ stats->txn_commit_queue_walked = 0;
stats->txn_commit_queue_empty = 0;
- stats->txn_commit_queue_tail = 0;
+ stats->txn_commit_queue_head = 0;
stats->txn_commit_queue_inserts = 0;
stats->txn_commit_queue_len = 0;
stats->txn_snapshots_created = 0;
@@ -1462,6 +1474,7 @@ __wt_stat_connection_clear_single(WT_CONNECTION_STATS *stats)
stats->txn_prepare_active = 0;
stats->txn_prepare_rollback = 0;
stats->txn_query_ts = 0;
+ stats->txn_read_queue_walked = 0;
stats->txn_read_queue_empty = 0;
stats->txn_read_queue_head = 0;
stats->txn_read_queue_inserts = 0;
@@ -1494,6 +1507,7 @@ __wt_stat_connection_clear_single(WT_CONNECTION_STATS *stats)
/* not clearing txn_pinned_checkpoint_range */
/* not clearing txn_pinned_snapshot_range */
/* not clearing txn_pinned_timestamp */
+ /* not clearing txn_pinned_timestamp_checkpoint */
/* not clearing txn_pinned_timestamp_oldest */
stats->txn_sync = 0;
stats->txn_commit = 0;
@@ -1565,6 +1579,18 @@ __wt_stat_connection_aggregate(
to->cache_bytes_other += WT_STAT_READ(from, cache_bytes_other);
to->cache_bytes_read += WT_STAT_READ(from, cache_bytes_read);
to->cache_bytes_write += WT_STAT_READ(from, cache_bytes_write);
+ to->cache_lookaside_cursor_wait_application +=
+ WT_STAT_READ(from, cache_lookaside_cursor_wait_application);
+ to->cache_lookaside_cursor_wait_internal +=
+ WT_STAT_READ(from, cache_lookaside_cursor_wait_internal);
+ to->cache_lookaside_score +=
+ WT_STAT_READ(from, cache_lookaside_score);
+ to->cache_lookaside_entries +=
+ WT_STAT_READ(from, cache_lookaside_entries);
+ to->cache_lookaside_insert +=
+ WT_STAT_READ(from, cache_lookaside_insert);
+ to->cache_lookaside_remove +=
+ WT_STAT_READ(from, cache_lookaside_remove);
to->cache_eviction_checkpoint +=
WT_STAT_READ(from, cache_eviction_checkpoint);
to->cache_eviction_get_ref +=
@@ -1648,20 +1674,13 @@ __wt_stat_connection_aggregate(
WT_STAT_READ(from, cache_eviction_split_internal);
to->cache_eviction_split_leaf +=
WT_STAT_READ(from, cache_eviction_split_leaf);
- to->cache_lookaside_score +=
- WT_STAT_READ(from, cache_lookaside_score);
- to->cache_lookaside_entries +=
- WT_STAT_READ(from, cache_lookaside_entries);
- to->cache_lookaside_insert +=
- WT_STAT_READ(from, cache_lookaside_insert);
- to->cache_lookaside_remove +=
- WT_STAT_READ(from, cache_lookaside_remove);
to->cache_bytes_max += WT_STAT_READ(from, cache_bytes_max);
to->cache_eviction_maximum_page_size +=
WT_STAT_READ(from, cache_eviction_maximum_page_size);
to->cache_eviction_dirty += WT_STAT_READ(from, cache_eviction_dirty);
to->cache_eviction_app_dirty +=
WT_STAT_READ(from, cache_eviction_app_dirty);
+ to->cache_timed_out_ops += WT_STAT_READ(from, cache_timed_out_ops);
to->cache_read_overflow += WT_STAT_READ(from, cache_read_overflow);
to->cache_eviction_deepen +=
WT_STAT_READ(from, cache_eviction_deepen);
@@ -1726,11 +1745,11 @@ __wt_stat_connection_aggregate(
to->cursor_insert += WT_STAT_READ(from, cursor_insert);
to->cursor_modify += WT_STAT_READ(from, cursor_modify);
to->cursor_next += WT_STAT_READ(from, cursor_next);
+ to->cursor_restart += WT_STAT_READ(from, cursor_restart);
to->cursor_prev += WT_STAT_READ(from, cursor_prev);
to->cursor_remove += WT_STAT_READ(from, cursor_remove);
to->cursor_reserve += WT_STAT_READ(from, cursor_reserve);
to->cursor_reset += WT_STAT_READ(from, cursor_reset);
- to->cursor_restart += WT_STAT_READ(from, cursor_restart);
to->cursor_search += WT_STAT_READ(from, cursor_search);
to->cursor_search_near += WT_STAT_READ(from, cursor_search_near);
to->cursor_sweep_buckets += WT_STAT_READ(from, cursor_sweep_buckets);
@@ -1914,6 +1933,7 @@ __wt_stat_connection_aggregate(
WT_STAT_READ(from, rec_split_stashed_objects);
to->session_cursor_open += WT_STAT_READ(from, session_cursor_open);
to->session_open += WT_STAT_READ(from, session_open);
+ to->session_query_ts += WT_STAT_READ(from, session_query_ts);
to->session_table_alter_fail +=
WT_STAT_READ(from, session_table_alter_fail);
to->session_table_alter_success +=
@@ -1979,10 +1999,12 @@ __wt_stat_connection_aggregate(
WT_STAT_READ(from, page_del_rollback_blocked);
to->child_modify_blocked_page +=
WT_STAT_READ(from, child_modify_blocked_page);
+ to->txn_commit_queue_walked +=
+ WT_STAT_READ(from, txn_commit_queue_walked);
to->txn_commit_queue_empty +=
WT_STAT_READ(from, txn_commit_queue_empty);
- to->txn_commit_queue_tail +=
- WT_STAT_READ(from, txn_commit_queue_tail);
+ to->txn_commit_queue_head +=
+ WT_STAT_READ(from, txn_commit_queue_head);
to->txn_commit_queue_inserts +=
WT_STAT_READ(from, txn_commit_queue_inserts);
to->txn_commit_queue_len += WT_STAT_READ(from, txn_commit_queue_len);
@@ -1995,6 +2017,8 @@ __wt_stat_connection_aggregate(
to->txn_prepare_active += WT_STAT_READ(from, txn_prepare_active);
to->txn_prepare_rollback += WT_STAT_READ(from, txn_prepare_rollback);
to->txn_query_ts += WT_STAT_READ(from, txn_query_ts);
+ to->txn_read_queue_walked +=
+ WT_STAT_READ(from, txn_read_queue_walked);
to->txn_read_queue_empty += WT_STAT_READ(from, txn_read_queue_empty);
to->txn_read_queue_head += WT_STAT_READ(from, txn_read_queue_head);
to->txn_read_queue_inserts +=
@@ -2047,6 +2071,8 @@ __wt_stat_connection_aggregate(
to->txn_pinned_snapshot_range +=
WT_STAT_READ(from, txn_pinned_snapshot_range);
to->txn_pinned_timestamp += WT_STAT_READ(from, txn_pinned_timestamp);
+ to->txn_pinned_timestamp_checkpoint +=
+ WT_STAT_READ(from, txn_pinned_timestamp_checkpoint);
to->txn_pinned_timestamp_oldest +=
WT_STAT_READ(from, txn_pinned_timestamp_oldest);
to->txn_sync += WT_STAT_READ(from, txn_sync);
diff --git a/src/third_party/wiredtiger/src/support/thread_group.c b/src/third_party/wiredtiger/src/support/thread_group.c
index 4597d26496d..9df1e3d636f 100644
--- a/src/third_party/wiredtiger/src/support/thread_group.c
+++ b/src/third_party/wiredtiger/src/support/thread_group.c
@@ -103,7 +103,7 @@ __thread_group_shrink(
if (thread == NULL)
continue;
- WT_TRET(__wt_thread_join(session, thread->tid));
+ WT_TRET(__wt_thread_join(session, &thread->tid));
__wt_cond_destroy(session, &thread->pause_cond);
}
__wt_writelock(session, &group->lock);
diff --git a/src/third_party/wiredtiger/src/txn/txn.c b/src/third_party/wiredtiger/src/txn/txn.c
index cf233ab9a5d..cbfac786f36 100644
--- a/src/third_party/wiredtiger/src/txn/txn.c
+++ b/src/third_party/wiredtiger/src/txn/txn.c
@@ -119,9 +119,11 @@ void
__wt_txn_release_snapshot(WT_SESSION_IMPL *session)
{
WT_TXN *txn;
+ WT_TXN_GLOBAL *txn_global;
WT_TXN_STATE *txn_state;
txn = &session->txn;
+ txn_global = &S2C(session)->txn_global;
txn_state = WT_SESSION_TXN_STATE(session);
WT_ASSERT(session,
@@ -131,6 +133,14 @@ __wt_txn_release_snapshot(WT_SESSION_IMPL *session)
txn_state->metadata_pinned = txn_state->pinned_id = WT_TXN_NONE;
F_CLR(txn, WT_TXN_HAS_SNAPSHOT);
+
+ /* Clear a checkpoint's pinned ID. */
+ if (WT_SESSION_IS_CHECKPOINT(session)) {
+ txn_global->checkpoint_state.pinned_id = WT_TXN_NONE;
+ __wt_timestamp_set_zero(&txn_global->checkpoint_timestamp);
+ }
+
+ __wt_txn_clear_read_timestamp(session);
}
/*
@@ -528,8 +538,7 @@ __wt_txn_release(WT_SESSION_IMPL *session)
if (WT_SESSION_IS_CHECKPOINT(session)) {
WT_ASSERT(session,
WT_SESSION_TXN_STATE(session)->id == WT_TXN_NONE);
- txn->id = txn_global->checkpoint_state.id =
- txn_global->checkpoint_state.pinned_id = WT_TXN_NONE;
+ txn->id = txn_global->checkpoint_state.id = WT_TXN_NONE;
/*
* Be extra careful to cleanup everything for checkpoints: once
@@ -548,7 +557,6 @@ __wt_txn_release(WT_SESSION_IMPL *session)
}
__wt_txn_clear_commit_timestamp(session);
- __wt_txn_clear_read_timestamp(session);
/* Free the scratch buffer allocated for logging. */
__wt_logrec_free(session, &txn->logrec);
@@ -611,12 +619,13 @@ __txn_commit_timestamp_validate(WT_SESSION_IMPL *session)
* are at a later timestamp or use timestamps inconsistently.
*/
for (i = 0, op = txn->mod; i < txn->mod_count; i++, op++)
- if (op->type == WT_TXN_OP_BASIC) {
+ if (op->type == WT_TXN_OP_BASIC_COL ||
+ op->type == WT_TXN_OP_BASIC_ROW) {
/*
* Skip over any aborted update structures or ones
* from our own transaction.
*/
- upd = op->u.upd->next;
+ upd = op->u.op_upd->next;
while (upd != NULL && (upd->txnid == WT_TXN_ABORTED ||
upd->txnid == txn->id))
upd = upd->next;
@@ -648,12 +657,13 @@ __txn_commit_timestamp_validate(WT_SESSION_IMPL *session)
*/
if (op_zero_ts)
continue;
- op_timestamp = op->u.upd->timestamp;
+
+ op_timestamp = op->u.op_upd->timestamp;
/*
* Only if the update structure doesn't have a timestamp
* then use the one in the transaction structure.
*/
- if (__wt_timestamp_iszero(&op->u.upd->timestamp))
+ if (__wt_timestamp_iszero(&op_timestamp))
op_timestamp = txn->commit_timestamp;
if (__wt_timestamp_cmp(&op_timestamp,
&upd->timestamp) < 0)
@@ -678,14 +688,12 @@ __wt_txn_commit(WT_SESSION_IMPL *session, const char *cfg[])
WT_TXN_GLOBAL *txn_global;
WT_TXN_OP *op;
WT_UPDATE *upd;
+ uint32_t fileid;
u_int i;
bool locked, readonly;
#ifdef HAVE_TIMESTAMPS
- WT_REF *ref;
- WT_UPDATE **updp;
wt_timestamp_t prev_commit_timestamp, ts;
- uint32_t previous_state;
- bool prepared_transaction, update_timestamp;
+ bool update_timestamp;
#endif
txn = &session->txn;
@@ -803,18 +811,17 @@ __wt_txn_commit(WT_SESSION_IMPL *session, const char *cfg[])
/* Note: we're going to commit: nothing can fail after this point. */
-#ifdef HAVE_TIMESTAMPS
- prepared_transaction = F_ISSET(txn, WT_TXN_PREPARE);
-#endif
/* Process and free updates. */
for (i = 0, op = txn->mod; i < txn->mod_count; i++, op++) {
+ fileid = op->btree->id;
switch (op->type) {
case WT_TXN_OP_NONE:
break;
-
- case WT_TXN_OP_BASIC:
- case WT_TXN_OP_INMEM:
- upd = op->u.upd;
+ case WT_TXN_OP_BASIC_COL:
+ case WT_TXN_OP_BASIC_ROW:
+ case WT_TXN_OP_INMEM_COL:
+ case WT_TXN_OP_INMEM_ROW:
+ upd = op->u.op_upd;
/*
* Switch reserved operations to abort to
@@ -830,111 +837,14 @@ __wt_txn_commit(WT_SESSION_IMPL *session, const char *cfg[])
* as they commit.
*/
if (conn->cache->las_fileid != 0 &&
- op->fileid == conn->cache->las_fileid) {
+ fileid == conn->cache->las_fileid) {
upd->txnid = WT_TXN_NONE;
break;
}
-
-#ifdef HAVE_TIMESTAMPS
- if (!__wt_txn_update_needs_timestamp(session, op))
- break;
-
- if (prepared_transaction) {
- /*
- * In case of a prepared transaction, the order
- * of modification of the prepare timestamp to
- * the commit timestamp in the update chain will
- * not affect the data visibility, a reader will
- * encounter a prepared update resulting in
- * prepare conflict.
- *
- * As updating timestamp might not be an atomic
- * operation, we will manage using state.
- */
- upd->prepare_state = WT_PREPARE_LOCKED;
- WT_WRITE_BARRIER();
- __wt_timestamp_set(
- &upd->timestamp, &txn->commit_timestamp);
- WT_PUBLISH(upd->prepare_state,
- WT_PREPARE_RESOLVED);
- } else
- __wt_timestamp_set(
- &upd->timestamp, &txn->commit_timestamp);
-#endif
- break;
-
+ /* FALLTHROUGH */
case WT_TXN_OP_REF_DELETE:
#ifdef HAVE_TIMESTAMPS
- if (!__wt_txn_update_needs_timestamp(session, op))
- break;
-
- ref = op->u.ref;
- if (prepared_transaction) {
- /*
- * As updating timestamp might not be an atomic
- * operation, we will manage using state.
- */
- ref->page_del->prepare_state =
- WT_PREPARE_LOCKED;
- WT_WRITE_BARRIER();
- __wt_timestamp_set(&ref->page_del->timestamp,
- &txn->commit_timestamp);
- WT_PUBLISH(ref->page_del->prepare_state,
- WT_PREPARE_RESOLVED);
- } else
- __wt_timestamp_set(&ref->page_del->timestamp,
- &txn->commit_timestamp);
-
- /*
- * The page-deleted list can be discarded by eviction,
- * lock the WT_REF to ensure we don't race.
- */
- if (ref->page_del->update_list == NULL)
- break;
-
- for (;; __wt_yield()) {
- previous_state = ref->state;
- if (previous_state != WT_REF_LOCKED &&
- __wt_atomic_casv32(
- &ref->state, previous_state, WT_REF_LOCKED))
- break;
- }
-
- if ((updp = ref->page_del->update_list) == NULL) {
- /*
- * Publish to ensure we don't let the page be
- * evicted and the updates discarded before
- * being written.
- */
- WT_PUBLISH(ref->state, previous_state);
- break;
- }
-
- for (; *updp != NULL; ++updp) {
- if (prepared_transaction) {
- /*
- * As ref state is LOCKED, timestamp
- * and prepare state are updated in
- * exclusive access, hence no need for
- * temporary state WT_PREPARE_LOCKED
- * and BARRIER.
- */
- __wt_timestamp_set(
- &(*updp)->timestamp,
- &txn->commit_timestamp);
- (*updp)->prepare_state =
- WT_PREPARE_RESOLVED;
- } else
- __wt_timestamp_set(
- &(*updp)->timestamp,
- &txn->commit_timestamp);
- }
-
- /*
- * Publish to ensure we don't let the page be evicted
- * and the updates discarded before being written.
- */
- WT_PUBLISH(ref->state, previous_state);
+ __wt_txn_op_set_timestamp(session, op);
#endif
break;
case WT_TXN_OP_TRUNCATE_COL:
@@ -1071,30 +981,31 @@ __wt_txn_prepare(WT_SESSION_IMPL *session, const char *cfg[])
/* Prepare updates. */
for (i = 0, op = txn->mod; i < txn->mod_count; i++, op++) {
/* Assert it's not an update to the lookaside file. */
- WT_ASSERT(session,
- S2C(session)->cache->las_fileid == 0 ||
- op->fileid != S2C(session)->cache->las_fileid);
+ WT_ASSERT(session, S2C(session)->cache->las_fileid == 0 ||
+ !F_ISSET(op->btree, WT_BTREE_LOOKASIDE));
/* Metadata updates are never prepared. */
- if (op->fileid == WT_METAFILE_ID)
+ if (WT_IS_METADATA(op->btree->dhandle))
continue;
- upd = op->u.upd;
+ upd = op->u.op_upd;
switch (op->type) {
case WT_TXN_OP_NONE:
break;
- case WT_TXN_OP_BASIC:
- case WT_TXN_OP_INMEM:
+ case WT_TXN_OP_BASIC_COL:
+ case WT_TXN_OP_BASIC_ROW:
+ case WT_TXN_OP_INMEM_COL:
+ case WT_TXN_OP_INMEM_ROW:
/*
* Switch reserved operation to abort to simplify
- * obsolete update list truncation. Clear the
- * operation type so we don't try to visit this update
- * again: it can now be evicted.
+ * obsolete update list truncation. The object free
+ * function clears the operation type so we don't
+ * try to visit this update again: it can be evicted.
*/
if (upd->type == WT_UPDATE_RESERVE) {
upd->txnid = WT_TXN_ABORTED;
- op->type = WT_TXN_OP_NONE;
+ __wt_txn_op_free(session, op);
break;
}
@@ -1165,22 +1076,22 @@ __wt_txn_rollback(WT_SESSION_IMPL *session, const char *cfg[])
/* Rollback updates. */
for (i = 0, op = txn->mod; i < txn->mod_count; i++, op++) {
/* Assert it's not an update to the lookaside file. */
- WT_ASSERT(session,
- S2C(session)->cache->las_fileid == 0 ||
- op->fileid != S2C(session)->cache->las_fileid);
+ WT_ASSERT(session, S2C(session)->cache->las_fileid == 0 ||
+ !F_ISSET(op->btree, WT_BTREE_LOOKASIDE));
/* Metadata updates are never rolled back. */
- if (op->fileid == WT_METAFILE_ID)
+ if (WT_IS_METADATA(op->btree->dhandle))
continue;
- upd = op->u.upd;
+ upd = op->u.op_upd;
switch (op->type) {
case WT_TXN_OP_NONE:
break;
-
- case WT_TXN_OP_BASIC:
- case WT_TXN_OP_INMEM:
+ case WT_TXN_OP_BASIC_COL:
+ case WT_TXN_OP_BASIC_ROW:
+ case WT_TXN_OP_INMEM_COL:
+ case WT_TXN_OP_INMEM_ROW:
WT_ASSERT(session,
upd->txnid == txn->id ||
upd->txnid == WT_TXN_ABORTED);
@@ -1200,7 +1111,6 @@ __wt_txn_rollback(WT_SESSION_IMPL *session, const char *cfg[])
break;
}
- /* Free any memory allocated for the operation. */
__wt_txn_op_free(session, op);
}
txn->mod_count = 0;
@@ -1283,12 +1193,24 @@ __wt_txn_stats_update(WT_SESSION_IMPL *session)
txn_global->current - txn_global->oldest_id);
#if WT_TIMESTAMP_SIZE == 8
+ {
+ WT_DECL_TIMESTAMP(checkpoint_timestamp)
+ WT_DECL_TIMESTAMP(commit_timestamp)
+ WT_DECL_TIMESTAMP(pinned_timestamp)
+
+ checkpoint_timestamp = txn_global->checkpoint_timestamp;
+ commit_timestamp = txn_global->commit_timestamp;
+ pinned_timestamp = txn_global->pinned_timestamp;
+ if (checkpoint_timestamp.val != 0 &&
+ checkpoint_timestamp.val < pinned_timestamp.val)
+ pinned_timestamp = checkpoint_timestamp;
WT_STAT_SET(session, stats, txn_pinned_timestamp,
- txn_global->commit_timestamp.val -
- txn_global->pinned_timestamp.val);
+ commit_timestamp.val - pinned_timestamp.val);
+ WT_STAT_SET(session, stats, txn_pinned_timestamp_checkpoint,
+ commit_timestamp.val - checkpoint_timestamp.val);
WT_STAT_SET(session, stats, txn_pinned_timestamp_oldest,
- txn_global->commit_timestamp.val -
- txn_global->oldest_timestamp.val);
+ commit_timestamp.val - txn_global->oldest_timestamp.val);
+ }
#endif
WT_STAT_SET(session, stats, txn_pinned_snapshot_range,
diff --git a/src/third_party/wiredtiger/src/txn/txn_ckpt.c b/src/third_party/wiredtiger/src/txn/txn_ckpt.c
index 10af61caeaf..dd00ff7d36a 100644
--- a/src/third_party/wiredtiger/src/txn/txn_ckpt.c
+++ b/src/third_party/wiredtiger/src/txn/txn_ckpt.c
@@ -372,12 +372,11 @@ __checkpoint_reduce_dirty_cache(WT_SESSION_IMPL *session)
{
WT_CACHE *cache;
WT_CONNECTION_IMPL *conn;
- double current_dirty, delta, scrub_min;
- uint64_t bytes_written_last, bytes_written_start, bytes_written_total;
+ double current_dirty, prev_dirty;
+ uint64_t bytes_written_start, bytes_written_total;
uint64_t cache_size, max_write;
- uint64_t current_us, stepdown_us, total_ms, work_us;
- uint64_t time_last, time_start, time_stop;
- bool progress;
+ uint64_t time_start, time_stop;
+ uint64_t total_ms;
conn = S2C(session);
cache = conn->cache;
@@ -388,61 +387,41 @@ __checkpoint_reduce_dirty_cache(WT_SESSION_IMPL *session)
* scrubbing cannot help).
*/
if (F_ISSET(conn, WT_CONN_CLOSING_TIMESTAMP) ||
- cache->eviction_checkpoint_target < DBL_EPSILON ||
- cache->eviction_checkpoint_target >= cache->eviction_dirty_trigger)
+ cache->eviction_checkpoint_target < DBL_EPSILON)
return;
- time_last = time_start = __wt_clock(session);
- bytes_written_last = 0;
+ time_start = __wt_clock(session);
bytes_written_start = cache->bytes_written;
- cache_size = conn->cache_size;
+
/*
* If the cache size is zero or very small, we're done. The cache
* size can briefly become zero if we're transitioning to a shared
* cache via reconfigure. This avoids potential divide by zero.
*/
- if (cache_size < 10 * WT_MEGABYTE)
+ if ((cache_size = conn->cache_size) < 10 * WT_MEGABYTE)
return;
- /*
- * Skip scrubbing if it won't perform at-least some minimum amount of
- * work. Scrubbing is supposed to bring down the dirty data to eviction
- * checkpoint target before the actual checkpoint starts. Do not perform
- * scrubbing if the dirty data to scrub is less than a pre-configured
- * size. This size is to an extent based on the configured cache size
- * without being too large or too small for large cache sizes. For the
- * values chosen, for instance, 100 GB cache will require at-least
- * 200 MB of dirty data above eviction checkpoint target, which should
- * equate to a scrub phase a few seconds long. That said, the value of
- * 0.2% and 500 MB are still somewhat arbitrary.
- */
- scrub_min = WT_MIN((0.2 * conn->cache_size) / 100, 500 * WT_MEGABYTE);
- if (__wt_cache_dirty_leaf_inuse(cache) <
- ((cache->eviction_checkpoint_target * conn->cache_size) / 100) +
- scrub_min)
+ current_dirty =
+ (100.0 * __wt_cache_dirty_leaf_inuse(cache)) / cache_size;
+ if (current_dirty <= cache->eviction_checkpoint_target)
return;
- stepdown_us = 10000;
- work_us = 0;
- progress = false;
-
- /* Step down the scrub target (as a percentage) in units of 10MB. */
- delta = WT_MIN(1.0, (100 * 10.0 * WT_MEGABYTE) / cache_size);
-
- /*
- * Start with the scrub target equal to the expected maximum percentage
- * of dirty data in cache.
- */
- cache->eviction_scrub_limit = cache->eviction_dirty_trigger;
-
/* Stop if we write as much dirty data as is currently in cache. */
max_write = __wt_cache_dirty_leaf_inuse(cache);
- /* Step down the dirty target to the eviction trigger */
+ /* Set the dirty trigger to the target value. */
+ cache->eviction_scrub_target = cache->eviction_checkpoint_target;
+ WT_STAT_CONN_SET(session, txn_checkpoint_scrub_target, 0);
+
+ /* Wait while the dirty level is going down. */
for (;;) {
+ __wt_sleep(0, 100 * WT_THOUSAND);
+
+ prev_dirty = current_dirty;
current_dirty =
(100.0 * __wt_cache_dirty_leaf_inuse(cache)) / cache_size;
- if (current_dirty <= cache->eviction_checkpoint_target)
+ if (current_dirty <= cache->eviction_checkpoint_target ||
+ current_dirty >= prev_dirty)
break;
/*
@@ -452,63 +431,17 @@ __checkpoint_reduce_dirty_cache(WT_SESSION_IMPL *session)
if (F_ISSET(cache, WT_CACHE_EVICT_LOOKASIDE))
break;
- __wt_sleep(0, stepdown_us / 10);
- time_stop = __wt_clock(session);
- current_us = WT_CLOCKDIFF_US(time_stop, time_last);
- bytes_written_total =
- cache->bytes_written - bytes_written_start;
-
- if (current_dirty > cache->eviction_scrub_limit) {
- /*
- * We haven't reached the current target.
- *
- * Don't wait indefinitely: there might be dirty pages
- * that can't be evicted. If we can't meet the target,
- * give up and start the checkpoint for real.
- */
- if (current_us > WT_MAX(WT_MILLION, 10 * stepdown_us) ||
- bytes_written_total > max_write)
- break;
- continue;
- }
-
/*
- * Estimate how long the next step down of dirty data should
- * take.
- *
- * The calculation here assumes that the system is writing from
- * cache as fast as it can, and determines the write throughput
- * based on the change in the bytes written from cache since
- * the start of the call. We use that to estimate how long it
- * will take to step the dirty target down by delta.
+ * We haven't reached the current target.
*
- * Take care to avoid dividing by zero.
- */
- if (bytes_written_total - bytes_written_last > WT_MEGABYTE &&
- work_us > 0) {
- stepdown_us = (uint64_t)((delta * cache_size / 100) /
- ((double)bytes_written_total / work_us));
- stepdown_us = WT_MAX(1, stepdown_us);
- if (!progress)
- stepdown_us = WT_MIN(stepdown_us, 200000);
- progress = true;
-
- bytes_written_last = bytes_written_total;
- }
-
- work_us += current_us;
-
- /*
- * Smooth out step down: try to limit the impact on
- * performance to 10% by waiting once we reach the last
- * level.
+ * Don't wait indefinitely: there might be dirty pages
+ * that can't be evicted. If we can't meet the target,
+ * give up and start the checkpoint for real.
*/
- __wt_sleep(0, 10 * stepdown_us);
- cache->eviction_scrub_limit =
- WT_MAX(cache->eviction_dirty_target, current_dirty - delta);
- WT_STAT_CONN_SET(session, txn_checkpoint_scrub_target,
- cache->eviction_scrub_limit);
- time_last = __wt_clock(session);
+ bytes_written_total =
+ cache->bytes_written - bytes_written_start;
+ if (bytes_written_total > max_write)
+ break;
}
time_stop = __wt_clock(session);
@@ -681,8 +614,7 @@ __checkpoint_prepare(
*/
__wt_writelock(session, &txn_global->rwlock);
txn_global->checkpoint_state = *txn_state;
- txn_global->checkpoint_txn = txn;
- txn_global->checkpoint_state.pinned_id = WT_MIN(txn->id, txn->snap_min);
+ txn_global->checkpoint_state.pinned_id = txn->snap_min;
/*
* Sanity check that the oldest ID hasn't moved on before we have
@@ -724,6 +656,8 @@ __checkpoint_prepare(
if (txn_global->has_stable_timestamp) {
__wt_timestamp_set(&txn->read_timestamp,
&txn_global->stable_timestamp);
+ __wt_timestamp_set(&txn_global->checkpoint_timestamp,
+ &txn->read_timestamp);
F_SET(txn, WT_TXN_HAS_TS_READ);
if (!F_ISSET(conn, WT_CONN_RECOVERING))
__wt_timestamp_set(
@@ -802,14 +736,10 @@ __txn_checkpoint_can_skip(WT_SESSION_IMPL *session,
*/
WT_RET(__wt_config_gets(session, cfg, "target", &cval));
__wt_config_subinit(session, &targetconf, &cval);
- full = __wt_config_next(&targetconf, &k, &v) != 0;
- if (fullp != NULL)
- *fullp = full;
+ *fullp = full = __wt_config_next(&targetconf, &k, &v) != 0;
WT_RET(__wt_config_gets(session, cfg, "use_timestamp", &cval));
- use_timestamp = cval.val != 0;
- if (use_timestampp != NULL)
- *use_timestampp = use_timestamp;
+ *use_timestampp = use_timestamp = cval.val != 0;
/* Never skip non-full checkpoints */
if (!full)
@@ -979,7 +909,7 @@ __txn_checkpoint(WT_SESSION_IMPL *session, const char *cfg[])
* Unblock updates -- we can figure out that any updates to clean pages
* after this point are too new to be written in the checkpoint.
*/
- cache->eviction_scrub_limit = 0.0;
+ cache->eviction_scrub_target = 0.0;
WT_STAT_CONN_SET(session, txn_checkpoint_scrub_target, 0);
/* Tell logging that we have started a database checkpoint. */
@@ -1060,9 +990,8 @@ __txn_checkpoint(WT_SESSION_IMPL *session, const char *cfg[])
/* Disable metadata tracking during the metadata checkpoint. */
saved_meta_next = session->meta_track_next;
session->meta_track_next = NULL;
- WT_WITH_METADATA_LOCK(session,
- WT_WITH_DHANDLE(session,
- WT_SESSION_META_DHANDLE(session),
+ WT_WITH_DHANDLE(session, WT_SESSION_META_DHANDLE(session),
+ WT_WITH_METADATA_LOCK(session,
ret = __wt_checkpoint(session, cfg)));
session->meta_track_next = saved_meta_next;
WT_ERR(ret);
@@ -1129,7 +1058,7 @@ err: /*
if (tracking)
WT_TRET(__wt_meta_track_off(session, false, failed));
- cache->eviction_scrub_limit = 0.0;
+ cache->eviction_scrub_target = 0.0;
WT_STAT_CONN_SET(session, txn_checkpoint_scrub_target, 0);
if (F_ISSET(txn, WT_TXN_RUNNING)) {
@@ -1966,7 +1895,7 @@ __wt_checkpoint_close(WT_SESSION_IMPL *session, bool final)
if (btree->modified && !bulk &&
S2C(session)->txn_global.has_stable_timestamp &&
!__wt_btree_immediately_durable(session))
- return (EBUSY);
+ return (__wt_set_return(session, EBUSY));
/*
* Turn on metadata tracking if:
diff --git a/src/third_party/wiredtiger/src/txn/txn_log.c b/src/third_party/wiredtiger/src/txn/txn_log.c
index 81968c940f7..cf8e464239a 100644
--- a/src/third_party/wiredtiger/src/txn/txn_log.c
+++ b/src/third_party/wiredtiger/src/txn/txn_log.c
@@ -71,10 +71,13 @@ __txn_op_log(WT_SESSION_IMPL *session,
WT_ITEM value;
WT_UPDATE *upd;
uint64_t recno;
+ uint32_t fileid;
cursor = &cbt->iface;
- upd = op->u.upd;
+ fileid = op->btree->id;
+
+ upd = op->u.op_upd;
value.data = upd->data;
value.size = upd->size;
@@ -89,17 +92,17 @@ __txn_op_log(WT_SESSION_IMPL *session,
switch (upd->type) {
case WT_UPDATE_MODIFY:
WT_RET(__wt_logop_row_modify_pack(
- session, logrec, op->fileid, &cursor->key, &value));
+ session, logrec, fileid, &cursor->key, &value));
break;
case WT_UPDATE_STANDARD:
WT_RET(__wt_logop_row_put_pack(
- session, logrec, op->fileid, &cursor->key, &value));
+ session, logrec, fileid, &cursor->key, &value));
break;
case WT_UPDATE_TOMBSTONE:
WT_RET(__wt_logop_row_remove_pack(
- session, logrec, op->fileid, &cursor->key));
+ session, logrec, fileid, &cursor->key));
break;
- WT_ILLEGAL_VALUE(session);
+ WT_ILLEGAL_VALUE(session, upd->type);
}
} else {
recno = WT_INSERT_RECNO(cbt->ins);
@@ -108,17 +111,17 @@ __txn_op_log(WT_SESSION_IMPL *session,
switch (upd->type) {
case WT_UPDATE_MODIFY:
WT_RET(__wt_logop_col_modify_pack(
- session, logrec, op->fileid, recno, &value));
+ session, logrec, fileid, recno, &value));
break;
case WT_UPDATE_STANDARD:
WT_RET(__wt_logop_col_put_pack(
- session, logrec, op->fileid, recno, &value));
+ session, logrec, fileid, recno, &value));
break;
case WT_UPDATE_TOMBSTONE:
WT_RET(__wt_logop_col_remove_pack(
- session, logrec, op->fileid, recno));
+ session, logrec, fileid, recno));
break;
- WT_ILLEGAL_VALUE(session);
+ WT_ILLEGAL_VALUE(session, upd->type);
}
}
@@ -165,17 +168,31 @@ __wt_txn_op_free(WT_SESSION_IMPL *session, WT_TXN_OP *op)
{
switch (op->type) {
case WT_TXN_OP_NONE:
- case WT_TXN_OP_BASIC:
- case WT_TXN_OP_INMEM:
+ /*
+ * The free function can be called more than once: when there's
+ * no operation, a free is unnecessary or has already been done.
+ */
+ return;
+ case WT_TXN_OP_BASIC_COL:
+ case WT_TXN_OP_INMEM_COL:
case WT_TXN_OP_REF_DELETE:
case WT_TXN_OP_TRUNCATE_COL:
break;
+ case WT_TXN_OP_BASIC_ROW:
+ case WT_TXN_OP_INMEM_ROW:
+ __wt_buf_free(session, &op->u.op_row.key);
+ break;
+
case WT_TXN_OP_TRUNCATE_ROW:
__wt_buf_free(session, &op->u.truncate_row.start);
__wt_buf_free(session, &op->u.truncate_row.stop);
break;
}
+
+ (void)__wt_atomic_subi32(&op->btree->dhandle->session_inuse, 1);
+
+ op->type = WT_TXN_OP_NONE;
}
/*
@@ -227,6 +244,8 @@ __wt_txn_log_op(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt)
WT_TXN *txn;
WT_TXN_OP *op;
+ uint32_t fileid;
+
txn = &session->txn;
if (!FLD_ISSET(S2C(session)->log_flags, WT_CONN_LOG_ENABLED) ||
@@ -240,27 +259,28 @@ __wt_txn_log_op(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt)
WT_ASSERT(session, txn->mod_count > 0);
op = txn->mod + txn->mod_count - 1;
+ fileid = op->btree->id;
WT_RET(__txn_logrec_init(session));
logrec = txn->logrec;
switch (op->type) {
case WT_TXN_OP_NONE:
- case WT_TXN_OP_INMEM:
+ case WT_TXN_OP_INMEM_COL:
+ case WT_TXN_OP_INMEM_ROW:
case WT_TXN_OP_REF_DELETE:
/* Nothing to log, we're done. */
break;
- case WT_TXN_OP_BASIC:
+ case WT_TXN_OP_BASIC_COL:
+ case WT_TXN_OP_BASIC_ROW:
ret = __txn_op_log(session, logrec, op, cbt);
break;
case WT_TXN_OP_TRUNCATE_COL:
- ret = __wt_logop_col_truncate_pack(session, logrec,
- op->fileid,
+ ret = __wt_logop_col_truncate_pack(session, logrec, fileid,
op->u.truncate_col.start, op->u.truncate_col.stop);
break;
case WT_TXN_OP_TRUNCATE_ROW:
- ret = __wt_logop_row_truncate_pack(session, txn->logrec,
- op->fileid,
+ ret = __wt_logop_row_truncate_pack(session, logrec, fileid,
&op->u.truncate_row.start, &op->u.truncate_row.stop,
(uint32_t)op->u.truncate_row.mode);
break;
@@ -498,7 +518,7 @@ __wt_txn_checkpoint_log(
__wt_scr_free(session, &txn->ckpt_snapshot);
txn->full_ckpt = false;
break;
- WT_ILLEGAL_VALUE_ERR(session);
+ WT_ILLEGAL_VALUE_ERR(session, flags);
}
err: __wt_logrec_free(session, &logrec);
diff --git a/src/third_party/wiredtiger/src/txn/txn_recover.c b/src/third_party/wiredtiger/src/txn/txn_recover.c
index ac656047e62..0e882b5bd7f 100644
--- a/src/third_party/wiredtiger/src/txn/txn_recover.c
+++ b/src/third_party/wiredtiger/src/txn/txn_recover.c
@@ -91,12 +91,14 @@ __recovery_cursor(WT_SESSION_IMPL *session, WT_RECOVERY *r,
* Helper to a cursor if this operation is to be applied during recovery.
*/
#define GET_RECOVERY_CURSOR(session, r, lsnp, fileid, cp) \
- WT_ERR(__recovery_cursor(session, r, lsnp, fileid, false, cp)); \
+ ret = __recovery_cursor(session, r, lsnp, fileid, false, cp); \
__wt_verbose(session, WT_VERB_RECOVERY, \
"%s op %" PRIu32 " to file %" PRIu32 " at LSN %" PRIu32 \
"/%" PRIu32, \
+ ret != 0 ? "Error" : \
cursor == NULL ? "Skipping" : "Applying", \
optype, fileid, (lsnp)->l.file, (lsnp)->l.offset); \
+ WT_ERR(ret); \
if (cursor == NULL) \
break
@@ -247,7 +249,7 @@ __txn_op_apply(
stop = cursor;
break;
- WT_ILLEGAL_VALUE_ERR(session);
+ WT_ILLEGAL_VALUE_ERR(session, mode);
}
/* Set the keys. */
@@ -264,7 +266,7 @@ __txn_op_apply(
WT_ERR(ret);
break;
- WT_ILLEGAL_VALUE_ERR(session);
+ WT_ILLEGAL_VALUE_ERR(session, optype);
}
/* Reset the cursor so it doesn't block eviction. */
@@ -630,7 +632,7 @@ __wt_txn_recover(WT_SESSION_IMPL *session)
* Clear this out. We no longer need it and it could have been
* re-allocated when scanning the files.
*/
- metafile = NULL;
+ WT_NOT_READ(metafile, NULL);
/*
* We no longer need the metadata cursor: close it to avoid pinning any
@@ -659,7 +661,7 @@ __wt_txn_recover(WT_SESSION_IMPL *session)
if (F_ISSET(conn, WT_CONN_READONLY))
WT_ERR_MSG(session, WT_RUN_RECOVERY,
"Read-only database needs recovery");
- WT_ERR(WT_RUN_RECOVERY);
+ WT_ERR_MSG(session, WT_RUN_RECOVERY, "Database needs recovery");
}
if (F_ISSET(conn, WT_CONN_READONLY)) {
diff --git a/src/third_party/wiredtiger/src/txn/txn_rollback_to_stable.c b/src/third_party/wiredtiger/src/txn/txn_rollback_to_stable.c
index 35a89eeb072..6baaffa532a 100644
--- a/src/third_party/wiredtiger/src/txn/txn_rollback_to_stable.c
+++ b/src/third_party/wiredtiger/src/txn/txn_rollback_to_stable.c
@@ -20,11 +20,12 @@ __txn_rollback_to_stable_lookaside_fixup(WT_SESSION_IMPL *session)
WT_CURSOR *cursor;
WT_DECL_RET;
WT_DECL_TIMESTAMP(rollback_timestamp)
+ WT_DECL_TIMESTAMP(upd_timestamp)
WT_ITEM las_key, las_timestamp, las_value;
WT_TXN_GLOBAL *txn_global;
uint64_t las_counter, las_pageid, las_total, las_txnid;
uint32_t las_id, session_flags;
- uint8_t upd_type;
+ uint8_t prepare_state, upd_type;
conn = S2C(session);
cursor = NULL;
@@ -63,8 +64,10 @@ __txn_rollback_to_stable_lookaside_fixup(WT_SESSION_IMPL *session)
if (__bit_test(conn->stable_rollback_bitstring, las_id))
continue;
- WT_ERR(cursor->get_value(cursor,
- &las_txnid, &las_timestamp, &upd_type, &las_value));
+ WT_ERR(cursor->get_value(cursor, &las_txnid,
+ &las_timestamp, &prepare_state, &upd_type, &las_value));
+ WT_ASSERT(session, las_timestamp.size == WT_TIMESTAMP_SIZE);
+ memcpy(&upd_timestamp, las_timestamp.data, las_timestamp.size);
/*
* Entries with no timestamp will have a timestamp of zero,
@@ -72,7 +75,7 @@ __txn_rollback_to_stable_lookaside_fixup(WT_SESSION_IMPL *session)
* be removed.
*/
if (__wt_timestamp_cmp(
- &rollback_timestamp, las_timestamp.data) < 0) {
+ &rollback_timestamp, &upd_timestamp) < 0) {
WT_ERR(cursor->remove(cursor));
WT_STAT_CONN_INCR(session, txn_rollback_las_removed);
--las_total;
@@ -98,32 +101,38 @@ err: if (ret == 0) {
*/
static void
__txn_abort_newer_update(WT_SESSION_IMPL *session,
- WT_UPDATE *upd, wt_timestamp_t *rollback_timestamp)
+ WT_UPDATE *first_upd, wt_timestamp_t *rollback_timestamp)
{
- WT_UPDATE *next_upd;
- bool aborted_one;
+ WT_UPDATE *upd;
+ bool skip_zero_timestamps;
+
+ skip_zero_timestamps = !FLD_ISSET(S2BT(session)->assert_flags,
+ WT_ASSERT_COMMIT_TS_ALWAYS | WT_ASSERT_COMMIT_TS_KEYS);
- aborted_one = false;
- for (next_upd = upd; next_upd != NULL; next_upd = next_upd->next) {
+ for (upd = first_upd; upd != NULL; upd = upd->next) {
/*
- * Updates with no timestamp will have a timestamp of zero
- * which will fail the following check and cause them to never
- * be aborted.
+ * Updates with no timestamp will have a timestamp of zero and
+ * will never be rolled back. If the table is configured for
+ * strict timestamp checking, assert that all more recent
+ * updates were also rolled back.
*/
- if (__wt_timestamp_cmp(
- rollback_timestamp, &next_upd->timestamp) < 0) {
- next_upd->txnid = WT_TXN_ABORTED;
+ if (upd->txnid == WT_TXN_ABORTED && upd == first_upd)
+ first_upd = upd->next;
+ else if (__wt_timestamp_iszero(&upd->timestamp)) {
+ if (skip_zero_timestamps && upd == first_upd)
+ first_upd = upd->next;
+ } else if (__wt_timestamp_cmp(
+ rollback_timestamp, &upd->timestamp) < 0) {
+ upd->txnid = WT_TXN_ABORTED;
WT_STAT_CONN_INCR(session, txn_rollback_upd_aborted);
- __wt_timestamp_set_zero(&next_upd->timestamp);
+ __wt_timestamp_set_zero(&upd->timestamp);
/*
- * If any updates are aborted, all newer updates
- * better be aborted as well.
- */
- if (!aborted_one)
- WT_ASSERT(session,
- !aborted_one || upd == next_upd);
- aborted_one = true;
+ * If any updates are aborted, all newer updates
+ * better be aborted as well.
+ */
+ WT_ASSERT(session, upd == first_upd);
+ first_upd = upd->next;
}
}
}
@@ -230,9 +239,54 @@ static int
__txn_abort_newer_updates(
WT_SESSION_IMPL *session, WT_REF *ref, wt_timestamp_t *rollback_timestamp)
{
+ WT_DECL_RET;
WT_PAGE *page;
+ uint32_t read_flags;
+ bool local_read;
+
+ /*
+ * If we created a page image with updates the need to be rolled back,
+ * read the history into cache now and make sure the page is marked
+ * dirty. Otherwise, the history we need could be swept from the
+ * lookaside table before the page is read because the lookaside sweep
+ * code has no way to tell that the page image is invalid.
+ */
+ local_read = false;
+ read_flags = WT_READ_WONT_NEED;
+ if (ref->page_las != NULL && ref->page_las->skew_newest &&
+ __wt_timestamp_cmp(rollback_timestamp,
+ &ref->page_las->unstable_timestamp) < 0) {
+ /* Make sure get back a page with history, not limbo page */
+ WT_ASSERT(session,
+ !F_ISSET(&session->txn, WT_TXN_HAS_SNAPSHOT));
+ WT_RET(__wt_page_in(session, ref, read_flags));
+ WT_ASSERT(session, ref->state != WT_REF_LIMBO &&
+ ref->page != NULL && __wt_page_is_modified(ref->page));
+ local_read = true;
+ }
+
+ /* Review deleted page saved to the ref */
+ if (ref->page_del != NULL && __wt_timestamp_cmp(
+ rollback_timestamp, &ref->page_del->timestamp) < 0)
+ WT_ERR(__wt_delete_page_rollback(session, ref));
+
+ /*
+ * If we have a ref with no page, or the page is clean, there is
+ * nothing to roll back.
+ *
+ * This check for a clean page is partly an optimization (checkpoint
+ * only marks pages clean when they have no unwritten updates so
+ * there's no point visiting them again), but also covers a corner case
+ * of a checkpoint with use_timestamp=false. Such a checkpoint
+ * effectively moves the stable timestamp forward, because changes that
+ * are written in the checkpoint cannot be reliably rolled back. The
+ * actual stable timestamp doesn't change, though, so if we try to roll
+ * back clean pages the in-memory tree can get out of sync with the
+ * on-disk tree.
+ */
+ if ((page = ref->page) == NULL || !__wt_page_is_modified(page))
+ goto err;
- page = ref->page;
switch (page->type) {
case WT_PAGE_COL_FIX:
__txn_abort_newer_col_fix(session, page, rollback_timestamp);
@@ -252,10 +306,12 @@ __txn_abort_newer_updates(
case WT_PAGE_ROW_LEAF:
__txn_abort_newer_row_leaf(session, page, rollback_timestamp);
break;
- WT_ILLEGAL_VALUE(session);
+ WT_ILLEGAL_VALUE_ERR(session, page->type);
}
- return (0);
+err: if (local_read)
+ WT_TRET(__wt_page_release(session, ref, read_flags));
+ return (ret);
}
/*
@@ -267,28 +323,21 @@ __txn_rollback_to_stable_btree_walk(
WT_SESSION_IMPL *session, wt_timestamp_t *rollback_timestamp)
{
WT_DECL_RET;
- WT_REF *ref;
+ WT_REF *child_ref, *ref;
/* Walk the tree, marking commits aborted where appropriate. */
ref = NULL;
while ((ret = __wt_tree_walk(session, &ref,
- WT_READ_CACHE | WT_READ_LOOKASIDE | WT_READ_NO_EVICT)) == 0 &&
+ WT_READ_CACHE | WT_READ_NO_EVICT | WT_READ_WONT_NEED)) == 0 &&
ref != NULL) {
- if (ref->page_las != NULL &&
- __wt_timestamp_cmp(rollback_timestamp,
- &ref->page_las->onpage_timestamp) < 0)
- ref->page_las->invalid = true;
-
- /* Review deleted page saved to the ref */
- if (ref->page_del != NULL && __wt_timestamp_cmp(
- rollback_timestamp, &ref->page_del->timestamp) < 0)
- WT_RET(__wt_delete_page_rollback(session, ref));
-
- if (!__wt_page_is_modified(ref->page))
- continue;
-
- WT_RET(__txn_abort_newer_updates(
- session, ref, rollback_timestamp));
+ if (WT_PAGE_IS_INTERNAL(ref->page)) {
+ WT_INTL_FOREACH_BEGIN(session, ref->page, child_ref) {
+ WT_RET(__txn_abort_newer_updates(
+ session, child_ref, rollback_timestamp));
+ } WT_INTL_FOREACH_END;
+ } else
+ WT_RET(__txn_abort_newer_updates(
+ session, ref, rollback_timestamp));
}
return (ret);
}
@@ -372,8 +421,8 @@ __txn_rollback_to_stable_btree(WT_SESSION_IMPL *session, const char *cfg[])
* be in.
*/
WT_RET(__wt_evict_file_exclusive_on(session));
- ret = __txn_rollback_to_stable_btree_walk(
- session, &rollback_timestamp);
+ WT_WITH_PAGE_INDEX(session, ret = __txn_rollback_to_stable_btree_walk(
+ session, &rollback_timestamp));
__wt_evict_file_exclusive_off(session);
return (ret);
@@ -465,7 +514,8 @@ __wt_txn_rollback_to_stable(WT_SESSION_IMPL *session, const char *cfg[])
* trees in cache populates a list that is used to check which
* lookaside records should be removed.
*/
- WT_ERR(__txn_rollback_to_stable_lookaside_fixup(session));
+ if (!F_ISSET(conn, WT_CONN_IN_MEMORY))
+ WT_ERR(__txn_rollback_to_stable_lookaside_fixup(session));
err: F_CLR(conn, WT_CONN_EVICTION_NO_LOOKASIDE);
__wt_free(session, conn->stable_rollback_bitstring);
diff --git a/src/third_party/wiredtiger/src/txn/txn_timestamp.c b/src/third_party/wiredtiger/src/txn/txn_timestamp.c
index a10ff740df6..28f635f5549 100644
--- a/src/third_party/wiredtiger/src/txn/txn_timestamp.c
+++ b/src/third_party/wiredtiger/src/txn/txn_timestamp.c
@@ -193,8 +193,72 @@ __wt_txn_parse_timestamp(WT_SESSION_IMPL *session, const char *name,
}
/*
+ * __txn_get_pinned_timestamp --
+ * Calculate the current pinned timestamp.
+ */
+static int
+__txn_get_pinned_timestamp(
+ WT_SESSION_IMPL *session, wt_timestamp_t *tsp, bool include_checkpoint,
+ bool include_oldest)
+{
+ WT_CONNECTION_IMPL *conn;
+ WT_DECL_TIMESTAMP(tmp_ts)
+ WT_TXN *txn;
+ WT_TXN_GLOBAL *txn_global;
+
+ conn = S2C(session);
+ txn_global = &conn->txn_global;
+
+ if (include_oldest && !txn_global->has_oldest_timestamp)
+ return (WT_NOTFOUND);
+
+ __wt_readlock(session, &txn_global->rwlock);
+ if (include_oldest)
+ __wt_timestamp_set(&tmp_ts, &txn_global->oldest_timestamp);
+ else
+ __wt_timestamp_set_zero(&tmp_ts);
+
+ /* Check for a running checkpoint */
+ if (include_checkpoint &&
+ !__wt_timestamp_iszero(&txn_global->checkpoint_timestamp) &&
+ (__wt_timestamp_iszero(&tmp_ts) ||
+ __wt_timestamp_cmp(&txn_global->checkpoint_timestamp, &tmp_ts) <
+ 0))
+ __wt_timestamp_set(&tmp_ts, &txn_global->checkpoint_timestamp);
+ __wt_readunlock(session, &txn_global->rwlock);
+
+ /* Look for the oldest ordinary reader. */
+ __wt_readlock(session, &txn_global->read_timestamp_rwlock);
+ TAILQ_FOREACH(txn, &txn_global->read_timestamph, read_timestampq) {
+ /*
+ * Skip any transactions on the queue that are not active.
+ */
+ if (txn->clear_read_q)
+ continue;
+ /*
+ * A zero timestamp is possible here only when the oldest
+ * timestamp is not accounted for.
+ */
+ if (__wt_timestamp_iszero(&tmp_ts) ||
+ __wt_timestamp_cmp(&txn->read_timestamp, &tmp_ts) < 0)
+ __wt_timestamp_set(&tmp_ts, &txn->read_timestamp);
+ /*
+ * We break on the first active txn on the list.
+ */
+ break;
+ }
+ __wt_readunlock(session, &txn_global->read_timestamp_rwlock);
+
+ if (!include_oldest && __wt_timestamp_iszero(&tmp_ts))
+ return (WT_NOTFOUND);
+ __wt_timestamp_set(tsp, &tmp_ts);
+
+ return (0);
+}
+
+/*
* __txn_global_query_timestamp --
- * Query a timestamp.
+ * Query a timestamp on the global transaction.
*/
static int
__txn_global_query_timestamp(
@@ -226,7 +290,7 @@ __txn_global_query_timestamp(
__wt_readlock(session, &txn_global->commit_timestamp_rwlock);
TAILQ_FOREACH(txn, &txn_global->commit_timestamph,
commit_timestampq) {
- if (txn->clear_ts_queue)
+ if (txn->clear_commit_q)
continue;
__wt_timestamp_set(
@@ -247,28 +311,11 @@ __txn_global_query_timestamp(
return (WT_NOTFOUND);
WT_WITH_TIMESTAMP_READLOCK(session, &txn_global->rwlock,
__wt_timestamp_set(&ts, &txn_global->oldest_timestamp));
- } else if (WT_STRING_MATCH("pinned", cval.str, cval.len)) {
- if (!txn_global->has_oldest_timestamp)
- return (WT_NOTFOUND);
- __wt_readlock(session, &txn_global->rwlock);
- __wt_timestamp_set(&ts, &txn_global->oldest_timestamp);
-
- /* Check for a running checkpoint */
- txn = txn_global->checkpoint_txn;
- if (txn_global->checkpoint_state.pinned_id != WT_TXN_NONE &&
- !__wt_timestamp_iszero(&txn->read_timestamp) &&
- __wt_timestamp_cmp(&txn->read_timestamp, &ts) < 0)
- __wt_timestamp_set(&ts, &txn->read_timestamp);
- __wt_readunlock(session, &txn_global->rwlock);
-
- /* Look for the oldest ordinary reader. */
- __wt_readlock(session, &txn_global->read_timestamp_rwlock);
- txn = TAILQ_FIRST(&txn_global->read_timestamph);
- if (txn != NULL &&
- __wt_timestamp_cmp(&txn->read_timestamp, &ts) < 0)
- __wt_timestamp_set(&ts, &txn->read_timestamp);
- __wt_readunlock(session, &txn_global->read_timestamp_rwlock);
- } else if (WT_STRING_MATCH("recovery", cval.str, cval.len))
+ } else if (WT_STRING_MATCH("oldest_reader", cval.str, cval.len))
+ WT_RET(__txn_get_pinned_timestamp(session, &ts, true, false));
+ else if (WT_STRING_MATCH("pinned", cval.str, cval.len))
+ WT_RET(__txn_get_pinned_timestamp(session, &ts, true, true));
+ else if (WT_STRING_MATCH("recovery", cval.str, cval.len))
/* Read-only value forever. No lock needed. */
__wt_timestamp_set(&ts, &txn_global->recovery_timestamp);
else if (WT_STRING_MATCH("stable", cval.str, cval.len)) {
@@ -283,24 +330,63 @@ __txn_global_query_timestamp(
done: __wt_timestamp_set(tsp, &ts);
return (0);
}
+
+/*
+ * __txn_query_timestamp --
+ * Query a timestamp within this session's transaction.
+ */
+static int
+__txn_query_timestamp(
+ WT_SESSION_IMPL *session, wt_timestamp_t *tsp, const char *cfg[])
+{
+ WT_CONFIG_ITEM cval;
+ WT_TXN *txn;
+
+ txn = &session->txn;
+
+ WT_STAT_CONN_INCR(session, session_query_ts);
+ if (!F_ISSET(txn, WT_TXN_RUNNING))
+ return (WT_NOTFOUND);
+
+ WT_RET(__wt_config_gets(session, cfg, "get", &cval));
+ if (WT_STRING_MATCH("commit", cval.str, cval.len))
+ __wt_timestamp_set(tsp, &txn->commit_timestamp);
+ else if (WT_STRING_MATCH("first_commit", cval.str, cval.len))
+ __wt_timestamp_set(tsp, &txn->first_commit_timestamp);
+ else if (WT_STRING_MATCH("prepare", cval.str, cval.len))
+ __wt_timestamp_set(tsp, &txn->prepare_timestamp);
+ else if (WT_STRING_MATCH("read", cval.str, cval.len))
+ __wt_timestamp_set(tsp, &txn->read_timestamp);
+ else
+ WT_RET_MSG(session, EINVAL,
+ "unknown timestamp query %.*s", (int)cval.len, cval.str);
+
+ return (0);
+}
#endif
/*
- * __wt_txn_global_query_timestamp --
- * Query a timestamp.
+ * __wt_txn_query_timestamp --
+ * Query a timestamp. The caller may query the global transaction or the
+ * session's transaction.
*/
int
-__wt_txn_global_query_timestamp(
- WT_SESSION_IMPL *session, char *hex_timestamp, const char *cfg[])
+__wt_txn_query_timestamp(WT_SESSION_IMPL *session,
+ char *hex_timestamp, const char *cfg[], bool global_txn)
{
#ifdef HAVE_TIMESTAMPS
wt_timestamp_t ts;
- WT_RET(__txn_global_query_timestamp(session, &ts, cfg));
+ if (global_txn)
+ WT_RET(__txn_global_query_timestamp(session, &ts, cfg));
+ else
+ WT_RET(__txn_query_timestamp(session, &ts, cfg));
+
return (__wt_timestamp_to_hex_string(session, hex_timestamp, &ts));
#else
WT_UNUSED(hex_timestamp);
WT_UNUSED(cfg);
+ WT_UNUSED(global_txn);
WT_RET_MSG(session, ENOTSUP,
"requires a version of WiredTiger built with timestamp support");
@@ -320,8 +406,6 @@ __wt_txn_update_pinned_timestamp(WT_SESSION_IMPL *session, bool force)
WT_TXN_GLOBAL *txn_global;
wt_timestamp_t active_timestamp, last_pinned_timestamp;
wt_timestamp_t oldest_timestamp, pinned_timestamp;
- const char *query_cfg[] = { WT_CONFIG_BASE(session,
- WT_CONNECTION_query_timestamp), "get=pinned", NULL };
txn_global = &S2C(session)->txn_global;
@@ -334,13 +418,13 @@ __wt_txn_update_pinned_timestamp(WT_SESSION_IMPL *session, bool force)
&oldest_timestamp, &txn_global->oldest_timestamp));
/* Scan to find the global pinned timestamp. */
- if ((ret = __txn_global_query_timestamp(
- session, &active_timestamp, query_cfg)) != 0)
+ if ((ret = __txn_get_pinned_timestamp(
+ session, &active_timestamp, false, true)) != 0)
return (ret == WT_NOTFOUND ? 0 : ret);
- if (__wt_timestamp_cmp(&oldest_timestamp, &active_timestamp) < 0) {
+ if (__wt_timestamp_cmp(&oldest_timestamp, &active_timestamp) < 0)
__wt_timestamp_set(&pinned_timestamp, &oldest_timestamp);
- } else
+ else
__wt_timestamp_set(&pinned_timestamp, &active_timestamp);
if (txn_global->has_pinned_timestamp && !force) {
@@ -362,6 +446,9 @@ __wt_txn_update_pinned_timestamp(WT_SESSION_IMPL *session, bool force)
txn_global->oldest_is_pinned = __wt_timestamp_cmp(
&txn_global->pinned_timestamp,
&txn_global->oldest_timestamp) == 0;
+ txn_global->stable_is_pinned = __wt_timestamp_cmp(
+ &txn_global->pinned_timestamp,
+ &txn_global->stable_timestamp) == 0;
__wt_verbose_timestamp(session,
&pinned_timestamp, "Updated pinned timestamp");
}
@@ -721,16 +808,27 @@ __wt_txn_parse_prepare_timestamp(
__wt_readlock(session, &txn_global->read_timestamp_rwlock);
prev = TAILQ_LAST(&txn_global->read_timestamph,
__wt_txn_rts_qh);
- if (prev != NULL &&
- __wt_timestamp_cmp(&prev->read_timestamp, timestamp) >= 0) {
- __wt_readunlock(session,
- &txn_global->read_timestamp_rwlock);
- WT_RET(__wt_timestamp_to_hex_string(session,
- hex_timestamp, &prev->read_timestamp));
- WT_RET_MSG(session, EINVAL,
- "prepare timestamp %.*s not later than an active "
- "read timestamp %s ", (int)cval.len, cval.str,
- hex_timestamp);
+ while (prev != NULL) {
+ /*
+ * Skip any transactions that are not active.
+ */
+ if (prev->clear_read_q) {
+ prev = TAILQ_PREV(
+ prev, __wt_txn_rts_qh, read_timestampq);
+ continue;
+ }
+ if (__wt_timestamp_cmp(
+ &prev->read_timestamp, timestamp) >= 0) {
+ __wt_readunlock(session,
+ &txn_global->read_timestamp_rwlock);
+ WT_RET(__wt_timestamp_to_hex_string(session,
+ hex_timestamp, &prev->read_timestamp));
+ WT_RET_MSG(session, EINVAL,
+ "prepare timestamp %.*s not later than "
+ "an active read timestamp %s ",
+ (int)cval.len, cval.str, hex_timestamp);
+ }
+ break;
}
__wt_readunlock(session, &txn_global->read_timestamp_rwlock);
@@ -881,6 +979,7 @@ __wt_txn_set_commit_timestamp(WT_SESSION_IMPL *session)
WT_TXN *qtxn, *txn, *txn_tmp;
WT_TXN_GLOBAL *txn_global;
wt_timestamp_t ts;
+ uint64_t walked;
txn = &session->txn;
txn_global = &S2C(session)->txn_global;
@@ -902,10 +1001,10 @@ __wt_txn_set_commit_timestamp(WT_SESSION_IMPL *session)
* finding where to insert ourselves (which would result in a list
* loop) and we don't want to walk more of the list than needed.
*/
- if (txn->clear_ts_queue) {
+ if (txn->clear_commit_q) {
TAILQ_REMOVE(&txn_global->commit_timestamph,
txn, commit_timestampq);
- WT_PUBLISH(txn->clear_ts_queue, false);
+ WT_PUBLISH(txn->clear_commit_q, false);
--txn_global->commit_timestampq_len;
}
/*
@@ -918,39 +1017,48 @@ __wt_txn_set_commit_timestamp(WT_SESSION_IMPL *session)
&txn_global->commit_timestamph, txn, commit_timestampq);
WT_STAT_CONN_INCR(session, txn_commit_queue_empty);
} else {
+ /* Walk from the start, removing cleared entries. */
+ walked = 0;
TAILQ_FOREACH_SAFE(qtxn, &txn_global->commit_timestamph,
commit_timestampq, txn_tmp) {
- if (qtxn->clear_ts_queue) {
- TAILQ_REMOVE(&txn_global->commit_timestamph,
- qtxn, commit_timestampq);
- WT_PUBLISH(qtxn->clear_ts_queue, false);
- --txn_global->commit_timestampq_len;
- continue;
- }
+ ++walked;
/*
- * Only walk the list up until we get to the place where
- * we want to insert our timestamp. Some other thread
- * will remove any later transactions.
+ * Stop on the first entry that we cannot clear.
*/
- if (__wt_timestamp_cmp(
- &qtxn->first_commit_timestamp, &ts) > 0)
+ if (!qtxn->clear_commit_q)
break;
+
+ TAILQ_REMOVE(&txn_global->commit_timestamph,
+ qtxn, commit_timestampq);
+ WT_PUBLISH(qtxn->clear_commit_q, false);
+ --txn_global->commit_timestampq_len;
}
+
/*
- * If we got to the end, then our timestamp is larger than
- * the last element's timestamp. Insert at the end.
+ * Now walk backwards from the end to find the correct position
+ * for the insert.
*/
+ qtxn = TAILQ_LAST(
+ &txn_global->commit_timestamph, __wt_txn_cts_qh);
+ while (qtxn != NULL && __wt_timestamp_cmp(
+ &qtxn->first_commit_timestamp, &ts) > 0) {
+ ++walked;
+ qtxn = TAILQ_PREV(
+ qtxn, __wt_txn_cts_qh, commit_timestampq);
+ }
if (qtxn == NULL) {
- TAILQ_INSERT_TAIL(&txn_global->commit_timestamph,
- txn, commit_timestampq);
- WT_STAT_CONN_INCR(session, txn_commit_queue_tail);
+ TAILQ_INSERT_HEAD(&txn_global->commit_timestamph,
+ txn, commit_timestampq);
+ WT_STAT_CONN_INCR(session, txn_commit_queue_head);
} else
- TAILQ_INSERT_BEFORE(qtxn, txn, commit_timestampq);
+ TAILQ_INSERT_AFTER(&txn_global->commit_timestamph,
+ qtxn, txn, commit_timestampq);
+ WT_STAT_CONN_INCRV(session, txn_commit_queue_walked, walked);
}
__wt_timestamp_set(&txn->first_commit_timestamp, &ts);
++txn_global->commit_timestampq_len;
WT_STAT_CONN_INCR(session, txn_commit_queue_inserts);
- txn->clear_ts_queue = false;
+ txn->clear_commit_q = false;
F_SET(txn, WT_TXN_HAS_TS_COMMIT | WT_TXN_PUBLIC_TS_COMMIT);
__wt_writeunlock(session, &txn_global->commit_timestamp_rwlock);
}
@@ -977,7 +1085,7 @@ __wt_txn_clear_commit_timestamp(WT_SESSION_IMPL *session)
* cleaned up safely from the commit timestamp queue whenever the next
* thread walks the queue. We do not need to remove it now.
*/
- WT_PUBLISH(txn->clear_ts_queue, true);
+ WT_PUBLISH(txn->clear_commit_q, true);
WT_PUBLISH(txn->flags, flags);
}
@@ -988,8 +1096,9 @@ __wt_txn_clear_commit_timestamp(WT_SESSION_IMPL *session)
void
__wt_txn_set_read_timestamp(WT_SESSION_IMPL *session)
{
- WT_TXN *prev, *txn;
+ WT_TXN *qtxn, *txn, *txn_tmp;
WT_TXN_GLOBAL *txn_global;
+ uint64_t walked;
txn = &session->txn;
txn_global = &S2C(session)->txn_global;
@@ -998,24 +1107,73 @@ __wt_txn_set_read_timestamp(WT_SESSION_IMPL *session)
return;
__wt_writelock(session, &txn_global->read_timestamp_rwlock);
- prev = TAILQ_LAST(&txn_global->read_timestamph, __wt_txn_rts_qh);
- if (prev == NULL)
- WT_STAT_CONN_INCR(session, txn_read_queue_empty);
- for (; prev != NULL && __wt_timestamp_cmp(
- &prev->read_timestamp, &txn->read_timestamp) > 0;
- prev = TAILQ_PREV(prev, __wt_txn_rts_qh, read_timestampq))
- ;
- if (prev == NULL) {
+ /*
+ * If our transaction is on the queue remove it first. The timestamp
+ * may move earlier so we otherwise might not remove ourselves before
+ * finding where to insert ourselves (which would result in a list
+ * loop) and we don't want to walk more of the list than needed.
+ */
+ if (txn->clear_read_q) {
+ TAILQ_REMOVE(&txn_global->read_timestamph,
+ txn, read_timestampq);
+ WT_PUBLISH(txn->clear_read_q, false);
+ --txn_global->read_timestampq_len;
+ }
+ /*
+ * Walk the list to look for where to insert our own transaction
+ * and remove any transactions that are not active. We stop when
+ * we get to the location where we want to insert.
+ */
+ if (TAILQ_EMPTY(&txn_global->read_timestamph)) {
TAILQ_INSERT_HEAD(
&txn_global->read_timestamph, txn, read_timestampq);
- WT_STAT_CONN_INCR(session, txn_read_queue_head);
- } else
- TAILQ_INSERT_AFTER(
- &txn_global->read_timestamph, prev, txn, read_timestampq);
+ WT_STAT_CONN_INCR(session, txn_read_queue_empty);
+ } else {
+ /* Walk from the start, removing cleared entries. */
+ walked = 0;
+ TAILQ_FOREACH_SAFE(qtxn, &txn_global->read_timestamph,
+ read_timestampq, txn_tmp) {
+ ++walked;
+ if (!qtxn->clear_read_q)
+ break;
+
+ TAILQ_REMOVE(&txn_global->read_timestamph,
+ qtxn, read_timestampq);
+ WT_PUBLISH(qtxn->clear_read_q, false);
+ --txn_global->read_timestampq_len;
+ }
+
+ /*
+ * Now walk backwards from the end to find the correct position
+ * for the insert.
+ */
+ qtxn = TAILQ_LAST(
+ &txn_global->read_timestamph, __wt_txn_rts_qh);
+ while (qtxn != NULL &&
+ __wt_timestamp_cmp(&qtxn->read_timestamp,
+ &txn->read_timestamp) > 0) {
+ ++walked;
+ qtxn = TAILQ_PREV(
+ qtxn, __wt_txn_rts_qh, read_timestampq);
+ }
+ if (qtxn == NULL) {
+ TAILQ_INSERT_HEAD(&txn_global->read_timestamph,
+ txn, read_timestampq);
+ WT_STAT_CONN_INCR(session, txn_read_queue_head);
+ } else
+ TAILQ_INSERT_AFTER(&txn_global->read_timestamph,
+ qtxn, txn, read_timestampq);
+ WT_STAT_CONN_INCRV(session, txn_read_queue_walked, walked);
+ }
+ /*
+ * We do not set the read timestamp here. It has been set in the caller
+ * because special processing for round to oldest.
+ */
++txn_global->read_timestampq_len;
WT_STAT_CONN_INCR(session, txn_read_queue_inserts);
- __wt_writeunlock(session, &txn_global->read_timestamp_rwlock);
+ txn->clear_read_q = false;
F_SET(txn, WT_TXN_HAS_TS_READ | WT_TXN_PUBLIC_TS_READ);
+ __wt_writeunlock(session, &txn_global->read_timestamp_rwlock);
}
/*
@@ -1026,29 +1184,80 @@ void
__wt_txn_clear_read_timestamp(WT_SESSION_IMPL *session)
{
WT_TXN *txn;
- WT_TXN_GLOBAL *txn_global;
+ uint32_t flags;
txn = &session->txn;
- txn_global = &S2C(session)->txn_global;
if (!F_ISSET(txn, WT_TXN_PUBLIC_TS_READ))
return;
#ifdef HAVE_DIAGNOSTIC
{
+ WT_TXN_GLOBAL *txn_global;
wt_timestamp_t pinned_ts;
+ txn_global = &S2C(session)->txn_global;
WT_WITH_TIMESTAMP_READLOCK(session, &txn_global->rwlock,
__wt_timestamp_set(&pinned_ts, &txn_global->pinned_timestamp));
WT_ASSERT(session,
__wt_timestamp_cmp(&txn->read_timestamp, &pinned_ts) >= 0);
}
#endif
+ flags = txn->flags;
+ LF_CLR(WT_TXN_PUBLIC_TS_READ);
- __wt_writelock(session, &txn_global->read_timestamp_rwlock);
- TAILQ_REMOVE(&txn_global->read_timestamph, txn, read_timestampq);
- --txn_global->read_timestampq_len;
- __wt_writeunlock(session, &txn_global->read_timestamp_rwlock);
- F_CLR(txn, WT_TXN_PUBLIC_TS_READ);
+ /*
+ * Notify other threads that our transaction is inactive and can be
+ * cleaned up safely from the read timestamp queue whenever the
+ * next thread walks the queue. We do not need to remove it now.
+ */
+ WT_PUBLISH(txn->clear_read_q, true);
+ WT_PUBLISH(txn->flags, flags);
}
#endif
+
+/*
+ * __wt_txn_clear_timestamp_queues --
+ * We're about to clear the session and overwrite the txn structure.
+ * Remove ourselves from the commit timestamp queue and the read
+ * timestamp queue if we're on either of them.
+ */
+void
+__wt_txn_clear_timestamp_queues(WT_SESSION_IMPL *session)
+{
+ WT_TXN *txn;
+ WT_TXN_GLOBAL *txn_global;
+
+ txn = &session->txn;
+ txn_global = &S2C(session)->txn_global;
+
+ if (!txn->clear_commit_q && !txn->clear_read_q)
+ return;
+
+ if (txn->clear_commit_q) {
+ __wt_writelock(session, &txn_global->commit_timestamp_rwlock);
+ /*
+ * Recheck after acquiring the lock.
+ */
+ if (txn->clear_commit_q) {
+ TAILQ_REMOVE(&txn_global->commit_timestamph,
+ txn, commit_timestampq);
+ --txn_global->commit_timestampq_len;
+ txn->clear_commit_q = false;
+ }
+ __wt_writeunlock(session, &txn_global->commit_timestamp_rwlock);
+ }
+ if (txn->clear_read_q) {
+ __wt_writelock(session, &txn_global->read_timestamp_rwlock);
+ /*
+ * Recheck after acquiring the lock.
+ */
+ if (txn->clear_read_q) {
+ TAILQ_REMOVE(
+ &txn_global->read_timestamph, txn, read_timestampq);
+ --txn_global->read_timestampq_len;
+ txn->clear_read_q = false;
+ }
+ __wt_writeunlock(session, &txn_global->read_timestamp_rwlock);
+ }
+}
diff --git a/src/third_party/wiredtiger/src/utilities/util_alter.c b/src/third_party/wiredtiger/src/utilities/util_alter.c
index b57c2d7b090..152dd1b8c03 100644
--- a/src/third_party/wiredtiger/src/utilities/util_alter.c
+++ b/src/third_party/wiredtiger/src/utilities/util_alter.c
@@ -31,8 +31,7 @@ util_alter(WT_SESSION *session, int argc, char *argv[])
if (argc % 2 != 0)
return (usage());
- for (configp = argv;
- configp != NULL && *configp != NULL; configp += 2)
+ for (configp = argv; *configp != NULL; configp += 2)
if ((ret = session->alter(
session, configp[0], configp[1])) != 0) {
(void)util_err(session, ret,
diff --git a/src/third_party/wiredtiger/src/utilities/util_load.c b/src/third_party/wiredtiger/src/utilities/util_load.c
index 2b210419c78..ea816675dee 100644
--- a/src/third_party/wiredtiger/src/utilities/util_load.c
+++ b/src/third_party/wiredtiger/src/utilities/util_load.c
@@ -138,7 +138,7 @@ load_dump(WT_SESSION *session)
* Check the append flag (it only applies to objects where the primary
* key is a record number).
*/
- if (append && strcmp(cursor->key_format, "r") != 0) {
+ if (append && !WT_STREQ(cursor->key_format, "r")) {
fprintf(stderr,
"%s: %s: -a option illegal unless the primary key is a "
"record number\n",
@@ -237,27 +237,33 @@ config_read(WT_SESSION *session, char ***listp, bool *hexp)
memset(&l, 0, sizeof(l));
/* Header line #1: "WiredTiger Dump" and a WiredTiger version. */
- if (util_read_line(session, &l, false, &eof))
- return (1);
+ if ((ret = util_read_line(session, &l, false, &eof)) != 0)
+ goto err;
s = "WiredTiger Dump ";
- if (strncmp(l.mem, s, strlen(s)) != 0)
- return (format(session));
+ if (strncmp(l.mem, s, strlen(s)) != 0) {
+ ret = format(session);
+ goto err;
+ }
/* Header line #2: "Format={hex,print}". */
- if (util_read_line(session, &l, false, &eof))
- return (1);
+ if ((ret = util_read_line(session, &l, false, &eof)) != 0)
+ goto err;
if (strcmp(l.mem, "Format=print") == 0)
*hexp = false;
else if (strcmp(l.mem, "Format=hex") == 0)
*hexp = true;
- else
- return (format(session));
+ else {
+ ret = format(session);
+ goto err;
+ }
/* Header line #3: "Header". */
- if (util_read_line(session, &l, false, &eof))
- return (1);
- if (strcmp(l.mem, "Header") != 0)
- return (format(session));
+ if ((ret = util_read_line(session, &l, false, &eof)) != 0)
+ goto err;
+ if (strcmp(l.mem, "Header") != 0) {
+ ret = format(session);
+ goto err;
+ }
/* Now, read in lines until we get to the end of the headers. */
for (entry = max_entry = 0, list = NULL;; ++entry) {
@@ -297,6 +303,8 @@ config_read(WT_SESSION *session, char ***listp, bool *hexp)
goto err;
}
*listp = list;
+
+ free(l.mem);
return (0);
err: if (list != NULL) {
@@ -304,6 +312,7 @@ err: if (list != NULL) {
free(*tlist);
free(list);
}
+ free(l.mem);
return (ret);
}
@@ -542,20 +551,21 @@ insert(WT_CURSOR *cursor, const char *name)
* and ignore it (a dump with "append" set), or not read it at
* all (flat-text load).
*/
- if (util_read_line(session, &key, true, &eof))
- return (1);
+ if ((ret = util_read_line(session, &key, true, &eof)) != 0)
+ goto err;
if (eof)
break;
if (!append)
cursor->set_key(cursor, key.mem);
- if (util_read_line(session, &value, false, &eof))
- return (1);
+ if ((ret = util_read_line(session, &value, false, &eof)) != 0)
+ goto err;
cursor->set_value(cursor, value.mem);
- if ((ret = cursor->insert(cursor)) != 0)
- return (
- util_err(session, ret, "%s: cursor.insert", name));
+ if ((ret = cursor->insert(cursor)) != 0) {
+ ret = util_err(session, ret, "%s: cursor.insert", name);
+ goto err;
+ }
/* Report on progress every 100 inserts. */
if (verbose && ++insert_count % 100 == 0) {
@@ -567,7 +577,10 @@ insert(WT_CURSOR *cursor, const char *name)
if (verbose)
printf("\r\t%s: %" PRIu64 "\n", name, insert_count);
- return (0);
+err: free(key.mem);
+ free(value.mem);
+
+ return (ret);
}
static int
diff --git a/src/third_party/wiredtiger/src/utilities/util_load_json.c b/src/third_party/wiredtiger/src/utilities/util_load_json.c
index 91358fd29f0..b6e63fef784 100644
--- a/src/third_party/wiredtiger/src/utilities/util_load_json.c
+++ b/src/third_party/wiredtiger/src/utilities/util_load_json.c
@@ -254,7 +254,7 @@ json_data(WT_SESSION *session,
goto err;
}
keyformat = cursor->key_format;
- isrec = strcmp(keyformat, "r") == 0;
+ isrec = WT_STREQ(keyformat, "r");
for (nkeys = 0; *keyformat; keyformat++)
if (!__wt_isdigit((u_char)*keyformat))
nkeys++;
@@ -599,16 +599,16 @@ util_load_json(WT_SESSION *session, const char *filename, uint32_t flags)
memset(&instate, 0, sizeof(instate));
instate.session = session;
- if (util_read_line(session, &instate.line, false, &instate.ateof))
- return (1);
- instate.p = (const char *)instate.line.mem;
- instate.linenum = 1;
- instate.filename = filename;
+ if ((ret = util_read_line(
+ session, &instate.line, false, &instate.ateof)) == 0) {
+ instate.p = (const char *)instate.line.mem;
+ instate.linenum = 1;
+ instate.filename = filename;
- if ((ret = json_top_level(session, &instate, flags)) != 0)
- goto err;
+ ret = json_top_level(session, &instate, flags);
+ }
-err: free(instate.line.mem);
+ free(instate.line.mem);
free(instate.kvraw);
return (ret);
}
diff --git a/src/third_party/wiredtiger/src/utilities/util_loadtext.c b/src/third_party/wiredtiger/src/utilities/util_loadtext.c
index 1519a0e4fa5..7271fbedf34 100644
--- a/src/third_party/wiredtiger/src/utilities/util_loadtext.c
+++ b/src/third_party/wiredtiger/src/utilities/util_loadtext.c
@@ -74,13 +74,13 @@ text(WT_SESSION *session, const char *uri)
* Row-store tables have key/value pairs, column-store tables only have
* values.
*/
- if (strcmp(cursor->value_format, "S") != 0 ||
- (strcmp(cursor->key_format, "S") != 0 &&
- strcmp(cursor->key_format, "r") != 0))
+ if (!WT_STREQ(cursor->value_format, "S") ||
+ (!WT_STREQ(cursor->key_format, "S") &&
+ !WT_STREQ(cursor->key_format, "r")))
return (util_err(session, EINVAL,
"the loadtext command can only load objects configured "
"for record number or string keys, and string values"));
- readkey = strcmp(cursor->key_format, "r") != 0;
+ readkey = !WT_STREQ(cursor->key_format, "r");
/* Insert the records */
ret = insert(cursor, uri, readkey);
@@ -150,6 +150,8 @@ insert(WT_CURSOR *cursor, const char *name, bool readkey)
fflush(stdout);
}
}
+ free(key.mem);
+ free(value.mem);
if (verbose)
printf("\r\t%s: %" PRIu64 "\n", name, insert_count);
diff --git a/src/third_party/wiredtiger/src/utilities/util_main.c b/src/third_party/wiredtiger/src/utilities/util_main.c
index 2d08c4c5274..7042736e2a2 100644
--- a/src/third_party/wiredtiger/src/utilities/util_main.c
+++ b/src/third_party/wiredtiger/src/utilities/util_main.c
@@ -11,7 +11,7 @@
const char *home = "."; /* Home directory */
const char *progname; /* Program name */
/* Global arguments */
-const char *usage_prefix = "[-LRVv] [-C config] [-E secretkey] [-h home]";
+const char *usage_prefix = "[-LRSVv] [-C config] [-E secretkey] [-h home]";
bool verbose = false; /* Verbose flag */
static const char *command; /* Command name */
@@ -19,6 +19,7 @@ static const char *command; /* Command name */
#define REC_ERROR "log=(recover=error)"
#define REC_LOGOFF "log=(enabled=false)"
#define REC_RECOVER "log=(recover=on)"
+#define REC_SALVAGE "log=(recover=salvage)"
static void
usage(void)
@@ -70,7 +71,7 @@ main(int argc, char *argv[])
int ch, major_v, minor_v, tret, (*func)(WT_SESSION *, int, char *[]);
const char *cmd_config, *config, *p1, *p2, *p3, *rec_config;
char *p, *secretkey;
- bool logoff, needconn, recover;
+ bool logoff, needconn, recover, salvage;
conn = NULL;
p = NULL;
@@ -105,9 +106,9 @@ main(int argc, char *argv[])
* needed, the user can specify -R to run recovery.
*/
rec_config = REC_ERROR;
- logoff = recover = false;
+ logoff = recover = salvage = false;
/* Check for standard options. */
- while ((ch = __wt_getopt(progname, argc, argv, "C:E:h:LRVv")) != EOF)
+ while ((ch = __wt_getopt(progname, argc, argv, "C:E:h:LRSVv")) != EOF)
switch (ch) {
case 'C': /* wiredtiger_open config */
cmd_config = __wt_optarg;
@@ -131,6 +132,10 @@ main(int argc, char *argv[])
rec_config = REC_RECOVER;
recover = true;
break;
+ case 'S': /* salvage */
+ rec_config = REC_SALVAGE;
+ salvage = true;
+ break;
case 'V': /* version */
printf("%s\n", wiredtiger_version(NULL, NULL, NULL));
goto done;
@@ -142,8 +147,9 @@ main(int argc, char *argv[])
usage();
goto err;
}
- if (logoff && recover) {
- fprintf(stderr, "Only one of -L and -R is allowed.\n");
+ if ((logoff && recover) || (logoff && salvage) ||
+ (recover && salvage)) {
+ fprintf(stderr, "Only one of -L, -R, and -S is allowed.\n");
goto err;
}
argc -= __wt_optind;
diff --git a/src/third_party/wiredtiger/src/utilities/util_read.c b/src/third_party/wiredtiger/src/utilities/util_read.c
index ab2b2a79968..26f3cb5394f 100644
--- a/src/third_party/wiredtiger/src/utilities/util_read.c
+++ b/src/third_party/wiredtiger/src/utilities/util_read.c
@@ -51,16 +51,16 @@ util_read(WT_SESSION *session, int argc, char *argv[])
* A simple search only makes sense if the key format is a string or a
* record number, and the value format is a single string.
*/
- if (strcmp(cursor->key_format, "r") != 0 &&
- strcmp(cursor->key_format, "S") != 0) {
+ if (!WT_STREQ(cursor->key_format, "r") &&
+ !WT_STREQ(cursor->key_format, "S")) {
fprintf(stderr,
"%s: read command only possible when the key format is "
"a record number or string\n",
progname);
return (1);
}
- rkey = strcmp(cursor->key_format, "r") == 0;
- if (strcmp(cursor->value_format, "S") != 0) {
+ rkey = WT_STREQ(cursor->key_format, "r");
+ if (!WT_STREQ(cursor->value_format, "S")) {
fprintf(stderr,
"%s: read command only possible when the value format is "
"a string\n",
diff --git a/src/third_party/wiredtiger/src/utilities/util_write.c b/src/third_party/wiredtiger/src/utilities/util_write.c
index da958f86c2d..5d460367adc 100644
--- a/src/third_party/wiredtiger/src/utilities/util_write.c
+++ b/src/third_party/wiredtiger/src/utilities/util_write.c
@@ -71,16 +71,16 @@ util_write(WT_SESSION *session, int argc, char *argv[])
* A simple search only makes sense if the key format is a string or a
* record number, and the value format is a single string.
*/
- if (strcmp(cursor->key_format, "r") != 0 &&
- strcmp(cursor->key_format, "S") != 0) {
+ if (!WT_STREQ(cursor->key_format, "r") &&
+ !WT_STREQ(cursor->key_format, "S")) {
fprintf(stderr,
"%s: write command only possible when the key format is "
"a record number or string\n",
progname);
return (1);
}
- rkey = strcmp(cursor->key_format, "r") == 0;
- if (strcmp(cursor->value_format, "S") != 0) {
+ rkey = WT_STREQ(cursor->key_format, "r");
+ if (!WT_STREQ(cursor->value_format, "S")) {
fprintf(stderr,
"%s: write command only possible when the value format is "
"a string\n",
diff --git a/src/third_party/wiredtiger/test/bloom/test_bloom.c b/src/third_party/wiredtiger/test/bloom/test_bloom.c
index dcc7ab372a9..f3072b1860a 100644
--- a/src/third_party/wiredtiger/test/bloom/test_bloom.c
+++ b/src/third_party/wiredtiger/test/bloom/test_bloom.c
@@ -89,7 +89,8 @@ main(int argc, char *argv[])
}
argc -= __wt_optind;
- argv += __wt_optind;
+ if (argc != 0)
+ usage();
setup();
run();
diff --git a/src/third_party/wiredtiger/test/checkpoint/checkpointer.c b/src/third_party/wiredtiger/test/checkpoint/checkpointer.c
index 170cde22fa1..99468504129 100644
--- a/src/third_party/wiredtiger/test/checkpoint/checkpointer.c
+++ b/src/third_party/wiredtiger/test/checkpoint/checkpointer.c
@@ -53,7 +53,7 @@ start_checkpoints(void)
void
end_checkpoints(void)
{
- testutil_check(__wt_thread_join(NULL, g.checkpoint_thread));
+ testutil_check(__wt_thread_join(NULL, &g.checkpoint_thread));
}
/*
diff --git a/src/third_party/wiredtiger/test/checkpoint/workers.c b/src/third_party/wiredtiger/test/checkpoint/workers.c
index c72b4b897b1..b57dad86b9f 100644
--- a/src/third_party/wiredtiger/test/checkpoint/workers.c
+++ b/src/third_party/wiredtiger/test/checkpoint/workers.c
@@ -103,7 +103,7 @@ start_workers(table_type type)
/* Wait for the threads. */
for (i = 0; i < g.nworkers; ++i)
- testutil_check(__wt_thread_join(NULL, tids[i]));
+ testutil_check(__wt_thread_join(NULL, &tids[i]));
(void)gettimeofday(&stop, NULL);
seconds = (stop.tv_sec - start.tv_sec) +
diff --git a/src/third_party/wiredtiger/test/csuite/Makefile.am b/src/third_party/wiredtiger/test/csuite/Makefile.am
index b3dde5ec628..e625f2e4bfe 100644
--- a/src/third_party/wiredtiger/test/csuite/Makefile.am
+++ b/src/third_party/wiredtiger/test/csuite/Makefile.am
@@ -11,10 +11,18 @@ test_random_abort_SOURCES = random_abort/main.c
noinst_PROGRAMS += test_random_abort
all_TESTS += random_abort/smoke.sh
+test_random_directio_SOURCES = random_directio/main.c
+noinst_PROGRAMS += test_random_directio
+all_TESTS += random_directio/smoke.sh
+
test_rwlock_SOURCES = rwlock/main.c
noinst_PROGRAMS += test_rwlock
all_TESTS += test_rwlock
+test_schema_abort_SOURCES = schema_abort/main.c
+noinst_PROGRAMS += test_schema_abort
+all_TESTS += schema_abort/smoke.sh
+
test_scope_SOURCES = scope/main.c
noinst_PROGRAMS += test_scope
all_TESTS += test_scope
@@ -111,6 +119,10 @@ test_wt4117_checksum_SOURCES = wt4117_checksum/main.c
noinst_PROGRAMS += test_wt4117_checksum
all_TESTS += test_wt4117_checksum
+test_wt4156_metadata_salvage_SOURCES = wt4156_metadata_salvage/main.c
+noinst_PROGRAMS += test_wt4156_metadata_salvage
+all_TESTS += test_wt4156_metadata_salvage
+
# Run this during a "make check" smoke test.
TESTS = $(all_TESTS)
LOG_COMPILER = $(TEST_WRAPPER)
diff --git a/src/third_party/wiredtiger/test/csuite/random_abort/main.c b/src/third_party/wiredtiger/test/csuite/random_abort/main.c
index e99ed5ecd4d..7834d88a780 100644
--- a/src/third_party/wiredtiger/test/csuite/random_abort/main.c
+++ b/src/third_party/wiredtiger/test/csuite/random_abort/main.c
@@ -37,7 +37,6 @@ static char home[1024]; /* Program working dir */
* These two names for the URI and file system must be maintained in tandem.
*/
static const char * const uri = "table:main";
-static const char * const fs_main = "main.wt";
static bool compat;
static bool inmem;
@@ -201,7 +200,7 @@ fill_db(uint32_t nth)
* it is killed.
*/
for (i = 0; i < nth; ++i)
- testutil_check(__wt_thread_join(NULL, thr[i]));
+ testutil_check(__wt_thread_join(NULL, &thr[i]));
/*
* NOTREACHED
*/
@@ -280,7 +279,6 @@ main(int argc, char *argv[])
usage();
}
argc -= __wt_optind;
- argv += __wt_optind;
if (argc != 0)
usage();
@@ -312,6 +310,11 @@ main(int argc, char *argv[])
compat ? "true" : "false", inmem ? "true" : "false");
printf("Parent: Create %" PRIu32
" threads; sleep %" PRIu32 " seconds\n", nth, timeout);
+ printf("CONFIG: %s%s%s -h %s -T %" PRIu32 " -t %" PRIu32 "\n",
+ progname,
+ compat ? " -C" : "",
+ inmem ? " -m" : "",
+ working_dir, nth, timeout);
/*
* Fork a child to insert as many items. We will then randomly
* kill the child, run recovery and make sure all items we wrote
@@ -332,13 +335,22 @@ main(int argc, char *argv[])
/*
* Sleep for the configured amount of time before killing
* the child. Start the timeout from the time we notice that
- * the table has been created. That allows the test to run
- * correctly on really slow machines.
+ * the child workers have created their record files. That
+ * allows the test to run correctly on really slow machines.
*/
- testutil_check(__wt_snprintf(
- buf, sizeof(buf), "%s/%s", home, fs_main));
- while (stat(buf, &sb) != 0)
- sleep(1);
+ i = 0;
+ while (i < nth) {
+ /*
+ * Wait for each record file to exist.
+ */
+ testutil_check(__wt_snprintf(
+ fname, sizeof(fname), RECORDS_FILE, i));
+ testutil_check(__wt_snprintf(
+ buf, sizeof(buf),"%s/%s", home, fname));
+ while (stat(buf, &sb) != 0)
+ sleep(1);
+ ++i;
+ }
sleep(timeout);
sa.sa_handler = SIG_DFL;
testutil_checksys(sigaction(SIGCHLD, &sa, NULL));
diff --git a/src/third_party/wiredtiger/test/csuite/random_directio/main.c b/src/third_party/wiredtiger/test/csuite/random_directio/main.c
new file mode 100644
index 00000000000..83c0ae46ef3
--- /dev/null
+++ b/src/third_party/wiredtiger/test/csuite/random_directio/main.c
@@ -0,0 +1,1291 @@
+/*-
+ * Public Domain 2014-2018 MongoDB, Inc.
+ * Public Domain 2008-2014 WiredTiger, Inc.
+ *
+ * This is free and unencumbered software released into the public domain.
+ *
+ * Anyone is free to copy, modify, publish, use, compile, sell, or
+ * distribute this software, either in source code form or as a compiled
+ * binary, for any purpose, commercial or non-commercial, and by any
+ * means.
+ *
+ * In jurisdictions that recognize copyright laws, the author or authors
+ * of this software dedicate any and all copyright interest in the
+ * software to the public domain. We make this dedication for the benefit
+ * of the public at large and to the detriment of our heirs and
+ * successors. We intend this dedication to be an overt act of
+ * relinquishment in perpetuity of all present and future rights to this
+ * software under copyright law.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+/*
+ * This test simulates system crashes. It uses direct IO, and currently
+ * runs only on Linux.
+ *
+ * Our strategy is to run a subordinate 'writer' process that creates/modifies
+ * data, including schema modifications. Every N seconds, asynchronously, we
+ * send a stop signal to the writer and then copy (with direct IO) the entire
+ * contents of its database home to a new saved location where we can run and
+ * verify the recovered home. Then we send a continue signal. We repeat this:
+ *
+ * sleep N, STOP, copy, run recovery, CONTINUE
+ *
+ * which allows the writer to make continuing progress, while the main
+ * process is verifying what's on disk.
+ *
+ * By using stop signal to suspend the process and copying with direct IO,
+ * we are roughly simulating a system crash, by seeing what's actually on
+ * disk (not in file system buffer cache) at the moment that the copy is
+ * made. It's not quite as harsh as a system crash, as suspending does not
+ * halt writes that are in-flight. Still, it's a reasonable proxy for testing.
+ *
+ * In the main table, the keys look like:
+ *
+ * xxxx:T:LARGE_STRING
+ *
+ * where xxxx represents an increasing decimal id (0 padded to 12 digits).
+ * These ids are only unique per thread, so this key is the xxxx-th key
+ * written by a thread. T represents the thread id reduced to a single
+ * hex digit. LARGE_STRING is a portion of a large string that includes
+ * the thread id and a lot of spaces, over and over (see the large_buf
+ * function). When forming the key, the large string is truncated so
+ * that the key is effectively padded to the right length.
+ *
+ * The key space for the main table is designed to be interleaved tightly
+ * among all the threads. The matching values in the main table are the
+ * same, except with the xxxx string reversed. So the keys and values
+ * are the same size.
+ *
+ * There is also a reverse table where the keys/values are swapped.
+ */
+
+#include "test_util.h"
+
+#include <fcntl.h>
+#include <signal.h>
+#include <sys/wait.h>
+
+static char home[1024]; /* Program working dir */
+
+/*
+ * These two names for the URI and file system must be maintained in tandem.
+ */
+static const char * const uri_main = "table:main";
+static const char * const fs_main = "main.wt";
+
+static const char * const uri_rev = "table:rev";
+
+/*
+ * The number of threads cannot be more than 16, we are using a hex digit
+ * to encode this in the key.
+ */
+#define MAX_TH 16
+#define MIN_TH 5
+
+#define MAX_TIME 40
+#define MIN_TIME 10
+
+#define LARGE_WRITE_SIZE (128*1024)
+#define MIN_DATA_SIZE 30
+#define DEFAULT_DATA_SIZE 50
+
+#define DEFAULT_CYCLES 5
+#define DEFAULT_INTERVAL 3
+
+#define KEY_SEP "_" /* Must be one char string */
+
+#define ENV_CONFIG \
+ "create,log=(file_max=10M,enabled)," \
+ "transaction_sync=(enabled,method=%s)"
+#define ENV_CONFIG_REC "log=(recover=on)"
+
+/* 64 spaces */
+#define SPACES \
+ " "
+
+/*
+ * Set the "schema operation frequency" higher to be less stressful for schema
+ * operations. With the current value, 100, there are sequences of schema
+ * operations that are begun when the id is in the range 0 to 9, 100 to 109,
+ * 200 to 209, etc. That is, 10 sequences per 100. A higher number (say 1000)
+ * means there are 10 sequences started per 1000. A sequence of schema
+ * operations lasts for 4 ids. So, for example, if thread 3 is inserting id
+ * 100 into the main table, an additional schema operation is done (creating a
+ * table), and operations on this table continue (while other schema operations
+ * continue).
+ *
+ * Starting at the insert of id 99 (which has no schema operations), here's
+ * what will happen (for thread #3).
+ *
+ * insert k/v 99 into table:main (with no additional schema operations)
+ *
+ * insert k/v 100 into table:main
+ * create table:A100-3 (3 for thread #3)
+ *
+ * insert k/v 101 into table:main
+ * insert into table:A100-3 (continuing the sequence)
+ * create table:A101-3 (starts a new sequence)
+ *
+ * insert k/v 102 into table:main
+ * rename table:A100-3 -> table:B100-3 (third step in sequence)
+ * insert into table:A101-3 (second step in sequence)
+ * create table:A102-3 (starting new sequence)
+ *
+ * insert k/v 103 into table:main
+ * update key in table:B100-3 (fourth step)
+ * rename table:A101-3 -> table:B101-3 (third step)
+ * insert into table:A102-3
+ * create table:A103-3
+ *
+ * insert k/v 104 into table:main
+ * drop table:B100-3 (fifth and last step)
+ * update key in table:B101-3 (fourth step)
+ * rename table:A102-3 -> table:B102-3
+ * insert into table:A103-3
+ * create table:A104-3
+ * ...
+ *
+ * This continues, with the last table created when k/v 109 is inserted into
+ * table:main and the last sequence finishing at k/v 113. Each clump above
+ * separated by a blank line represents a transaction. Meanwhile, other
+ * threads are doing the same thing. That stretch, from id 100 to id 113
+ * that has schema operations happens again at id 200, assuming frequency
+ * set to 100. So it is a good test of schema operations 'in flight'.
+ */
+#define SCHEMA_OP_FREQUENCY 100
+
+#define TEST_STREQ(expect, got, message) \
+ do { \
+ if (!WT_STREQ(expect, got)) { \
+ printf("FAIL: %s: expect %s, got %s", message, \
+ expect, got); \
+ testutil_assert(WT_STREQ(expect, got)); \
+ } \
+ } while (0)
+
+/*
+ * Values for flags used in various places.
+ */
+#define SCHEMA_CREATE 0x0001
+#define SCHEMA_CREATE_CHECK 0x0002
+#define SCHEMA_DATA_CHECK 0x0004
+#define SCHEMA_DROP 0x0008
+#define SCHEMA_DROP_CHECK 0x0010
+#define SCHEMA_RENAME 0x0020
+#define SCHEMA_VERBOSE 0x0040
+#define SCHEMA_ALL \
+ (SCHEMA_CREATE | SCHEMA_CREATE_CHECK | \
+ SCHEMA_DATA_CHECK | SCHEMA_DROP | \
+ SCHEMA_DROP_CHECK | SCHEMA_RENAME)
+
+extern int __wt_optind;
+extern char *__wt_optarg;
+
+static void handler(int);
+
+typedef struct {
+ WT_CONNECTION *conn;
+ char *data;
+ uint32_t datasize;
+ uint32_t id;
+
+ uint32_t flags; /* Uses SCHEMA_* values above */
+} WT_THREAD_DATA;
+
+/*
+ * usage --
+ * Print usage and exit.
+ */
+static void usage(void)
+ WT_GCC_FUNC_DECL_ATTRIBUTE((noreturn));
+static void
+usage(void)
+{
+ fprintf(stderr, "usage: %s [options]\n", progname);
+ fprintf(stderr, "options:\n");
+ fprintf(stderr, " %-20s%s\n", "-d data_size",
+ "approximate size of keys and values [1000]");
+ fprintf(stderr, " %-20s%s\n", "-h home",
+ "WiredTiger home directory [WT_TEST.directio]");
+ fprintf(stderr, " %-20s%s\n", "-i interval",
+ "interval timeout between copy/recover cycles [3]");
+ fprintf(stderr, " %-20s%s\n", "-m method",
+ "sync method: fsync, dsync, none [none]");
+ fprintf(stderr, " %-20s%s\n", "-n num_cycles",
+ "number of copy/recover cycles [5]");
+ fprintf(stderr, " %-20s%s\n", "-p", "populate only [false]");
+ fprintf(stderr, " %-20s%s\n", "-S arg1,arg2,...",
+ "comma separated schema operations, from the following:");
+ fprintf(stderr, " %-5s%-15s%s\n", "", "none",
+ "no schema operations [default]");
+ fprintf(stderr, " %-5s%-15s%s\n", "", "all",
+ "all of the below operations, except verbose");
+ fprintf(stderr, " %-5s%-15s%s\n", "", "create",
+ "create tables");
+ fprintf(stderr, " %-5s%-15s%s\n", "", "create_check",
+ "newly created tables are checked (requires create)");
+ fprintf(stderr, " %-5s%-15s%s\n", "", "data_check",
+ "check contents of files for various ops (requires create)");
+ fprintf(stderr, " %-5s%-15s%s\n", "", "rename",
+ "rename tables (requires create)");
+ fprintf(stderr, " %-5s%-15s%s\n", "", "drop",
+ "drop tables (requires create)");
+ fprintf(stderr, " %-5s%-15s%s\n", "", "drop_check",
+ "after recovery, dropped tables are checked (requires drop)");
+ fprintf(stderr, " %-5s%-15s%s\n", "", "",
+ "that they no longer exist (requires drop)");
+ fprintf(stderr, " %-5s%-15s%s\n", "", "verbose",
+ "verbose print during schema operation checks,");
+ fprintf(stderr, " %-5s%-15s%s\n", "", "",
+ "done after recovery, so does not effect test timing");
+ fprintf(stderr, " %-20s%s\n", "-T num_threads",
+ "number of threads in writer [random]");
+ fprintf(stderr, " %-20s%s\n", "-t timeout",
+ "initial timeout before first copy [random]");
+ fprintf(stderr, " %-20s%s\n", "-v", "verify only [false]");
+ exit(EXIT_FAILURE);
+}
+
+/*
+ * has_schema_operation --
+ * Return true if a schema operation should be performed for this id.
+ * See the comment above describing schema operation frequency.
+ */
+static bool
+has_schema_operation(uint64_t id, uint32_t offset)
+{
+ return (id >= offset &&
+ (id - offset) % SCHEMA_OP_FREQUENCY < 10);
+}
+
+/*
+ * large_buf --
+ * Fill or check a large buffer.
+ */
+static void
+large_buf(char *large, size_t lsize, uint32_t id, bool fill)
+{
+ size_t len;
+ uint64_t i;
+ char lgbuf[1024 + 20];
+
+ /*
+ * Set up a large value putting our id in it every 1024 bytes or so.
+ */
+ testutil_check(__wt_snprintf(
+ lgbuf, sizeof(lgbuf), "th-%" PRIu32
+ "%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s", id,
+ SPACES, SPACES, SPACES, SPACES,
+ SPACES, SPACES, SPACES, SPACES,
+ SPACES, SPACES, SPACES, SPACES,
+ SPACES, SPACES, SPACES, SPACES));
+
+ len = strlen(lgbuf);
+ for (i = 0; i < lsize - len; i += len)
+ if (fill)
+ testutil_check(__wt_snprintf(
+ &large[i], lsize - i, "%s", lgbuf));
+ else
+ testutil_check(strncmp(&large[i], lgbuf, len));
+}
+
+/*
+ * reverse --
+ * Reverse a string in place.
+ */
+static void
+reverse(char *s)
+{
+ size_t i, j, len;
+ char tmp;
+
+ len = strlen(s);
+ for (i = 0, j = len - 1; i < len / 2; i++, j--) {
+ tmp = s[i];
+ s[i] = s[j];
+ s[j] = tmp;
+ }
+}
+
+/*
+ * gen_kv --
+ * Generate a key/value.
+ */
+static void
+gen_kv(char *buf, size_t buf_size, uint64_t id, uint32_t threadid,
+ const char *large, bool forward)
+{
+ size_t keyid_size, large_size;
+ char keyid[64];
+
+ testutil_check(__wt_snprintf(keyid, sizeof(keyid),
+ "%10.10" PRIu64, id));
+ keyid_size = strlen(keyid);
+ if (!forward)
+ reverse(keyid);
+ testutil_assert(keyid_size + 4 <= buf_size);
+ large_size = buf_size - 4 - keyid_size;
+ testutil_check(__wt_snprintf(buf, buf_size,
+ "%s" KEY_SEP "%1.1x" KEY_SEP "%.*s",
+ keyid, threadid, (int)large_size, large));
+}
+
+/*
+ * gen_table_name --
+ * Generate a table name used for the schema test.
+ */
+static void
+gen_table_name(char *buf, size_t buf_size, uint64_t id, uint32_t threadid)
+{
+ testutil_check(__wt_snprintf(buf, buf_size,
+ "table:A%" PRIu64 "-%" PRIu32, id, threadid));
+}
+
+/*
+ * gen_table2_name --
+ * Generate a second table name used for the schema test.
+ */
+static void
+gen_table2_name(char *buf, size_t buf_size, uint64_t id, uint32_t threadid,
+ uint32_t flags)
+{
+ if (!LF_ISSET(SCHEMA_RENAME))
+ /* table is not renamed, so use original table name */
+ gen_table_name(buf, buf_size, id, threadid);
+ else
+ testutil_check(__wt_snprintf(buf, buf_size,
+ "table:B%" PRIu64 "-%" PRIu32, id, threadid));
+}
+
+static int
+schema_operation(WT_SESSION *session, uint32_t threadid, uint64_t id,
+ uint32_t op, uint32_t flags)
+{
+ WT_CURSOR *cursor;
+ int ret;
+ const char *retry_opname;
+ char uri1[50], uri2[50];
+
+ if (!has_schema_operation(id, op))
+ return (0);
+
+ id -= op;
+ ret = 0;
+ retry_opname = NULL;
+
+ switch (op) {
+ case 0:
+ /* Create a table. */
+ gen_table_name(uri1, sizeof(uri1), id, threadid);
+ /*
+ fprintf(stderr, "CREATE: %s\n", uri1);
+ */
+ testutil_check(session->create(session, uri1,
+ "key_format=S,value_format=S"));
+ break;
+ case 1:
+ /* Insert a value into the table. */
+ gen_table_name(uri1, sizeof(uri1), id, threadid);
+ /*
+ fprintf(stderr, "INSERT: %s\n", uri1);
+ */
+ testutil_check(session->open_cursor(
+ session, uri1, NULL, NULL, &cursor));
+ cursor->set_key(cursor, uri1);
+ cursor->set_value(cursor, uri1);
+ testutil_check(cursor->insert(cursor));
+ cursor->close(cursor);
+ break;
+ case 2:
+ /* Rename the table. */
+ if (LF_ISSET(SCHEMA_RENAME)) {
+ gen_table_name(uri1, sizeof(uri1), id, threadid);
+ gen_table2_name(uri2, sizeof(uri2), id, threadid,
+ flags);
+ retry_opname = "rename";
+ /*
+ fprintf(stderr, "RENAME: %s->%s\n", uri1, uri2);
+ */
+ ret = session->rename(session, uri1, uri2, NULL);
+ }
+ break;
+ case 3:
+ /* Update the single value in the table. */
+ gen_table_name(uri1, sizeof(uri1), id, threadid);
+ gen_table2_name(uri2, sizeof(uri2), id, threadid, flags);
+ testutil_check(session->open_cursor(session,
+ uri2, NULL, NULL, &cursor));
+ cursor->set_key(cursor, uri1);
+ cursor->set_value(cursor, uri2);
+ /*
+ fprintf(stderr, "UPDATE: %s\n", uri2);
+ */
+ testutil_check(cursor->update(cursor));
+ cursor->close(cursor);
+ break;
+ case 4:
+ /* Drop the table. */
+ if (LF_ISSET(SCHEMA_DROP)) {
+ gen_table2_name(uri1, sizeof(uri1), id, threadid,
+ flags);
+ retry_opname = "drop";
+ /*
+ fprintf(stderr, "DROP: %s\n", uri1);
+ */
+ ret = session->drop(session, uri1, NULL);
+ }
+ }
+ /*
+ * XXX
+ * We notice occasional EBUSY errors from
+ * rename or drop, even though neither URI should be
+ * used by any other thread. Report it, and retry.
+ */
+ if (retry_opname != NULL && ret == EBUSY)
+ printf("%s(\"%s\", ....) failed, retrying transaction\n",
+ retry_opname, uri1);
+ else if (ret != 0) {
+ printf("FAIL: %s(\"%s\", ....) returns %d: %s\n",
+ retry_opname, uri1, ret, wiredtiger_strerror(ret));
+ testutil_check(ret);
+ }
+
+ return (ret);
+}
+
+/*
+ * thread_run --
+ * Run a writer thread.
+ */
+static WT_THREAD_RET thread_run(void *)
+ WT_GCC_FUNC_DECL_ATTRIBUTE((noreturn));
+static WT_THREAD_RET
+thread_run(void *arg)
+{
+ WT_CURSOR *cursor, *rev;
+ WT_RAND_STATE rnd;
+ WT_SESSION *session;
+ WT_THREAD_DATA *td;
+ size_t lsize;
+ uint64_t i;
+ uint32_t kvsize, op;
+ int ret;
+ char *buf1, *buf2;
+ char large[LARGE_WRITE_SIZE];
+
+ __wt_random_init(&rnd);
+ lsize = sizeof(large);
+ memset(large, 0, lsize);
+
+ td = (WT_THREAD_DATA *)arg;
+ large_buf(large, lsize, td->id, true);
+
+ testutil_check(td->conn->open_session(td->conn, NULL, NULL, &session));
+ testutil_check(session->open_cursor(session, uri_main, NULL, NULL,
+ &cursor));
+ testutil_check(session->open_cursor(session, uri_rev, NULL, NULL,
+ &rev));
+
+ /*
+ * Split the allocated buffer into two parts, one for
+ * the key, one for the value.
+ */
+ kvsize = td->datasize / 2;
+ buf1 = td->data;
+ buf2 = &td->data[kvsize];
+
+ /*
+ * Continuing writing until we're killed.
+ */
+ printf("Thread %" PRIu32 "\n", td->id);
+ for (i = 0; ; ++i) {
+again:
+ /*
+ if (i > 0 && i % 10000 == 0)
+ printf("Thread %d completed %d entries\n",
+ (int)td->id, (int)i);
+ */
+
+ gen_kv(buf1, kvsize, i, td->id, large, true);
+ gen_kv(buf2, kvsize, i, td->id, large, false);
+
+ testutil_check(session->begin_transaction(session, NULL));
+ cursor->set_key(cursor, buf1);
+ /*
+ * Every 1000th record write a very large value that exceeds the
+ * log buffer size. This forces us to use the unbuffered path.
+ */
+ if (i % 1000 == 0) {
+ cursor->set_value(cursor, large);
+ } else {
+ cursor->set_value(cursor, buf2);
+ }
+ testutil_check(cursor->insert(cursor));
+
+ /*
+ * The reverse table has no very large records.
+ */
+ rev->set_key(rev, buf2);
+ rev->set_value(rev, buf1);
+ testutil_check(rev->insert(rev));
+
+ /*
+ * If we are doing a schema test, generate operations
+ * for additional tables. Each table has a 'lifetime'
+ * of 4 values of the id.
+ */
+ if (F_ISSET(td, SCHEMA_ALL)) {
+ /* Create is implied by any schema operation. */
+ testutil_assert(F_ISSET(td, SCHEMA_CREATE));
+
+ /*
+ * Any or all of the schema operations may be
+ * performed as part of this transaction.
+ * See the comment for schema operation frequency.
+ */
+ ret = 0;
+ for (op = 0; op <= 4 && ret == 0; op++)
+ ret = schema_operation(session, td->id, i, op,
+ td->flags);
+ if (ret == EBUSY) {
+ testutil_check(session->rollback_transaction(
+ session, NULL));
+ sleep(1);
+ goto again;
+ }
+ }
+ testutil_check(session->commit_transaction(session, NULL));
+ }
+ /* NOTREACHED */
+}
+
+/*
+ * fill_db --
+ * The child process creates the database and table, and then creates
+ * worker threads to add data until it is killed by the parent.
+ */
+static void fill_db(uint32_t, uint32_t, const char *, uint32_t)
+ WT_GCC_FUNC_DECL_ATTRIBUTE((noreturn));
+static void
+ fill_db(uint32_t nth, uint32_t datasize, const char *method, uint32_t flags)
+{
+ WT_CONNECTION *conn;
+ WT_SESSION *session;
+ WT_THREAD_DATA *td;
+ wt_thread_t *thr;
+ uint32_t i;
+ char envconf[512];
+
+ thr = dcalloc(nth, sizeof(*thr));
+ td = dcalloc(nth, sizeof(WT_THREAD_DATA));
+ if (chdir(home) != 0)
+ testutil_die(errno, "Child chdir: %s", home);
+ testutil_check(__wt_snprintf(envconf, sizeof(envconf),
+ ENV_CONFIG, method));
+
+ testutil_check(wiredtiger_open(".", NULL, envconf, &conn));
+ testutil_check(conn->open_session(conn, NULL, NULL, &session));
+ testutil_check(session->create(
+ session, uri_main, "key_format=S,value_format=S"));
+ testutil_check(session->create(
+ session, uri_rev, "key_format=S,value_format=S"));
+ /*
+ * Checkpoint to help ensure that at least the main tables
+ * can be opened after recovery.
+ */
+ testutil_check(session->checkpoint(session, NULL));
+ testutil_check(session->close(session, NULL));
+
+ datasize += 1; /* Add an extra byte for string termination */
+ printf("Create %" PRIu32 " writer threads\n", nth);
+ for (i = 0; i < nth; ++i) {
+ td[i].conn = conn;
+ td[i].data = dcalloc(datasize, 1);
+ td[i].datasize = datasize;
+ td[i].id = i;
+ td[i].flags = flags;
+ testutil_check(__wt_thread_create(
+ NULL, &thr[i], thread_run, &td[i]));
+ }
+ printf("Spawned %" PRIu32 " writer threads\n", nth);
+ fflush(stdout);
+ /*
+ * The threads never exit, so the child will just wait here until
+ * it is killed.
+ */
+ for (i = 0; i < nth; ++i) {
+ testutil_check(__wt_thread_join(NULL, &thr[i]));
+ free(td[i].data);
+ }
+ /*
+ * NOTREACHED
+ */
+ free(thr);
+ free(td);
+ exit(EXIT_SUCCESS);
+}
+
+/*
+ * check_kv --
+ * Check that a key exists with a value, or does not exist.
+ */
+static void
+check_kv(WT_CURSOR *cursor, const char *key, const char *value, bool exists)
+{
+ int ret;
+ char *got;
+
+ cursor->set_key(cursor, key);
+ ret = cursor->search(cursor);
+ if ((ret = cursor->search(cursor)) == WT_NOTFOUND) {
+ if (exists) {
+ printf("FAIL: expected rev file to have: %s\n", key);
+ testutil_assert(!exists);
+ }
+ } else {
+ testutil_check(ret);
+ if (!exists) {
+ printf("FAIL: unexpected key in rev file: %s\n", key);
+ testutil_assert(exists);
+ }
+ cursor->get_value(cursor, &got);
+ TEST_STREQ(value, got, "value");
+ }
+}
+
+/*
+ * check_dropped --
+ * Check that the uri has been dropped.
+ */
+static void
+check_dropped(WT_SESSION *session, const char *uri)
+{
+ WT_CURSOR *cursor;
+ int ret;
+
+ ret = session->open_cursor(session, uri, NULL, NULL, &cursor);
+ testutil_assert(ret == WT_NOTFOUND);
+}
+
+/*
+ * check_empty --
+ * Check that the uri exists and is empty.
+ */
+static void
+check_empty(WT_SESSION *session, const char *uri)
+{
+ WT_CURSOR *cursor;
+ int ret;
+
+ testutil_check(session->open_cursor(session, uri, NULL, NULL, &cursor));
+ ret = cursor->next(cursor);
+ testutil_assert(ret == WT_NOTFOUND);
+ testutil_check(cursor->close(cursor));
+}
+
+/*
+ * check_empty --
+ * Check that the uri exists and has one entry.
+ */
+static void
+check_one_entry(WT_SESSION *session, const char *uri, const char *key,
+ const char *value)
+{
+ WT_CURSOR *cursor;
+ int ret;
+ char *gotkey, *gotvalue;
+
+ testutil_check(session->open_cursor(session, uri, NULL, NULL, &cursor));
+ testutil_check(cursor->next(cursor));
+ cursor->get_key(cursor, &gotkey);
+ cursor->get_value(cursor, &gotvalue);
+ testutil_assert(WT_STREQ(key, gotkey));
+ testutil_assert(WT_STREQ(value, gotvalue));
+ ret = cursor->next(cursor);
+ testutil_assert(ret == WT_NOTFOUND);
+ testutil_check(cursor->close(cursor));
+}
+
+/*
+ * check_schema
+ * Check that the database has the expected schema according to the
+ * last id seen for this thread.
+ */
+static void
+check_schema(WT_SESSION *session, uint64_t lastid, uint32_t threadid,
+ uint32_t flags)
+{
+ char uri[50], uri2[50];
+
+ if (!LF_ISSET(SCHEMA_ALL))
+ return;
+
+ if (LF_ISSET(SCHEMA_VERBOSE))
+ fprintf(stderr, "check_schema(%d, thread=%d)\n",
+ (int)lastid, (int)threadid);
+ if (has_schema_operation(lastid, 0)) {
+ /* Create table operation. */
+ gen_table_name(uri, sizeof(uri), lastid, threadid);
+ if (LF_ISSET(SCHEMA_VERBOSE))
+ fprintf(stderr, " create %s\n", uri);
+ if (LF_ISSET(SCHEMA_CREATE_CHECK))
+ check_empty(session, uri);
+ }
+ if (has_schema_operation(lastid, 1)) {
+ /* Insert value operation. */
+ gen_table_name(uri, sizeof(uri), lastid - 1, threadid);
+ if (LF_ISSET(SCHEMA_VERBOSE))
+ fprintf(stderr, " insert %s\n", uri);
+ if (LF_ISSET(SCHEMA_DATA_CHECK))
+ check_one_entry(session, uri, uri, uri);
+ }
+ if (LF_ISSET(SCHEMA_RENAME) && has_schema_operation(lastid, 2)) {
+ /* Table rename operation. */
+ gen_table_name(uri, sizeof(uri), lastid - 2, threadid);
+ gen_table2_name(uri2, sizeof(uri2), lastid - 2, threadid,
+ flags);
+ if (LF_ISSET(SCHEMA_VERBOSE))
+ fprintf(stderr, " rename %s,%s\n", uri, uri2);
+ if (LF_ISSET(SCHEMA_DROP_CHECK))
+ check_dropped(session, uri);
+ if (LF_ISSET(SCHEMA_CREATE_CHECK))
+ check_one_entry(session, uri2, uri, uri);
+ }
+ if (has_schema_operation(lastid, 3)) {
+ /* Value update operation. */
+ gen_table_name(uri, sizeof(uri), lastid - 2, threadid);
+ gen_table2_name(uri2, sizeof(uri2), lastid - 2, threadid,
+ flags);
+ if (LF_ISSET(SCHEMA_VERBOSE))
+ fprintf(stderr, " update %s\n", uri2);
+ if (LF_ISSET(SCHEMA_DATA_CHECK))
+ check_one_entry(session, uri2, uri, uri2);
+ }
+ if (LF_ISSET(SCHEMA_DROP_CHECK) && has_schema_operation(lastid, 4)) {
+ /* Drop table operation. */
+ gen_table2_name(uri2, sizeof(uri2), lastid - 2, threadid,
+ flags);
+ if (LF_ISSET(SCHEMA_VERBOSE))
+ fprintf(stderr, " drop %s\n", uri2);
+ check_dropped(session, uri2);
+ }
+}
+
+/*
+ * check_db --
+ * Make a copy of the database and verify its contents.
+ */
+static bool
+check_db(uint32_t nth, uint32_t datasize, bool directio, uint32_t flags)
+{
+ struct sigaction sa;
+ WT_CONNECTION *conn;
+ WT_CURSOR *cursor, *meta, *rev;
+ WT_SESSION *session;
+ uint64_t gotid, id;
+ uint64_t *lastid;
+ uint32_t gotth, kvsize, th, threadmap;
+ int ret, status;
+ char buf[4096];
+ char *gotkey, *gotvalue, *keybuf, *p;
+ char **large_arr;
+
+ keybuf = dcalloc(datasize, 1);
+ lastid = dcalloc(nth, sizeof(uint64_t));
+
+ large_arr = dcalloc(nth, sizeof(char *));
+ for (th = 0; th < nth; th++) {
+ large_arr[th] = dcalloc(LARGE_WRITE_SIZE, 1);
+ large_buf(large_arr[th], LARGE_WRITE_SIZE, th, true);
+ }
+
+ /*
+ * We make a copy of the directory (possibly using direct IO)
+ * for recovery and checking, and an identical copy that
+ * keeps the state of all files before recovery starts.
+ */
+ testutil_check(__wt_snprintf(buf, sizeof(buf),
+ "H='%s'; C=$H.CHECK; S=$H.SAVE; rm -rf $C $S;"
+ " mkdir $C; for f in `ls $H/`; do "
+ " dd if=$H/$f of=$C/$f bs=4096 %s >/dev/null 2>&1 || exit 1; done;"
+ " cp -pr $C $S",
+ home, directio ? "iflag=direct" : ""));
+ printf(
+ "Copy database home directory using direct I/O to run recovery,\n"
+ "along with a saved 'pre-recovery' copy.\n");
+ printf("Shell command: %s\n", buf);
+
+ /* Temporarily turn off the child handler while running 'system' */
+ memset(&sa, 0, sizeof(sa));
+ sa.sa_handler = SIG_DFL;
+ testutil_checksys(sigaction(SIGCHLD, &sa, NULL));
+ if ((status = system(buf)) < 0)
+ testutil_die(status, "system: %s", buf);
+ sa.sa_handler = handler;
+ testutil_checksys(sigaction(SIGCHLD, &sa, NULL));
+
+ testutil_check(__wt_snprintf(buf, sizeof(buf), "%s.CHECK", home));
+
+ printf("Open database, run recovery and verify content\n");
+ testutil_check(wiredtiger_open(buf, NULL, ENV_CONFIG_REC, &conn));
+ testutil_check(conn->open_session(conn, NULL, NULL, &session));
+ testutil_check(session->open_cursor(session, uri_main, NULL, NULL,
+ &cursor));
+ testutil_check(session->open_cursor(session, uri_rev, NULL, NULL,
+ &rev));
+ kvsize = datasize / 2;
+
+ /*
+ * We're most interested in the final records on disk.
+ * Rather than walk all records, we do a quick scan
+ * to find the last complete set of written ids.
+ * Each thread writes each id, along with the thread id,
+ * so they are interleaved. Once we have the neighborhood
+ * where some keys may be missing, we'll back up to do a scan
+ * from that point.
+ */
+#define CHECK_INCR 1000
+ for (id = 0; ; id += CHECK_INCR) {
+ gen_kv(keybuf, kvsize, id, 0, large_arr[0], true);
+ cursor->set_key(cursor, keybuf);
+ if ((ret = cursor->search(cursor)) == WT_NOTFOUND)
+ break;
+ testutil_check(ret);
+ for (th = 1; th < nth; th++) {
+ gen_kv(keybuf, kvsize, id, th, large_arr[th], true);
+ cursor->set_key(cursor, keybuf);
+ if ((ret = cursor->search(cursor)) == WT_NOTFOUND)
+ break;
+ testutil_check(ret);
+ }
+ if (ret == WT_NOTFOUND)
+ break;
+ }
+ if (id < CHECK_INCR * 2)
+ id = 0;
+ else
+ id -= CHECK_INCR * 2;
+
+ printf("starting full scan at %" PRIu64 "\n", id);
+ gen_kv(keybuf, kvsize, id, 0, large_arr[0], true);
+ cursor->set_key(cursor, keybuf);
+ testutil_check(cursor->search(cursor));
+ th = 0;
+
+ /* Keep bitmap of "active" threads. */
+ threadmap = (0x1U << nth) - 1;
+ for (ret = 0; ret != WT_NOTFOUND && threadmap != 0;
+ ret = cursor->next(cursor)) {
+ testutil_check(ret);
+ cursor->get_key(cursor, &gotkey);
+ gotid = (uint64_t)strtol(gotkey, &p, 10);
+ testutil_assert(*p == KEY_SEP[0]);
+ p++;
+ testutil_assert(isxdigit(*p));
+ if (isdigit(*p))
+ gotth = (uint32_t)(*p - '0');
+ else if (*p >= 'a' && *p <= 'f')
+ gotth = (uint32_t)(*p - 'a' + 10);
+ else
+ gotth = (uint32_t)(*p - 'A' + 10);
+ p++;
+ testutil_assert(*p == KEY_SEP[0]);
+ p++;
+
+ /*
+ * See if the expected thread has finished at this point.
+ * If so, remove it from the thread map.
+ */
+ while (gotth != th) {
+ if ((threadmap & (0x1U << th)) != 0) {
+ threadmap &= ~(0x1U << th);
+ lastid[th] = id - 1;
+ /*
+ * Any newly removed value in the main table
+ * should not be present as a key in the
+ * reverse table, since they were
+ * transactionally inserted at the same time.
+ */
+ gen_kv(keybuf, kvsize, id, th, large_arr[th],
+ false);
+ check_kv(rev, keybuf, NULL, false);
+ check_schema(session, id - 1, th, flags);
+ }
+ th = (th + 1) % nth;
+ if (th == 0)
+ id++;
+ }
+ testutil_assert(gotid == id);
+ /*
+ * Check that the key and value fully match.
+ */
+ gen_kv(keybuf, kvsize, id, th, large_arr[th], true);
+ gen_kv(&keybuf[kvsize], kvsize, id, th, large_arr[th], false);
+ cursor->get_value(cursor, &gotvalue);
+ TEST_STREQ(keybuf, gotkey, "main table key");
+
+ /*
+ * Every 1000th record is large.
+ */
+ if (id % 1000 == 0)
+ TEST_STREQ(large_arr[th], gotvalue,
+ "main table large value");
+ else
+ TEST_STREQ(&keybuf[kvsize], gotvalue,
+ "main table value");
+
+ /*
+ * Check the reverse file, with key/value reversed.
+ */
+ check_kv(rev, &keybuf[kvsize], keybuf, true);
+
+ check_schema(session, id, th, flags);
+
+ /* Bump thread number and id to the next expected key. */
+ th = (th + 1) % nth;
+ if (th == 0)
+ id++;
+ }
+ printf("scanned to %" PRIu64 "\n", id);
+
+ if (LF_ISSET(SCHEMA_ALL)) {
+ /*
+ * Check metadata to see if there are any tables
+ * present that shouldn't be there.
+ */
+ testutil_check(session->open_cursor(session, "metadata:", NULL,
+ NULL, &meta));
+ while ((ret = meta->next(meta)) != WT_NOTFOUND) {
+ testutil_check(ret);
+ meta->get_key(meta, &gotkey);
+ /*
+ * Names involved in schema testing are of the form:
+ * table:Axxx-t
+ * table:Bxxx-t
+ * xxx corresponds to the id inserted into the main
+ * table when the table was created, and t corresponds
+ * to the thread id that did this.
+ */
+ if (WT_PREFIX_SKIP(gotkey, "table:") &&
+ (*gotkey == 'A' || *gotkey == 'B')) {
+ gotid = (uint64_t)strtol(gotkey + 1, &p, 10);
+ testutil_assert(*p == '-');
+ th = (uint32_t)strtol(p + 1, &p, 10);
+ testutil_assert(*p == '\0');
+ /*
+ * If table operations are truly
+ * transactional, then there shouldn't
+ * be any extra files that unaccounted for.
+ */
+ if (LF_ISSET(SCHEMA_DROP_CHECK))
+ testutil_assert(gotid == lastid[th]);
+ }
+ }
+ testutil_check(meta->close(meta));
+
+ }
+
+ testutil_check(cursor->close(cursor));
+ testutil_check(rev->close(rev));
+ testutil_check(session->close(session, NULL));
+ testutil_check(conn->close(conn, NULL));
+
+ for (th = 0; th < nth; th++)
+ free(large_arr[th]);
+ free(large_arr);
+ free(keybuf);
+ free(lastid);
+ return (true);
+}
+
+/*
+ * handler --
+ * Child signal handler
+ */
+static void
+handler(int sig)
+{
+ pid_t pid;
+ int status, termsig;
+
+ WT_UNUSED(sig);
+ pid = waitpid(-1, &status, WNOHANG|WUNTRACED);
+ if (pid == 0)
+ return; /* Nothing to wait for. */
+ if (WIFSTOPPED(status))
+ return;
+ if (WIFSIGNALED(status)) {
+ termsig = WTERMSIG(status);
+ if (termsig == SIGCONT || termsig == SIGSTOP)
+ return;
+ printf("Child got signal %d (status = %d, 0x%x)\n",
+ termsig, status, (unsigned int)status);
+#ifdef WCOREDUMP
+ if (WCOREDUMP(status))
+ printf("Child process id=%d created core file\n", pid);
+#endif
+ }
+
+ /*
+ * The core file will indicate why the child exited. Choose EINVAL here.
+ */
+ testutil_die(EINVAL,
+ "Child process %" PRIu64 " abnormally exited, status=%d (0x%x)",
+ (uint64_t)pid, status, status);
+}
+
+/*
+ * has_direct_io --
+ * Check for direct I/O support.
+ */
+static bool
+has_direct_io(void)
+{
+#ifdef O_DIRECT
+ return (true);
+#else
+ return (false);
+#endif
+}
+
+/*
+ * main --
+ * Top level test.
+ */
+int
+main(int argc, char *argv[])
+{
+ struct sigaction sa;
+ struct stat sb;
+ WT_RAND_STATE rnd;
+ pid_t pid;
+ size_t size;
+ uint32_t datasize, flags, i, interval, ncycles, nth, timeout;
+ int ch, status;
+ const char *method, *working_dir;
+ char *arg, *p;
+ char args[1024], buf[1024];
+ bool populate_only, rand_th, rand_time, verify_only;
+
+ (void)testutil_set_progname(argv);
+
+ datasize = DEFAULT_DATA_SIZE;
+ nth = MIN_TH;
+ ncycles = DEFAULT_CYCLES;
+ rand_th = rand_time = true;
+ timeout = MIN_TIME;
+ interval = DEFAULT_INTERVAL;
+ flags = 0;
+ populate_only = verify_only = false;
+ working_dir = "WT_TEST.random-directio";
+ method = "none";
+ pid = 0;
+ WT_CLEAR(args);
+
+ if (!has_direct_io()) {
+ fprintf(stderr, "**** test_random_directio: this system does "
+ "not support direct I/O.\n**** Skipping test.\n");
+ return (EXIT_SUCCESS);
+ }
+ for (i = 0, p = args; i < (uint32_t)argc; i++) {
+ testutil_check(__wt_snprintf_len_set(p,
+ sizeof(args) - (size_t)(p - args), &size, " %s",
+ argv[i]));
+ p += size;
+ }
+ while ((ch = __wt_getopt(progname, argc, argv,
+ "d:h:i:m:n:pS:T:t:v")) != EOF)
+ switch (ch) {
+ case 'd':
+ datasize = (uint32_t)atoi(__wt_optarg);
+ if (datasize > LARGE_WRITE_SIZE ||
+ datasize < MIN_DATA_SIZE) {
+ fprintf(stderr,
+ "-d value is larger than maximum %"
+ PRId32 "\n",
+ LARGE_WRITE_SIZE);
+ return (EXIT_FAILURE);
+ }
+ break;
+ case 'h':
+ working_dir = __wt_optarg;
+ break;
+ case 'i':
+ interval = (uint32_t)atoi(__wt_optarg);
+ break;
+ case 'm':
+ method = __wt_optarg;
+ if (!WT_STREQ(method, "fsync") &&
+ !WT_STREQ(method, "dsync") &&
+ !WT_STREQ(method, "none")) {
+ fprintf(stderr,
+ "-m option requires fsync|dsync|none\n");
+ return (EXIT_FAILURE);
+ }
+ break;
+ case 'n':
+ ncycles = (uint32_t)atoi(__wt_optarg);
+ break;
+ case 'p':
+ populate_only = true;
+ break;
+ case 'S':
+ p = __wt_optarg;
+ while ((arg = strtok_r(p, ",", &p)) != NULL) {
+ if (WT_STREQ(arg, "all"))
+ LF_SET(SCHEMA_ALL);
+ else if (WT_STREQ(arg, "create"))
+ LF_SET(SCHEMA_CREATE);
+ else if (WT_STREQ(arg, "create_check"))
+ LF_SET(SCHEMA_CREATE_CHECK);
+ else if (WT_STREQ(arg, "data_check"))
+ LF_SET(SCHEMA_DATA_CHECK);
+ else if (WT_STREQ(arg, "drop"))
+ LF_SET(SCHEMA_DROP);
+ else if (WT_STREQ(arg, "drop_check"))
+ LF_SET(SCHEMA_DROP_CHECK);
+ else if (WT_STREQ(arg, "none"))
+ flags = 0;
+ else if (WT_STREQ(arg, "rename"))
+ LF_SET(SCHEMA_RENAME);
+ else if (WT_STREQ(arg, "verbose"))
+ LF_SET(SCHEMA_VERBOSE);
+ else {
+ fprintf(stderr,
+ "Unknown -S arg '%s'\n", arg);
+ usage();
+ }
+ }
+ break;
+ case 'T':
+ rand_th = false;
+ nth = (uint32_t)atoi(__wt_optarg);
+ break;
+ case 't':
+ rand_time = false;
+ timeout = (uint32_t)atoi(__wt_optarg);
+ break;
+ case 'v':
+ verify_only = true;
+ break;
+ default:
+ usage();
+ }
+ argc -= __wt_optind;
+ if (argc != 0)
+ usage();
+
+ testutil_work_dir_from_path(home, sizeof(home), working_dir);
+ /*
+ * If the user wants to verify they need to tell us how many threads
+ * there were so we know what records we can expect.
+ */
+ if (verify_only && rand_th) {
+ fprintf(stderr,
+ "Verify option requires specifying number of threads\n");
+ return (EXIT_FAILURE);
+ }
+ if ((LF_ISSET(SCHEMA_RENAME|SCHEMA_DROP|SCHEMA_CREATE_CHECK|
+ SCHEMA_DATA_CHECK) &&
+ !LF_ISSET(SCHEMA_CREATE)) ||
+ (LF_ISSET(SCHEMA_DROP_CHECK) &&
+ !LF_ISSET(SCHEMA_DROP))) {
+ fprintf(stderr, "Schema operations incompatible\n");
+ usage();
+ }
+ printf("CONFIG:%s\n", args);
+ if (!verify_only) {
+ testutil_check(__wt_snprintf(buf, sizeof(buf),
+ "rm -rf %s", home));
+ if ((status = system(buf)) < 0)
+ testutil_die(status, "system: %s", buf);
+ testutil_make_work_dir(home);
+
+ __wt_random_init_seed(NULL, &rnd);
+ if (rand_time) {
+ timeout = __wt_random(&rnd) % MAX_TIME;
+ if (timeout < MIN_TIME)
+ timeout = MIN_TIME;
+ }
+ if (rand_th) {
+ nth = __wt_random(&rnd) % MAX_TH;
+ if (nth < MIN_TH)
+ nth = MIN_TH;
+ }
+ printf("Parent: Create %" PRIu32
+ " threads; sleep %" PRIu32 " seconds\n", nth, timeout);
+
+ if (!populate_only) {
+ /*
+ * Fork a child to insert as many items. We will
+ * then randomly suspend the child, run recovery and
+ * make sure all items we wrote exist after recovery
+ * runs.
+ */
+ memset(&sa, 0, sizeof(sa));
+ sa.sa_handler = handler;
+ testutil_checksys(sigaction(SIGCHLD, &sa, NULL));
+ if ((pid = fork()) < 0)
+ testutil_die(errno, "fork");
+ }
+ if (pid == 0) { /* child, or populate_only */
+ fill_db(nth, datasize, method, flags);
+ return (EXIT_SUCCESS);
+ }
+
+ /* parent */
+ /*
+ * Sleep for the configured amount of time before killing
+ * the child. Start the timeout from the time we notice that
+ * the table has been created. That allows the test to run
+ * correctly on really slow machines.
+ */
+ testutil_check(__wt_snprintf(
+ buf, sizeof(buf), "%s/%s", home, fs_main));
+ while (stat(buf, &sb) != 0 || sb.st_size < 4096)
+ testutil_sleep_wait(1, pid);
+ testutil_sleep_wait(timeout, pid);
+
+ /*
+ * Begin our cycles of suspend, copy, recover.
+ */
+ for (i = 0; i < ncycles; i++) {
+ printf("Beginning cycle %" PRIu32 "/%" PRIu32 "\n",
+ i + 1, ncycles);
+ if (i != 0)
+ testutil_sleep_wait(interval, pid);
+ printf("Suspend child\n");
+ if (kill(pid, SIGSTOP) != 0)
+ testutil_die(errno, "kill");
+ printf("Check DB\n");
+ fflush(stdout);
+ if (!check_db(nth, datasize, true, flags))
+ return (EXIT_FAILURE);
+ if (kill(pid, SIGCONT) != 0)
+ testutil_die(errno, "kill");
+ printf("\n");
+ }
+
+ printf("Kill child\n");
+ sa.sa_handler = SIG_DFL;
+ testutil_checksys(sigaction(SIGCHLD, &sa, NULL));
+ if (kill(pid, SIGKILL) != 0)
+ testutil_die(errno, "kill");
+ if (waitpid(pid, &status, 0) == -1)
+ testutil_die(errno, "waitpid");
+ }
+ if (verify_only && !check_db(nth, datasize, false, flags)) {
+ printf("FAIL\n");
+ return (EXIT_FAILURE);
+ }
+ printf("SUCCESS\n");
+ return (EXIT_SUCCESS);
+}
diff --git a/src/third_party/wiredtiger/test/csuite/random_directio/smoke.sh b/src/third_party/wiredtiger/test/csuite/random_directio/smoke.sh
new file mode 100755
index 00000000000..0ba243bd0f8
--- /dev/null
+++ b/src/third_party/wiredtiger/test/csuite/random_directio/smoke.sh
@@ -0,0 +1,34 @@
+#! /bin/sh
+
+set -e
+
+# Smoke-test random_directio as part of running "make check".
+
+RUN_TEST_CMD="$TEST_WRAPPER ./test_random_directio"
+
+# Disabled for now until we fix issues encountered via the test
+exit 0
+
+# Replace for more complete testing
+#TEST_THREADS="1 5 10"
+TEST_THREADS="5"
+
+# Replace for more complete testing
+#TEST_METHODS="none dsync fsync"
+TEST_METHODS="none"
+
+for threads in $TEST_THREADS; do
+ for method in $TEST_METHODS; do
+ RUN_TEST="$RUN_TEST_CMD -t 5 -m $method"
+ $RUN_TEST -T $threads || exit 1
+ $RUN_TEST -T $threads -S create,drop,verbose || exit 1
+
+ # Here are successively tougher schema tests that do not yet
+ # reliably pass. 'verbose' can be added to any.
+ #$RUN_TEST -T $threads -S create,create_check || exit 1
+ #$RUN_TEST -T $threads -S create,drop,drop_check || exit 1
+ #$RUN_TEST -T $threads -S create,rename || exit 1
+ #$RUN_TEST -T $threads -S create,rename,drop_check || exit 1
+ #$RUN_TEST -T $threads -S all,verbose || exit 1
+ done
+done
diff --git a/src/third_party/wiredtiger/test/csuite/schema_abort/main.c b/src/third_party/wiredtiger/test/csuite/schema_abort/main.c
new file mode 100644
index 00000000000..79832199bf7
--- /dev/null
+++ b/src/third_party/wiredtiger/test/csuite/schema_abort/main.c
@@ -0,0 +1,1355 @@
+/*-
+ * Public Domain 2014-2018 MongoDB, Inc.
+ * Public Domain 2008-2014 WiredTiger, Inc.
+ *
+ * This is free and unencumbered software released into the public domain.
+ *
+ * Anyone is free to copy, modify, publish, use, compile, sell, or
+ * distribute this software, either in source code form or as a compiled
+ * binary, for any purpose, commercial or non-commercial, and by any
+ * means.
+ *
+ * In jurisdictions that recognize copyright laws, the author or authors
+ * of this software dedicate any and all copyright interest in the
+ * software to the public domain. We make this dedication for the benefit
+ * of the public at large and to the detriment of our heirs and
+ * successors. We intend this dedication to be an overt act of
+ * relinquishment in perpetuity of all present and future rights to this
+ * software under copyright law.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include "test_util.h"
+
+#include <sys/wait.h>
+#include <signal.h>
+
+static char home[1024]; /* Program working dir */
+
+/*
+ * Create three tables that we will write the same data to and verify that
+ * all the types of usage have the expected data in them after a crash and
+ * recovery. We want:
+ * 1. A table that is logged and is not involved in timestamps. This table
+ * simulates a user local table.
+ * 2. A table that is logged and involved in timestamps. This simulates
+ * the oplog.
+ * 3. A table that is not logged and involved in timestamps. This simulates
+ * a typical collection file.
+ *
+ * We also have most threads perform schema operations such as create/drop.
+ *
+ * We also create several files that are not WiredTiger tables. The checkpoint
+ * thread creates a file indicating that a checkpoint has completed. The parent
+ * process uses this to know when at least one checkpoint is done and it can
+ * start the timer to abort.
+ *
+ * Each worker thread creates its own records file that records the data it
+ * inserted and it records the timestamp that was used for that insertion.
+ */
+#define INVALID_KEY UINT64_MAX
+#define MAX_CKPT_INVL 2 /* Maximum interval between checkpoints */
+/* Set large, some slow I/O systems take tens of seconds to fsync. */
+#define MAX_STARTUP 30 /* Seconds to start up and set stable */
+#define MAX_TH 12
+#define MAX_TIME 40
+#define MAX_VAL 1024
+#define MIN_TH 5
+#define MIN_TIME 10
+#define PREPARE_FREQ 5
+#define PREPARE_YIELD (PREPARE_FREQ * 10)
+#define RECORDS_FILE "records-%" PRIu32
+#define STABLE_PERIOD 100
+
+static const char * const uri = "table:wt";
+static const char * const uri_local = "table:local";
+static const char * const uri_oplog = "table:oplog";
+static const char * const uri_collection = "table:collection";
+
+static const char * const ckpt_file = "checkpoint_done";
+
+static bool compat, inmem, stable_set, use_ts, use_txn;
+static volatile uint64_t global_ts = 1;
+static volatile uint64_t uid = 1;
+typedef struct {
+ uint64_t ts;
+ const char *op;
+} THREAD_TS;
+static volatile THREAD_TS th_ts[MAX_TH];
+
+#define ENV_CONFIG_COMPAT ",compatibility=(release=\"2.9\")"
+#define ENV_CONFIG_DEF \
+ "create,log=(archive=false,file_max=10M,enabled)"
+#define ENV_CONFIG_TXNSYNC \
+ "create,log=(archive=false,file_max=10M,enabled)," \
+ "transaction_sync=(enabled,method=none)"
+#define ENV_CONFIG_REC "log=(archive=false,recover=on)"
+
+typedef struct {
+ uint64_t absent_key; /* Last absent key */
+ uint64_t exist_key; /* First existing key after miss */
+ uint64_t first_key; /* First key in range */
+ uint64_t first_miss; /* First missing key */
+ uint64_t last_key; /* Last key in range */
+} REPORT;
+
+typedef struct {
+ WT_CONNECTION *conn;
+ uint64_t start;
+ uint32_t info;
+ const char *op;
+} THREAD_DATA;
+
+#define NOOP "noop"
+#define BULK "bulk"
+#define BULK_UNQ "bulk_unique"
+#define CREATE "create"
+#define CREATE_UNQ "create_unique"
+#define CURSOR "cursor"
+#define DROP "drop"
+#define REBALANCE "rebalance"
+#define UPGRADE "upgrade"
+#define VERIFY "verify"
+
+static void sig_handler(int)
+ WT_GCC_FUNC_DECL_ATTRIBUTE((noreturn));
+static void usage(void)
+ WT_GCC_FUNC_DECL_ATTRIBUTE((noreturn));
+static void
+usage(void)
+{
+ fprintf(stderr,
+ "usage: %s [-h dir] [-T threads] [-t time] [-Cmvxz]\n", progname);
+ exit(EXIT_FAILURE);
+}
+
+static const char * const config = NULL;
+
+/*
+ * subtest_error_handler --
+ * Error event handler.
+ */
+static int
+subtest_error_handler(WT_EVENT_HANDLER *handler,
+ WT_SESSION *session, int error, const char *message)
+{
+ (void)(handler);
+ (void)(session);
+ (void)(error);
+
+ /* Filter out errors about bulk load usage - they are annoying */
+ if (strstr(message, "bulk-load is only supported on newly") == NULL)
+ fprintf(stderr, "%s", message);
+ return (0);
+}
+
+static WT_EVENT_HANDLER event_handler = {
+ subtest_error_handler,
+ NULL, /* Message handler */
+ NULL, /* Progress handler */
+ NULL /* Close handler */
+};
+
+/*
+ * The following are various schema-related functions to have some threads
+ * performing during the test. The goal is to make sure that after a random
+ * abort, the database is left in a recoverable state. Yield during the
+ * schema operations to increase chance of abort during them.
+ *
+ * TODO: Currently only verifies insert data, it would be ideal to modify the
+ * schema operations so that we can verify the state of the schema too.
+ */
+
+static void
+dump_ts(uint64_t nth)
+{
+ uint64_t i;
+
+ for (i = 0; i < nth; ++i)
+ fprintf(stderr, "THREAD %" PRIu64 ": ts: %" PRIu64
+ " op %s\n", i, th_ts[i].ts, th_ts[i].op);
+}
+
+/*
+ * test_bulk --
+ * Test creating a bulk cursor.
+ */
+static void
+test_bulk(THREAD_DATA *td)
+{
+ WT_CURSOR *c;
+ WT_SESSION *session;
+ int ret;
+ bool create;
+
+ testutil_check(td->conn->open_session(td->conn, NULL, NULL, &session));
+
+ if (use_txn)
+ testutil_check(session->begin_transaction(session, NULL));
+ create = false;
+ if ((ret = session->create(session, uri, config)) != 0)
+ if (ret != EEXIST && ret != EBUSY)
+ testutil_die(ret, "session.create");
+
+ if (ret == 0) {
+ create = true;
+ if ((ret = session->open_cursor(
+ session, uri, NULL, "bulk", &c)) == 0) {
+ __wt_yield();
+ testutil_check(c->close(c));
+ } else if (ret != ENOENT && ret != EBUSY && ret != EINVAL)
+ testutil_die(ret, "session.open_cursor bulk");
+ }
+
+ if (use_txn) {
+ /* If create fails, rollback else will commit.*/
+ if (!create)
+ ret = session->rollback_transaction(session, NULL);
+ else
+ ret = session->commit_transaction(session, NULL);
+
+ if (ret == EINVAL) {
+ fprintf(stderr, "BULK: EINVAL on %s. ABORT\n",
+ create ? "commit" : "rollback");
+ testutil_die(ret, "session.commit bulk");
+ }
+ }
+ testutil_check(session->close(session, NULL));
+}
+
+/*
+ * test_bulk_unique --
+ * Test creating a bulk cursor with a unique name.
+ */
+static void
+test_bulk_unique(THREAD_DATA *td, int force)
+{
+ WT_CURSOR *c;
+ WT_SESSION *session;
+ uint64_t my_uid;
+ int ret;
+ char new_uri[64];
+
+ testutil_check(td->conn->open_session(td->conn, NULL, NULL, &session));
+
+ /* Generate a unique object name. */
+ my_uid = __wt_atomic_addv64(&uid, 1);
+ testutil_check(__wt_snprintf(
+ new_uri, sizeof(new_uri), "%s.%u", uri, my_uid));
+
+ if (use_txn)
+ testutil_check(session->begin_transaction(session, NULL));
+ testutil_check(session->create(session, new_uri, config));
+
+ __wt_yield();
+ /*
+ * Opening a bulk cursor may have raced with a forced checkpoint
+ * which created a checkpoint of the empty file, and triggers an EINVAL.
+ */
+ if ((ret = session->open_cursor(
+ session, new_uri, NULL, "bulk", &c)) == 0)
+ testutil_check(c->close(c));
+ else if (ret != EINVAL)
+ testutil_die(ret,
+ "session.open_cursor bulk unique: %s, new_uri");
+
+ while ((ret = session->drop(
+ session, new_uri, force ? "force" : NULL)) != 0)
+ if (ret != EBUSY)
+ testutil_die(ret, "session.drop: %s", new_uri);
+
+ if (use_txn &&
+ (ret = session->commit_transaction(session, NULL)) != 0 &&
+ ret != EINVAL)
+ testutil_die(ret, "session.commit bulk unique");
+ testutil_check(session->close(session, NULL));
+}
+
+/*
+ * test_cursor --
+ * Open a cursor on a data source.
+ */
+static void
+test_cursor(THREAD_DATA *td)
+{
+ WT_CURSOR *cursor;
+ WT_SESSION *session;
+ int ret;
+
+ testutil_check(td->conn->open_session(td->conn, NULL, NULL, &session));
+
+ if (use_txn)
+ testutil_check(session->begin_transaction(session, NULL));
+ if ((ret =
+ session->open_cursor(session, uri, NULL, NULL, &cursor)) != 0) {
+ if (ret != ENOENT && ret != EBUSY)
+ testutil_die(ret, "session.open_cursor");
+ } else {
+ __wt_yield();
+ testutil_check(cursor->close(cursor));
+ }
+
+ if (use_txn &&
+ (ret = session->commit_transaction(session, NULL)) != 0 &&
+ ret != EINVAL)
+ testutil_die(ret, "session.commit cursor");
+ testutil_check(session->close(session, NULL));
+}
+
+/*
+ * test_create --
+ * Create a table.
+ */
+static void
+test_create(THREAD_DATA *td)
+{
+ WT_SESSION *session;
+ int ret;
+
+ testutil_check(td->conn->open_session(td->conn, NULL, NULL, &session));
+
+ if (use_txn)
+ testutil_check(session->begin_transaction(session, NULL));
+ if ((ret = session->create(session, uri, config)) != 0)
+ if (ret != EEXIST && ret != EBUSY)
+ testutil_die(ret, "session.create");
+ __wt_yield();
+ if (use_txn &&
+ (ret = session->commit_transaction(session, NULL)) != 0 &&
+ ret != EINVAL)
+ testutil_die(ret, "session.commit create");
+ testutil_check(session->close(session, NULL));
+}
+
+/*
+ * test_create_unique --
+ * Create a uniquely named table.
+ */
+static void
+test_create_unique(THREAD_DATA *td, int force)
+{
+ WT_SESSION *session;
+ uint64_t my_uid;
+ int ret;
+ char new_uri[64];
+
+ testutil_check(td->conn->open_session(td->conn, NULL, NULL, &session));
+
+ /* Generate a unique object name. */
+ my_uid = __wt_atomic_addv64(&uid, 1);
+ testutil_check(__wt_snprintf(
+ new_uri, sizeof(new_uri), "%s.%u", uri, my_uid));
+
+ if (use_txn)
+ testutil_check(session->begin_transaction(session, NULL));
+ testutil_check(session->create(session, new_uri, config));
+ if (use_txn &&
+ (ret = session->commit_transaction(session, NULL)) != 0 &&
+ ret != EINVAL)
+ testutil_die(ret, "session.commit create unique");
+
+ __wt_yield();
+ if (use_txn)
+ testutil_check(session->begin_transaction(session, NULL));
+ while ((ret = session->drop(
+ session, new_uri, force ? "force" : NULL)) != 0)
+ if (ret != EBUSY)
+ testutil_die(ret, "session.drop: %s", new_uri);
+ if (use_txn &&
+ (ret = session->commit_transaction(session, NULL)) != 0 &&
+ ret != EINVAL)
+ testutil_die(ret, "session.commit create unique");
+
+ testutil_check(session->close(session, NULL));
+}
+
+/*
+ * test_drop --
+ * Test dropping a table.
+ */
+static void
+test_drop(THREAD_DATA *td, int force)
+{
+ WT_SESSION *session;
+ int ret;
+
+ testutil_check(td->conn->open_session(td->conn, NULL, NULL, &session));
+
+ if (use_txn)
+ testutil_check(session->begin_transaction(session, NULL));
+ if ((ret = session->drop(session, uri, force ? "force" : NULL)) != 0)
+ if (ret != ENOENT && ret != EBUSY)
+ testutil_die(ret, "session.drop");
+
+ if (use_txn) {
+ /*
+ * As the operations are being performed concurrently,
+ * return value can be ENOENT or EBUSY will set
+ * error to transaction opened by session. In these
+ * cases the transaction has to be aborted.
+ */
+ if (ret != ENOENT && ret != EBUSY)
+ ret = session->commit_transaction(session, NULL);
+ else
+ ret = session->rollback_transaction(session, NULL);
+ if (ret == EINVAL)
+ testutil_die(ret, "session.commit drop");
+ }
+ testutil_check(session->close(session, NULL));
+}
+
+/*
+ * test_rebalance --
+ * Rebalance a tree.
+ */
+static void
+test_rebalance(THREAD_DATA *td)
+{
+ WT_SESSION *session;
+ int ret;
+
+ testutil_check(td->conn->open_session(td->conn, NULL, NULL, &session));
+
+ if ((ret = session->rebalance(session, uri, NULL)) != 0)
+ if (ret != ENOENT && ret != EBUSY)
+ testutil_die(ret, "session.rebalance");
+
+ testutil_check(session->close(session, NULL));
+}
+
+/*
+ * test_upgrade --
+ * Upgrade a tree.
+ */
+static void
+test_upgrade(THREAD_DATA *td)
+{
+ WT_SESSION *session;
+ int ret;
+
+ testutil_check(td->conn->open_session(td->conn, NULL, NULL, &session));
+
+ if ((ret = session->upgrade(session, uri, NULL)) != 0)
+ if (ret != ENOENT && ret != EBUSY)
+ testutil_die(ret, "session.upgrade");
+
+ testutil_check(session->close(session, NULL));
+}
+
+/*
+ * test_verify --
+ * Verify a tree.
+ */
+static void
+test_verify(THREAD_DATA *td)
+{
+ WT_SESSION *session;
+ int ret;
+
+ testutil_check(td->conn->open_session(td->conn, NULL, NULL, &session));
+
+ if ((ret = session->verify(session, uri, NULL)) != 0)
+ if (ret != ENOENT && ret != EBUSY)
+ testutil_die(ret, "session.verify");
+
+ testutil_check(session->close(session, NULL));
+}
+
+/*
+ * thread_ts_run --
+ * Runner function for a timestamp thread.
+ */
+static WT_THREAD_RET
+thread_ts_run(void *arg)
+{
+ WT_SESSION *session;
+ THREAD_DATA *td;
+ uint64_t i, last_ts, oldest_ts, this_ts;
+ char tscfg[64];
+
+ td = (THREAD_DATA *)arg;
+ last_ts = 0;
+
+ testutil_check(td->conn->open_session(td->conn, NULL, NULL, &session));
+ /*
+ * Every N records we will record our stable timestamp into the stable
+ * table. That will define our threshold where we expect to find records
+ * after recovery.
+ */
+ for (;;) {
+ oldest_ts = UINT64_MAX;
+ /*
+ * For the timestamp thread, the info field contains the number
+ * of worker threads.
+ */
+ for (i = 0; i < td->info; ++i) {
+ /*
+ * We need to let all threads get started, so if we find
+ * any thread still with a zero timestamp we go to
+ * sleep.
+ */
+ this_ts = th_ts[i].ts;
+ if (this_ts == 0)
+ goto ts_wait;
+ else if (this_ts < oldest_ts)
+ oldest_ts = this_ts;
+ }
+
+ if (oldest_ts != UINT64_MAX &&
+ oldest_ts - last_ts > STABLE_PERIOD) {
+ /*
+ * Set both the oldest and stable timestamp so that we
+ * don't need to maintain read availability at older
+ * timestamps.
+ */
+ testutil_check(__wt_snprintf(
+ tscfg, sizeof(tscfg),
+ "oldest_timestamp=%" PRIx64
+ ",stable_timestamp=%" PRIx64,
+ oldest_ts, oldest_ts));
+ testutil_check(
+ td->conn->set_timestamp(td->conn, tscfg));
+ last_ts = oldest_ts;
+ if (!stable_set) {
+ stable_set = true;
+ printf("SET STABLE: %" PRIx64 " %" PRIu64 "\n",
+ oldest_ts, oldest_ts);
+ }
+ } else
+ts_wait: __wt_sleep(0, 1000);
+ }
+ /* NOTREACHED */
+}
+
+/*
+ * thread_ckpt_run --
+ * Runner function for the checkpoint thread.
+ */
+static WT_THREAD_RET
+thread_ckpt_run(void *arg)
+{
+ struct timespec now, start;
+ FILE *fp;
+ WT_RAND_STATE rnd;
+ WT_SESSION *session;
+ THREAD_DATA *td;
+ uint64_t ts;
+ uint32_t sleep_time;
+ int i;
+ bool first_ckpt;
+
+ __wt_random_init(&rnd);
+
+ td = (THREAD_DATA *)arg;
+ /*
+ * Keep a separate file with the records we wrote for checking.
+ */
+ (void)unlink(ckpt_file);
+ testutil_check(td->conn->open_session(td->conn, NULL, NULL, &session));
+ first_ckpt = true;
+ ts = 0;
+ /*
+ * Keep writing checkpoints until killed by parent.
+ */
+ __wt_epoch(NULL, &start);
+ i = 0;
+ while (true) {
+ sleep_time = __wt_random(&rnd) % MAX_CKPT_INVL;
+ sleep(sleep_time);
+ if (use_ts) {
+ ts = global_ts;
+ /*
+ * If we're using timestamps wait for the stable
+ * timestamp to get set the first time.
+ */
+ if (!stable_set) {
+ __wt_epoch(NULL, &now);
+ if (WT_TIMEDIFF_SEC(now, start) >= 1)
+ printf("CKPT: !stable_set time %"
+ PRIu64 "\n",
+ WT_TIMEDIFF_SEC(now, start));
+ if (WT_TIMEDIFF_SEC(now, start) >
+ MAX_STARTUP) {
+ fprintf(stderr,
+ "After %d seconds stable still not "
+ "set. Aborting.\n", MAX_STARTUP);
+ /*
+ * For the checkpoint thread the info
+ * contains the number of threads.
+ */
+ dump_ts(td->info);
+ abort();
+ }
+ continue;
+ }
+ }
+ /*
+ * Since this is the default, send in this string even if
+ * running without timestamps.
+ */
+ testutil_check(session->checkpoint(
+ session, "use_timestamp=true"));
+ printf("Checkpoint %d complete. Minimum ts %" PRIu64 "\n",
+ ++i, ts);
+ fflush(stdout);
+ /*
+ * Create the checkpoint file so that the parent process knows
+ * at least one checkpoint has finished and can start its
+ * timer. Start the timer for stable after the first checkpoint
+ * completes because a slow I/O lag during the checkpoint can
+ * cause a false positive for a timeout.
+ */
+ if (first_ckpt) {
+ testutil_checksys((fp = fopen(ckpt_file, "w")) == NULL);
+ first_ckpt = false;
+ testutil_checksys(fclose(fp) != 0);
+ }
+ }
+ /* NOTREACHED */
+}
+
+/*
+ * thread_run --
+ * Runner function for the worker threads.
+ */
+static WT_THREAD_RET
+thread_run(void *arg)
+{
+ FILE *fp;
+ WT_CURSOR *cur_coll, *cur_local, *cur_oplog;
+ WT_ITEM data;
+ WT_RAND_STATE rnd;
+ WT_SESSION *oplog_session, *session;
+ THREAD_DATA *td;
+ uint64_t i, stable_ts;
+ char cbuf[MAX_VAL], lbuf[MAX_VAL], obuf[MAX_VAL];
+ char kname[64], tscfg[64];
+ bool use_prep;
+
+ __wt_random_init(&rnd);
+ memset(cbuf, 0, sizeof(cbuf));
+ memset(lbuf, 0, sizeof(lbuf));
+ memset(obuf, 0, sizeof(obuf));
+ memset(kname, 0, sizeof(kname));
+
+ td = (THREAD_DATA *)arg;
+ /*
+ * Set up the separate file for checking.
+ */
+ testutil_check(__wt_snprintf(
+ cbuf, sizeof(cbuf), RECORDS_FILE, td->info));
+ (void)unlink(cbuf);
+ testutil_checksys((fp = fopen(cbuf, "w")) == NULL);
+ /*
+ * Set to line buffering. But that is advisory only. We've seen
+ * cases where the result files end up with partial lines.
+ */
+ __wt_stream_set_line_buffer(fp);
+
+ /*
+ * Have half the threads use prepared transactions if timestamps
+ * are in use.
+ */
+ use_prep = (use_ts && td->info % 2 == 0) ? true : false;
+ /*
+ * We may have two sessions so that the oplog session can have its own
+ * transaction in parallel with the collection session for threads
+ * that are going to be using prepared transactions. We need this
+ * because prepared transactions cannot have any operations that modify
+ * a table that is logged. But we also want to test mixed logged and
+ * not-logged transactions.
+ */
+ testutil_check(td->conn->open_session(td->conn, NULL, NULL, &session));
+ /*
+ * Open a cursor to each table.
+ */
+ testutil_check(session->open_cursor(session,
+ uri_collection, NULL, NULL, &cur_coll));
+ testutil_check(session->open_cursor(session,
+ uri_local, NULL, NULL, &cur_local));
+ oplog_session = NULL;
+ if (use_prep) {
+ testutil_check(td->conn->open_session(
+ td->conn, NULL, NULL, &oplog_session));
+ testutil_check(session->open_cursor(oplog_session,
+ uri_oplog, NULL, NULL, &cur_oplog));
+ } else
+ testutil_check(session->open_cursor(session,
+ uri_oplog, NULL, NULL, &cur_oplog));
+
+ /*
+ * Write our portion of the key space until we're killed.
+ */
+ printf("Thread %" PRIu32 " starts at %" PRIu64 "\n",
+ td->info, td->start);
+ stable_ts = 0;
+ for (i = td->start;; ++i) {
+ /*
+ * Allow some threads to skip schema operations so that they
+ * are generating sufficient dirty data.
+ */
+ WT_PUBLISH(th_ts[td->info].op, NOOP);
+ if (td->info != 0 && td->info != 1)
+ /*
+ * Do a schema operation about 50% of the time by having
+ * a case for only about half the possible mod values.
+ */
+ switch (__wt_random(&rnd) % 20) {
+ case 0:
+ WT_PUBLISH(th_ts[td->info].op, BULK);
+ test_bulk(td);
+ break;
+ case 1:
+ WT_PUBLISH(th_ts[td->info].op, BULK_UNQ);
+ test_bulk_unique(td, __wt_random(&rnd) & 1);
+ break;
+ case 2:
+ WT_PUBLISH(th_ts[td->info].op, CREATE);
+ test_create(td);
+ break;
+ case 3:
+ WT_PUBLISH(th_ts[td->info].op, CREATE_UNQ);
+ test_create_unique(td, __wt_random(&rnd) & 1);
+ break;
+ case 4:
+ WT_PUBLISH(th_ts[td->info].op, CURSOR);
+ test_cursor(td);
+ break;
+ case 5:
+ WT_PUBLISH(th_ts[td->info].op, DROP);
+ test_drop(td, __wt_random(&rnd) & 1);
+ break;
+ case 6:
+ WT_PUBLISH(th_ts[td->info].op, REBALANCE);
+ test_rebalance(td);
+ break;
+ case 7:
+ WT_PUBLISH(th_ts[td->info].op, UPGRADE);
+ test_upgrade(td);
+ break;
+ case 8:
+ WT_PUBLISH(th_ts[td->info].op, VERIFY);
+ test_verify(td);
+ break;
+ }
+ if (use_ts)
+ stable_ts = __wt_atomic_addv64(&global_ts, 1);
+ testutil_check(__wt_snprintf(
+ kname, sizeof(kname), "%" PRIu64, i));
+
+ testutil_check(session->begin_transaction(session, NULL));
+ if (use_prep)
+ testutil_check(oplog_session->begin_transaction(
+ oplog_session, NULL));
+ cur_coll->set_key(cur_coll, kname);
+ cur_local->set_key(cur_local, kname);
+ cur_oplog->set_key(cur_oplog, kname);
+ /*
+ * Put an informative string into the value so that it
+ * can be viewed well in a binary dump.
+ */
+ testutil_check(__wt_snprintf(cbuf, sizeof(cbuf),
+ "COLL: thread:%" PRIu64 " ts:%" PRIu64 " key: %" PRIu64,
+ td->info, stable_ts, i));
+ testutil_check(__wt_snprintf(lbuf, sizeof(lbuf),
+ "LOCAL: thread:%" PRIu64 " ts:%" PRIu64 " key: %" PRIu64,
+ td->info, stable_ts, i));
+ testutil_check(__wt_snprintf(obuf, sizeof(obuf),
+ "OPLOG: thread:%" PRIu64 " ts:%" PRIu64 " key: %" PRIu64,
+ td->info, stable_ts, i));
+ data.size = __wt_random(&rnd) % MAX_VAL;
+ data.data = cbuf;
+ cur_coll->set_value(cur_coll, &data);
+ testutil_check(cur_coll->insert(cur_coll));
+ data.size = __wt_random(&rnd) % MAX_VAL;
+ data.data = obuf;
+ cur_oplog->set_value(cur_oplog, &data);
+ testutil_check(cur_oplog->insert(cur_oplog));
+ if (use_ts) {
+ /*
+ * Run with prepare every once in a while. And also
+ * yield after prepare sometimes too. This is only done
+ * on the regular session.
+ */
+ if (use_prep && i % PREPARE_FREQ == 0) {
+ testutil_check(__wt_snprintf(
+ tscfg, sizeof(tscfg),
+ "prepare_timestamp=%" PRIx64, stable_ts));
+ testutil_check(session->prepare_transaction(
+ session, tscfg));
+ if (i % PREPARE_YIELD == 0)
+ __wt_yield();
+ }
+ testutil_check(__wt_snprintf(tscfg, sizeof(tscfg),
+ "commit_timestamp=%" PRIx64, stable_ts));
+ testutil_check(
+ session->commit_transaction(session, tscfg));
+ if (use_prep)
+ testutil_check(
+ oplog_session->commit_transaction(
+ oplog_session, tscfg));
+ /*
+ * Update the thread's last-committed timestamp.
+ * Don't let the compiler re-order this statement,
+ * if we were to race with the timestamp thread, it
+ * might see our thread update before the commit.
+ */
+ WT_PUBLISH(th_ts[td->info].ts, stable_ts);
+ } else {
+ testutil_check(
+ session->commit_transaction(session, NULL));
+ if (use_prep)
+ testutil_check(
+ oplog_session->commit_transaction(
+ oplog_session, NULL));
+ }
+ /*
+ * Insert into the local table outside the timestamp txn.
+ */
+ data.size = __wt_random(&rnd) % MAX_VAL;
+ data.data = lbuf;
+ cur_local->set_value(cur_local, &data);
+ testutil_check(cur_local->insert(cur_local));
+
+ /*
+ * Save the timestamp and key separately for checking later.
+ */
+ if (fprintf(fp,
+ "%" PRIu64 " %" PRIu64 "\n", stable_ts, i) < 0)
+ testutil_die(EIO, "fprintf");
+ }
+ /* NOTREACHED */
+}
+
+/*
+ * Child process creates the database and table, and then creates worker
+ * threads to add data until it is killed by the parent.
+ */
+static void run_workload(uint32_t)
+ WT_GCC_FUNC_DECL_ATTRIBUTE((noreturn));
+static void
+run_workload(uint32_t nth)
+{
+ WT_CONNECTION *conn;
+ WT_SESSION *session;
+ THREAD_DATA *td;
+ wt_thread_t *thr;
+ uint32_t ckpt_id, i, ts_id;
+ char envconf[512];
+
+ thr = dcalloc(nth+2, sizeof(*thr));
+ td = dcalloc(nth+2, sizeof(THREAD_DATA));
+ stable_set = false;
+ if (chdir(home) != 0)
+ testutil_die(errno, "Child chdir: %s", home);
+ if (inmem)
+ strcpy(envconf, ENV_CONFIG_DEF);
+ else
+ strcpy(envconf, ENV_CONFIG_TXNSYNC);
+ if (compat)
+ strcat(envconf, ENV_CONFIG_COMPAT);
+
+ testutil_check(wiredtiger_open(NULL, &event_handler, envconf, &conn));
+ testutil_check(conn->open_session(conn, NULL, NULL, &session));
+ /*
+ * Create all the tables.
+ */
+ testutil_check(session->create(session, uri_collection,
+ "key_format=S,value_format=u,log=(enabled=false)"));
+ testutil_check(session->create(session,
+ uri_local, "key_format=S,value_format=u"));
+ testutil_check(session->create(session,
+ uri_oplog, "key_format=S,value_format=u"));
+ /*
+ * Don't log the stable timestamp table so that we know what timestamp
+ * was stored at the checkpoint.
+ */
+ testutil_check(session->close(session, NULL));
+
+ /*
+ * The checkpoint thread and the timestamp threads are added at the end.
+ */
+ ckpt_id = nth;
+ td[ckpt_id].conn = conn;
+ td[ckpt_id].info = nth;
+ printf("Create checkpoint thread\n");
+ testutil_check(__wt_thread_create(
+ NULL, &thr[ckpt_id], thread_ckpt_run, &td[ckpt_id]));
+ ts_id = nth + 1;
+ if (use_ts) {
+ td[ts_id].conn = conn;
+ td[ts_id].info = nth;
+ printf("Create timestamp thread\n");
+ testutil_check(__wt_thread_create(
+ NULL, &thr[ts_id], thread_ts_run, &td[ts_id]));
+ }
+ printf("Create %" PRIu32 " writer threads\n", nth);
+ for (i = 0; i < nth; ++i) {
+ td[i].conn = conn;
+ td[i].start = WT_BILLION * (uint64_t)i;
+ td[i].info = i;
+ testutil_check(__wt_thread_create(
+ NULL, &thr[i], thread_run, &td[i]));
+ }
+ /*
+ * The threads never exit, so the child will just wait here until
+ * it is killed.
+ */
+ fflush(stdout);
+ for (i = 0; i <= ts_id; ++i)
+ testutil_check(__wt_thread_join(NULL, &thr[i]));
+ /*
+ * NOTREACHED
+ */
+ free(thr);
+ free(td);
+ exit(EXIT_SUCCESS);
+}
+
+/*
+ * Determines whether this is a timestamp build or not
+ */
+static bool
+timestamp_build(void)
+{
+#ifdef HAVE_TIMESTAMPS
+ return (true);
+#else
+ return (false);
+#endif
+}
+
+extern int __wt_optind;
+extern char *__wt_optarg;
+
+/*
+ * Initialize a report structure. Since zero is a valid key we
+ * cannot just clear it.
+ */
+static void
+initialize_rep(REPORT *r)
+{
+ r->first_key = r->first_miss = INVALID_KEY;
+ r->absent_key = r->exist_key = r->last_key = INVALID_KEY;
+}
+
+/*
+ * Print out information if we detect missing records in the
+ * middle of the data of a report structure.
+ */
+static void
+print_missing(REPORT *r, const char *fname, const char *msg)
+{
+ if (r->exist_key != INVALID_KEY)
+ printf("%s: %s error %" PRIu64
+ " absent records %" PRIu64 "-%" PRIu64
+ ". Then keys %" PRIu64 "-%" PRIu64 " exist."
+ " Key range %" PRIu64 "-%" PRIu64 "\n",
+ fname, msg,
+ (r->exist_key - r->first_miss) - 1,
+ r->first_miss, r->exist_key - 1,
+ r->exist_key, r->last_key,
+ r->first_key, r->last_key);
+}
+
+/*
+ * Signal handler to catch if the child died unexpectedly.
+ */
+static void
+sig_handler(int sig)
+{
+ pid_t pid;
+
+ WT_UNUSED(sig);
+ pid = wait(NULL);
+ /*
+ * The core file will indicate why the child exited. Choose EINVAL here.
+ */
+ testutil_die(EINVAL,
+ "Child process %" PRIu64 " abnormally exited", (uint64_t)pid);
+}
+
+int
+main(int argc, char *argv[])
+{
+ struct sigaction sa;
+ struct stat sb;
+ FILE *fp;
+ REPORT c_rep[MAX_TH], l_rep[MAX_TH], o_rep[MAX_TH];
+ WT_CONNECTION *conn;
+ WT_CURSOR *cur_coll, *cur_local, *cur_oplog;
+ WT_RAND_STATE rnd;
+ WT_SESSION *session;
+ pid_t pid;
+ uint64_t absent_coll, absent_local, absent_oplog, count, key, last_key;
+ uint64_t stable_fp, stable_val;
+ uint32_t i;
+ int ret;
+ char fname[64], kname[64];
+ bool fatal;
+ uint32_t nth, timeout;
+ int ch, status;
+ const char *working_dir;
+ char buf[512], statname[1024];
+ bool rand_th, rand_time, verify_only;
+
+ /* We have nothing to do if this is not a timestamp build */
+ if (!timestamp_build())
+ return (EXIT_SUCCESS);
+
+ (void)testutil_set_progname(argv);
+
+ compat = inmem = false;
+ use_ts = true;
+ /*
+ * Setting this to false forces us to use internal library code.
+ * Allow an override but default to using that code.
+ */
+ use_txn = false;
+ nth = MIN_TH;
+ rand_th = rand_time = true;
+ timeout = MIN_TIME;
+ verify_only = false;
+ working_dir = "WT_TEST.schema-abort";
+
+ while ((ch = __wt_getopt(progname, argc, argv, "Ch:mT:t:vxz")) != EOF)
+ switch (ch) {
+ case 'C':
+ compat = true;
+ break;
+ case 'h':
+ working_dir = __wt_optarg;
+ break;
+ case 'm':
+ inmem = true;
+ break;
+ case 'T':
+ rand_th = false;
+ nth = (uint32_t)atoi(__wt_optarg);
+ break;
+ case 't':
+ rand_time = false;
+ timeout = (uint32_t)atoi(__wt_optarg);
+ break;
+ case 'v':
+ verify_only = true;
+ break;
+ case 'x':
+ use_txn = true;
+ break;
+ case 'z':
+ use_ts = false;
+ break;
+ default:
+ usage();
+ }
+ argc -= __wt_optind;
+ if (argc != 0)
+ usage();
+
+ testutil_work_dir_from_path(home, sizeof(home), working_dir);
+ /*
+ * If the user wants to verify they need to tell us how many threads
+ * there were so we can find the old record files.
+ */
+ if (verify_only && rand_th) {
+ fprintf(stderr,
+ "Verify option requires specifying number of threads\n");
+ exit (EXIT_FAILURE);
+ }
+ if (!verify_only) {
+ testutil_make_work_dir(home);
+
+ __wt_random_init_seed(NULL, &rnd);
+ if (rand_time) {
+ timeout = __wt_random(&rnd) % MAX_TIME;
+ if (timeout < MIN_TIME)
+ timeout = MIN_TIME;
+ }
+ if (rand_th) {
+ nth = __wt_random(&rnd) % MAX_TH;
+ if (nth < MIN_TH)
+ nth = MIN_TH;
+ }
+
+ printf("Parent: compatibility: %s, "
+ "in-mem log sync: %s, timestamp in use: %s\n",
+ compat ? "true" : "false",
+ inmem ? "true" : "false",
+ use_ts ? "true" : "false");
+ printf("Parent: Create %" PRIu32
+ " threads; sleep %" PRIu32 " seconds\n", nth, timeout);
+ printf("CONFIG: %s%s%s%s -h %s -T %" PRIu32 " -t %" PRIu32 "\n",
+ progname,
+ compat ? " -C" : "",
+ inmem ? " -m" : "",
+ !use_ts ? " -z" : "",
+ working_dir, nth, timeout);
+ /*
+ * Fork a child to insert as many items. We will then randomly
+ * kill the child, run recovery and make sure all items we wrote
+ * exist after recovery runs.
+ */
+ memset(&sa, 0, sizeof(sa));
+ sa.sa_handler = sig_handler;
+ testutil_checksys(sigaction(SIGCHLD, &sa, NULL));
+ testutil_checksys((pid = fork()) < 0);
+
+ if (pid == 0) { /* child */
+ run_workload(nth);
+ return (EXIT_SUCCESS);
+ }
+
+ /* parent */
+ /*
+ * Sleep for the configured amount of time before killing
+ * the child. Start the timeout from the time we notice that
+ * the file has been created. That allows the test to run
+ * correctly on really slow machines.
+ */
+ testutil_check(__wt_snprintf(
+ statname, sizeof(statname), "%s/%s", home, ckpt_file));
+ while (stat(statname, &sb) != 0)
+ sleep(1);
+ sleep(timeout);
+ sa.sa_handler = SIG_DFL;
+ testutil_checksys(sigaction(SIGCHLD, &sa, NULL));
+
+ /*
+ * !!! It should be plenty long enough to make sure more than
+ * one log file exists. If wanted, that check would be added
+ * here.
+ */
+ printf("Kill child\n");
+ testutil_checksys(kill(pid, SIGKILL) != 0);
+ testutil_checksys(waitpid(pid, &status, 0) == -1);
+ }
+ /*
+ * !!! If we wanted to take a copy of the directory before recovery,
+ * this is the place to do it. Don't do it all the time because
+ * it can use a lot of disk space, which can cause test machine
+ * issues.
+ */
+ if (chdir(home) != 0)
+ testutil_die(errno, "parent chdir: %s", home);
+ /*
+ * The tables can get very large, so while we'd ideally like to
+ * copy the entire database, we only copy the log files for now.
+ * Otherwise it can take far too long to run the test, particularly
+ * in automated testing.
+ */
+ testutil_check(__wt_snprintf(buf, sizeof(buf),
+ "rm -rf ../%s.SAVE && mkdir ../%s.SAVE && "
+ "cp -p * ../%s.SAVE",
+ home, home, home));
+ if ((status = system(buf)) < 0)
+ testutil_die(status, "system: %s", buf);
+ printf("Open database, run recovery and verify content\n");
+
+ /*
+ * Open the connection which forces recovery to be run.
+ */
+ testutil_check(wiredtiger_open(
+ NULL, &event_handler, ENV_CONFIG_REC, &conn));
+ testutil_check(conn->open_session(conn, NULL, NULL, &session));
+ /*
+ * Open a cursor on all the tables.
+ */
+ testutil_check(session->open_cursor(session,
+ uri_collection, NULL, NULL, &cur_coll));
+ testutil_check(session->open_cursor(session,
+ uri_local, NULL, NULL, &cur_local));
+ testutil_check(session->open_cursor(session,
+ uri_oplog, NULL, NULL, &cur_oplog));
+
+ /*
+ * Find the biggest stable timestamp value that was saved.
+ */
+ stable_val = 0;
+ if (use_ts) {
+ testutil_check(
+ conn->query_timestamp(conn, buf, "get=recovery"));
+ sscanf(buf, "%" SCNx64, &stable_val);
+ printf("Got stable_val %" PRIu64 "\n", stable_val);
+ }
+
+ count = 0;
+ absent_coll = absent_local = absent_oplog = 0;
+ fatal = false;
+ for (i = 0; i < nth; ++i) {
+ initialize_rep(&c_rep[i]);
+ initialize_rep(&l_rep[i]);
+ initialize_rep(&o_rep[i]);
+ testutil_check(__wt_snprintf(
+ fname, sizeof(fname), RECORDS_FILE, i));
+ if ((fp = fopen(fname, "r")) == NULL)
+ testutil_die(errno, "fopen: %s", fname);
+
+ /*
+ * For every key in the saved file, verify that the key exists
+ * in the table after recovery. If we're doing in-memory
+ * log buffering we never expect a record missing in the middle,
+ * but records may be missing at the end. If we did
+ * write-no-sync, we expect every key to have been recovered.
+ */
+ for (last_key = INVALID_KEY;; ++count, last_key = key) {
+ ret = fscanf(fp, "%" SCNu64 "%" SCNu64 "\n",
+ &stable_fp, &key);
+ if (last_key == INVALID_KEY) {
+ c_rep[i].first_key = key;
+ l_rep[i].first_key = key;
+ o_rep[i].first_key = key;
+ }
+ if (ret != EOF && ret != 2) {
+ /*
+ * If we find a partial line, consider it
+ * like an EOF.
+ */
+ if (ret == 1 || ret == 0)
+ break;
+ testutil_die(errno, "fscanf");
+ }
+ if (ret == EOF)
+ break;
+ /*
+ * If we're unlucky, the last line may be a partially
+ * written key at the end that can result in a false
+ * negative error for a missing record. Detect it.
+ */
+ if (last_key != INVALID_KEY && key != last_key + 1) {
+ printf("%s: Ignore partial record %" PRIu64
+ " last valid key %" PRIu64 "\n",
+ fname, key, last_key);
+ break;
+ }
+ testutil_check(__wt_snprintf(
+ kname, sizeof(kname), "%" PRIu64, key));
+ cur_coll->set_key(cur_coll, kname);
+ cur_local->set_key(cur_local, kname);
+ cur_oplog->set_key(cur_oplog, kname);
+ /*
+ * The collection table should always only have the
+ * data as of the checkpoint.
+ */
+ if ((ret = cur_coll->search(cur_coll)) != 0) {
+ if (ret != WT_NOTFOUND)
+ testutil_die(ret, "search");
+ /*
+ * If we don't find a record, the stable
+ * timestamp written to our file better be
+ * larger than the saved one.
+ */
+ if (!inmem &&
+ stable_fp != 0 && stable_fp <= stable_val) {
+ printf("%s: COLLECTION no record with "
+ "key %" PRIu64 " record ts %" PRIu64
+ " <= stable ts %" PRIu64 "\n",
+ fname, key, stable_fp, stable_val);
+ absent_coll++;
+ }
+ if (c_rep[i].first_miss == INVALID_KEY)
+ c_rep[i].first_miss = key;
+ c_rep[i].absent_key = key;
+ } else if (c_rep[i].absent_key != INVALID_KEY &&
+ c_rep[i].exist_key == INVALID_KEY) {
+ /*
+ * If we get here we found a record that exists
+ * after absent records, a hole in our data.
+ */
+ c_rep[i].exist_key = key;
+ fatal = true;
+ } else if (!inmem &&
+ stable_fp != 0 && stable_fp > stable_val) {
+ /*
+ * If we found a record, the stable timestamp
+ * written to our file better be no larger
+ * than the checkpoint one.
+ */
+ printf("%s: COLLECTION record with "
+ "key %" PRIu64 " record ts %" PRIu64
+ " > stable ts %" PRIu64 "\n",
+ fname, key, stable_fp, stable_val);
+ fatal = true;
+ }
+ /*
+ * The local table should always have all data.
+ */
+ if ((ret = cur_local->search(cur_local)) != 0) {
+ if (ret != WT_NOTFOUND)
+ testutil_die(ret, "search");
+ if (!inmem)
+ printf("%s: LOCAL no record with key %"
+ PRIu64 "\n", fname, key);
+ absent_local++;
+ if (l_rep[i].first_miss == INVALID_KEY)
+ l_rep[i].first_miss = key;
+ l_rep[i].absent_key = key;
+ } else if (l_rep[i].absent_key != INVALID_KEY &&
+ l_rep[i].exist_key == INVALID_KEY) {
+ /*
+ * We should never find an existing key after
+ * we have detected one missing.
+ */
+ l_rep[i].exist_key = key;
+ fatal = true;
+ }
+ /*
+ * The oplog table should always have all data.
+ */
+ if ((ret = cur_oplog->search(cur_oplog)) != 0) {
+ if (ret != WT_NOTFOUND)
+ testutil_die(ret, "search");
+ if (!inmem)
+ printf("%s: OPLOG no record with key %"
+ PRIu64 "\n", fname, key);
+ absent_oplog++;
+ if (o_rep[i].first_miss == INVALID_KEY)
+ o_rep[i].first_miss = key;
+ o_rep[i].absent_key = key;
+ } else if (o_rep[i].absent_key != INVALID_KEY &&
+ o_rep[i].exist_key == INVALID_KEY) {
+ /*
+ * We should never find an existing key after
+ * we have detected one missing.
+ */
+ o_rep[i].exist_key = key;
+ fatal = true;
+ }
+ }
+ c_rep[i].last_key = last_key;
+ l_rep[i].last_key = last_key;
+ o_rep[i].last_key = last_key;
+ testutil_checksys(fclose(fp) != 0);
+ print_missing(&c_rep[i], fname, "COLLECTION");
+ print_missing(&l_rep[i], fname, "LOCAL");
+ print_missing(&o_rep[i], fname, "OPLOG");
+ }
+ testutil_check(conn->close(conn, NULL));
+ if (!inmem && absent_coll) {
+ printf("COLLECTION: %" PRIu64
+ " record(s) absent from %" PRIu64 "\n",
+ absent_coll, count);
+ fatal = true;
+ }
+ if (!inmem && absent_local) {
+ printf("LOCAL: %" PRIu64 " record(s) absent from %" PRIu64 "\n",
+ absent_local, count);
+ fatal = true;
+ }
+ if (!inmem && absent_oplog) {
+ printf("OPLOG: %" PRIu64 " record(s) absent from %" PRIu64 "\n",
+ absent_oplog, count);
+ fatal = true;
+ }
+ if (fatal)
+ return (EXIT_FAILURE);
+ printf("%" PRIu64 " records verified\n", count);
+ return (EXIT_SUCCESS);
+}
diff --git a/src/third_party/wiredtiger/test/csuite/schema_abort/smoke.sh b/src/third_party/wiredtiger/test/csuite/schema_abort/smoke.sh
new file mode 100755
index 00000000000..41d702d40cd
--- /dev/null
+++ b/src/third_party/wiredtiger/test/csuite/schema_abort/smoke.sh
@@ -0,0 +1,12 @@
+#! /bin/sh
+
+set -e
+
+# Smoke-test schema-abort as part of running "make check".
+
+$TEST_WRAPPER ./test_schema_abort -t 10 -T 5
+$TEST_WRAPPER ./test_schema_abort -m -t 10 -T 5
+$TEST_WRAPPER ./test_schema_abort -C -t 10 -T 5
+$TEST_WRAPPER ./test_schema_abort -C -m -t 10 -T 5
+$TEST_WRAPPER ./test_schema_abort -m -t 10 -T 5 -z
+$TEST_WRAPPER ./test_schema_abort -m -t 10 -T 5 -x
diff --git a/src/third_party/wiredtiger/test/csuite/timestamp_abort/main.c b/src/third_party/wiredtiger/test/csuite/timestamp_abort/main.c
index 8a1781eae45..5aec69cf034 100644
--- a/src/third_party/wiredtiger/test/csuite/timestamp_abort/main.c
+++ b/src/third_party/wiredtiger/test/csuite/timestamp_abort/main.c
@@ -68,9 +68,10 @@ static char home[1024]; /* Program working dir */
#define RECORDS_FILE "records-%" PRIu32
#define STABLE_PERIOD 100
-static const char * const uri_local = "table:local";
-static const char * const uri_oplog = "table:oplog";
-static const char * const uri_collection = "table:collection";
+static const char * table_pfx = "table";
+static const char * const uri_local = "local";
+static const char * const uri_oplog = "oplog";
+static const char * const uri_collection = "collection";
static const char * const ckpt_file = "checkpoint_done";
@@ -210,7 +211,7 @@ thread_ckpt_run(void *arg)
session, "use_timestamp=true"));
testutil_check(td->conn->query_timestamp(
td->conn, buf, "get=last_checkpoint"));
- sscanf(buf, "%" SCNx64, &stable);
+ testutil_assert(sscanf(buf, "%" SCNx64, &stable) == 1);
printf("Checkpoint %d complete at stable %"
PRIu64 ".\n", i, stable);
fflush(stdout);
@@ -243,7 +244,7 @@ thread_run(void *arg)
THREAD_DATA *td;
uint64_t i, stable_ts;
char cbuf[MAX_VAL], lbuf[MAX_VAL], obuf[MAX_VAL];
- char kname[64], tscfg[64];
+ char kname[64], tscfg[64], uri[128];
bool use_prep;
__wt_random_init(&rnd);
@@ -267,10 +268,12 @@ thread_run(void *arg)
__wt_stream_set_line_buffer(fp);
/*
- * Have half the threads use prepared transactions if timestamps
- * are in use.
+ * Have 10% of the threads use prepared transactions if timestamps
+ * are in use. Thread numbers start at 0 so we're always guaranteed
+ * that at least one thread is using prepared transactions.
*/
- use_prep = (use_ts && td->info % 2 == 0) ? true : false;
+ use_prep = (use_ts && td->info % 10 == 0) ? true : false;
+
/*
* We may have two sessions so that the oplog session can have its own
* transaction in parallel with the collection session for threads
@@ -283,19 +286,25 @@ thread_run(void *arg)
/*
* Open a cursor to each table.
*/
+ testutil_check(__wt_snprintf(
+ uri, sizeof(uri), "%s:%s", table_pfx, uri_collection));
testutil_check(session->open_cursor(session,
- uri_collection, NULL, NULL, &cur_coll));
+ uri, NULL, NULL, &cur_coll));
+ testutil_check(__wt_snprintf(
+ uri, sizeof(uri), "%s:%s", table_pfx, uri_local));
testutil_check(session->open_cursor(session,
- uri_local, NULL, NULL, &cur_local));
+ uri, NULL, NULL, &cur_local));
+ testutil_check(__wt_snprintf(
+ uri, sizeof(uri), "%s:%s", table_pfx, uri_oplog));
oplog_session = NULL;
if (use_prep) {
testutil_check(td->conn->open_session(
td->conn, NULL, NULL, &oplog_session));
testutil_check(session->open_cursor(oplog_session,
- uri_oplog, NULL, NULL, &cur_oplog));
+ uri, NULL, NULL, &cur_oplog));
} else
testutil_check(session->open_cursor(session,
- uri_oplog, NULL, NULL, &cur_oplog));
+ uri, NULL, NULL, &cur_oplog));
/*
* Write our portion of the key space until we're killed.
@@ -313,6 +322,17 @@ thread_run(void *arg)
if (use_prep)
testutil_check(oplog_session->begin_transaction(
oplog_session, NULL));
+ /*
+ * If not using prepared transactions set the timestamp now
+ * before performing the operation. If we are using prepared
+ * transactions, it must be set after the prepare.
+ */
+ if (use_ts && !use_prep) {
+ testutil_check(__wt_snprintf(tscfg, sizeof(tscfg),
+ "commit_timestamp=%" PRIx64, stable_ts));
+ testutil_check(
+ session->timestamp_transaction(session, tscfg));
+ }
cur_coll->set_key(cur_coll, kname);
cur_local->set_key(cur_local, kname);
cur_oplog->set_key(cur_oplog, kname);
@@ -352,10 +372,21 @@ thread_run(void *arg)
if (i % PREPARE_YIELD == 0)
__wt_yield();
}
- testutil_check(__wt_snprintf(tscfg, sizeof(tscfg),
- "commit_timestamp=%" PRIx64, stable_ts));
- testutil_check(
- session->commit_transaction(session, tscfg));
+ /*
+ * If we did not set the timestamp above via
+ * timestamp_transaction send it now on commit.
+ */
+ if (use_ts && !use_prep)
+ testutil_check(
+ session->commit_transaction(session, NULL));
+ else {
+ testutil_check(
+ __wt_snprintf(tscfg, sizeof(tscfg),
+ "commit_timestamp=%" PRIx64, stable_ts));
+ testutil_check(
+ session->commit_transaction(session,
+ tscfg));
+ }
if (use_prep)
testutil_check(
oplog_session->commit_transaction(
@@ -407,7 +438,7 @@ run_workload(uint32_t nth)
THREAD_DATA *td;
wt_thread_t *thr;
uint32_t ckpt_id, i, ts_id;
- char envconf[512];
+ char envconf[512], uri[128];
thr = dcalloc(nth+2, sizeof(*thr));
td = dcalloc(nth+2, sizeof(THREAD_DATA));
@@ -425,12 +456,18 @@ run_workload(uint32_t nth)
/*
* Create all the tables.
*/
- testutil_check(session->create(session, uri_collection,
+ testutil_check(__wt_snprintf(
+ uri, sizeof(uri), "%s:%s", table_pfx, uri_collection));
+ testutil_check(session->create(session, uri,
"key_format=S,value_format=u,log=(enabled=false)"));
+ testutil_check(__wt_snprintf(
+ uri, sizeof(uri), "%s:%s", table_pfx, uri_local));
testutil_check(session->create(session,
- uri_local, "key_format=S,value_format=u"));
+ uri, "key_format=S,value_format=u"));
+ testutil_check(__wt_snprintf(
+ uri, sizeof(uri), "%s:%s", table_pfx, uri_oplog));
testutil_check(session->create(session,
- uri_oplog, "key_format=S,value_format=u"));
+ uri, "key_format=S,value_format=u"));
/*
* Don't log the stable timestamp table so that we know what timestamp
* was stored at the checkpoint.
@@ -468,7 +505,7 @@ run_workload(uint32_t nth)
*/
fflush(stdout);
for (i = 0; i <= ts_id; ++i)
- testutil_check(__wt_thread_join(NULL, thr[i]));
+ testutil_check(__wt_thread_join(NULL, &thr[i]));
/*
* NOTREACHED
*/
@@ -574,7 +611,7 @@ main(int argc, char *argv[])
verify_only = false;
working_dir = "WT_TEST.timestamp-abort";
- while ((ch = __wt_getopt(progname, argc, argv, "Ch:mT:t:vz")) != EOF)
+ while ((ch = __wt_getopt(progname, argc, argv, "Ch:LmT:t:vz")) != EOF)
switch (ch) {
case 'C':
compat = true;
@@ -582,6 +619,9 @@ main(int argc, char *argv[])
case 'h':
working_dir = __wt_optarg;
break;
+ case 'L':
+ table_pfx = "lsm";
+ break;
case 'm':
inmem = true;
break;
@@ -603,7 +643,6 @@ main(int argc, char *argv[])
usage();
}
argc -= __wt_optind;
- argv += __wt_optind;
if (argc != 0)
usage();
@@ -714,12 +753,18 @@ main(int argc, char *argv[])
/*
* Open a cursor on all the tables.
*/
+ testutil_check(__wt_snprintf(
+ buf, sizeof(buf), "%s:%s", table_pfx, uri_collection));
testutil_check(session->open_cursor(session,
- uri_collection, NULL, NULL, &cur_coll));
+ buf, NULL, NULL, &cur_coll));
+ testutil_check(__wt_snprintf(
+ buf, sizeof(buf), "%s:%s", table_pfx, uri_local));
testutil_check(session->open_cursor(session,
- uri_local, NULL, NULL, &cur_local));
+ buf, NULL, NULL, &cur_local));
+ testutil_check(__wt_snprintf(
+ buf, sizeof(buf), "%s:%s", table_pfx, uri_oplog));
testutil_check(session->open_cursor(session,
- uri_oplog, NULL, NULL, &cur_oplog));
+ buf, NULL, NULL, &cur_oplog));
/*
* Find the biggest stable timestamp value that was saved.
@@ -728,7 +773,7 @@ main(int argc, char *argv[])
if (use_ts) {
testutil_check(
conn->query_timestamp(conn, buf, "get=recovery"));
- sscanf(buf, "%" SCNx64, &stable_val);
+ testutil_assert(sscanf(buf, "%" SCNx64, &stable_val) == 1);
printf("Got stable_val %" PRIu64 "\n", stable_val);
}
diff --git a/src/third_party/wiredtiger/test/csuite/timestamp_abort/smoke.sh b/src/third_party/wiredtiger/test/csuite/timestamp_abort/smoke.sh
index f49ea10e5e1..661261eb1bb 100755
--- a/src/third_party/wiredtiger/test/csuite/timestamp_abort/smoke.sh
+++ b/src/third_party/wiredtiger/test/csuite/timestamp_abort/smoke.sh
@@ -5,6 +5,8 @@ set -e
# Smoke-test timestamp-abort as part of running "make check".
$TEST_WRAPPER ./test_timestamp_abort -t 10 -T 5
+#$TEST_WRAPPER ./test_timestamp_abort -t 10 -T 5 -L
$TEST_WRAPPER ./test_timestamp_abort -m -t 10 -T 5
+#$TEST_WRAPPER ./test_timestamp_abort -m -t 10 -T 5 -L
$TEST_WRAPPER ./test_timestamp_abort -C -t 10 -T 5
$TEST_WRAPPER ./test_timestamp_abort -C -m -t 10 -T 5
diff --git a/src/third_party/wiredtiger/test/csuite/truncated_log/main.c b/src/third_party/wiredtiger/test/csuite/truncated_log/main.c
index be28a8d9500..1ed1b6e8157 100644
--- a/src/third_party/wiredtiger/test/csuite/truncated_log/main.c
+++ b/src/third_party/wiredtiger/test/csuite/truncated_log/main.c
@@ -262,7 +262,6 @@ main(int argc, char *argv[])
usage();
}
argc -= __wt_optind;
- argv += __wt_optind;
if (argc != 0)
usage();
diff --git a/src/third_party/wiredtiger/test/csuite/wt2909_checkpoint_integrity/main.c b/src/third_party/wiredtiger/test/csuite/wt2909_checkpoint_integrity/main.c
index 5e6ebc0ab1a..bb9f293edaa 100644
--- a/src/third_party/wiredtiger/test/csuite/wt2909_checkpoint_integrity/main.c
+++ b/src/third_party/wiredtiger/test/csuite/wt2909_checkpoint_integrity/main.c
@@ -445,9 +445,9 @@ run_process(TEST_OPTS *opts, const char *prog, char *argv[], int *status)
}
/*
-* subtest_error_handler --
-* Error event handler.
-*/
+ * subtest_error_handler --
+ * Error event handler.
+ */
static int
subtest_error_handler(WT_EVENT_HANDLER *handler,
WT_SESSION *session, int error, const char *message)
diff --git a/src/third_party/wiredtiger/test/csuite/wt4105_large_doc_small_upd/main.c b/src/third_party/wiredtiger/test/csuite/wt4105_large_doc_small_upd/main.c
index 1efe22c1816..2dd0f86db20 100644
--- a/src/third_party/wiredtiger/test/csuite/wt4105_large_doc_small_upd/main.c
+++ b/src/third_party/wiredtiger/test/csuite/wt4105_large_doc_small_upd/main.c
@@ -101,8 +101,7 @@ main(int argc, char *argv[])
"key_format=Q,value_format=u,"
"leaf_item_max=64M,leaf_page_max=32k,memory_page_max=1M"));
- testutil_check(
- session->open_cursor(session, uri, NULL, NULL, &c));
+ testutil_check(session->open_cursor(session, uri, NULL, NULL, &c));
/* Value is initialized with 'v' and has not significance to it. */
large_doc = dmalloc(DATASIZE);
@@ -137,6 +136,8 @@ main(int argc, char *argv[])
while (++j < MODIFY_COUNT) {
for (i = 0; i < NUM_DOCS; i++) {
/* Position the cursor. */
+ testutil_check(
+ session2->begin_transaction(session2, NULL));
c->set_key(c, i);
modify_entry.data.data =
"abcdefghijklmnopqrstuvwxyz";
@@ -146,6 +147,8 @@ main(int argc, char *argv[])
(void)alarm(1);
testutil_check(c->modify(c, &modify_entry, 1));
(void)alarm(0);
+ testutil_check(
+ session2->commit_transaction(session2, NULL));
}
/*
* Modify operations are done similar to append sequence.
diff --git a/src/third_party/wiredtiger/test/csuite/wt4117_checksum/main.c b/src/third_party/wiredtiger/test/csuite/wt4117_checksum/main.c
index 9a786e5d222..fff7b55bbf9 100644
--- a/src/third_party/wiredtiger/test/csuite/wt4117_checksum/main.c
+++ b/src/third_party/wiredtiger/test/csuite/wt4117_checksum/main.c
@@ -44,65 +44,51 @@ static void
run(void)
{
size_t len;
- uint32_t crc32c;
+ uint32_t crc32c, (*func)(const void *, size_t);
uint8_t *data;
/* Allocate aligned memory for the data. */
data = dcalloc(100, sizeof(uint8_t));
+ /* Get a pointer to the CRC32C function. */
+ func = wiredtiger_crc32c_func();
+
/*
* Some simple known checksums.
*/
len = 1;
- crc32c = wiredtiger_checksum_crc32c(data, len);
+ crc32c = func(data, len);
check(crc32c, (uint32_t)0x527d5351, len, "nul x1");
len = 2;
- crc32c = wiredtiger_checksum_crc32c(data, len);
+ crc32c = func(data, len);
check(crc32c, (uint32_t)0xf16177d2, len, "nul x2");
len = 3;
- crc32c = wiredtiger_checksum_crc32c(data, len);
+ crc32c = func(data, len);
check(crc32c, (uint32_t)0x6064a37a, len, "nul x3");
len = 4;
- crc32c = wiredtiger_checksum_crc32c(data, len);
+ crc32c = func(data, len);
check(crc32c, (uint32_t)0x48674bc7, len, "nul x4");
len = strlen("123456789");
memcpy(data, "123456789", len);
- crc32c = wiredtiger_checksum_crc32c(data, len);
+ crc32c = func(data, len);
check(crc32c, (uint32_t)0xe3069283, len, "known string #1");
len = strlen("The quick brown fox jumps over the lazy dog");
memcpy(data, "The quick brown fox jumps over the lazy dog", len);
- crc32c = wiredtiger_checksum_crc32c(data, len);
+ crc32c = func(data, len);
check(crc32c, (uint32_t)0x22620404, len, "known string #2");
free(data);
}
int
-main(int argc, char *argv[])
+main(void)
{
- TEST_OPTS *opts, _opts;
-
- opts = &_opts;
- memset(opts, 0, sizeof(*opts));
- testutil_check(testutil_parse_opts(argc, argv, opts));
- testutil_make_work_dir(opts->home);
-
- /*
- * The external API should work before the library configures itself,
- * run before and after calling wiredtiger_open().
- */
- run();
-
- testutil_check(
- wiredtiger_open(opts->home, NULL, "create", &opts->conn));
-
run();
- testutil_cleanup(opts);
return (EXIT_SUCCESS);
}
diff --git a/src/third_party/wiredtiger/test/csuite/wt4156_metadata_salvage/main.c b/src/third_party/wiredtiger/test/csuite/wt4156_metadata_salvage/main.c
new file mode 100644
index 00000000000..abce637e5c6
--- /dev/null
+++ b/src/third_party/wiredtiger/test/csuite/wt4156_metadata_salvage/main.c
@@ -0,0 +1,755 @@
+/*-
+ * Public Domain 2014-2018 MongoDB, Inc.
+ * Public Domain 2008-2014 WiredTiger, Inc.
+ *
+ * This is free and unencumbered software released into the public domain.
+ *
+ * Anyone is free to copy, modify, publish, use, compile, sell, or
+ * distribute this software, either in source code form or as a compiled
+ * binary, for any purpose, commercial or non-commercial, and by any
+ * means.
+ *
+ * In jurisdictions that recognize copyright laws, the author or authors
+ * of this software dedicate any and all copyright interest in the
+ * software to the public domain. We make this dedication for the benefit
+ * of the public at large and to the detriment of our heirs and
+ * successors. We intend this dedication to be an overt act of
+ * relinquishment in perpetuity of all present and future rights to this
+ * software under copyright law.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+#include "test_util.h"
+
+#include <sys/wait.h>
+#include <signal.h>
+
+#define CKPT_DISTANCE 1
+#define CORRUPT "file:zzz-corrupt.SS"
+#define KEY "key"
+#define VALUE "value,value,value"
+
+#define DB0 "CKPT0"
+#define DB1 "CKPT1"
+#define DB2 "CKPT2"
+#define SAVE "SAVE"
+#define TEST "TEST"
+
+/*
+ * NOTE: This assumes the default page size of 4096. If that changes these
+ * sizes need to change along with it.
+ */
+#define APP_MD_SIZE 4096
+#define APP_BUF_SIZE (3 * 1024)
+#define APP_STR "long app metadata. "
+
+static uint64_t data_val;
+static const char *home;
+static bool test_abort = false;
+static bool test_out_of_sync = false;
+static WT_SESSION *wt_session;
+
+static int
+handle_message(WT_EVENT_HANDLER *handler,
+ WT_SESSION *session, int error, const char *message)
+{
+ (void)(handler);
+
+ (void)fprintf(stderr, "%s: %s\n",
+ message, session->strerror(session, error));
+ if (test_abort) {
+ fprintf(stderr, "Got unexpected error. Aborting\n");
+ abort();
+ }
+ return (0);
+}
+
+static WT_EVENT_HANDLER event_handler = {
+ handle_message,
+ NULL,
+ NULL,
+ NULL
+};
+
+typedef struct table_info {
+ const char *name;
+ const char *kvformat;
+ bool verified;
+} TABLE_INFO;
+
+/*
+ * byte_str --
+ * A byte-string version to find a sub-string. The metadata we read
+ * contains a lot of zeroes so we cannot use string-based functions.
+ */
+static uint8_t *
+byte_str(uint8_t *buf, size_t bufsize, const char *str)
+{
+ size_t buflen, slen;
+ uint8_t *end, *p, *s;
+ int c;
+
+ p = buf;
+ end = buf + bufsize;
+ s = NULL;
+ c = (int)str[0];
+ buflen = bufsize;
+ slen = strlen(str);
+ /*
+ * Find the first character and then compare.
+ */
+ while ((s = memchr(p, c, buflen)) != NULL) {
+ /*
+ * If we don't have enough buffer left to compare we do not
+ * have a match.
+ */
+ buflen = (size_t)(end - s);
+ if (buflen < slen)
+ return (NULL);
+ if (memcmp(s, str, slen) == 0)
+ return (s);
+ /*
+ * This one didn't match, increment in the buffer and find the
+ * next one.
+ */
+ ++s;
+ --buflen;
+ p = s;
+ }
+ return (NULL);
+}
+
+/*
+ * cursor_insert --
+ * Insert some data into a table.
+ */
+static void
+cursor_insert(const char *uri, uint64_t i)
+{
+ WT_CURSOR *cursor;
+ WT_ITEM vu;
+ char keybuf[100], valuebuf[100];
+ bool recno;
+
+ memset(&vu, 0, sizeof(vu));
+
+ /* Open a cursor. */
+ testutil_check(wt_session->open_cursor(
+ wt_session, uri, NULL, NULL, &cursor));
+ /* Operations change based on the key/value formats. */
+ recno = strcmp(cursor->key_format, "r") == 0;
+ if (recno)
+ cursor->set_key(cursor, i);
+ else {
+ testutil_check(__wt_snprintf(keybuf, sizeof(keybuf),
+ "%s-%" PRIu64, KEY, i));
+ cursor->set_key(cursor, keybuf);
+ }
+ strcpy(valuebuf, VALUE);
+ cursor->set_value(cursor, valuebuf);
+ testutil_check(cursor->insert(cursor));
+ testutil_check(cursor->close(cursor));
+}
+
+/*
+ * create_data --
+ * Create a table and insert a piece of data.
+ */
+static void
+create_data(TABLE_INFO *t)
+{
+ size_t len;
+ uint64_t i;
+ char buf[APP_BUF_SIZE], cfg[APP_MD_SIZE];
+
+ memset(buf, 0, sizeof(buf));
+ memset(cfg, 0, sizeof(cfg));
+
+ /*
+ * Create an app-specific metadata string that fills most of page
+ * so that each table in the metadata has its own page.
+ */
+ len = strlen(APP_STR);
+ for (i = 0; i + len < APP_BUF_SIZE; i += len)
+ testutil_check(__wt_snprintf(
+ &buf[i], APP_BUF_SIZE - i, "%s", APP_STR));
+ testutil_check(__wt_snprintf(cfg, sizeof(cfg),
+ "%s,app_metadata=\"%s\"", t->kvformat, buf));
+ testutil_check(wt_session->create(wt_session, t->name, cfg));
+ data_val = 1;
+ cursor_insert(t->name, data_val);
+}
+
+/*
+ * corrupt_metadata --
+ * Corrupt the metadata by scribbling on the "corrupt" URI string.
+ */
+static void
+corrupt_metadata(void)
+{
+ FILE *fp;
+ struct stat sb;
+ long off;
+ size_t meta_size;
+ bool corrupted;
+ uint8_t *buf, *corrupt;
+ char path[256];
+
+ /*
+ * Open the file, read its contents. Find the string "corrupt" and
+ * modify one byte at that offset. That will cause a checksum error
+ * when WiredTiger next reads it.
+ */
+ testutil_check(__wt_snprintf(
+ path, sizeof(path), "%s/%s", home, WT_METAFILE));
+ if ((fp = fopen(path, "r+")) == NULL)
+ testutil_die(errno, "fopen: %s", path);
+ testutil_check(fstat(fileno(fp), &sb));
+ meta_size = (size_t)sb.st_size;
+ buf = dcalloc(meta_size, 1);
+ if (fread(buf, 1, meta_size, fp) != meta_size)
+ testutil_die(errno, "fread: %" WT_SIZET_FMT, meta_size);
+ corrupted = false;
+ /*
+ * Corrupt all occurrences of the string in the file.
+ */
+ while ((corrupt = byte_str(buf, meta_size, CORRUPT)) != NULL) {
+ corrupted = true;
+ testutil_assert(*(char *)corrupt != 'X');
+ *(char *)corrupt = 'X';
+ off = (long)(corrupt - buf);
+ if (fseek(fp, off, SEEK_SET) != 0)
+ testutil_die(errno, "fseek: %ld", off);
+ if (fwrite("X", 1, 1, fp) != 1)
+ testutil_die(errno, "fwrite");
+ }
+ if (!corrupted)
+ testutil_die(errno, "corrupt string did not occur");
+ if (fclose(fp) != 0)
+ testutil_die(errno, "fclose");
+ free(buf);
+}
+
+/*
+ * file_exists --
+ * Return if the file exists.
+ */
+static int
+file_exists(const char *path)
+{
+ struct stat sb;
+
+ return (stat(path, &sb) == 0);
+}
+
+/*
+ * reset_verified --
+ * Reset the verified field in the table array.
+ */
+static void
+reset_verified(TABLE_INFO *tables)
+{
+ TABLE_INFO *t;
+
+ for (t = tables; t->name != NULL; t++)
+ t->verified = false;
+}
+
+/*
+ * verify_metadata --
+ * Verify all the tables expected are in the metadata. We expect all but
+ * the "corrupt" table name.
+ */
+static void
+verify_metadata(WT_CONNECTION *conn, TABLE_INFO *tables)
+{
+ TABLE_INFO *t;
+ WT_CURSOR *cursor;
+ int ret;
+ const char *kv;
+
+ /*
+ * Open a metadata cursor.
+ */
+ testutil_check(conn->open_session(conn, NULL, NULL, &wt_session));
+ testutil_check(wt_session->open_cursor(
+ wt_session, "metadata:", NULL, NULL, &cursor));
+ reset_verified(tables);
+
+ /*
+ * We have to walk the cursor and walk the tables to match up that
+ * the expected tables are in the metadata. It is not efficient, but
+ * the list of tables is small. Walk the cursor once and the array
+ * of tables each time.
+ */
+ while ((ret = cursor->next(cursor)) == 0) {
+ testutil_check(cursor->get_key(cursor, &kv));
+ for (t = tables; t->name != NULL; t++) {
+ if (strcmp(t->name, kv) == 0) {
+ testutil_assert(t->verified == false);
+ t->verified = true;
+ break;
+ }
+ }
+ }
+ testutil_assert(ret == WT_NOTFOUND);
+ testutil_check(cursor->close(cursor));
+ /*
+ * Any tables that were salvaged, make sure we can read the data.
+ * The corrupt table should never be salvaged.
+ */
+ for (t = tables; t->name != NULL; t++) {
+ if (strcmp(t->name, CORRUPT) == 0 && !test_out_of_sync)
+ testutil_assert(t->verified == false);
+ else if (t->verified != true)
+ printf("%s not seen in metadata\n", t->name);
+ else {
+ testutil_check(wt_session->open_cursor(
+ wt_session, t->name, NULL, NULL, &cursor));
+ while ((ret = cursor->next(cursor)) == 0) {
+ testutil_check(cursor->get_value(cursor, &kv));
+ testutil_assert(strcmp(kv, VALUE) == 0);
+ }
+ testutil_assert(ret == WT_NOTFOUND);
+ testutil_check(cursor->close(cursor));
+ printf("%s metadata salvaged and data verified\n",
+ t->name);
+ }
+ }
+}
+
+/*
+ * copy_database --
+ * Copy the database to the specified suffix. In addition, make a copy
+ * of the metadata and turtle files in that new directory.
+ */
+static void
+copy_database(const char *sfx)
+{
+ WT_DECL_RET;
+ char buf[1024];
+
+ testutil_check(__wt_snprintf(buf, sizeof(buf),
+ "rm -rf ./%s.%s; mkdir ./%s.%s; "
+ "cp -p %s/* ./%s.%s",
+ home, sfx, home, sfx, home, home, sfx));
+ printf("copy: %s\n", buf);
+ if ((ret = system(buf)) < 0)
+ testutil_die(ret, "system: %s", buf);
+
+ /*
+ * Now, in the copied directory make a save copy of the
+ * metadata and turtle files to move around and restore
+ * as needed during testing.
+ */
+ testutil_check(__wt_snprintf(buf, sizeof(buf),
+ "cp -p %s.%s/%s %s.%s/%s.%s",
+ home, sfx, WT_METADATA_TURTLE,
+ home, sfx, WT_METADATA_TURTLE, SAVE));
+ if ((ret = system(buf)) < 0)
+ testutil_die(ret, "system: %s", buf);
+ testutil_check(__wt_snprintf(buf, sizeof(buf),
+ "cp -p %s.%s/%s %s.%s/%s.%s",
+ home, sfx, WT_METAFILE,
+ home, sfx, WT_METAFILE, SAVE));
+ if ((ret = system(buf)) < 0)
+ testutil_die(ret, "system: %s", buf);
+}
+
+/*
+ * move_data_ahead --
+ * Update the tables with new data and take a checkpoint twice.
+ * WiredTiger keeps the previous checkpoint so we do it twice so that
+ * the old checkpoint address no longer exists.
+ */
+static void
+move_data_ahead(TABLE_INFO *table_data)
+{
+ TABLE_INFO *t;
+ uint64_t i;
+
+ i = 0;
+ while (i < CKPT_DISTANCE) {
+ ++data_val;
+ for (t = table_data; t->name != NULL; t++)
+ cursor_insert(t->name, data_val);
+ ++i;
+ fprintf(stderr, "MOVE DATA: inserted %" PRIu64 ". CKPT.\n",
+ data_val);
+ testutil_check(wt_session->checkpoint(wt_session, NULL));
+ }
+}
+
+/*
+ * make_database_copies --
+ * Make copies of the database so that we can test various mix and match
+ * of turtle files and metadata files. We take some checkpoints and
+ * update the data too.
+ */
+static void
+make_database_copies(TABLE_INFO *table_data)
+{
+ /*
+ * If we're running an out-of-sync test, then we want to make copies
+ * of the turtle and metadata file, then checkpoint and again save a
+ * copy of the turtle file and the metadata file. Then we add more data
+ * and checkpoint again at least twice. Using the original and current
+ * files we can test various out of sync scenarios.
+ */
+ /*
+ * Take a checkpoint and make a copy.
+ */
+ testutil_check(wt_session->checkpoint(wt_session, NULL));
+ copy_database(DB0);
+
+ move_data_ahead(table_data);
+ copy_database(DB1);
+
+ move_data_ahead(table_data);
+ copy_database(DB2);
+}
+
+/*
+ * wt_open_corrupt --
+ * Call wiredtiger_open and expect a corruption error.
+ */
+static void wt_open_corrupt(const char *)
+ WT_GCC_FUNC_DECL_ATTRIBUTE((noreturn));
+static void
+wt_open_corrupt(const char *sfx)
+{
+ WT_CONNECTION *conn;
+ int ret;
+ char buf[1024];
+
+#ifdef HAVE_ATTACH
+ WT_UNUSED(buf);
+ WT_UNUSED(conn);
+ WT_UNUSED(ret);
+ WT_UNUSED(sfx);
+#else
+ conn = NULL;
+ if (sfx != NULL)
+ testutil_check(__wt_snprintf(buf, sizeof(buf),
+ "%s.%s", home, sfx));
+ else
+ testutil_check(__wt_snprintf(buf, sizeof(buf), "%s", home));
+ ret = wiredtiger_open(buf, &event_handler, NULL, &conn);
+ /*
+ * Not all out of sync combinations lead to corruption. We keep
+ * the previous checkpoint in the file so some combinations of
+ * future or old turtle files and metadata files will succeed.
+ */
+ if (ret != WT_TRY_SALVAGE && ret != 0)
+ fprintf(stderr,
+ "OPEN_CORRUPT: wiredtiger_open returned %d\n", ret);
+ testutil_assert(ret == WT_TRY_SALVAGE || ret == 0);
+#endif
+ exit (EXIT_SUCCESS);
+}
+
+static int
+open_with_error(const char *sfx)
+{
+ pid_t pid;
+ int status;
+
+ /*
+ * Call wiredtiger_open. We expect to see a corruption panic so we
+ * run this in a forked process. In diagnostic mode, the panic will
+ * cause an abort and core dump. So we want to catch that and
+ * continue running with salvage.
+ */
+ printf("=== open corrupt in child ===\n");
+ if ((pid = fork()) < 0)
+ testutil_die(errno, "fork");
+ if (pid == 0) { /* child */
+ wt_open_corrupt(sfx);
+ return (EXIT_SUCCESS);
+ }
+ /* parent */
+ if (waitpid(pid, &status, 0) == -1)
+ testutil_die(errno, "waitpid");
+ return (EXIT_SUCCESS);
+}
+
+static void
+open_with_salvage(const char *sfx, TABLE_INFO *table_data)
+{
+ WT_CONNECTION *conn;
+ char buf[1024];
+
+ printf("=== wt_open with salvage ===\n");
+ /*
+ * Then call wiredtiger_open with the salvage configuration setting.
+ * That should succeed. We should be able to then verify the contents
+ * of the metadata file.
+ */
+ test_abort = true;
+ if (sfx != NULL)
+ testutil_check(__wt_snprintf(buf, sizeof(buf),
+ "%s.%s", home, sfx));
+ else
+ testutil_check(__wt_snprintf(buf, sizeof(buf), "%s", home));
+ testutil_check(wiredtiger_open(buf,
+ &event_handler, "salvage=true", &conn));
+ testutil_assert(conn != NULL);
+ if (sfx != NULL)
+ testutil_check(__wt_snprintf(buf, sizeof(buf),
+ "%s.%s/%s", home, sfx, WT_METAFILE_SLVG));
+ else
+ testutil_check(__wt_snprintf(buf, sizeof(buf),
+ "%s/%s", home, WT_METAFILE_SLVG));
+ testutil_assert(file_exists(buf));
+
+ /*
+ * Confirm we salvaged the metadata file by looking for the saved
+ * copy of the original metadata.
+ */
+ printf("verify with salvaged connection\n");
+ verify_metadata(conn, &table_data[0]);
+ testutil_check(conn->close(conn, NULL));
+}
+
+static void
+open_normal(const char *sfx, TABLE_INFO *table_data)
+{
+ WT_CONNECTION *conn;
+ char buf[1024];
+
+ printf("=== wt_open normal ===\n");
+ if (sfx != NULL)
+ testutil_check(__wt_snprintf(buf, sizeof(buf),
+ "%s.%s", home, sfx));
+ else
+ testutil_check(__wt_snprintf(buf, sizeof(buf), "%s", home));
+ testutil_check(wiredtiger_open(buf, &event_handler, NULL, &conn));
+ verify_metadata(conn, &table_data[0]);
+ testutil_check(conn->close(conn, NULL));
+}
+
+static void
+run_all_verification(const char *sfx, TABLE_INFO *t)
+{
+ testutil_check(open_with_error(sfx));
+ open_with_salvage(sfx, t);
+ open_normal(sfx, t);
+}
+
+static void
+setup_database(const char *src, const char *turtle_dir, const char *meta_dir)
+{
+ int ret;
+ char buf[1024];
+
+ /*
+ * Remove the test home directory and copy the source to it.
+ * Then copy the saved turtle and/or metadata file from the
+ * given args.
+ */
+ testutil_check(__wt_snprintf(buf, sizeof(buf),
+ "rm -rf ./%s.%s; mkdir ./%s.%s; "
+ "cp -p %s.%s/* ./%s.%s",
+ home, TEST, home, TEST, home, src, home, TEST));
+ printf("copy: %s\n", buf);
+ if ((ret = system(buf)) < 0)
+ testutil_die(ret, "system: %s", buf);
+
+ /* Copy turtle if given. */
+ if (turtle_dir != NULL) {
+ testutil_check(__wt_snprintf(buf, sizeof(buf),
+ "cp -p %s.%s/%s.%s %s.%s/%s",
+ home, turtle_dir, WT_METADATA_TURTLE, SAVE,
+ home, TEST, WT_METADATA_TURTLE));
+ printf("copy: %s\n", buf);
+ if ((ret = system(buf)) < 0)
+ testutil_die(ret, "system: %s", buf);
+ }
+ /* Copy metadata if given. */
+ if (meta_dir != NULL) {
+ testutil_check(__wt_snprintf(buf, sizeof(buf),
+ "cp -p %s.%s/%s.%s %s.%s/%s",
+ home, meta_dir, WT_METAFILE, SAVE,
+ home, TEST, WT_METAFILE));
+ printf("copy: %s\n", buf);
+ if ((ret = system(buf)) < 0)
+ testutil_die(ret, "system: %s", buf);
+ }
+}
+
+static void
+out_of_sync(TABLE_INFO *table_data)
+{
+ /*
+ * We have five directories:
+ * - The main database directory that we just corrupted/salvaged.
+ * - A .SAVE copy of the main directory that is coherent prior to
+ * corrupting. Essentially a copy of the second checkpoint dir.
+ * - A copy of the main directory before the first checkpoint. DB0
+ * - A copy of the main directory after the first checkpoint. DB1
+ * - A copy of the main directory after the second checkpoint. DB2
+ *
+ * We want to make a copy of a source directory and then copy a
+ * turtle or metadata file from another directory. Then detect the
+ * error, run with salvage and confirm.
+ */
+ /*
+ * Run in DB0, bring in future metadata from DB1.
+ */
+ test_out_of_sync = true;
+ printf(
+ "#\n# OUT OF SYNC: %s with future metadata from %s\n#\n", DB0, DB1);
+ setup_database(DB0, NULL, DB1);
+ run_all_verification(TEST, table_data);
+
+ /*
+ * Run in DB0, bring in future turtle file from DB1.
+ */
+ printf(
+ "#\n# OUT OF SYNC: %s with future turtle from %s\n#\n", DB0, DB1);
+ setup_database(DB0, DB1, NULL);
+ run_all_verification(TEST, table_data);
+
+ /*
+ * Run in DB1, bring in old metadata file from DB0.
+ */
+ printf("#\n# OUT OF SYNC: %s with old metadata from %s\n#\n", DB1, DB0);
+ setup_database(DB1, NULL, DB0);
+ run_all_verification(TEST, table_data);
+
+ /*
+ * Run in DB1, bring in old turtle file from DB0.
+ */
+ printf("#\n# OUT OF SYNC: %s with old turtle from %s\n#\n", DB1, DB0);
+ setup_database(DB1, DB0, NULL);
+ run_all_verification(TEST, table_data);
+
+ /*
+ * Run in DB1, bring in future metadata file from DB2.
+ */
+ printf(
+ "#\n# OUT OF SYNC: %s with future metadata from %s\n#\n", DB1, DB2);
+ setup_database(DB1, NULL, DB2);
+ run_all_verification(TEST, table_data);
+
+ /*
+ * Run in DB1, bring in future turtle file from DB2.
+ */
+ printf(
+ "#\n# OUT OF SYNC: %s with future turtle from %s\n#\n", DB1, DB2);
+ setup_database(DB1, DB2, NULL);
+ run_all_verification(TEST, table_data);
+
+ /*
+ * Run in DB2, bring in old metadata file from DB1.
+ */
+ printf("#\n# OUT OF SYNC: %s with old metadata from %s\n#\n", DB2, DB1);
+ setup_database(DB2, NULL, DB1);
+ run_all_verification(TEST, table_data);
+
+ /*
+ * Run in DB2, bring in old turtle file from DB1.
+ */
+ printf("#\n# OUT OF SYNC: %s with old turtle from %s\n#\n", DB2, DB1);
+ setup_database(DB2, DB1, NULL);
+ run_all_verification(TEST, table_data);
+}
+
+int
+main(int argc, char *argv[])
+{
+ /*
+ * Add a bunch of tables so that some of the metadata ends up on
+ * other pages and a good number of tables are available after
+ * salvage completes.
+ */
+ TABLE_INFO table_data[] = {
+ { "file:aaa-file.SS", "key_format=S,value_format=S", false },
+ { "file:bbb-file.rS", "key_format=r,value_format=S", false },
+ { "lsm:ccc-lsm.SS", "key_format=S,value_format=S", false },
+ { "table:ddd-table.SS", "key_format=S,value_format=S", false },
+ { "table:eee-table.rS", "key_format=r,value_format=S", false },
+ { "file:fff-file.SS", "key_format=S,value_format=S", false },
+ { "file:ggg-file.rS", "key_format=r,value_format=S", false },
+ { "lsm:hhh-lsm.SS", "key_format=S,value_format=S", false },
+ { "table:iii-table.SS", "key_format=S,value_format=S", false },
+ { "table:jjj-table.rS", "key_format=r,value_format=S", false },
+ { CORRUPT, "key_format=S,value_format=S", false },
+ { NULL, NULL, false }
+ };
+ TABLE_INFO *t;
+ TEST_OPTS *opts, _opts;
+ int ret;
+ char buf[1024];
+
+ opts = &_opts;
+ memset(opts, 0, sizeof(*opts));
+ testutil_check(testutil_parse_opts(argc, argv, opts));
+ /*
+ * Set a global. We use this everywhere.
+ */
+ home = opts->home;
+ testutil_make_work_dir(home);
+
+ testutil_check(
+ wiredtiger_open(home, &event_handler, "create", &opts->conn));
+
+ testutil_check(opts->conn->open_session(
+ opts->conn, NULL, NULL, &wt_session));
+ /*
+ * Create a bunch of different tables.
+ */
+ for (t = table_data; t->name != NULL; t++)
+ create_data(t);
+
+ /*
+ * Take some checkpoints and add more data for out of sync testing.
+ */
+ make_database_copies(table_data);
+ testutil_check(opts->conn->close(opts->conn, NULL));
+ opts->conn = NULL;
+
+ /*
+ * Make copy of original directory.
+ */
+ copy_database(SAVE);
+ /*
+ * Damage/corrupt WiredTiger.wt.
+ */
+ printf("corrupt metadata\n");
+ corrupt_metadata();
+ testutil_check(__wt_snprintf(buf, sizeof(buf),
+ "cp -p %s/WiredTiger.wt ./%s.SAVE/WiredTiger.wt.CORRUPT",
+ home, home));
+ printf("copy: %s\n", buf);
+ if ((ret = system(buf)) < 0)
+ testutil_die(ret, "system: %s", buf);
+ run_all_verification(NULL, &table_data[0]);
+
+ out_of_sync(&table_data[0]);
+
+ /*
+ * We need to set up the string before we clean up
+ * the structure. Then after the clean up we will
+ * run this command.
+ */
+ testutil_check(__wt_snprintf(buf, sizeof(buf),
+ "rm -rf core* %s*", home));
+ testutil_cleanup(opts);
+
+ /*
+ * We've created a lot of extra directories and possibly some core
+ * files from child process aborts. Manually clean them up.
+ */
+ printf("cleanup and remove: %s\n", buf);
+ if ((ret = system(buf)) < 0)
+ testutil_die(ret, "system: %s", buf);
+
+ return (EXIT_SUCCESS);
+}
diff --git a/src/third_party/wiredtiger/test/cursor_order/cursor_order.c b/src/third_party/wiredtiger/test/cursor_order/cursor_order.c
index b2e847f880c..2cdbe4cb840 100644
--- a/src/third_party/wiredtiger/test/cursor_order/cursor_order.c
+++ b/src/third_party/wiredtiger/test/cursor_order/cursor_order.c
@@ -132,7 +132,6 @@ main(int argc, char *argv[])
}
argc -= __wt_optind;
- argv += __wt_optind;
if (argc != 0)
return (usage());
diff --git a/src/third_party/wiredtiger/test/cursor_order/cursor_order_ops.c b/src/third_party/wiredtiger/test/cursor_order/cursor_order_ops.c
index 2ff4b5ed5fb..d110d513bfb 100644
--- a/src/third_party/wiredtiger/test/cursor_order/cursor_order_ops.c
+++ b/src/third_party/wiredtiger/test/cursor_order/cursor_order_ops.c
@@ -120,7 +120,7 @@ ops_start(SHARED_CONFIG *cfg)
/* Wait for the threads. */
for (i = 0; i < cfg->reverse_scanners + cfg->append_inserters; ++i)
- testutil_check(__wt_thread_join(NULL, tids[i]));
+ testutil_check(__wt_thread_join(NULL, &tids[i]));
(void)gettimeofday(&stop, NULL);
seconds = (stop.tv_sec - start.tv_sec) +
diff --git a/src/third_party/wiredtiger/test/fops/fops.c b/src/third_party/wiredtiger/test/fops/fops.c
index 99c333be4a0..96a60acbfa5 100644
--- a/src/third_party/wiredtiger/test/fops/fops.c
+++ b/src/third_party/wiredtiger/test/fops/fops.c
@@ -69,7 +69,7 @@ fop_start(u_int nthreads)
/* Wait for the threads. */
for (i = 0; i < nthreads; ++i)
- testutil_check(__wt_thread_join(NULL, tids[i]));
+ testutil_check(__wt_thread_join(NULL, &tids[i]));
(void)gettimeofday(&stop, NULL);
seconds = (stop.tv_sec - start.tv_sec) +
diff --git a/src/third_party/wiredtiger/test/fops/t.c b/src/third_party/wiredtiger/test/fops/t.c
index e748918a08a..d82b2c3f3f1 100644
--- a/src/third_party/wiredtiger/test/fops/t.c
+++ b/src/third_party/wiredtiger/test/fops/t.c
@@ -112,7 +112,6 @@ main(int argc, char *argv[])
}
argc -= __wt_optind;
- argv += __wt_optind;
if (argc != 0)
return (usage());
diff --git a/src/third_party/wiredtiger/test/format/config.c b/src/third_party/wiredtiger/test/format/config.c
index 9df743cf056..33ef6864b64 100644
--- a/src/third_party/wiredtiger/test/format/config.c
+++ b/src/third_party/wiredtiger/test/format/config.c
@@ -615,10 +615,16 @@ config_lsm_reset(void)
/*
* LSM doesn't currently play nicely with timestamps, don't choose the
- * pair unless forced to. Remove this code with WT-4067.
+ * pair unless forced to. If we turn off timestamps, make sure we turn
+ * off prepare as well, it requires timestamps. Remove this code with
+ * WT-4067.
+ *
*/
- if (!config_is_perm("transaction_timestamps"))
+ if (!config_is_perm("prepare") &&
+ !config_is_perm("transaction_timestamps")) {
+ config_single("prepare=off", 0);
config_single("transaction_timestamps=off", 0);
+ }
}
/*
diff --git a/src/third_party/wiredtiger/test/format/config.h b/src/third_party/wiredtiger/test/format/config.h
index 51dc906465a..70e8165e97d 100644
--- a/src/third_party/wiredtiger/test/format/config.h
+++ b/src/third_party/wiredtiger/test/format/config.h
@@ -331,41 +331,45 @@ static CONFIG c[] = {
C_IGNORE, 0, 0, UINT_MAX, &g.c_timer, NULL },
{ "timing_stress_checkpoint",
- "configure slow checkpoints", /* 2% */
+ "stress checkpoints", /* 2% */
C_BOOL, 2, 0, 0, &g.c_timing_stress_checkpoint, NULL },
{ "timing_stress_lookaside_sweep",
- "configure slow lookaside sweep", /* 2% */
+ "stress lookaside sweep", /* 2% */
C_BOOL, 2, 0, 0, &g.c_timing_stress_lookaside_sweep, NULL },
{ "timing_stress_split_1",
- "configure slow splits (#1)", /* 2% */
+ "stress splits (#1)", /* 2% */
C_BOOL, 2, 0, 0, &g.c_timing_stress_split_1, NULL },
{ "timing_stress_split_2",
- "configure slow splits (#2)", /* 2% */
+ "stress splits (#2)", /* 2% */
C_BOOL, 2, 0, 0, &g.c_timing_stress_split_2, NULL },
{ "timing_stress_split_3",
- "configure slow splits (#3)", /* 2% */
+ "stress splits (#3)", /* 2% */
C_BOOL, 2, 0, 0, &g.c_timing_stress_split_3, NULL },
{ "timing_stress_split_4",
- "configure slow splits (#4)", /* 2% */
+ "stress splits (#4)", /* 2% */
C_BOOL, 2, 0, 0, &g.c_timing_stress_split_4, NULL },
{ "timing_stress_split_5",
- "configure slow splits (#5)", /* 2% */
+ "stress splits (#5)", /* 2% */
C_BOOL, 2, 0, 0, &g.c_timing_stress_split_5, NULL },
{ "timing_stress_split_6",
- "configure slow splits (#6)", /* 2% */
+ "stress splits (#6)", /* 2% */
C_BOOL, 2, 0, 0, &g.c_timing_stress_split_6, NULL },
{ "timing_stress_split_7",
- "configure slow splits (#7)", /* 2% */
+ "stress splits (#7)", /* 2% */
C_BOOL, 2, 0, 0, &g.c_timing_stress_split_7, NULL },
+ { "timing_stress_split_8",
+ "stress splits (#8)", /* 2% */
+ C_BOOL, 2, 0, 0, &g.c_timing_stress_split_8, NULL },
+
{ "transaction_timestamps", /* 10% */
"enable transaction timestamp support",
C_BOOL, 10, 0, 0, &g.c_txn_timestamps, NULL },
diff --git a/src/third_party/wiredtiger/test/format/format.h b/src/third_party/wiredtiger/test/format/format.h
index 0eca6657dd9..a83aa0d2dcb 100644
--- a/src/third_party/wiredtiger/test/format/format.h
+++ b/src/third_party/wiredtiger/test/format/format.h
@@ -122,7 +122,16 @@ typedef struct {
WT_RAND_STATE rnd; /* Global RNG state */
- pthread_rwlock_t prepare_lock; /* Prepare running */
+ /*
+ * Prepare will return an error if the prepare timestamp is less than
+ * any active read timestamp. Lock across allocating prepare and read
+ * timestamps.
+ *
+ * We get the last committed timestamp periodically in order to update
+ * the oldest timestamp, that requires locking out transactional ops
+ * that set a timestamp.
+ */
+ pthread_rwlock_t ts_lock;
uint64_t timestamp; /* Counter for timestamps */
@@ -221,6 +230,7 @@ typedef struct {
uint32_t c_timing_stress_split_5;
uint32_t c_timing_stress_split_6;
uint32_t c_timing_stress_split_7;
+ uint32_t c_timing_stress_split_8;
uint32_t c_truncate;
uint32_t c_txn_freq;
uint32_t c_txn_timestamps;
@@ -281,9 +291,6 @@ typedef struct {
WT_RAND_STATE rnd; /* thread RNG state */
- uint64_t commit_timestamp; /* last committed timestamp */
- uint64_t read_timestamp; /* read timestamp */
-
volatile bool quit; /* thread should quit */
uint64_t ops; /* total operations */
@@ -380,3 +387,27 @@ mmrand(WT_RAND_STATE *rnd, u_int min, u_int max)
v += min;
return (v);
}
+
+static inline void
+random_sleep(WT_RAND_STATE *rnd, u_int max_seconds)
+{
+ uint64_t i, micro_seconds;
+
+ /*
+ * We need a fast way to choose a sleep time. We want to sleep a short
+ * period most of the time, but occasionally wait longer. Divide the
+ * maximum period of time into 10 buckets (where bucket 0 doesn't sleep
+ * at all), and roll dice, advancing to the next bucket 50% of the time.
+ * That means we'll hit the maximum roughly every 1K calls.
+ */
+ for (i = 0;;)
+ if (rng(rnd) & 0x1 || ++i > 9)
+ break;
+
+ if (i == 0)
+ __wt_yield();
+ else {
+ micro_seconds = (uint64_t)max_seconds * WT_MILLION;
+ __wt_sleep(0, i * (micro_seconds / 10));
+ }
+}
diff --git a/src/third_party/wiredtiger/test/format/lrt.c b/src/third_party/wiredtiger/test/format/lrt.c
index 9d99933ef64..31c5de93870 100644
--- a/src/third_party/wiredtiger/test/format/lrt.c
+++ b/src/third_party/wiredtiger/test/format/lrt.c
@@ -110,8 +110,15 @@ lrt(void *arg)
*/
testutil_check(session->snapshot(session, "name=test"));
__wt_sleep(1, 0);
- testutil_check(session->begin_transaction(
- session, "snapshot=test"));
+ /*
+ * Keep trying to start a new transaction if it's
+ * timing out - we know there aren't any resources
+ * pinned so it should succeed eventually.
+ */
+ while ((ret = session->begin_transaction(
+ session, "snapshot=test")) == WT_CACHE_FULL)
+ ;
+ testutil_check(ret);
testutil_check(session->snapshot(
session, "drop=(all)"));
testutil_check(session->commit_transaction(
@@ -123,8 +130,10 @@ lrt(void *arg)
* positioned. As soon as the cursor loses its position
* a new snapshot will be allocated.
*/
- testutil_check(session->begin_transaction(
- session, "isolation=snapshot"));
+ while ((ret = session->begin_transaction(
+ session, "snapshot=snapshot")) == WT_CACHE_FULL)
+ ;
+ testutil_check(ret);
/* Read a record at the end of the table. */
do {
diff --git a/src/third_party/wiredtiger/test/format/ops.c b/src/third_party/wiredtiger/test/format/ops.c
index 7d08dbd8bd8..f92f438a4f1 100644
--- a/src/third_party/wiredtiger/test/format/ops.c
+++ b/src/third_party/wiredtiger/test/format/ops.c
@@ -207,7 +207,7 @@ wts_ops(int lastrun)
case TINFO_COMPLETE:
tinfo->state = TINFO_JOINED;
testutil_check(
- __wt_thread_join(NULL, tinfo->tid));
+ __wt_thread_join(NULL, &tinfo->tid));
break;
case TINFO_JOINED:
break;
@@ -252,17 +252,17 @@ wts_ops(int lastrun)
/* Wait for the other threads. */
g.workers_finished = true;
if (g.c_alter)
- testutil_check(__wt_thread_join(NULL, alter_tid));
+ testutil_check(__wt_thread_join(NULL, &alter_tid));
if (g.c_backups)
- testutil_check(__wt_thread_join(NULL, backup_tid));
+ testutil_check(__wt_thread_join(NULL, &backup_tid));
if (g.c_checkpoint_flag == CHECKPOINT_ON)
- testutil_check(__wt_thread_join(NULL, checkpoint_tid));
+ testutil_check(__wt_thread_join(NULL, &checkpoint_tid));
if (g.c_compact)
- testutil_check(__wt_thread_join(NULL, compact_tid));
+ testutil_check(__wt_thread_join(NULL, &compact_tid));
if (!SINGLETHREADED && g.c_long_running_txn)
- testutil_check(__wt_thread_join(NULL, lrt_tid));
+ testutil_check(__wt_thread_join(NULL, &lrt_tid));
if (g.c_txn_timestamps)
- testutil_check(__wt_thread_join(NULL, timestamp_tid));
+ testutil_check(__wt_thread_join(NULL, &timestamp_tid));
g.workers_finished = false;
if (g.logging != 0) {
@@ -353,8 +353,15 @@ snap_check(WT_CURSOR *cursor,
testutil_assert(start->keyno != 0);
}
- /* Check for subsequent changes to this record. */
+ /*
+ * Check for subsequent changes to this record. If we find a
+ * read, don't treat it was a subsequent change, that way we
+ * verify the results of the change as well as the results of
+ * the read.
+ */
for (p = start + 1; p < stop; ++p) {
+ if (p->op == READ)
+ continue;
if (p->keyno == start->keyno)
break;
@@ -456,10 +463,10 @@ snap_check(WT_CURSOR *cursor,
print_item_data(
"expected", start->vdata, start->vsize);
if (ret == WT_NOTFOUND)
- fprintf(stderr, "\t found {deleted}\n");
+ fprintf(stderr, "found {deleted}\n");
else
print_item_data(
- " found", value->data, value->size);
+ "found", value->data, value->size);
testutil_die(ret,
"snapshot-isolation: %.*s search mismatch",
@@ -476,10 +483,10 @@ snap_check(WT_CURSOR *cursor,
print_item_data(
"expected", start->vdata, start->vsize);
if (ret == WT_NOTFOUND)
- fprintf(stderr, "\t found {deleted}\n");
+ fprintf(stderr, "found {deleted}\n");
else
print_item_data(
- " found", value->data, value->size);
+ "found", value->data, value->size);
testutil_die(ret,
"snapshot-isolation: %" PRIu64 " search mismatch",
@@ -497,12 +504,10 @@ snap_check(WT_CURSOR *cursor,
static void
begin_transaction(TINFO *tinfo, WT_SESSION *session, u_int *iso_configp)
{
+ WT_DECL_RET;
u_int v;
+ char buf[64];
const char *config;
- char config_buf[64];
- bool locked;
-
- locked = false;
if ((v = g.c_isolation_flag) == ISOLATION_RANDOM)
v = mmrand(&tinfo->rnd, 1, 3);
@@ -523,64 +528,35 @@ begin_transaction(TINFO *tinfo, WT_SESSION *session, u_int *iso_configp)
}
*iso_configp = v;
- testutil_check(session->begin_transaction(session, config));
+ /*
+ * Keep trying to start a new transaction if it's timing out - we know
+ * there aren't any resources pinned so it should succeed eventually.
+ */
+ while ((ret =
+ session->begin_transaction(session, config)) == WT_CACHE_FULL)
+ ;
+ testutil_check(ret);
if (v == ISOLATION_SNAPSHOT && g.c_txn_timestamps) {
- /* Avoid starting a new reader when a prepare is in progress. */
- if (g.c_prepare) {
- testutil_check(pthread_rwlock_rdlock(&g.prepare_lock));
- locked = true;
- }
-
/*
- * Set the thread's read timestamp to the current value before
- * allocating a new read timestamp. This guarantees the oldest
- * timestamp won't move past the allocated timestamp before the
- * transaction uses it.
+ * Prepare returns an error if the prepare timestamp is less
+ * than any active read timestamp, single-thread transaction
+ * prepare and begin.
+ *
+ * Lock out the oldest timestamp update.
*/
- tinfo->read_timestamp = g.timestamp;
- tinfo->read_timestamp = __wt_atomic_addv64(&g.timestamp, 1);
- testutil_check(__wt_snprintf(
- config_buf, sizeof(config_buf),
- "read_timestamp=%" PRIx64, tinfo->read_timestamp));
-
- testutil_check(
- session->timestamp_transaction(session, config_buf));
+ testutil_check(pthread_rwlock_wrlock(&g.ts_lock));
- /*
- * It's OK for the oldest timestamp to move past a running
- * query, clear the thread's read timestamp, it no longer needs
- * to be pinned.
- */
- tinfo->read_timestamp = 0;
+ testutil_check(__wt_snprintf(buf, sizeof(buf),
+ "read_timestamp=%" PRIx64,
+ __wt_atomic_addv64(&g.timestamp, 1)));
+ testutil_check(session->timestamp_transaction(session, buf));
- if (locked)
- testutil_check(pthread_rwlock_unlock(&g.prepare_lock));
+ testutil_check(pthread_rwlock_unlock(&g.ts_lock));
}
}
/*
- * set_commit_timestamp --
- * Return the next commit timestamp.
- */
-static uint64_t
-set_commit_timestamp(TINFO *tinfo)
-{
- /*
- * If the thread's commit timestamp hasn't been set yet, update it with
- * the current value to prevent the oldest timestamp moving past our
- * allocated timestamp before the commit completes. The sequence where
- * it's already set is after prepare, in which case we can't let the
- * oldest timestamp move past either the prepare or commit timestamps.
- *
- * Note the barrier included in the atomic call ensures proper ordering.
- */
- if (tinfo->commit_timestamp == 0)
- tinfo->commit_timestamp = g.timestamp;
- return (__wt_atomic_addv64(&g.timestamp, 1));
-}
-
-/*
* commit_transaction --
* Commit a transaction.
*/
@@ -588,25 +564,20 @@ static void
commit_transaction(TINFO *tinfo, WT_SESSION *session)
{
uint64_t ts;
- char config_buf[64];
+ char buf[64];
++tinfo->commit;
if (g.c_txn_timestamps) {
- ts = set_commit_timestamp(tinfo);
+ /* Lock out the oldest timestamp update. */
+ testutil_check(pthread_rwlock_wrlock(&g.ts_lock));
+
+ ts = __wt_atomic_addv64(&g.timestamp, 1);
testutil_check(__wt_snprintf(
- config_buf, sizeof(config_buf),
- "commit_timestamp=%" PRIx64, ts));
- testutil_check(
- session->timestamp_transaction(session, config_buf));
- /*
- * Clear the thread's active timestamp: it no longer needs to
- * be pinned. Don't let the compiler re-order this statement,
- * if we were to race with the timestamp thread, it might see
- * our thread update before the commit_timestamp is set for the
- * transaction.
- */
- WT_PUBLISH(tinfo->commit_timestamp, 0);
+ buf, sizeof(buf), "commit_timestamp=%" PRIx64, ts));
+ testutil_check(session->timestamp_transaction(session, buf));
+
+ testutil_check(pthread_rwlock_unlock(&g.ts_lock));
}
testutil_check(session->commit_transaction(session, NULL));
}
@@ -621,15 +592,6 @@ rollback_transaction(TINFO *tinfo, WT_SESSION *session)
++tinfo->rollback;
testutil_check(session->rollback_transaction(session, NULL));
-
- /*
- * Clear the thread's active timestamp: it no longer needs to be pinned.
- * Don't let the compiler re-order this statement, if we were to race
- * with the timestamp thread, it might see our thread update before the
- * transaction commit completes.
- */
- if (g.c_txn_timestamps)
- WT_PUBLISH(tinfo->commit_timestamp, 0);
}
/*
@@ -641,34 +603,28 @@ prepare_transaction(TINFO *tinfo, WT_SESSION *session)
{
WT_DECL_RET;
uint64_t ts;
- char config_buf[64];
+ char buf[64];
- /* Skip if no timestamp has yet been set. */
- if (g.timestamp == 0)
- return (0);
++tinfo->prepare;
/*
- * Synchronize prepare call with begin transaction to prevent a new
- * reader creeping in.
- */
- testutil_check(pthread_rwlock_wrlock(&g.prepare_lock));
-
- /*
* Prepare timestamps must be less than or equal to the eventual commit
* timestamp. Set the prepare timestamp to whatever the global value is
* now. The subsequent commit will increment it, ensuring correctness.
*
- * Prepare will return error if the prepare timestamp is less than any
- * active read timestamp.
+ * Prepare returns an error if the prepare timestamp is less than any
+ * active read timestamp, single-thread transaction prepare and begin.
+ *
+ * Lock out the oldest timestamp update.
*/
- ts = set_commit_timestamp(tinfo);
- testutil_check(__wt_snprintf(
- config_buf, sizeof(config_buf), "prepare_timestamp=%" PRIx64, ts));
- ret = session->prepare_transaction(session, config_buf);
+ testutil_check(pthread_rwlock_wrlock(&g.ts_lock));
- testutil_check(pthread_rwlock_unlock(&g.prepare_lock));
+ ts = __wt_atomic_addv64(&g.timestamp, 1);
+ testutil_check(__wt_snprintf(
+ buf, sizeof(buf), "prepare_timestamp=%" PRIx64, ts));
+ ret = session->prepare_transaction(session, buf);
+ testutil_check(pthread_rwlock_unlock(&g.ts_lock));
return (ret);
}
@@ -923,10 +879,11 @@ ops(void *arg)
break;
case MODIFY:
/*
- * Change modify into update if in a read-uncommitted
- * transaction, modify isn't supported in that case.
+ * Change modify into update if not in a transaction
+ * or in a read-uncommitted transaction, modify isn't
+ * supported in those cases.
*/
- if (iso_config == ISOLATION_READ_UNCOMMITTED)
+ if (!intxn || iso_config == ISOLATION_READ_UNCOMMITTED)
goto update_instead_of_chosen_op;
++tinfo->update;
diff --git a/src/third_party/wiredtiger/test/format/t.c b/src/third_party/wiredtiger/test/format/t.c
index d7b9add1f14..a1e9736ab60 100644
--- a/src/third_party/wiredtiger/test/format/t.c
+++ b/src/third_party/wiredtiger/test/format/t.c
@@ -104,7 +104,6 @@ main(int argc, char *argv[])
default:
usage();
}
- argc -= __wt_optind;
argv += __wt_optind;
/* Initialize the global RNG. */
@@ -170,7 +169,7 @@ main(int argc, char *argv[])
testutil_check(pthread_rwlock_init(&g.append_lock, NULL));
testutil_check(pthread_rwlock_init(&g.backup_lock, NULL));
testutil_check(pthread_rwlock_init(&g.death_lock, NULL));
- testutil_check(pthread_rwlock_init(&g.prepare_lock, NULL));
+ testutil_check(pthread_rwlock_init(&g.ts_lock, NULL));
printf("%s: process %" PRIdMAX "\n", progname, (intmax_t)getpid());
while (++g.run_cnt <= g.c_runs || g.c_runs == 0 ) {
@@ -268,7 +267,7 @@ main(int argc, char *argv[])
testutil_check(pthread_rwlock_destroy(&g.append_lock));
testutil_check(pthread_rwlock_destroy(&g.backup_lock));
testutil_check(pthread_rwlock_destroy(&g.death_lock));
- testutil_check(pthread_rwlock_destroy(&g.prepare_lock));
+ testutil_check(pthread_rwlock_destroy(&g.ts_lock));
config_clear();
diff --git a/src/third_party/wiredtiger/test/format/util.c b/src/third_party/wiredtiger/test/format/util.c
index cf75c98129a..23b198c05af 100644
--- a/src/third_party/wiredtiger/test/format/util.c
+++ b/src/third_party/wiredtiger/test/format/util.c
@@ -588,72 +588,47 @@ WT_THREAD_RET
timestamp(void *arg)
{
WT_CONNECTION *conn;
+ WT_DECL_RET;
WT_SESSION *session;
- TINFO **tinfo_list, *tinfo;
- time_t last, now;
- uint64_t oldest_timestamp, this_ts, usecs;
- uint32_t i;
- char config_buf[64];
-
- tinfo_list = arg;
+ char buf[64];
+ bool done;
+ (void)(arg);
conn = g.wts_conn;
- testutil_check(conn->open_session(conn, NULL, NULL, &session));
- __wt_seconds((WT_SESSION_IMPL *)session, &last);
+ testutil_check(conn->open_session(conn, NULL, NULL, &session));
- /*
- * Update the oldest timestamp every 100 transactions, but at least
- * once every 15 seconds.
- */
- while (!g.workers_finished) {
- /*
- * Find the lowest in-use timestamp. The timestamp thread starts
- * before the operational threads, wait for them.
- */
- oldest_timestamp = g.timestamp;
- for (i = 0; i < g.c_threads; ++i) {
- tinfo = tinfo_list[i];
- this_ts = tinfo->commit_timestamp;
- if (this_ts != 0 && this_ts < oldest_timestamp)
- oldest_timestamp = this_ts;
- this_ts = tinfo->read_timestamp;
- if (this_ts != 0 && this_ts < oldest_timestamp)
- oldest_timestamp = this_ts;
- }
+ testutil_check(
+ __wt_snprintf(buf, sizeof(buf), "%s", "oldest_timestamp="));
+ /* Update the oldest timestamp at least once every 15 seconds. */
+ done = false;
+ do {
/*
- * Don't try to update until we've committed some transactions
- * with timestamps.
+ * Do a final bump of the oldest timestamp as part of shutting
+ * down the worker threads, otherwise recent operations can
+ * prevent verify from running.
*/
- if (oldest_timestamp == 0) {
- __wt_sleep(1, 0);
- continue;
- }
+ if (g.workers_finished)
+ done = true;
+ else
+ random_sleep(&g.rnd, 15);
/*
- * If less than 100 transactions out of date, wait up to 15
- * seconds before updating.
+ * Lock out transaction timestamp operations. The lock acts as a
+ * barrier ensuring we've checked if the workers have finished,
+ * we don't want that line reordered.
*/
- WT_READ_BARRIER();
- testutil_assert(oldest_timestamp <= g.timestamp);
- if (g.timestamp - oldest_timestamp < 100) {
- __wt_seconds((WT_SESSION_IMPL *)session, &now);
- if (difftime(now, last) < 15) {
- __wt_sleep(1, 0);
- continue;
- }
- }
+ testutil_check(pthread_rwlock_wrlock(&g.ts_lock));
- testutil_check(__wt_snprintf(
- config_buf, sizeof(config_buf),
- "oldest_timestamp=%" PRIx64, oldest_timestamp));
- testutil_check(conn->set_timestamp(conn, config_buf));
- __wt_seconds((WT_SESSION_IMPL *)session, &last);
+ ret = conn->query_timestamp(conn,
+ buf + strlen("oldest_timestamp="), "get=all_committed");
+ testutil_assert(ret == 0 || ret == WT_NOTFOUND);
+ if (ret == 0)
+ testutil_check(conn->set_timestamp(conn, buf));
- usecs = mmrand(NULL, 5, 40);
- __wt_sleep(0, usecs);
- }
+ testutil_check(pthread_rwlock_unlock(&g.ts_lock));
+ } while (!done);
testutil_check(session->close(session, NULL));
return (WT_THREAD_RET_VALUE);
diff --git a/src/third_party/wiredtiger/test/format/wts.c b/src/third_party/wiredtiger/test/format/wts.c
index dd87adeae56..30b910435d4 100644
--- a/src/third_party/wiredtiger/test/format/wts.c
+++ b/src/third_party/wiredtiger/test/format/wts.c
@@ -262,6 +262,8 @@ wts_open(const char *home, bool set_api, WT_CONNECTION **connp)
CONFIG_APPEND(p, ",split_6");
if (g.c_timing_stress_split_7)
CONFIG_APPEND(p, ",split_7");
+ if (g.c_timing_stress_split_8)
+ CONFIG_APPEND(p, ",split_8");
CONFIG_APPEND(p, "]");
/* Extensions. */
@@ -540,7 +542,6 @@ wts_verify(const char *tag)
WT_CONNECTION *conn;
WT_DECL_RET;
WT_SESSION *session;
- char config_buf[64];
if (g.c_verify == 0)
return;
@@ -553,17 +554,6 @@ wts_verify(const char *tag)
(void)g.wt_api->msg_printf(g.wt_api, session,
"=============== verify start ===============");
- if (g.c_txn_timestamps && g.timestamp > 0) {
- /*
- * Bump the oldest timestamp, otherwise recent operations can
- * prevent verify from running.
- */
- testutil_check(__wt_snprintf(
- config_buf, sizeof(config_buf),
- "oldest_timestamp=%" PRIx64, g.timestamp));
- testutil_check(conn->set_timestamp(conn, config_buf));
- }
-
/*
* Verify can return EBUSY if the handle isn't available. Don't yield
* and retry, in the case of LSM, the handle may not be available for
diff --git a/src/third_party/wiredtiger/test/huge/huge.c b/src/third_party/wiredtiger/test/huge/huge.c
index 11d6bbdc312..18bf873ff0b 100644
--- a/src/third_party/wiredtiger/test/huge/huge.c
+++ b/src/third_party/wiredtiger/test/huge/huge.c
@@ -171,7 +171,6 @@ main(int argc, char *argv[])
usage();
}
argc -= __wt_optind;
- argv += __wt_optind;
if (argc != 0)
usage();
diff --git a/src/third_party/wiredtiger/test/manydbs/manydbs.c b/src/third_party/wiredtiger/test/manydbs/manydbs.c
index a6574d21d72..daf19302828 100644
--- a/src/third_party/wiredtiger/test/manydbs/manydbs.c
+++ b/src/third_party/wiredtiger/test/manydbs/manydbs.c
@@ -148,7 +148,6 @@ main(int argc, char *argv[])
usage();
}
argc -= __wt_optind;
- argv += __wt_optind;
if (argc != 0)
usage();
diff --git a/src/third_party/wiredtiger/test/readonly/readonly.c b/src/third_party/wiredtiger/test/readonly/readonly.c
index b5342831320..7a84f662029 100644
--- a/src/third_party/wiredtiger/test/readonly/readonly.c
+++ b/src/third_party/wiredtiger/test/readonly/readonly.c
@@ -194,7 +194,6 @@ main(int argc, char *argv[])
usage();
}
argc -= __wt_optind;
- argv += __wt_optind;
if (argc != 0)
usage();
diff --git a/src/third_party/wiredtiger/test/salvage/salvage.c b/src/third_party/wiredtiger/test/salvage/salvage.c
index 9c8a90d37b9..3517405c6ac 100644
--- a/src/third_party/wiredtiger/test/salvage/salvage.c
+++ b/src/third_party/wiredtiger/test/salvage/salvage.c
@@ -97,7 +97,6 @@ main(int argc, char *argv[])
return (usage());
}
argc -= __wt_optind;
- argv += __wt_optind;
if (argc != 0)
return (usage());
@@ -670,7 +669,7 @@ empty(int cnt)
if (page_type == WT_PAGE_COL_FIX)
for (i = 0; i < cnt; ++i)
- fputs("\\00\n", res_fp);
+ CHECK(fputs("\\00\n", res_fp));
}
/*
diff --git a/src/third_party/wiredtiger/test/suite/run.py b/src/third_party/wiredtiger/test/suite/run.py
index 6b668ad3e07..74fa259c3c4 100644
--- a/src/third_party/wiredtiger/test/suite/run.py
+++ b/src/third_party/wiredtiger/test/suite/run.py
@@ -238,7 +238,7 @@ if __name__ == '__main__':
tests = unittest.TestSuite()
# Turn numbers and ranges into test module names
- preserve = timestamp = debug = dryRun = gdbSub = longtest = False
+ preserve = timestamp = debug = dryRun = gdbSub = lldbSub = longtest = False
parallel = 0
configfile = None
configwrite = False
@@ -269,6 +269,9 @@ if __name__ == '__main__':
if option == '-gdb' or option == 'g':
gdbSub = True
continue
+ if option == '-lldb':
+ lldbSub = True
+ continue
if option == '-help' or option == 'h':
usage()
sys.exit(0)
@@ -323,7 +326,7 @@ if __name__ == '__main__':
# All global variables should be set before any test classes are loaded.
# That way, verbose printing can be done at the class definition level.
- wttest.WiredTigerTestCase.globalSetup(preserve, timestamp, gdbSub,
+ wttest.WiredTigerTestCase.globalSetup(preserve, timestamp, gdbSub, lldbSub,
verbose, wt_builddir, dirarg,
longtest)
diff --git a/src/third_party/wiredtiger/test/suite/suite_subprocess.py b/src/third_party/wiredtiger/test/suite/suite_subprocess.py
index c2e9d99f691..4b0f6823e06 100644..100755
--- a/src/third_party/wiredtiger/test/suite/suite_subprocess.py
+++ b/src/third_party/wiredtiger/test/suite/suite_subprocess.py
@@ -26,7 +26,7 @@
# ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
# OTHER DEALINGS IN THE SOFTWARE.
-import os, subprocess, sys
+import os, re, subprocess, sys
from run import wt_builddir
from wttest import WiredTigerTestCase
@@ -79,23 +79,51 @@ class suite_subprocess:
print '********************************'
self.fail('ERROR found in output file: ' + filename)
+ # If the string is of the form '/.../', then return just the embedded
+ # pattern, otherwise, return None
+ def convert_to_pattern(self, s):
+ if len(s) >= 2 and s[0] == '/' and s[-1] == '/':
+ return s[1:-1]
+ else:
+ return None
+
def check_file_content(self, filename, expect):
with open(filename, 'r') as f:
got = f.read(len(expect) + 100)
self.assertEqual(got, expect, filename + ': does not contain expected:\n\'' + expect + '\', but contains:\n\'' + got + '\'.')
- def check_file_contains(self, filename, expect):
+ def check_file_contains_one_of(self, filename, expectlist):
"""
Check that the file contains the expected string in the first 100K bytes
"""
maxbytes = 1024*100
with open(filename, 'r') as f:
got = f.read(maxbytes)
- if not (expect in got):
+ found = False
+ for expect in expectlist:
+ pat = self.convert_to_pattern(expect)
+ if pat == None:
+ if expect in got:
+ found = True
+ break
+ else:
+ if re.search(pat, got):
+ found = True
+ break
+ if not found:
+ if len(expectlist) == 1:
+ expect = '\'' + expectlist[0] + '\''
+ else:
+ expect = str(expectlist)
+ gotstr = '\'' + \
+ (got if len(got) < 1000 else (got[0:1000] + '...')) + '\''
if len(got) >= maxbytes:
- self.fail(filename + ': does not contain expected \'' + expect + '\', or output is too large')
+ self.fail(filename + ': does not contain expected ' + expect + ', or output is too large, got ' + gotstr)
else:
- self.fail(filename + ': does not contain expected \'' + expect + '\'')
+ self.fail(filename + ': does not contain expected ' + expect + ', got ' + gotstr)
+
+ def check_file_contains(self, filename, expect):
+ self.check_file_contains_one_of(filename, [expect])
def check_empty_file(self, filename):
"""
@@ -165,6 +193,8 @@ class suite_subprocess:
procargs = [ wtexe ]
if self._gdbSubprocess:
procargs = [ "gdb", "--args" ] + procargs
+ elif self._lldbSubprocess:
+ procargs = [ "lldb", "--" ] + procargs
procargs.extend(args)
if self._gdbSubprocess:
infilepart = ""
@@ -177,6 +207,17 @@ class suite_subprocess:
">" + wtoutname + " 2>" + wterrname
print "*********************************************"
returncode = subprocess.call(procargs)
+ elif self._lldbSubprocess:
+ infilepart = ""
+ if infilename != None:
+ infilepart = "<" + infilename + " "
+ print str(procargs)
+ print "*********************************************"
+ print "**** Run 'wt' via: run " + \
+ " ".join(procargs[3:]) + infilepart + \
+ ">" + wtoutname + " 2>" + wterrname
+ print "*********************************************"
+ returncode = subprocess.call(procargs)
elif infilename:
with open(infilename, "r") as wtin:
returncode = subprocess.call(
diff --git a/src/third_party/wiredtiger/test/suite/test_backup02.py b/src/third_party/wiredtiger/test/suite/test_backup02.py
index f4f5219440b..aac69e811a2 100644
--- a/src/third_party/wiredtiger/test/suite/test_backup02.py
+++ b/src/third_party/wiredtiger/test/suite/test_backup02.py
@@ -29,15 +29,17 @@
import Queue
import threading, time, wiredtiger, wttest
from wtthread import backup_thread, checkpoint_thread, op_thread
-from wtscenario import make_scenarios
# test_backup02.py
# Run background checkpoints and backups repeatedly while doing inserts
# in another thread
class test_backup02(wttest.WiredTigerTestCase):
- scenarios = make_scenarios([
- ('table', dict(uri='table:test',fmt='L',dsize=100,nops=200,nthreads=1,time=30)),
- ])
+ uri = 'table:test_backup02'
+ fmt = 'L'
+ dsize = 100
+ nops = 200
+ nthreads = 1
+ time = 60 if wttest.islongtest() else 10
def test_backup02(self):
done = threading.Event()
diff --git a/src/third_party/wiredtiger/test/suite/test_backup03.py b/src/third_party/wiredtiger/test/suite/test_backup03.py
index da94dfe5f17..a710b0e38d1 100644
--- a/src/third_party/wiredtiger/test/suite/test_backup03.py
+++ b/src/third_party/wiredtiger/test/suite/test_backup03.py
@@ -48,12 +48,15 @@ class test_backup_target(wttest.WiredTigerTestCase, suite_subprocess):
# The way it works is we create 4 objects, only one of which is large, then
# we do a hot backup of one or more of the objects and compare the original
# to the backup to confirm the backup is correct.
+ #
+ # Set the chunk size small to avoid needing a lot of data when generating a
+ # complex LSM tree.
pfx = 'test_backup'
objs = [ # Objects
- ('table:' + pfx + '.1', SimpleDataSet, 0),
- ( 'lsm:' + pfx + '.2', SimpleDataSet, 1),
- ('table:' + pfx + '.3', ComplexDataSet, 2),
- ('table:' + pfx + '.4', ComplexLSMDataSet, 3),
+ ('table:' + pfx + '.1', SimpleDataSet, 0, ''),
+ ( 'lsm:' + pfx + '.2', SimpleDataSet, 1, ''),
+ ('table:' + pfx + '.3', ComplexDataSet, 2, ''),
+ ('table:' + pfx + '.4', ComplexLSMDataSet, 3, 'lsm=(chunk_size=512k)'),
]
list = [
( 'backup_1', dict(big=0,list=[0])), # Target objects individually
@@ -82,10 +85,10 @@ class test_backup_target(wttest.WiredTigerTestCase, suite_subprocess):
def populate(self):
for i in self.objs:
if self.big == i[2]:
- rows = 200000 # Big object
+ rows = 50000 # Big object
else:
rows = 1000 # Small object
- i[1](self, i[0], rows).populate()
+ i[1](self, i[0], rows, cgconfig = i[3]).populate()
# Backup needs a checkpoint
self.session.checkpoint(None)
diff --git a/src/third_party/wiredtiger/test/suite/test_backup09.py b/src/third_party/wiredtiger/test/suite/test_backup09.py
new file mode 100644
index 00000000000..fc8f782a5dd
--- /dev/null
+++ b/src/third_party/wiredtiger/test/suite/test_backup09.py
@@ -0,0 +1,145 @@
+#!/usr/bin/env python
+#
+# Public Domain 2014-2018 MongoDB, Inc.
+# Public Domain 2008-2014 WiredTiger, Inc.
+#
+# This is free and unencumbered software released into the public domain.
+#
+# Anyone is free to copy, modify, publish, use, compile, sell, or
+# distribute this software, either in source code form or as a compiled
+# binary, for any purpose, commercial or non-commercial, and by any
+# means.
+#
+# In jurisdictions that recognize copyright laws, the author or authors
+# of this software dedicate any and all copyright interest in the
+# software to the public domain. We make this dedication for the benefit
+# of the public at large and to the detriment of our heirs and
+# successors. We intend this dedication to be an overt act of
+# relinquishment in perpetuity of all present and future rights to this
+# software under copyright law.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+# IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+# OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+# ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+# OTHER DEALINGS IN THE SOFTWARE.
+#
+# test_backup09.py
+# Verify opening a backup cursor forces a log file switch.
+#
+
+import os, shutil, stat
+import helper, wiredtiger, wttest
+from wtscenario import make_scenarios
+
+class test_backup09(wttest.WiredTigerTestCase):
+ # Have log writes go directly to the OS to avoid log_flush calls before
+ # performing file copies not technically part of the backup cursor.
+ conn_config = 'config_base=false,create,' \
+ 'log=(enabled),transaction_sync=(enabled,method=none)'
+ uri = 'table:coll1'
+ backup_dir = 'backup.dir'
+
+ types = [
+ # checkpoint: whether to explicitly checkpoint some data before opening
+ # the backup cursor.
+ #
+ # all_log_files: whether to copy all files in the source directory, or
+ # only the files returned from the backup cursor. Copying the
+ # additional log files will result in more operations being recovered.
+ ('checkpoint', dict(checkpoint=True, all_log_files=False)),
+ ('no_checkpoint', dict(checkpoint=False, all_log_files=False)),
+ ('all_log_files', dict(checkpoint=True, all_log_files=True)),
+ ]
+ scenarios = make_scenarios(types)
+
+ def data_and_start_backup(self):
+ self.session.create(self.uri, 'key_format=i,value_format=i')
+
+ cursor = self.session.open_cursor(self.uri)
+ doc_id = 0
+
+ for i in range(10):
+ doc_id += 1
+ cursor[doc_id] = doc_id
+
+ if self.checkpoint:
+ self.session.checkpoint()
+
+ for i in range(10):
+ doc_id += 1
+ cursor[doc_id] = doc_id
+
+ last_doc_in_backup = doc_id
+ self.assertEqual(1, len(filter(lambda x:
+ x.startswith('WiredTigerLog.'), os.listdir('.'))))
+ backup_cursor = self.session.open_cursor('backup:')
+ self.assertEqual(2, len(filter(lambda x:
+ x.startswith('WiredTigerLog.'), os.listdir('.'))))
+
+ for i in range(10):
+ doc_id += 1
+ cursor[doc_id] = doc_id
+
+ cursor.close()
+ return backup_cursor, last_doc_in_backup, doc_id
+
+ def copy_and_restore(self, backup_cursor, last_doc_in_backup, last_doc_in_data):
+ log_files_to_copy = 0
+ os.mkdir(self.backup_dir)
+ if self.all_log_files:
+ helper.copy_wiredtiger_home('.', self.backup_dir)
+ log_files_copied = filter(lambda x: x.startswith('WiredTigerLog.'), os.listdir(self.backup_dir))
+ self.assertEqual(len(log_files_copied), 2)
+ else:
+ while True:
+ ret = backup_cursor.next()
+ if ret != 0:
+ break
+ shutil.copy(backup_cursor.get_key(), self.backup_dir)
+ if backup_cursor.get_key().startswith('WiredTigerLog.'):
+ log_files_to_copy += 1
+
+ self.assertEqual(ret, wiredtiger.WT_NOTFOUND)
+ self.assertEqual(log_files_to_copy, 1)
+
+ backup_conn = self.wiredtiger_open(self.backup_dir, self.conn_config)
+ if self.all_log_files:
+ self.captureout.checkAdditionalPattern(self, 'Both WiredTiger.turtle and WiredTiger.backup exist.*')
+
+ session = backup_conn.open_session()
+ cursor = session.open_cursor(self.uri)
+
+ if self.all_log_files:
+ doc_cnt = 0
+ for key, val in cursor:
+ doc_cnt += 1
+ self.assertLessEqual(key, last_doc_in_data)
+
+ self.assertEqual(doc_cnt, last_doc_in_data)
+ else:
+ doc_cnt = 0
+ for key, val in cursor:
+ doc_cnt += 1
+ self.assertLessEqual(key, last_doc_in_backup)
+
+ self.assertEqual(doc_cnt, last_doc_in_backup)
+
+ def test_backup_rotates_log(self):
+ if os.name == "nt" and self.all_log_files:
+ self.skipTest('Unix specific test skipped on Windows')
+
+ # Add some data, open a backup cursor, and add some more data. Return
+ # the value of the last document that should appear on a restore.
+ backup_cursor, last_doc_in_backup, last_doc_in_data = \
+ self.data_and_start_backup()
+
+ # Copy the files returned via the backup cursor and bring up WiredTiger
+ # on the destination. Verify no document later than last_doc exists.
+ self.copy_and_restore(
+ backup_cursor, last_doc_in_backup, last_doc_in_data)
+
+if __name__ == '__main__':
+ wttest.run()
diff --git a/src/third_party/wiredtiger/test/suite/test_bug010.py b/src/third_party/wiredtiger/test/suite/test_bug010.py
index 01d604e5a07..67373bf3890 100644
--- a/src/third_party/wiredtiger/test/suite/test_bug010.py
+++ b/src/third_party/wiredtiger/test/suite/test_bug010.py
@@ -37,7 +37,7 @@ import threading, time
class test_bug010(wttest.WiredTigerTestCase):
name = 'test_bug010'
uri = 'table:' + name
- num_tables = 1000
+ num_tables = 2000 if wttest.islongtest() else 200
# Disable checkpoint sync, to make checkpoints faster and
# increase the likelihood of triggering the symptom
diff --git a/src/third_party/wiredtiger/test/suite/test_bug019.py b/src/third_party/wiredtiger/test/suite/test_bug019.py
index 202ca6b6b60..0ef80f3536d 100644
--- a/src/third_party/wiredtiger/test/suite/test_bug019.py
+++ b/src/third_party/wiredtiger/test/suite/test_bug019.py
@@ -35,7 +35,7 @@ from wtdataset import SimpleDataSet
class test_bug019(wttest.WiredTigerTestCase):
conn_config = 'log=(enabled,file_max=100K)'
uri = "table:bug019"
- entries = 100000
+ entries = 5000
# Modify rows so we write log records. We're writing a lot more than a
# single log file, so we know the underlying library will churn through
@@ -43,7 +43,9 @@ class test_bug019(wttest.WiredTigerTestCase):
def populate(self, nentries):
c = self.session.open_cursor(self.uri, None, None)
for i in range(0, nentries):
- c[i] = i
+ # Make the values about 200 bytes. That's about 1MB of data for
+ # 5000 records, generating 10 log files used plus more for overhead.
+ c[i] = "abcde" * 40
c.close()
# Wait for a log file to be pre-allocated. Avoid timing problems, but
@@ -60,7 +62,7 @@ class test_bug019(wttest.WiredTigerTestCase):
# Windows systems due to an issue with the directory list code.
def test_bug019(self):
# Create a table just to write something into the log.
- self.session.create(self.uri, 'key_format=i,value_format=i')
+ self.session.create(self.uri, 'key_format=i,value_format=S')
self.populate(self.entries)
self.session.checkpoint()
diff --git a/src/third_party/wiredtiger/test/suite/test_bug021.py b/src/third_party/wiredtiger/test/suite/test_bug021.py
new file mode 100644
index 00000000000..3a7caa3b8d2
--- /dev/null
+++ b/src/third_party/wiredtiger/test/suite/test_bug021.py
@@ -0,0 +1,212 @@
+#!/usr/bin/env python
+#
+# Public Domain 2014-2018 MongoDB, Inc.
+# Public Domain 2008-2014 WiredTiger, Inc.
+#
+# This is free and unencumbered software released into the public domain.
+#
+# Anyone is free to copy, modify, publish, use, compile, sell, or
+# distribute this software, either in source code form or as a compiled
+# binary, for any purpose, commercial or non-commercial, and by any
+# means.
+#
+# In jurisdictions that recognize copyright laws, the author or authors
+# of this software dedicate any and all copyright interest in the
+# software to the public domain. We make this dedication for the benefit
+# of the public at large and to the detriment of our heirs and
+# successors. We intend this dedication to be an overt act of
+# relinquishment in perpetuity of all present and future rights to this
+# software under copyright law.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+# IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+# OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+# ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+# OTHER DEALINGS IN THE SOFTWARE.
+#
+# test_bug021.py
+# Fixed-length column store implicit record operations test.
+
+import wiredtiger, wttest
+
+# Fixed-length column store implicit record operations test.
+class test_bug021(wttest.WiredTigerTestCase):
+ uri = 'file:test_bug021'
+
+ def create_implicit(self, initial, middle, trailing):
+ self.session.create(self.uri, 'key_format=r,value_format=8t')
+ cursor = self.session.open_cursor(self.uri, None)
+
+ # Create a set of initial implicit records, followed by a set of real
+ # records, followed by a set of trailing implicit records.
+ expected = [0] * (initial + middle + trailing + 2)
+ expected[0] = None
+ r = 0
+ for i in range(initial):
+ r += 1
+ expected[r] = 0x00
+ for i in range(middle):
+ r += 1
+ cursor[r] = expected[r] = 0xab
+ r += trailing
+ cursor[r + 1] = expected[r + 1] = 0xab
+ return (cursor, expected)
+
+ def check(self, expected):
+ c = self.session.open_cursor(self.uri, None)
+ actual = [None] * len(expected)
+ for k, v in c:
+ actual[k] = v
+ c.close()
+
+ if actual != expected:
+ print 'expected: ', expected
+ print ' actual: ', actual
+ self.assertEqual(expected, actual)
+
+ def test_implicit_record_cursor_insert_next(self):
+ cursor, current = self.create_implicit(0, 50, 20)
+ self.check(current)
+
+ # Check cursor next/operation inside trailing implicit keys.
+ cursor.set_key(62)
+ self.assertEquals(cursor.search(), 0)
+ self.assertEquals(cursor.next(), 0)
+ self.assertEquals(cursor.next(), 0)
+ cursor.set_value(3)
+ self.assertEquals(cursor.insert(), 0)
+ current[62 + 2] = 3
+ self.check(current)
+
+ # Check cursor prev/operation inside trailing implicit keys.
+ cursor.set_key(68)
+ self.assertEquals(cursor.search(), 0)
+ self.assertEquals(cursor.prev(), 0)
+ self.assertEquals(cursor.prev(), 0)
+ cursor.set_value(7)
+ self.assertEquals(cursor.insert(), 0)
+ current[68 - 2] = 7
+
+ def test_implicit_record_cursor_insert_prev(self):
+ cursor, current = self.create_implicit(20, 50, 0)
+ self.check(current)
+
+ # Check cursor next/operation inside leading implicit keys.
+ cursor.set_key(2)
+ self.assertEquals(cursor.search(), 0)
+ self.assertEquals(cursor.next(), 0)
+ self.assertEquals(cursor.next(), 0)
+ cursor.set_value(3)
+ self.assertEquals(cursor.insert(), 0)
+ current[2 + 2] = 3
+ self.check(current)
+
+ # Check cursor prev/operation inside leading implicit keys.
+ cursor.set_key(18)
+ self.assertEquals(cursor.search(), 0)
+ self.assertEquals(cursor.prev(), 0)
+ self.assertEquals(cursor.prev(), 0)
+ cursor.set_value(7)
+ self.assertEquals(cursor.insert(), 0)
+ current[18 - 2] = 7
+ self.check(current)
+
+ def test_implicit_record_cursor_remove_next(self):
+ cursor, current = self.create_implicit(0, 50, 20)
+ self.check(current)
+
+ # Check cursor next/operation inside trailing implicit keys.
+ cursor.set_key(62)
+ self.assertEquals(cursor.search(), 0)
+ for i in range(1, 5):
+ self.assertEquals(cursor.next(), 0)
+ self.assertEquals(cursor.remove(), 0)
+ current[62 + i] = 0
+ self.check(current)
+
+ # Check cursor prev/operation inside trailing implicit keys.
+ cursor.set_key(68)
+ self.assertEquals(cursor.search(), 0)
+ for i in range(1, 5):
+ self.assertEquals(cursor.prev(), 0)
+ self.assertEquals(cursor.remove(), 0)
+ current[68 - i] = 0
+ self.check(current)
+
+ def test_implicit_record_cursor_remove_prev(self):
+ cursor, current = self.create_implicit(20, 50, 0)
+ self.check(current)
+
+ # Check cursor next/operation inside leading implicit keys.
+ cursor.set_key(2)
+ self.assertEquals(cursor.search(), 0)
+ for i in range(1, 5):
+ self.assertEquals(cursor.next(), 0)
+ self.assertEquals(cursor.remove(), 0)
+ current[2 + i] = 0
+ self.check(current)
+
+ # Check cursor prev/operation inside leading implicit keys.
+ cursor.set_key(18)
+ self.assertEquals(cursor.search(), 0)
+ for i in range(1, 5):
+ current[18 - i] = 0
+ self.assertEquals(cursor.prev(), 0)
+ self.assertEquals(cursor.remove(), 0)
+ current[18 - i] = 0
+ self.check(current)
+
+ def test_implicit_record_cursor_update_next(self):
+ cursor, current = self.create_implicit(0, 50, 20)
+ self.check(current)
+
+ # Check cursor next/operation inside trailing implicit keys.
+ cursor.set_key(62)
+ self.assertEquals(cursor.search(), 0)
+ for i in range(1, 5):
+ self.assertEquals(cursor.next(), 0)
+ cursor.set_value(i)
+ self.session.breakpoint()
+ self.assertEquals(cursor.update(), 0)
+ current[62 + i] = i
+ self.check(current)
+
+ # Check cursor prev/operation inside trailing implicit keys.
+ cursor.set_key(68)
+ self.assertEquals(cursor.search(), 0)
+ for i in range(1, 5):
+ self.assertEquals(cursor.prev(), 0)
+ cursor.set_value(i)
+ self.assertEquals(cursor.update(), 0)
+ current[68 - i] = i
+ self.check(current)
+
+ def test_implicit_record_cursor_update_prev(self):
+ cursor, current = self.create_implicit(20, 50, 0)
+ self.check(current)
+
+ # Check cursor next/operation inside leading implicit keys.
+ cursor.set_key(2)
+ self.assertEquals(cursor.search(), 0)
+ for i in range(1, 5):
+ self.assertEquals(cursor.next(), 0)
+ cursor.set_value(i)
+ self.assertEquals(cursor.update(), 0)
+ current[2 + i] = i
+ self.check(current)
+
+ # Check cursor prev/operation inside leading implicit keys.
+ cursor.set_key(18)
+ self.assertEquals(cursor.search(), 0)
+ for i in range(1, 5):
+ current[18 - i] = 0
+ self.assertEquals(cursor.prev(), 0)
+ cursor.set_value(i)
+ self.assertEquals(cursor.update(), 0)
+ current[18 - i] = i
+ self.check(current)
+
+if __name__ == '__main__':
+ wttest.run()
diff --git a/src/third_party/wiredtiger/test/suite/test_config07.py b/src/third_party/wiredtiger/test/suite/test_config07.py
new file mode 100644
index 00000000000..ce9d8082424
--- /dev/null
+++ b/src/third_party/wiredtiger/test/suite/test_config07.py
@@ -0,0 +1,105 @@
+#!/usr/bin/env python
+#
+# Public Domain 2014-2018 MongoDB, Inc.
+# Public Domain 2008-2014 WiredTiger, Inc.
+#
+# This is free and unencumbered software released into the public domain.
+#
+# Anyone is free to copy, modify, publish, use, compile, sell, or
+# distribute this software, either in source code form or as a compiled
+# binary, for any purpose, commercial or non-commercial, and by any
+# means.
+#
+# In jurisdictions that recognize copyright laws, the author or authors
+# of this software dedicate any and all copyright interest in the
+# software to the public domain. We make this dedication for the benefit
+# of the public at large and to the detriment of our heirs and
+# successors. We intend this dedication to be an overt act of
+# relinquishment in perpetuity of all present and future rights to this
+# software under copyright law.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+# IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+# OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+# ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+# OTHER DEALINGS IN THE SOFTWARE.
+
+import fnmatch, os, time
+import wiredtiger, wttest
+from wtscenario import make_scenarios
+
+# test_config07.py
+# Test that log files extend as configured and as documented.
+class test_config07(wttest.WiredTigerTestCase):
+ uri = "table:test"
+ entries = 5000
+ K = 1024
+ log_size = K * K
+
+ extend_len = [
+ ('default', dict(log_extend_len='()', expected_log_size = log_size)),
+ ('empty', dict(log_extend_len='(log=)', expected_log_size = log_size)),
+ ('disable', dict(log_extend_len='(log=0)', expected_log_size = 128)),
+ ('100K', dict(log_extend_len='(log=100K)', expected_log_size = 100 * K)),
+ ('too_small', dict(log_extend_len='(log=20K)', expected_log_size = None)),
+ ('too_large', dict(log_extend_len='(log=20G)', expected_log_size = None)),
+ ('small_in_allowed range', dict(log_extend_len='(log=200K)',
+ expected_log_size = 200 * K)),
+ ('large_in_allowed_range', dict(log_extend_len='(log=900K)',
+ expected_log_size = 900 * K)),
+ ('larger_than_log_file_size', dict(log_extend_len='(log=20M)',
+ expected_log_size = log_size)),
+ ('with_data_file_extend_conf', dict(log_extend_len='(log=100K,data=16M)',
+ expected_log_size = 100 * K)),
+ ]
+
+ scenarios = make_scenarios(extend_len)
+
+ def populate(self):
+ cur = self.session.open_cursor(self.uri, None, None)
+ for i in range(0, self.entries):
+ # Make the values about 200 bytes. That's about 1MB of data for
+ # 5000 records, generating 10 log files used plus more for overhead.
+ cur[i] = "abcde" * 40
+ cur.close()
+
+ def checkLogFileSize(self, size):
+ # Wait for a log file to be preallocated. Avoid timing problems, but
+ # assert that a file is created within 1 minute.
+ for i in range(1,60):
+ logs = fnmatch.filter(os.listdir('.'), "*Prep*")
+ if logs:
+ f = logs[-1]
+ file_size = os.stat(f).st_size
+ self.assertEqual(size, file_size)
+ break
+ time.sleep(1)
+ self.assertTrue(logs)
+
+ def test_log_extend(self):
+ self.conn.close()
+ msg = '/invalid log extend length/'
+
+ config = 'log=(enabled,file_max=1M),file_extend=' + self.log_extend_len
+ configarg = 'create,statistics=(fast)' + ',' + config
+
+ # Expect an error when an invalid log extend size is provided.
+ if self.expected_log_size is None:
+ self.assertRaisesWithMessage(wiredtiger.WiredTigerError,
+ lambda: self.wiredtiger_open('.', configarg), msg)
+ return
+
+ self.conn = self.wiredtiger_open('.', configarg)
+ self.session = self.conn.open_session(None)
+
+ # Create a table, insert data in it to trigger log file writes.
+ self.session.create(self.uri, 'key_format=i,value_format=S')
+ self.populate()
+ self.session.checkpoint()
+
+ self.checkLogFileSize(self.expected_log_size)
+
+if __name__ == '__main__':
+ wttest.run()
diff --git a/src/third_party/wiredtiger/test/suite/test_cursor12.py b/src/third_party/wiredtiger/test/suite/test_cursor12.py
index 50204274b94..96bf48ef83c 100644
--- a/src/third_party/wiredtiger/test/suite/test_cursor12.py
+++ b/src/third_party/wiredtiger/test/suite/test_cursor12.py
@@ -194,12 +194,14 @@ class test_cursor12(wttest.WiredTigerTestCase):
self.assertEquals(c.update(), 0)
c.reset()
+ self.session.begin_transaction()
c.set_key(ds.key(row))
mods = []
for j in i['mods']:
mod = wiredtiger.Modify(j[0], j[1], j[2])
mods.append(mod)
self.assertEquals(c.modify(mods), 0)
+ self.session.commit_transaction()
c.reset()
c.set_key(ds.key(row))
@@ -288,6 +290,7 @@ class test_cursor12(wttest.WiredTigerTestCase):
ds.populate()
c = self.session.open_cursor(self.uri, None)
+ self.session.begin_transaction()
c.set_key(ds.key(10))
orig = 'abcdefghijklmnopqrstuvwxyz'
c.set_value(orig)
@@ -299,6 +302,7 @@ class test_cursor12(wttest.WiredTigerTestCase):
mod = wiredtiger.Modify(new, 10, 5)
mods.append(mod)
self.assertEquals(c.modify(mods), 0)
+ self.session.commit_transaction()
c.set_key(ds.key(10))
self.assertEquals(c.search(), 0)
@@ -314,12 +318,14 @@ class test_cursor12(wttest.WiredTigerTestCase):
c.set_key(ds.key(10))
self.assertEquals(c.remove(), 0)
+ self.session.begin_transaction()
mods = []
mod = wiredtiger.Modify('ABCD', 3, 3)
mods.append(mod)
c.set_key(ds.key(10))
self.assertEqual(c.modify(mods), wiredtiger.WT_NOTFOUND)
+ self.session.commit_transaction()
# Check that modify returns not-found when an insert is not yet committed
# and after it's aborted.
@@ -347,6 +353,7 @@ class test_cursor12(wttest.WiredTigerTestCase):
# Test that another transaction cannot modify our uncommitted record.
xs = self.conn.open_session()
xc = xs.open_cursor(self.uri, None)
+ xs.begin_transaction()
xc.set_key(ds.key(30))
xc.set_value(ds.value(30))
mods = []
@@ -354,16 +361,19 @@ class test_cursor12(wttest.WiredTigerTestCase):
mods.append(mod)
xc.set_key(ds.key(30))
self.assertEqual(xc.modify(mods), wiredtiger.WT_NOTFOUND)
+ xs.rollback_transaction()
# Rollback our transaction.
self.session.rollback_transaction()
# Test that we can't modify our aborted insert.
+ self.session.begin_transaction()
mods = []
mod = wiredtiger.Modify('ABCD', 3, 3)
mods.append(mod)
c.set_key(ds.key(30))
self.assertEqual(c.modify(mods), wiredtiger.WT_NOTFOUND)
+ self.session.rollback_transaction()
if __name__ == '__main__':
wttest.run()
diff --git a/src/third_party/wiredtiger/test/suite/test_cursor15.py b/src/third_party/wiredtiger/test/suite/test_cursor15.py
new file mode 100644
index 00000000000..ca9c65589f2
--- /dev/null
+++ b/src/third_party/wiredtiger/test/suite/test_cursor15.py
@@ -0,0 +1,70 @@
+#!/usr/bin/env python
+#
+# Public Domain 2014-2018 MongoDB, Inc.
+# Public Domain 2008-2014 WiredTiger, Inc.
+#
+# This is free and unencumbered software released into the public domain.
+#
+# Anyone is free to copy, modify, publish, use, compile, sell, or
+# distribute this software, either in source code form or as a compiled
+# binary, for any purpose, commercial or non-commercial, and by any
+# means.
+#
+# In jurisdictions that recognize copyright laws, the author or authors
+# of this software dedicate any and all copyright interest in the
+# software to the public domain. We make this dedication for the benefit
+# of the public at large and to the detriment of our heirs and
+# successors. We intend this dedication to be an overt act of
+# relinquishment in perpetuity of all present and future rights to this
+# software under copyright law.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+# IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+# OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+# ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+# OTHER DEALINGS IN THE SOFTWARE.
+#
+# test_cursor15.py
+# Cursors: read_once configuration
+#
+
+import wttest
+from wiredtiger import stat
+
+class test_cursor15(wttest.WiredTigerTestCase):
+ tablename = 'test_read_once'
+ uri = 'table:' + tablename
+
+ conn_config = 'cache_size=1M,statistics=(all)'
+
+ def test_cursor15(self):
+ # This test is configured to use 1MB of cache. It will insert 20
+ # documents, each 100KB. Manipulate the table to result in one page per
+ # document.
+ self.session.create(self.uri,
+ 'key_format=i,value_format=S,leaf_page_max=108K,leaf_value_max=108K')
+
+ cursor = self.session.open_cursor(self.uri, None, None)
+ for key in range(0, 20):
+ cursor[key] = '1' * (100 * 1024)
+ cursor.close()
+
+ # Restart the database to clear the cache and reset statistics.
+ self.reopen_conn()
+
+ # We don't restart the database between runs to exercise that read_once
+ # plays nice with cursor caching. Note, there are no reliable statistics
+ # to check that read_once behaves as expected. The future may introduce a
+ # statistic for WT_READ_WONT_NEED being exercised. That may work as a
+ # suitable side-effect to observe here.
+ for cursor_conf in ["read_once=true", None]:
+ # Table scan ~2MB of data when only given 1MB of cache.
+ cursor = self.session.open_cursor(self.uri, None, cursor_conf)
+ for key, value in cursor:
+ pass
+ cursor.close()
+
+if __name__ == '__main__':
+ wttest.run()
diff --git a/src/third_party/wiredtiger/test/suite/test_intpack.py b/src/third_party/wiredtiger/test/suite/test_intpack.py
index 68c9f7b5566..4ba094265c7 100644
--- a/src/third_party/wiredtiger/test/suite/test_intpack.py
+++ b/src/third_party/wiredtiger/test/suite/test_intpack.py
@@ -126,6 +126,9 @@ class PackTester:
class test_intpack(wttest.WiredTigerTestCase):
name = 'test_intpack'
+ # It's useful to test a larger range but avoid the CPU overhead normally
+ base_range = 66000 if wttest.islongtest() else 5000
+
# We have to be a bit verbose here with naming, scenario names are
# case insensitive and must be unique.
@@ -152,7 +155,7 @@ class test_intpack(wttest.WiredTigerTestCase):
pt = PackTester(self.formatcode, self.low, self.high, self.assertEquals)
self.assertEquals(2 ** self.nbits, self.high - self.low + 1)
pt.initialize(self.session)
- pt.check_range(-66000, 66000)
+ pt.check_range(-self.base_range, self.base_range)
if self.nbits >= 32:
e32 = 2 ** 32
pt.check_range(e32 - 1000, e32 + 1000)
diff --git a/src/third_party/wiredtiger/test/suite/test_las01.py b/src/third_party/wiredtiger/test/suite/test_las01.py
index 8b6132f8418..f47df3be9ac 100644
--- a/src/third_party/wiredtiger/test/suite/test_las01.py
+++ b/src/third_party/wiredtiger/test/suite/test_las01.py
@@ -139,8 +139,10 @@ class test_las01(wttest.WiredTigerTestCase):
session2 = self.conn.open_session()
session2.begin_transaction('isolation=snapshot')
# Apply two modify operations - replacing the first two items with 'A'
+ self.session.begin_transaction()
self.large_modifies(self.session, uri, 0, ds, nrows)
self.large_modifies(self.session, uri, 1, ds, nrows)
+ self.session.commit_transaction()
# Check to see the value after recovery
self.durable_check(bigvalue3, uri, ds, nrows)
session2.rollback_transaction()
diff --git a/src/third_party/wiredtiger/test/suite/test_las03.py b/src/third_party/wiredtiger/test/suite/test_las03.py
new file mode 100644
index 00000000000..6934bd9a741
--- /dev/null
+++ b/src/third_party/wiredtiger/test/suite/test_las03.py
@@ -0,0 +1,105 @@
+#!/usr/bin/env python
+#
+# Public Domain 2014-2018 MongoDB, Inc.
+# Public Domain 2008-2014 WiredTiger, Inc.
+#
+# This is free and unencumbered software released into the public domain.
+#
+# Anyone is free to copy, modify, publish, use, compile, sell, or
+# distribute this software, either in source code form or as a compiled
+# binary, for any purpose, commercial or non-commercial, and by any
+# means.
+#
+# In jurisdictions that recognize copyright laws, the author or authors
+# of this software dedicate any and all copyright interest in the
+# software to the public domain. We make this dedication for the benefit
+# of the public at large and to the detriment of our heirs and
+# successors. We intend this dedication to be an overt act of
+# relinquishment in perpetuity of all present and future rights to this
+# software under copyright law.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+# IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+# OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+# ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+# OTHER DEALINGS IN THE SOFTWARE.
+
+from helper import copy_wiredtiger_home
+import wiredtiger, wttest
+from wiredtiger import stat
+from wtdataset import SimpleDataSet
+
+def timestamp_str(t):
+ return '%x' % t
+
+# test_las03.py
+# Ensure checkpoints don't read too unnecessary lookaside entries.
+class test_las03(wttest.WiredTigerTestCase):
+ # Force a small cache.
+ def conn_config(self):
+ return 'cache_size=50MB,statistics=(fast)'
+
+ def get_stat(self, stat):
+ stat_cursor = self.session.open_cursor('statistics:')
+ val = stat_cursor[stat][2]
+ stat_cursor.close()
+ return val
+
+ def large_updates(self, session, uri, value, ds, nrows, nops):
+ # Update a large number of records, we'll hang if the lookaside table
+ # isn't doing its thing.
+ cursor = session.open_cursor(uri)
+ for i in range(nrows + 1, nrows + nops + 1):
+ session.begin_transaction()
+ cursor[ds.key(i)] = value
+ session.commit_transaction('commit_timestamp=' + timestamp_str(i))
+ cursor.close()
+
+ def test_checkpoint_las_reads(self):
+ if not wiredtiger.timestamp_build():
+ self.skipTest('requires a timestamp build')
+
+ # Create a small table.
+ uri = "table:test_las03"
+ nrows = 100
+ ds = SimpleDataSet(self, uri, nrows, key_format="S", value_format='u')
+ ds.populate()
+ bigvalue = "aaaaa" * 100
+
+ # Initially load huge data
+ cursor = self.session.open_cursor(uri)
+ for i in range(1, 10000):
+ cursor[ds.key(nrows + i)] = bigvalue
+ cursor.close()
+ self.session.checkpoint()
+
+ # Check to see LAS working with old timestamp
+ bigvalue2 = "ddddd" * 100
+ self.conn.set_timestamp('stable_timestamp=' + timestamp_str(1))
+ las_writes_start = self.get_stat(stat.conn.cache_write_lookaside)
+ self.large_updates(self.session, uri, bigvalue2, ds, nrows, 10000)
+
+ # If the test sizing is correct, the history will overflow the cache
+ self.session.checkpoint()
+ las_writes = self.get_stat(stat.conn.cache_write_lookaside) - las_writes_start
+ self.assertGreaterEqual(las_writes, 0)
+
+ for ts in range(2, 4):
+ self.conn.set_timestamp('stable_timestamp=' + timestamp_str(ts))
+
+ # Now just update one record and checkpoint again
+ self.large_updates(self.session, uri, bigvalue2, ds, nrows, 1)
+
+ las_reads_start = self.get_stat(stat.conn.cache_read_lookaside)
+ self.session.checkpoint()
+ las_reads = self.get_stat(stat.conn.cache_read_lookaside) - las_reads_start
+
+ # Since we're dealing with eviction concurrent with checkpoints
+ # and skewing is controlled by a heuristic, we can't put too tight
+ # a bound on this.
+ self.assertLessEqual(las_reads, 100)
+
+if __name__ == '__main__':
+ wttest.run()
diff --git a/src/third_party/wiredtiger/test/suite/test_prepare_lookaside01.py b/src/third_party/wiredtiger/test/suite/test_prepare_lookaside01.py
new file mode 100644
index 00000000000..22af3ca1bb8
--- /dev/null
+++ b/src/third_party/wiredtiger/test/suite/test_prepare_lookaside01.py
@@ -0,0 +1,137 @@
+#!/usr/bin/env python
+#
+# Public Domain 2014-2018 MongoDB, Inc.
+# Public Domain 2008-2014 WiredTiger, Inc.
+#
+# This is free and unencumbered software released into the public domain.
+#
+# Anyone is free to copy, modify, publish, use, compile, sell, or
+# distribute this software, either in source code form or as a compiled
+# binary, for any purpose, commercial or non-commercial, and by any
+# means.
+#
+# In jurisdictions that recognize copyright laws, the author or authors
+# of this software dedicate any and all copyright interest in the
+# software to the public domain. We make this dedication for the benefit
+# of the public at large and to the detriment of our heirs and
+# successors. We intend this dedication to be an overt act of
+# relinquishment in perpetuity of all present and future rights to this
+# software under copyright law.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+# IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+# OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+# ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+# OTHER DEALINGS IN THE SOFTWARE.
+
+from helper import copy_wiredtiger_home
+import wiredtiger, wttest
+from wtdataset import SimpleDataSet
+
+def timestamp_str(t):
+ return '%x' % t
+
+# test_prepare_lookaside01.py
+# test to ensure lookaside eviction is working for prepared transactions.
+class test_prepare_lookaside01(wttest.WiredTigerTestCase):
+ # Force a small cache.
+ def conn_config(self):
+ return 'cache_size=50MB'
+
+ def prepare_updates(self, uri, ds, nrows, nsessions, nkeys):
+ # Update a large number of records in their individual transactions.
+ # This will force eviction and start lookaside eviction of committed
+ # updates.
+ #
+ # Follow this by updating a number of records in prepared transactions
+ # under multiple sessions. We'll hang if lookaside table isn't doing its
+ # thing. If we do all updates in a single session, then hang will be due
+ # to uncommitted updates, instead of prepared updates.
+ #
+ # Do another set of updates in that many transactions. This forces the
+ # pages that have been evicted to lookaside to be re-read and brought in
+ # memory. Hence testing if we can read prepared updates from lookaside.
+
+ # Start with setting a stable timestamp to pin history in cache
+ self.conn.set_timestamp('stable_timestamp=' + timestamp_str(1))
+
+ # Commit some updates to get eviction and lookaside fired up
+ bigvalue1 = "bbbbb" * 100
+ cursor = self.session.open_cursor(uri)
+ for i in range(1, nsessions * nkeys):
+ self.session.begin_transaction()
+ cursor.set_key(ds.key(nrows + i))
+ cursor.set_value(bigvalue1)
+ self.assertEquals(cursor.update(), 0)
+ self.session.commit_transaction('commit_timestamp=' + timestamp_str(i))
+
+ # Have prepared updates in multiple sessions. This should ensure writing
+ # prepared updates to the lookaside
+ sessions = [0] * nsessions
+ cursors = [0] * nsessions
+ bigvalue2 = "ccccc" * 100
+ for j in range (0, nsessions):
+ sessions[j] = self.conn.open_session()
+ sessions[j].begin_transaction("isolation=snapshot")
+ cursors[j] = sessions[j].open_cursor(uri)
+ # Each session will update many consecutive keys.
+ start = (j * nkeys)
+ end = start + nkeys
+ for i in range(start, end):
+ cursors[j].set_key(ds.key(nrows + i))
+ cursors[j].set_value(bigvalue2)
+ self.assertEquals(cursors[j].update(), 0)
+ sessions[j].prepare_transaction('prepare_timestamp=' + timestamp_str(2))
+
+ # Commit more regular updates. To do this, the pages that were just
+ # evicted need to be read back. This ensures reading prepared updates
+ # from the lookaside
+ bigvalue3 = "ddddd" * 100
+ cursor = self.session.open_cursor(uri)
+ for i in range(1, nsessions * nkeys):
+ self.session.begin_transaction()
+ cursor.set_key(ds.key(nrows + i))
+ cursor.set_value(bigvalue3)
+ self.assertEquals(cursor.update(), 0)
+ self.session.commit_transaction('commit_timestamp=' + timestamp_str(i + 3))
+ cursor.close()
+
+ # Close all cursors and sessions, this will cause prepared updates to be
+ # rollback-ed
+ for j in range (1, nsessions):
+ cursors[j].close()
+ sessions[j].close()
+
+ def test_prepare_lookaside(self):
+ if not wiredtiger.timestamp_build():
+ self.skipTest('requires a timestamp build')
+
+ # Create a small table.
+ uri = "table:test_prepare_lookaside01"
+ nrows = 100
+ ds = SimpleDataSet(self, uri, nrows, key_format="S", value_format='u')
+ ds.populate()
+ bigvalue = "aaaaa" * 100
+
+ # Initially load huge data
+ cursor = self.session.open_cursor(uri)
+ for i in range(1, 10000):
+ cursor.set_key(ds.key(nrows + i))
+ cursor.set_value(bigvalue)
+ self.assertEquals(cursor.insert(), 0)
+ cursor.close()
+ self.session.checkpoint()
+
+ # Check if lookaside is working properly with prepare transactions.
+ # We put prepared updates in multiple sessions so that we do not hang
+ # because of cache being full with uncommitted updates.
+ # TODO: Increase the nsessions below to start testing lookaside eviction
+ # of prepared updates.
+ nsessions = 1
+ nkeys = 4000
+ self.prepare_updates(uri, ds, nrows, nsessions, nkeys)
+
+if __name__ == '__main__':
+ wttest.run()
diff --git a/src/third_party/wiredtiger/test/suite/test_schema08.py b/src/third_party/wiredtiger/test/suite/test_schema08.py
new file mode 100644
index 00000000000..e7b44219ba3
--- /dev/null
+++ b/src/third_party/wiredtiger/test/suite/test_schema08.py
@@ -0,0 +1,189 @@
+#!/usr/bin/env python
+#
+# Public Domain 2014-2018 MongoDB, Inc.
+# Public Domain 2008-2014 WiredTiger, Inc.
+#
+# This is free and unencumbered software released into the public domain.
+#
+# Anyone is free to copy, modify, publish, use, compile, sell, or
+# distribute this software, either in source code form or as a compiled
+# binary, for any purpose, commercial or non-commercial, and by any
+# means.
+#
+# In jurisdictions that recognize copyright laws, the author or authors
+# of this software dedicate any and all copyright interest in the
+# software to the public domain. We make this dedication for the benefit
+# of the public at large and to the detriment of our heirs and
+# successors. We intend this dedication to be an overt act of
+# relinquishment in perpetuity of all present and future rights to this
+# software under copyright law.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+# IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+# OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+# ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+# OTHER DEALINGS IN THE SOFTWARE.
+
+import fnmatch, os, shutil, sys
+from suite_subprocess import suite_subprocess
+import wiredtiger, wttest
+from wtscenario import make_scenarios
+
+# test_schema08.py
+# Test schema operations on recovery.
+# Test all schema operations alter, create, drop, rename.
+# After doing the operation, create a backup copy of the directory,
+# walk the log recording each LSN, truncate the backup copy of the
+# log walking backward from the LSNs and then run recovery.
+class test_schema08(wttest.WiredTigerTestCase, suite_subprocess):
+ # We want to copy, truncate and run recovery so keep the log
+ # file small and don't pre-allocate any. We expect a small log.
+ conn_config = 'log=(enabled,archive=false,file_max=100k,prealloc=false)'
+ types = [
+ ('file', dict(uri='file:', use_cg=False, use_index=False)),
+ ('lsm', dict(uri='lsm:', use_cg=False, use_index=False)),
+ ('table-cg', dict(uri='table:', use_cg=True, use_index=False)),
+ ('table-index', dict(uri='table:', use_cg=False, use_index=True)),
+ ('table-simple', dict(uri='table:', use_cg=False, use_index=False)),
+ ]
+ ops = [
+ ('none', dict(schema_ops='none')),
+ ('alter', dict(schema_ops='alter')),
+ ('drop', dict(schema_ops='drop')),
+ ('rename', dict(schema_ops='rename')),
+ ]
+ ckpt = [
+ ('no_ckpt', dict(ckpt=False)),
+ ('with_ckpt', dict(ckpt=True)),
+ ]
+ scenarios = make_scenarios(types, ops, ckpt)
+ count = 0
+ lsns = []
+ backup_pfx = "BACKUP."
+
+ def do_alter(self, uri, suburi):
+ alter_param = 'cache_resident=true'
+ self.session.alter(uri, alter_param)
+ if suburi != None:
+ self.session.alter(suburi, alter_param)
+
+ def do_ops(self, uri, suburi):
+ if (self.schema_ops == 'none'):
+ return
+ if (self.schema_ops == 'alter'):
+ self.do_alter(uri, suburi)
+ elif (self.schema_ops == 'drop'):
+ self.session.drop(uri, None)
+ elif (self.schema_ops == 'rename'):
+ newuri = self.uri + "new-table"
+ self.session.rename(uri, newuri, None)
+
+ # Count actual log records in the log. Log cursors walk the individual
+ # operations of a transaction as well as the entire record. Skip counting
+ # any individual commit operations and only count entire records.
+ def find_logrecs(self):
+ self.count = 0
+ self.session.log_flush('sync=on')
+ c = self.session.open_cursor('log:', None, None)
+ self.lsns.append(0)
+ while c.next() == 0:
+ # lsn.file, lsn.offset, opcount
+ keys = c.get_key()
+ # We don't expect to need more than one log file. We only store
+ # the offsets in a list so assert lsn.file is 1.
+ self.assertTrue(keys[0] == 1)
+
+ # Only count whole records, which is when opcount is zero.
+ # If opcount is not zero it is an operation of a commit.
+ # Skip LSN 128, that is a system record and its existence
+ # is assumed within the system.
+ if keys[2] == 0 and keys[1] != 128:
+ self.count += 1
+ self.lsns.append(keys[1])
+ c.close()
+ self.pr("Find " + str(self.count) + " logrecs LSNS: ")
+ self.pr(str(self.lsns))
+
+ def make_backups(self):
+ # With the connection still open, copy files to the new directory.
+ # Make an initial copy as well as a copy for each LSN we save.
+ # Truncate the log to the appropriate offset as we make each copy.
+ olddir = "."
+ log1 = 'WiredTigerLog.0000000001'
+ for lsn in self.lsns:
+ newdir = self.backup_pfx + str(lsn)
+ shutil.rmtree(newdir, ignore_errors=True)
+ os.mkdir(newdir)
+ for fname in os.listdir(olddir):
+ fullname = os.path.join(olddir, fname)
+ # Skip lock file on Windows since it is locked
+ if os.path.isfile(fullname) and \
+ "WiredTiger.lock" not in fullname and \
+ "Tmplog" not in fullname and \
+ "Preplog" not in fullname:
+ shutil.copy(fullname, newdir)
+ # Truncate the file to the LSN offset.
+ # NOTE: This removes the record at that offset
+ # resulting in recovery running to just before
+ # that record.
+ if lsn != 0:
+ logf = os.path.join(newdir + '/' + log1)
+ f = open(logf, "r+")
+ f.truncate(lsn)
+ f.close()
+ # print "New size " + logf + ": " + str(os.path.getsize(logf))
+
+ def run_recovery(self, uri, suburi):
+ # With the connection still open, copy files to the new directory.
+ # Make an initial copy as well as a copy for each LSN we save.
+ # Truncate the log to the appropriate offset as we make each copy.
+ olddir = "."
+ for lsn in self.lsns:
+ newdir = self.backup_pfx + str(lsn)
+ outfile = newdir + '.txt'
+ self.runWt(['-R', '-h', newdir, 'list', '-v'], outfilename=outfile)
+
+ # Test that creating and dropping tables does not write individual
+ # log records.
+ def test_schema08_create(self):
+ self.count = 0
+ self.lsns = []
+ uri = self.uri + 'table0'
+ create_params = 'key_format=i,value_format=S,'
+
+ cgparam = ''
+ suburi = None
+ if self.use_cg or self.use_index:
+ cgparam = 'columns=(k,v),'
+ if self.use_cg:
+ cgparam += 'colgroups=(g0),'
+
+ # Create main table.
+ self.session.create(uri, create_params + cgparam)
+
+ # Checkpoint after the main table creation if wanted.
+ if self.ckpt:
+ self.session.checkpoint()
+
+ # Add in column group or index tables.
+ if self.use_cg:
+ # Create.
+ cgparam = 'columns=(v),'
+ suburi = 'colgroup:table0:g0'
+ self.session.create(suburi, cgparam)
+
+ if self.use_index:
+ # Create.
+ suburi = 'index:table0:i0'
+ self.session.create(suburi, cgparam)
+
+ self.do_ops(uri, suburi)
+ self.find_logrecs()
+ # print "Found " + str(self.count) + " log records"
+ self.make_backups()
+ self.run_recovery(uri, suburi)
+
+if __name__ == '__main__':
+ wttest.run()
diff --git a/src/third_party/wiredtiger/test/suite/test_sweep01.py b/src/third_party/wiredtiger/test/suite/test_sweep01.py
index 4939c270635..43272a6a1c5 100644
--- a/src/third_party/wiredtiger/test/suite/test_sweep01.py
+++ b/src/third_party/wiredtiger/test/suite/test_sweep01.py
@@ -43,7 +43,7 @@ class test_sweep01(wttest.WiredTigerTestCase, suite_subprocess):
numfiles = 30
numkv = 1000
conn_config = 'file_manager=(close_handle_minimum=0,' + \
- 'close_idle_time=6,close_scan_interval=2),' + \
+ 'close_idle_time=3,close_scan_interval=1),' + \
'statistics=(fast),operation_tracking=(enabled=false),'
types = [
@@ -111,8 +111,8 @@ class test_sweep01(wttest.WiredTigerTestCase, suite_subprocess):
self.session.checkpoint()
k = k+1
c[k] = 1
- sleep += 2
- time.sleep(2)
+ sleep += 0.5
+ time.sleep(0.5)
# Give slow machines time to process files.
stat_cursor = self.session.open_cursor('statistics:', None, None)
this_nfile = stat_cursor[stat.conn.file_open][2]
diff --git a/src/third_party/wiredtiger/test/suite/test_sweep03.py b/src/third_party/wiredtiger/test/suite/test_sweep03.py
index 356d270ddea..c378b54b710 100644
--- a/src/third_party/wiredtiger/test/suite/test_sweep03.py
+++ b/src/third_party/wiredtiger/test/suite/test_sweep03.py
@@ -39,7 +39,7 @@ import wttest
class test_sweep03(wttest.WiredTigerTestCase, suite_subprocess):
tablebase = 'test_sweep03'
uri = 'table:' + tablebase
- numfiles = 400 # Make this more than the default close_handle_minimum
+ numfiles = 40 # Make this more than the default close_handle_minimum
numkv = 100
conn_config = 'file_manager=(close_handle_minimum=10,' + \
'close_idle_time=0,close_scan_interval=1),' + \
@@ -56,6 +56,24 @@ class test_sweep03(wttest.WiredTigerTestCase, suite_subprocess):
scenarios = make_scenarios(types)
+ # Wait for the sweep server to run - let it run twice, since the statistic
+ # is incrememented at the start of a sweep and the test relies on sweep
+ # completing it's work.
+ def wait_for_sweep(self, baseline):
+ # Check regularly for up to 5 seconds total.
+ for i in range(10):
+ stat_cursor = self.session.open_cursor('statistics:', None, None)
+ sweeps = stat_cursor[stat.conn.dh_sweeps][2]
+ stat_cursor.close()
+ if (sweeps > baseline + 1):
+ return sweeps
+ time.sleep(0.5)
+
+ # If the statistic didn't increase in 5 seconds the sweep server isn't
+ # working as expected.
+ self.assertTrue(sweeps > baseline + 1)
+ return (sweeps)
+
def test_disable_idle_timeout1(self):
#
# Set up numfiles with numkv entries. We just want some data in there
@@ -72,18 +90,15 @@ class test_sweep03(wttest.WiredTigerTestCase, suite_subprocess):
#
# The idle timeout is disabled - we don't expect the sweep server to
- # close any regular handles. Give the sweep server 5x the scan
- # interval to close any handles.
+ # close any regular handles. The function returns the current sweep
+ # count to allow for incremental waits - which this test doesn't need.
#
- time.sleep(5)
+ self.wait_for_sweep(0)
stat_cursor = self.session.open_cursor('statistics:', None, None)
close1 = stat_cursor[stat.conn.dh_sweep_close][2]
- sweep1 = stat_cursor[stat.conn.dh_sweeps][2]
stat_cursor.close()
- # The sweep server should have run, or the test isn't working.
- self.assertGreater(sweep1, 0)
# We expect nothing to have been closed.
self.assertEqual(close1, 0)
@@ -101,23 +116,21 @@ class test_sweep03(wttest.WiredTigerTestCase, suite_subprocess):
# We just filled the table, now check what the stats are
stat_cursor = self.session.open_cursor('statistics:', None, None)
cache1 = stat_cursor[stat.conn.cache_bytes_inuse][2]
- sweep1 = stat_cursor[stat.conn.dh_sweeps][2]
+ close1 = stat_cursor[stat.conn.dh_sweep_close][2]
+ sweep_baseline = stat_cursor[stat.conn.dh_sweeps][2]
stat_cursor.close()
# We force the drop in this case to confirm that the handle is closed
self.session.drop(drop_uri, "force=true")
- time.sleep(5)
+ sweep_baseline = self.wait_for_sweep(sweep_baseline)
# Grab the stats post table drop to see things have decremented
stat_cursor = self.session.open_cursor('statistics:', None, None)
cache2 = stat_cursor[stat.conn.cache_bytes_inuse][2]
close2 = stat_cursor[stat.conn.dh_sweep_close][2]
- sweep2 = stat_cursor[stat.conn.dh_sweeps][2]
stat_cursor.close()
- # Make sure the sweep server is still working.
- self.assertGreater(sweep2, sweep1)
# Ensure that the handle has been closed after the drop.
self.assertEqual(close2, 1)
# Ensure that any space was reclaimed from cache.
@@ -137,21 +150,19 @@ class test_sweep03(wttest.WiredTigerTestCase, suite_subprocess):
stat_cursor = self.session.open_cursor('statistics:', None, None)
cache1 = stat_cursor[stat.conn.cache_bytes_inuse][2]
close1 = stat_cursor[stat.conn.dh_sweep_close][2]
- sweep1 = stat_cursor[stat.conn.dh_sweeps][2]
+ sweep_baseline = stat_cursor[stat.conn.dh_sweeps][2]
stat_cursor.close()
self.session.drop(drop_uri, None)
- time.sleep(5)
+ sweep_baseline = self.wait_for_sweep(sweep_baseline)
# Grab the stats post table drop to see things have decremented
stat_cursor = self.session.open_cursor('statistics:', None, None)
cache2 = stat_cursor[stat.conn.cache_bytes_inuse][2]
close2 = stat_cursor[stat.conn.dh_sweep_close][2]
- sweep2 = stat_cursor[stat.conn.dh_sweeps][2]
stat_cursor.close()
- self.assertGreater(sweep2, sweep1)
# The sweep server should not be involved in regular drop cleanup
self.assertEqual(close2, close1)
# Ensure that any space was reclaimed from cache.
diff --git a/src/third_party/wiredtiger/test/suite/test_timestamp04.py b/src/third_party/wiredtiger/test/suite/test_timestamp04.py
index 83ed4e904a6..77a55a447b0 100644
--- a/src/third_party/wiredtiger/test/suite/test_timestamp04.py
+++ b/src/third_party/wiredtiger/test/suite/test_timestamp04.py
@@ -70,8 +70,8 @@ class test_timestamp04(wttest.WiredTigerTestCase, suite_subprocess):
if missing == False:
actual = dict((k, v) for k, v in cur if v != 0)
if actual != expected:
- print "missing: ", sorted(set(expected) - set(actual))
- print "extras: ", sorted(set(actual) - set(expected))
+ print "missing: ", sorted(set(expected.items()) - set(actual.items()))
+ print "extras: ", sorted(set(actual.items()) - set(expected.items()))
self.assertTrue(actual == expected)
# Search for the expected items as well as iterating.
@@ -167,7 +167,8 @@ class test_timestamp04(wttest.WiredTigerTestCase, suite_subprocess):
self.conn.rollback_to_stable()
stat_cursor = self.session.open_cursor('statistics:', None, None)
calls = stat_cursor[stat.conn.txn_rollback_to_stable][2]
- upd_aborted = stat_cursor[stat.conn.txn_rollback_upd_aborted][2]
+ upd_aborted = (stat_cursor[stat.conn.txn_rollback_upd_aborted][2] +
+ stat_cursor[stat.conn.txn_rollback_las_removed][2])
stat_cursor.close()
self.assertEqual(calls, 1)
self.assertTrue(upd_aborted >= key_range/2)
@@ -237,7 +238,8 @@ class test_timestamp04(wttest.WiredTigerTestCase, suite_subprocess):
self.conn.rollback_to_stable()
stat_cursor = self.session.open_cursor('statistics:', None, None)
calls = stat_cursor[stat.conn.txn_rollback_to_stable][2]
- upd_aborted = stat_cursor[stat.conn.txn_rollback_upd_aborted][2]
+ upd_aborted = (stat_cursor[stat.conn.txn_rollback_upd_aborted][2] +
+ stat_cursor[stat.conn.txn_rollback_las_removed][2])
stat_cursor.close()
self.assertEqual(calls, 2)
#
diff --git a/src/third_party/wiredtiger/test/suite/test_timestamp06.py b/src/third_party/wiredtiger/test/suite/test_timestamp06.py
index 37e3d0da2d2..15425ce4027 100644
--- a/src/third_party/wiredtiger/test/suite/test_timestamp06.py
+++ b/src/third_party/wiredtiger/test/suite/test_timestamp06.py
@@ -65,17 +65,15 @@ class test_timestamp06(wttest.WiredTigerTestCase, suite_subprocess):
# Check that a cursor (optionally started in a new transaction), sees the
# expected values.
- def check(self, session, txn_config, tablename, expected, prn=False):
+ def check(self, session, txn_config, tablename, expected):
if txn_config:
session.begin_transaction(txn_config)
cur = session.open_cursor(tablename, None)
actual = dict((k, v) for k, v in cur if v != 0)
- if prn == True:
- print "CHECK : Expected"
- print expected
- print "CHECK : Actual"
- print actual
+ if actual != expected:
+ print "missing: ", sorted(set(expected.items()) - set(actual.items()))
+ print "extras: ", sorted(set(actual.items()) - set(expected.items()))
self.assertTrue(actual == expected)
# Search for the expected items as well as iterating
for k, v in expected.iteritems():
diff --git a/src/third_party/wiredtiger/test/suite/test_timestamp07.py b/src/third_party/wiredtiger/test/suite/test_timestamp07.py
index e4d281f2038..625e119831d 100644
--- a/src/third_party/wiredtiger/test/suite/test_timestamp07.py
+++ b/src/third_party/wiredtiger/test/suite/test_timestamp07.py
@@ -50,8 +50,8 @@ class test_timestamp07(wttest.WiredTigerTestCase, suite_subprocess):
]
conncfg = [
- ('nolog', dict(conn_config='create,cache_size=1M', using_log=False)),
- ('log', dict(conn_config='create,log=(archive=false,enabled),cache_size=1M', using_log=True)),
+ ('nolog', dict(conn_config='create,cache_size=2M', using_log=False)),
+ ('log', dict(conn_config='create,log=(file_max=1M,archive=false,enabled),cache_size=2M', using_log=True)),
]
nkeys = [
diff --git a/src/third_party/wiredtiger/test/suite/test_timestamp09.py b/src/third_party/wiredtiger/test/suite/test_timestamp09.py
index 6fbb15e38b8..862fa31e681 100644
--- a/src/third_party/wiredtiger/test/suite/test_timestamp09.py
+++ b/src/third_party/wiredtiger/test/suite/test_timestamp09.py
@@ -184,6 +184,8 @@ class test_timestamp09(wttest.WiredTigerTestCase, suite_subprocess):
self.assertEqual(c[6], 6)
self.assertEqual(c[7], 7)
self.assertEqual(c[8], 8)
+ self.assertTimestampsEqual(
+ self.conn.query_timestamp('get=oldest_reader'), timestamp_str(8))
self.session.commit_transaction()
# We can move the oldest timestamp backwards with "force"
@@ -194,6 +196,8 @@ class test_timestamp09(wttest.WiredTigerTestCase, suite_subprocess):
timestamp_str(4)),
'/older than oldest timestamp/')
self.session.begin_transaction('read_timestamp=' + timestamp_str(6))
+ self.assertTimestampsEqual(
+ self.conn.query_timestamp('get=oldest_reader'), timestamp_str(6))
self.session.commit_transaction()
if __name__ == '__main__':
diff --git a/src/third_party/wiredtiger/test/suite/test_timestamp13.py b/src/third_party/wiredtiger/test/suite/test_timestamp13.py
new file mode 100644
index 00000000000..d3a142c0f2f
--- /dev/null
+++ b/src/third_party/wiredtiger/test/suite/test_timestamp13.py
@@ -0,0 +1,151 @@
+#!/usr/bin/env python
+#
+# Public Domain 2014-2018 MongoDB, Inc.
+# Public Domain 2008-2014 WiredTiger, Inc.
+#
+# This is free and unencumbered software released into the public domain.
+#
+# Anyone is free to copy, modify, publish, use, compile, sell, or
+# distribute this software, either in source code form or as a compiled
+# binary, for any purpose, commercial or non-commercial, and by any
+# means.
+#
+# In jurisdictions that recognize copyright laws, the author or authors
+# of this software dedicate any and all copyright interest in the
+# software to the public domain. We make this dedication for the benefit
+# of the public at large and to the detriment of our heirs and
+# successors. We intend this dedication to be an overt act of
+# relinquishment in perpetuity of all present and future rights to this
+# software under copyright law.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+# IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+# OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+# ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+# OTHER DEALINGS IN THE SOFTWARE.
+#
+# test_timestamp13.py
+# Timestamps: session query_timestamp
+#
+
+import random
+from suite_subprocess import suite_subprocess
+import wiredtiger, wttest
+from wtscenario import make_scenarios
+
+class test_timestamp13(wttest.WiredTigerTestCase, suite_subprocess):
+ tablename = 'test_timestamp13'
+ uri = 'table:' + tablename
+
+ scenarios = make_scenarios([
+ ('col', dict(extra_config=',key_format=r')),
+ ('lsm', dict(extra_config=',type=lsm')),
+ ('row', dict(extra_config='')),
+ ])
+
+ conn_config = 'log=(enabled)'
+
+ def test_degenerate_timestamps(self):
+ if not wiredtiger.timestamp_build():
+ self.skipTest('requires a timestamp build')
+
+ self.session.create(self.uri,
+ 'key_format=i,value_format=i' + self.extra_config)
+
+ query_choices = ['commit', 'first_commit', 'prepare', 'read']
+ # Querying a session's timestamps will error when not in a transaction.
+ for query in query_choices:
+ self.assertRaises(
+ wiredtiger.WiredTigerError,
+ lambda: self.session.query_timestamp('get=' + query))
+
+ self.session.begin_transaction()
+ # Nothing has been set, all queries will return timestamp 0.
+ for query in query_choices:
+ self.assertTimestampsEqual(
+ self.session.query_timestamp('get=' + query), '0')
+
+ self.assertRaisesWithMessage(
+ wiredtiger.WiredTigerError,
+ lambda: self.session.query_timestamp('get=unknown'),
+ '/not a permitted choice for key/')
+
+ self.session.commit_transaction()
+ # Querying a session's timestamps will error when not in a transaction.
+ for query in query_choices:
+ self.assertRaises(
+ wiredtiger.WiredTigerError,
+ lambda: self.session.query_timestamp('get=' + query))
+
+ def test_query_read_commit_timestamps(self):
+ if not wiredtiger.timestamp_build():
+ self.skipTest('requires a timestamp build')
+
+ self.session.create(self.uri,
+ 'key_format=i,value_format=i' + self.extra_config)
+
+ self.session.begin_transaction('isolation=snapshot')
+ self.session.timestamp_transaction('read_timestamp=10')
+ self.assertTimestampsEqual(
+ self.session.query_timestamp('get=read'), '10')
+
+ # The first commit_timestamp will set both the commit and first_commit
+ # values.
+ self.session.timestamp_transaction('commit_timestamp=20')
+ self.assertTimestampsEqual(
+ self.session.query_timestamp('get=commit'), '20')
+ self.assertTimestampsEqual(
+ self.session.query_timestamp('get=first_commit'), '20')
+
+ # The second commit_timestamp will update the commit value, leaving
+ # first_commit alone.
+ self.session.timestamp_transaction('commit_timestamp=30')
+ self.assertTimestampsEqual(
+ self.session.query_timestamp('get=commit'), '30')
+ self.assertTimestampsEqual(
+ self.session.query_timestamp('get=first_commit'), '20')
+ self.session.commit_transaction()
+
+ def test_query_round_read_timestamp(self):
+ if not wiredtiger.timestamp_build():
+ self.skipTest('requires a timestamp build')
+
+ self.session.create(self.uri,
+ 'key_format=i,value_format=i' + self.extra_config)
+
+ self.conn.set_timestamp('oldest_timestamp=10')
+ self.session.begin_transaction('isolation=snapshot')
+ # Rounding to the oldest timestamp will allow the stale read_timestamp
+ # to succeed. The follow-up call to get the read timestamp returns the
+ # chosen read timestamp.
+ self.session.timestamp_transaction('read_timestamp=5,round_to_oldest=true')
+ self.assertTimestampsEqual(
+ self.session.query_timestamp('get=read'), '10')
+
+ # Moving the oldest timestamp has no bearing on the read timestamp
+ # returned.
+ self.conn.set_timestamp('oldest_timestamp=20')
+ self.assertTimestampsEqual(
+ self.session.query_timestamp('get=read'), '10')
+ self.session.commit_transaction()
+
+ def test_query_prepare_timestamp(self):
+ if not wiredtiger.timestamp_build():
+ self.skipTest('requires a timestamp build')
+
+ self.session.create(self.uri,
+ 'key_format=i,value_format=i' + self.extra_config)
+
+ self.session.begin_transaction()
+ self.session.prepare_transaction('prepare_timestamp=10')
+ self.assertTimestampsEqual(
+ self.session.query_timestamp('get=prepare'), '10')
+
+ self.session.timestamp_transaction('commit_timestamp=20')
+ self.assertTimestampsEqual(
+ self.session.query_timestamp('get=prepare'), '10')
+ self.assertTimestampsEqual(
+ self.session.query_timestamp('get=commit'), '20')
+ self.session.commit_transaction()
diff --git a/src/third_party/wiredtiger/test/suite/test_txn02.py b/src/third_party/wiredtiger/test/suite/test_txn02.py
index 94b939596d1..42e96504f89 100644
--- a/src/third_party/wiredtiger/test/suite/test_txn02.py
+++ b/src/third_party/wiredtiger/test/suite/test_txn02.py
@@ -93,6 +93,12 @@ class test_txn02(wttest.WiredTigerTestCase, suite_subprocess):
checklog_calls = 100 if wttest.islongtest() else 2
checklog_mod = (len(scenarios) / checklog_calls + 1)
+ _debug = False
+ def debug(self, msg):
+ if not self._debug:
+ return
+ print(msg)
+
def conn_config(self):
# Cycle through the different transaction_sync values in a
# deterministic manner.
@@ -201,7 +207,7 @@ class test_txn02(wttest.WiredTigerTestCase, suite_subprocess):
def test_ops(self):
self.backup_dir = os.path.join(self.home, "WT_BACKUP")
self.session2 = self.conn.open_session()
- # print "Creating %s with config '%s'" % (self.uri, self.create_params)
+ self.debug("Creating %s with config '%s'" % (self.uri, self.create_params))
self.session.create(self.uri, self.create_params)
# Set up the table with entries for 1, 2, 10 and 11.
# We use the overwrite config so insert can update as needed.
@@ -214,11 +220,10 @@ class test_txn02(wttest.WiredTigerTestCase, suite_subprocess):
self.scenario_number % len(self.conn_list)]
ops = (self.op1, self.op2, self.op3, self.op4)
txns = (self.txn1, self.txn2, self.txn3, self.txn4)
- # for ok, txn in zip(ops, txns):
- # print ', '.join('%s(%d)[%s]' % (ok[0], ok[1], txn)
for i, ot in enumerate(zip(ops, txns)):
ok, txn = ot
op, k = ok
+ self.debug('{}({})[{}]'.format(ok[0], ok[1], txn))
# Close and reopen the connection and cursor.
if reopen == 'reopen':
@@ -231,7 +236,7 @@ class test_txn02(wttest.WiredTigerTestCase, suite_subprocess):
# Test multiple operations per transaction by always
# doing the same operation on key k + 1.
k1 = k + 1
- # print '%d: %s(%d)[%s]' % (i, ok[0], ok[1], txn)
+ self.debug('Operation. Num: %d: %s(%d)[%s]' % (i, ok[0], ok[1], txn))
if op == 'insert' or op == 'update':
c[k] = c[k1] = i + 2
current[k] = current[k1] = i + 2
@@ -245,7 +250,7 @@ class test_txn02(wttest.WiredTigerTestCase, suite_subprocess):
if k1 in current:
del current[k1]
- # print current
+ self.debug(str(current))
# Check the state after each operation.
self.check_all(current, committed)
diff --git a/src/third_party/wiredtiger/test/suite/test_txn19.py b/src/third_party/wiredtiger/test/suite/test_txn19.py
new file mode 100755
index 00000000000..6805a0c9cc6
--- /dev/null
+++ b/src/third_party/wiredtiger/test/suite/test_txn19.py
@@ -0,0 +1,352 @@
+#!/usr/bin/env python
+#
+# Public Domain 2014-2018 MongoDB, Inc.
+# Public Domain 2008-2014 WiredTiger, Inc.
+#
+# This is free and unencumbered software released into the public domain.
+#
+# Anyone is free to copy, modify, publish, use, compile, sell, or
+# distribute this software, either in source code form or as a compiled
+# binary, for any purpose, commercial or non-commercial, and by any
+# means.
+#
+# In jurisdictions that recognize copyright laws, the author or authors
+# of this software dedicate any and all copyright interest in the
+# software to the public domain. We make this dedication for the benefit
+# of the public at large and to the detriment of our heirs and
+# successors. We intend this dedication to be an overt act of
+# relinquishment in perpetuity of all present and future rights to this
+# software under copyright law.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+# IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+# OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+# ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+# OTHER DEALINGS IN THE SOFTWARE.
+#
+# test_txn19.py
+# Transactions: test recovery with corrupted log files
+#
+
+import fnmatch, os, shutil, time
+from wtscenario import make_scenarios
+from suite_subprocess import suite_subprocess
+import wiredtiger, wttest
+
+# This test uses an artificially small log file limit, and creates
+# large records so two fit into a log file. This allows us to test
+# both the case when corruption happens at the beginning of a log file
+# (an even number of records have been created), and when corruption
+# happens in the middle of a log file (with an odd number of records).
+
+def corrupt(fname, truncate, offset, writeit):
+ with open(fname, 'r+') as log:
+ if offset:
+ if offset < 0: # Negative offset means seek to the end
+ log.seek(0, 2)
+ else:
+ log.seek(offset)
+ if truncate:
+ log.truncate()
+ if writeit:
+ log.write(writeit)
+
+class test_txn19(wttest.WiredTigerTestCase, suite_subprocess):
+ base_config = 'log=(archive=false,enabled,file_max=100K),' + \
+ 'transaction_sync=(enabled,method=none)'
+ conn_config = base_config
+ corruption_type = [
+ ('removal', dict(kind='removal', f=lambda fname:
+ os.remove(fname))),
+ ('truncate', dict(kind='truncate', f=lambda fname:
+ corrupt(fname, True, 0, None))),
+ ('truncate-middle', dict(kind='truncate-middle', f=lambda fname:
+ corrupt(fname, True, 1024 * 25, None))),
+ ('zero-begin', dict(kind='zero', f=lambda fname:
+ corrupt(fname, False, 0, '\0' * 4096))),
+ ('zero-trunc', dict(kind='zero', f=lambda fname:
+ corrupt(fname, True, 0, '\0' * 4096))),
+ ('zero-end', dict(kind='zero-end', f=lambda fname:
+ corrupt(fname, False, -1, '\0' * 4096))),
+ ('garbage-begin', dict(kind='garbage-begin', f=lambda fname:
+ corrupt(fname, False, 0, 'Bad!' * 1024))),
+ ('garbage-middle', dict(kind='garbage-middle', f=lambda fname:
+ corrupt(fname, False, 1024 * 25, 'Bad!' * 1024))),
+ ('garbage-end', dict(kind='garbage-end', f=lambda fname:
+ corrupt(fname, False, -1, 'Bad!' * 1024))),
+ ]
+ # The list comprehension below expands each entry in the integer tuple
+ # list to a scenario. For example, (3, 4, 2) expands to:
+ # ('corrupt=[3,4],checkpoint=2', dict(corruptpos=3, corruptpos2=4, chkpt=2))
+ #
+ # Each number corresponds to a log file, so for this example, we have
+ # corruption in log file 3 (using the style of corruption from
+ # corruption_type), there is a second corruption in log file 4,
+ # and there is a checkpoint in log file 2. A value of 0 means no
+ # corruption or checkpoint.
+ corruption_pos = [
+ ('corrupt=[' + str(x) + ',' + str(y) + '],checkpoint=' + str(z),
+ dict(corruptpos=x,corruptpos2=y,chkpt=z)) for (x,y,z) in (
+ (0, 0, 0), (0, 0, 2), (6, 0, 0), (6, 0, 3), (3, 0, 0),
+ (3, 0, 2), (3, 4, 2), (3, 5, 2), (3, 0, 4))]
+ nrecords = [('nrecords=10', dict(nrecords=10)),
+ ('nrecords=11', dict(nrecords=11))]
+
+ # This function prunes out unnecessary or problematic test cases
+ # from the list of scenarios.
+ def includeFunc(name, dictarg):
+ kind = dictarg['kind']
+ corruptpos = dictarg['corruptpos']
+ chkpt = dictarg['chkpt']
+
+ # corruptpos == 0 indicates there is no corruption.
+ # (i.e. corrupt log file 0, which doesn't exist)
+ # We do want to test the case of no corruption, but we don't
+ # need to try it against every type of corruption, only one.
+ if corruptpos == 0:
+ return kind == 'removal'
+
+ # NOTE:
+ # The removal or truncation of a middle log file (not first or last)
+ # that would be used in recovery is not currently handled gracefully.
+ if (kind == 'removal' or kind == 'truncate') and \
+ corruptpos != 6 and corruptpos > chkpt:
+ return False
+
+ # All the other cases are valid
+ return True
+
+ scenarios = make_scenarios(
+ corruption_type, corruption_pos, nrecords,
+ include=includeFunc, prune=20, prunelong=1000)
+
+ uri = 'table:test_txn19'
+ create_params = 'key_format=i,value_format=S'
+
+ # Return the log file number that contains the given record
+ # number. In this test, two records fit into each log file, and
+ # after each even record is written, a new log file is created
+ # (having no records initially). The last log file is this
+ # (nrecords/2 + 1), given that we start with log 1.
+ def record_to_logfile(self, recordnum):
+ return recordnum / 2 + 1
+
+ # Returns the first record number in a log file.
+ def logfile_to_record(self, logfile):
+ return (logfile - 1) * 2
+
+ # Return true if the log file is corrupted.
+ # If not corrupted, the log file will produce no errors,
+ # and all the records originally written should be recovered.
+ def corrupted(self):
+ # Corruptpos == 0 means to do no corruption in any log file
+ if self.corruptpos == 0:
+ return False
+
+ # Adding zeroes to the end of a log file is indistinguishable
+ # from having a log file that is preallocated that has not been
+ # totally filled. One might argue that if this does not occur
+ # in the final log file, it could/should have been truncated.
+ # At any rate, we consider this particular corruption to be benign.
+ if self.kind == 'zero-end':
+ return False
+ return True
+
+ def show_logs(self, homedir, msg):
+ loglist = []
+ for i in range(0, 10):
+ basename = 'WiredTigerLog.000000000' + str(i)
+ fullname = homedir + os.sep + basename
+ if os.path.isfile(fullname):
+ loglist.append(i)
+ if os.stat(fullname).st_size == 0:
+ self.tty('LOGS ' + msg + ': ' + str(i) + ' is empty')
+ self.tty('LOGS ' + msg + ': ' + str(loglist))
+
+ def copy_for_crash_restart(self, olddir, newdir):
+ ''' Simulate a crash from olddir and restart in newdir. '''
+ # with the connection still open, copy files to new directory
+ shutil.rmtree(newdir, ignore_errors=True)
+ os.mkdir(newdir)
+ for fname in os.listdir(olddir):
+ fullname = os.path.join(olddir, fname)
+ # Skip lock file on Windows since it is locked
+ if os.path.isfile(fullname) and \
+ "WiredTiger.lock" not in fullname and \
+ "Tmplog" not in fullname and \
+ "Preplog" not in fullname:
+ shutil.copy(fullname, newdir)
+
+ # Generate a value that is a bit over half the size of the log file.
+ def valuegen(self, i):
+ return str(i) + 'A' * (1024 * 60) # ~60K
+
+ # Insert a list of keys
+ def inserts(self, keylist):
+ c = self.session.open_cursor(self.uri)
+ for i in keylist:
+ if self.chkpt > 0 and self.logfile_to_record(self.chkpt) == i:
+ self.session.checkpoint()
+ c[i] = self.valuegen(i)
+ c.close()
+
+ def checks(self, expectlist):
+ c = self.session.open_cursor(self.uri, None, None)
+ gotlist = []
+ for key, value in c:
+ gotlist.append(key)
+ self.assertEqual(self.valuegen(key), value)
+ self.assertEqual(expectlist, gotlist)
+ c.close()
+
+ def log_number_to_file_name(self, homedir, n):
+ self.assertLess(n, 10) # assuming 1 digit
+ return homedir + os.sep + 'WiredTigerLog.000000000' + str(n)
+
+ def corrupt_log(self, homedir):
+ if not self.corrupted():
+ return
+ self.f(self.log_number_to_file_name(homedir, self.corruptpos))
+
+ # Corrupt a second log file if needed
+ if self.corruptpos2 != 0:
+ self.f(self.log_number_to_file_name(homedir, self.corruptpos2))
+
+ def corrupt_last_file(self):
+ return self.corruptpos == self.record_to_logfile(self.nrecords)
+
+ # Corruption past the last written record in a log file can sometimes
+ # be detected. In our test case, the last log file has zero or one large
+ # 60K record written into it, but it is presized to 100K. Corruption
+ # at the end of this file creates a hole, and the corruption starts
+ # a new log record, where it can be detected as phony. Similarly,
+ # corruption in the "middle" of the last file (actually the 25K point)
+ # can be detected if there aren't any of the insert records in the file.
+ def corrupt_hole_in_last_file(self):
+ return self.corrupt_last_file() and \
+ ((self.kind == 'garbage-middle' and self.nrecords % 2 == 0) or \
+ self.kind == 'garbage-end')
+
+ # Return true iff the log has been damaged in a way that is not detected
+ # as a corruption. WiredTiger must be lenient about damage in any log
+ # file, because a partial log record written just before a crash is in
+ # most cases indistinguishable from a corruption. If the beginning of
+ # the file is mangled, that is always an unexpected corruption. Situations
+ # where we cannot reliably detect corruption include:
+ # - removal of the last log
+ # - certain corruptions at the beginning of a log record (adding garbage
+ # at the end of a log file can trigger this).
+ def log_corrupt_but_valid(self):
+ if self.corrupt_last_file() and self.kind == 'removal':
+ return True
+ if self.kind == 'truncate-middle' or \
+ self.kind == 'garbage-middle' or \
+ self.kind == 'garbage-end':
+ return True
+ return False
+
+ # In certain cases, we detect log corruption, but just issue warnings.
+ def expect_warning_corruption(self):
+ if self.kind == 'garbage-middle' and self.chkpt <= self.corruptpos:
+ return True
+ if self.corrupt_hole_in_last_file():
+ return True
+ return False
+
+ # For this test, at least, salvage identifies and fixes all
+ # recovery failures.
+ def expect_salvage_messages(self):
+ return self.expect_recovery_failure()
+
+ def expect_recovery_failure(self):
+ return self.corrupted() and \
+ self.corruptpos >= self.chkpt and \
+ not self.log_corrupt_but_valid()
+
+ def recovered_records(self):
+ if not self.corrupted() or self.chkpt > self.corruptpos:
+ return self.nrecords
+ if self.kind == 'garbage-end':
+ # All records in the corrupt file will be found.
+ found = self.logfile_to_record(self.corruptpos + 1)
+ else:
+ found = self.logfile_to_record(self.corruptpos)
+ return min(found, self.nrecords)
+
+ def test_corrupt_log(self):
+ ''' Corrupt the log and restart with different kinds of recovery '''
+
+ # This test creates some data, then simulates a crash with corruption.
+ # Then does a restart with recovery, then starts again with salvage,
+ # and finally starts again with recovery (adding new records).
+
+ self.session.create(self.uri, self.create_params)
+ self.inserts([x for x in range(0, self.nrecords)])
+ newdir = "RESTART"
+ self.copy_for_crash_restart(self.home, newdir)
+ self.close_conn()
+ #self.show_logs(newdir, 'before corruption')
+ self.corrupt_log(newdir)
+ #self.show_logs(newdir, 'after corruption')
+ salvage_config = self.base_config + ',salvage=true'
+ errfile = 'list.err'
+ outfile = 'list.out'
+ expect_fail = self.expect_recovery_failure()
+
+ # In cases of corruption, we cannot always call wiredtiger_open
+ # directly, because there may be a panic, and abort() is called
+ # in diagnostic mode which terminates the Python interpreter.
+ #
+ # Running any wt command externally to Python allows
+ # us to observe the failure or success safely.
+ # Use -R to force recover=on, which is the default for
+ # wiredtiger_open, (wt utilities normally have recover=error)
+ self.runWt(['-h', newdir, '-C', self.base_config, '-R', 'list'],
+ errfilename=errfile, outfilename=outfile, failure=expect_fail,
+ closeconn=False)
+
+ if expect_fail:
+ self.check_file_contains_one_of(errfile,
+ ['/log file.*corrupted/', 'WT_ERROR: non-specific WiredTiger error'])
+ else:
+ self.check_empty_file(errfile)
+ if self.expect_warning_corruption():
+ self.check_file_contains(outfile, '/log file .* corrupted/')
+ self.check_file_contains(outfile, self.uri)
+
+ found_records = self.recovered_records()
+ expect = [x for x in range(0, found_records)]
+
+ # If we are salvaging, expect an informational message
+ if self.expect_salvage_messages():
+ errpat = '.*'
+ # Possible messages:
+ # salvage: log files x-y truncated at beginning
+ # salvage: log file x truncated at beginning
+ # salvage: log file x truncated
+ # salvage: log file x removed
+ outpat = 'salvage: log file'
+ else:
+ errpat = '^$'
+ outpat = '^$'
+ with self.expectedStdoutPattern(outpat):
+ with self.expectedStderrPattern(errpat):
+ self.conn = self.wiredtiger_open(newdir, salvage_config)
+ self.session = self.setUpSessionOpen(self.conn)
+ self.checks(expect)
+
+ # Insert a couple more and simulate another crash.
+ newdir2 = "RESTART2"
+ self.inserts([self.nrecords, self.nrecords + 1])
+ expect.extend([self.nrecords, self.nrecords + 1])
+ self.copy_for_crash_restart(newdir, newdir2)
+ self.checks(expect)
+ self.reopen_conn(newdir)
+ self.checks(expect)
+ self.reopen_conn(newdir2, self.conn_config)
+ self.checks(expect)
+
+if __name__ == '__main__':
+ wttest.run()
diff --git a/src/third_party/wiredtiger/test/suite/wtscenario.py b/src/third_party/wiredtiger/test/suite/wtscenario.py
index 76824e428df..d953b7f627f 100644
--- a/src/third_party/wiredtiger/test/suite/wtscenario.py
+++ b/src/third_party/wiredtiger/test/suite/wtscenario.py
@@ -68,6 +68,9 @@ def make_scenarios(*args, **kwargs):
"""
The standard way to create scenarios for WT tests.
Scenarios can be combined by listing them all as arguments.
+ If some scenario combinations should not be included,
+ a include= argument function may be listed, which given a name and
+ dictionary argument, returns True if the scenario should be included.
A final prune= and/or prunelong= argument may be given that
forces the list of entries in the scenario to be pruned.
The result is a (combined) scenario that has been checked
@@ -76,14 +79,19 @@ def make_scenarios(*args, **kwargs):
scenes = multiply_scenarios('.', *args)
pruneval = None
prunelong = None
+ includefunc = None
for key in kwargs:
if key == 'prune':
pruneval = kwargs[key]
elif key == 'prunelong':
prunelong = kwargs[key]
+ elif key == 'include':
+ includefunc = kwargs[key]
else:
raise AssertionError(
'make_scenarios: unexpected named arg: ' + key)
+ if includefunc:
+ scenes = [(name, d) for (name, d) in scenes if includefunc(name, d)]
if pruneval != None or prunelong != None:
pruneval = pruneval if pruneval != None else -1
prunelong = prunelong if prunelong != None else -1
diff --git a/src/third_party/wiredtiger/test/suite/wttest.py b/src/third_party/wiredtiger/test/suite/wttest.py
index 8c0915bc9c7..1a2fddce031 100644
--- a/src/third_party/wiredtiger/test/suite/wttest.py
+++ b/src/third_party/wiredtiger/test/suite/wttest.py
@@ -101,7 +101,7 @@ class CapturedFd(object):
' unexpected ' + self.desc +
', contains:\n"' + contents + '"')
testcase.fail('unexpected ' + self.desc + ', contains: "' +
- shortenWithEllipsis(contents,100) + '"')
+ contents + '"')
self.expectpos = filesize
def checkAdditional(self, testcase, expect):
@@ -180,7 +180,7 @@ class WiredTigerTestCase(unittest.TestCase):
@staticmethod
def globalSetup(preserveFiles = False, useTimestamp = False,
- gdbSub = False, verbose = 1, builddir = None, dirarg = None,
+ gdbSub = False, lldbSub = False, verbose = 1, builddir = None, dirarg = None,
longtest = False):
WiredTigerTestCase._preserveFiles = preserveFiles
d = 'WT_TEST' if dirarg == None else dirarg
@@ -194,6 +194,7 @@ class WiredTigerTestCase(unittest.TestCase):
WiredTigerTestCase._origcwd = os.getcwd()
WiredTigerTestCase._resultfile = open(os.path.join(d, 'results.txt'), "w", 0) # unbuffered
WiredTigerTestCase._gdbSubprocess = gdbSub
+ WiredTigerTestCase._lldbSubprocess = lldbSub
WiredTigerTestCase._longtest = longtest
WiredTigerTestCase._verbose = verbose
WiredTigerTestCase._dupout = os.dup(sys.stdout.fileno())
@@ -339,20 +340,25 @@ class WiredTigerTestCase(unittest.TestCase):
self.conn.close(config)
self.conn = None
- def open_conn(self, directory="."):
+ def open_conn(self, directory=".", config=None):
"""
Open the connection if already closed.
"""
if self.conn == None:
+ if config != None:
+ self._old_config = self.conn_config
+ self.conn_config = config
self.conn = self.setUpConnectionOpen(directory)
+ if config != None:
+ self.conn_config = self._old_config
self.session = self.setUpSessionOpen(self.conn)
- def reopen_conn(self, directory="."):
+ def reopen_conn(self, directory=".", config=None):
"""
Reopen the connection.
"""
self.close_conn()
- self.open_conn(directory)
+ self.open_conn(directory, config)
def setUp(self):
if not hasattr(self.__class__, 'wt_ntests'):
diff --git a/src/third_party/wiredtiger/test/thread/rw.c b/src/third_party/wiredtiger/test/thread/rw.c
index dc46e9b595d..9acd5d4095e 100644
--- a/src/third_party/wiredtiger/test/thread/rw.c
+++ b/src/third_party/wiredtiger/test/thread/rw.c
@@ -115,7 +115,7 @@ rw_start(u_int readers, u_int writers)
/* Wait for the threads. */
for (i = 0; i < readers + writers; ++i)
- testutil_check(__wt_thread_join(NULL, tids[i]));
+ testutil_check(__wt_thread_join(NULL, &tids[i]));
(void)gettimeofday(&stop, NULL);
seconds = (stop.tv_sec - start.tv_sec) +
diff --git a/src/third_party/wiredtiger/test/thread/t.c b/src/third_party/wiredtiger/test/thread/t.c
index f509b6d73e2..7108ea22005 100644
--- a/src/third_party/wiredtiger/test/thread/t.c
+++ b/src/third_party/wiredtiger/test/thread/t.c
@@ -136,7 +136,6 @@ main(int argc, char *argv[])
}
argc -= __wt_optind;
- argv += __wt_optind;
if (argc != 0)
return (usage());
diff --git a/src/third_party/wiredtiger/test/utility/misc.c b/src/third_party/wiredtiger/test/utility/misc.c
index d038254c7ea..2cc7ad8a94b 100644
--- a/src/third_party/wiredtiger/test/utility/misc.c
+++ b/src/third_party/wiredtiger/test/utility/misc.c
@@ -27,6 +27,10 @@
*/
#include "test_util.h"
+#ifndef _WIN32
+#include <sys/wait.h>
+#endif
+
void (*custom_die)(void) = NULL;
const char *progname = "program name not set";
@@ -211,6 +215,38 @@ testutil_is_flag_set(const char *flag)
return (enable_long_tests);
}
+#ifndef _WIN32
+/*
+ * testutil_sleep_wait --
+ * Wait for a process up to a number of seconds.
+ */
+void
+testutil_sleep_wait(uint32_t seconds, pid_t pid)
+{
+ pid_t got;
+ int status;
+
+ while (seconds > 0) {
+ if ((got = waitpid(pid, &status, WNOHANG|WUNTRACED)) == pid) {
+ if (WIFEXITED(status))
+ testutil_die(EINVAL,
+ "Child process %" PRIu64 " exited early"
+ " with status %d", (uint64_t)pid,
+ WEXITSTATUS(status));
+ if (WIFSIGNALED(status))
+ testutil_die(EINVAL,
+ "Child process %" PRIu64 " terminated "
+ " with signal %d", (uint64_t)pid,
+ WTERMSIG(status));
+ } else if (got == -1)
+ testutil_die(errno, "waitpid");
+
+ --seconds;
+ sleep(1);
+ }
+}
+#endif
+
/*
* dcalloc --
* Call calloc, dying on failure.
diff --git a/src/third_party/wiredtiger/test/utility/test_util.h b/src/third_party/wiredtiger/test/utility/test_util.h
index 7387615c84b..0f5398c4189 100644
--- a/src/third_party/wiredtiger/test/utility/test_util.h
+++ b/src/third_party/wiredtiger/test/utility/test_util.h
@@ -248,6 +248,9 @@ bool testutil_is_flag_set(const char *);
void testutil_make_work_dir(const char *);
int testutil_parse_opts(int, char * const *, TEST_OPTS *);
void testutil_progress(TEST_OPTS *, const char *);
+#ifndef _WIN32
+void testutil_sleep_wait(uint32_t, pid_t);
+#endif
void testutil_work_dir_from_path(char *, size_t, const char *);
WT_THREAD_RET thread_append(void *);
diff --git a/src/third_party/wiredtiger/test/windows/windows_shim.c b/src/third_party/wiredtiger/test/windows/windows_shim.c
index b40a9e01a42..b562fa97594 100644
--- a/src/third_party/wiredtiger/test/windows/windows_shim.c
+++ b/src/third_party/wiredtiger/test/windows/windows_shim.c
@@ -70,7 +70,7 @@ gettimeofday(struct timeval* tp, void* tzp)
int
pthread_rwlock_destroy(pthread_rwlock_t *lock)
{
- lock = lock;
+ lock = lock; /* Unused variable. */
return (0);
}
@@ -78,7 +78,7 @@ int
pthread_rwlock_init(pthread_rwlock_t *rwlock,
const pthread_rwlockattr_t *ignored)
{
- ignored = ignored;
+ ignored = ignored; /* Unused variable. */
InitializeSRWLock(&rwlock->rwlock);
rwlock->exclusive_locked = 0;
@@ -86,13 +86,6 @@ pthread_rwlock_init(pthread_rwlock_t *rwlock,
}
int
-pthread_rwlock_rdlock(pthread_rwlock_t *rwlock)
-{
- AcquireSRWLockShared(&rwlock->rwlock);
- return (0);
-}
-
-int
pthread_rwlock_unlock(pthread_rwlock_t *rwlock)
{
if (rwlock->exclusive_locked != 0) {
@@ -105,6 +98,19 @@ pthread_rwlock_unlock(pthread_rwlock_t *rwlock)
}
int
+pthread_rwlock_tryrdlock(pthread_rwlock_t *rwlock)
+{
+ return (TryAcquireSRWLockShared(&rwlock->rwlock) ? 0 : EBUSY);
+}
+
+int
+pthread_rwlock_rdlock(pthread_rwlock_t *rwlock)
+{
+ AcquireSRWLockShared(&rwlock->rwlock);
+ return (0);
+}
+
+int
pthread_rwlock_trywrlock(pthread_rwlock_t *rwlock)
{
if (TryAcquireSRWLockExclusive(&rwlock->rwlock)) {
diff --git a/src/third_party/wiredtiger/test/windows/windows_shim.h b/src/third_party/wiredtiger/test/windows/windows_shim.h
index 2f46be9daee..f04b09569b0 100644
--- a/src/third_party/wiredtiger/test/windows/windows_shim.h
+++ b/src/third_party/wiredtiger/test/windows/windows_shim.h
@@ -76,7 +76,7 @@ typedef CONDITION_VARIABLE pthread_cond_t;
struct rwlock_wrapper {
SRWLOCK rwlock;
- int exclusive_locked;
+ DWORD exclusive_locked;
};
struct rwlock_wrapper;
@@ -93,6 +93,7 @@ int pthread_join(pthread_t, void **);
int pthread_rwlock_destroy(pthread_rwlock_t *);
int pthread_rwlock_init(pthread_rwlock_t *, const pthread_rwlockattr_t *);
int pthread_rwlock_rdlock(pthread_rwlock_t *);
+int pthread_rwlock_tryrdlock(pthread_rwlock_t *);
int pthread_rwlock_trywrlock(pthread_rwlock_t *);
int pthread_rwlock_unlock(pthread_rwlock_t *);
int pthread_rwlock_wrlock(pthread_rwlock_t *);