summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorLuke Chen <luke.chen@mongodb.com>2019-10-17 02:45:40 +0000
committerevergreen <evergreen@mongodb.com>2019-10-17 02:45:40 +0000
commit684e936937f04afbd9cd0d5a619c7604fc99f0f8 (patch)
tree6a3026a3153bf00547f723ff8db563f6b86be669
parent9a9b82e95a88c5ce25c958690c2d3365bc62bacc (diff)
downloadmongo-684e936937f04afbd9cd0d5a619c7604fc99f0f8.tar.gz
Import wiredtiger: f3d8dbf9cc285e1f508562ab5d28e029d66b2101 from branch mongodb-4.2
ref: 18d13b8f6b..f3d8dbf9cc for: 4.2.2 WT-4702 Switch to ubuntu1804-test Evergreen distro WT-4715 Workloads will stall if old transaction or timestamp pinned by thread co-opted for eviction WT-4961 Checkpoints with cache overflow must keep history for reads WT-4972 Add new RHEL Evergreen build variant WT-4975 Migrate Jenkins “wiredtiger-pull-request-asan” job to Evergreen WT-5083 Add mips64el support WT-5093 Enable million-collection-test working with Evergreen distro rhel80-build WT-5094 Enable Windows compile task working with Evergreen distro windows-64-vs2017-test WT-5118 Remove incorrect assert that there are no hazard references to discarded WT_REF WT-5122 Shut down the sweep server before doing the final checkpoint WT-5123 Fill multi-paragraph comments WT-5128 Add script to run wtperf with XRay profiling WT-5130 Enable Big-endian (s390x/zSeries) working with Evergreen distro ubuntu1804-zseries-build WT-5134 Fix leaf-page only search and search-near operations WT-5135 Change lookaside file inserts to use cursor.insert WT-5140 Fix where a cursor returning random items can use an uninitialized buffer WT-5143 Fix typo in error message WT-5148 Switch little-endian test to use ubuntu1808 distro WT-5149 Clear the debugging field value so it cannot get stale WT-5150 LAS sweep is not removing the entries that are no longer required WT-5156 Lookaside table cursors not operating at isolation level WT_ISO_READ_UNCOMMITTED WT-5160 Stop requiring a checkpoint before calling rollback_to_stable WT-5161 Remove deprecated git.apply_patch WT-5163 Fix ignored failure return in WT_CURSOR.next when random retrieval configured WT-5164 Fix inconsistent underscore/dash output WT-5166 Allow configuring configure flags in evergreen builds
-rw-r--r--src/third_party/wiredtiger/SConstruct1
-rw-r--r--src/third_party/wiredtiger/bench/wtperf/runners/wtperf_xray.sh122
-rw-r--r--src/third_party/wiredtiger/bench/wtperf/wtperf.c9
-rw-r--r--src/third_party/wiredtiger/build_posix/configure.ac.in6
-rw-r--r--src/third_party/wiredtiger/dist/api_data.py25
-rw-r--r--src/third_party/wiredtiger/dist/filelist1
-rw-r--r--src/third_party/wiredtiger/dist/s_comment.py16
-rw-r--r--src/third_party/wiredtiger/dist/s_define.list1
-rw-r--r--src/third_party/wiredtiger/examples/c/ex_all.c12
-rw-r--r--src/third_party/wiredtiger/examples/c/ex_call_center.c26
-rw-r--r--src/third_party/wiredtiger/examples/java/com/wiredtiger/examples/ex_call_center.java2
-rw-r--r--src/third_party/wiredtiger/ext/collators/revint/revint_collator.c22
-rw-r--r--src/third_party/wiredtiger/ext/compressors/lz4/lz4_compress.c22
-rw-r--r--src/third_party/wiredtiger/ext/compressors/nop/nop_compress.c5
-rw-r--r--src/third_party/wiredtiger/ext/compressors/snappy/snappy_compress.c5
-rw-r--r--src/third_party/wiredtiger/ext/compressors/zstd/zstd_compress.c5
-rw-r--r--src/third_party/wiredtiger/ext/encryptors/nop/nop_encrypt.c5
-rw-r--r--src/third_party/wiredtiger/ext/encryptors/rotn/rotn_encrypt.c9
-rw-r--r--src/third_party/wiredtiger/ext/test/fail_fs/fail_fs.c18
-rw-r--r--src/third_party/wiredtiger/import.data2
-rw-r--r--src/third_party/wiredtiger/src/async/async_api.c23
-rw-r--r--src/third_party/wiredtiger/src/block/block_ckpt.c97
-rw-r--r--src/third_party/wiredtiger/src/block/block_ckpt_scan.c16
-rw-r--r--src/third_party/wiredtiger/src/block/block_compact.c12
-rw-r--r--src/third_party/wiredtiger/src/block/block_ext.c74
-rw-r--r--src/third_party/wiredtiger/src/block/block_open.c17
-rw-r--r--src/third_party/wiredtiger/src/block/block_vrfy.c31
-rw-r--r--src/third_party/wiredtiger/src/block/block_write.c73
-rw-r--r--src/third_party/wiredtiger/src/btree/bt_compact.c53
-rw-r--r--src/third_party/wiredtiger/src/btree/bt_curnext.c63
-rw-r--r--src/third_party/wiredtiger/src/btree/bt_curprev.c61
-rw-r--r--src/third_party/wiredtiger/src/btree/bt_cursor.c295
-rw-r--r--src/third_party/wiredtiger/src/btree/bt_debug.c10
-rw-r--r--src/third_party/wiredtiger/src/btree/bt_delete.c166
-rw-r--r--src/third_party/wiredtiger/src/btree/bt_discard.c8
-rw-r--r--src/third_party/wiredtiger/src/btree/bt_handle.c68
-rw-r--r--src/third_party/wiredtiger/src/btree/bt_import.c54
-rw-r--r--src/third_party/wiredtiger/src/btree/bt_io.c22
-rw-r--r--src/third_party/wiredtiger/src/btree/bt_ovfl.c39
-rw-r--r--src/third_party/wiredtiger/src/btree/bt_page.c60
-rw-r--r--src/third_party/wiredtiger/src/btree/bt_random.c65
-rw-r--r--src/third_party/wiredtiger/src/btree/bt_read.c77
-rw-r--r--src/third_party/wiredtiger/src/btree/bt_rebalance.c21
-rw-r--r--src/third_party/wiredtiger/src/btree/bt_ret.c5
-rw-r--r--src/third_party/wiredtiger/src/btree/bt_slvg.c154
-rw-r--r--src/third_party/wiredtiger/src/btree/bt_split.c342
-rw-r--r--src/third_party/wiredtiger/src/btree/bt_stat.c10
-rw-r--r--src/third_party/wiredtiger/src/btree/bt_sync.c81
-rw-r--r--src/third_party/wiredtiger/src/btree/bt_vrfy.c18
-rw-r--r--src/third_party/wiredtiger/src/btree/bt_vrfy_dsk.c21
-rw-r--r--src/third_party/wiredtiger/src/btree/bt_walk.c68
-rw-r--r--src/third_party/wiredtiger/src/btree/col_modify.c23
-rw-r--r--src/third_party/wiredtiger/src/btree/col_srch.c29
-rw-r--r--src/third_party/wiredtiger/src/btree/row_key.c80
-rw-r--r--src/third_party/wiredtiger/src/btree/row_modify.c53
-rw-r--r--src/third_party/wiredtiger/src/btree/row_srch.c81
-rw-r--r--src/third_party/wiredtiger/src/cache/cache_las.c192
-rw-r--r--src/third_party/wiredtiger/src/checksum/x86/crc32-x86-alt.c10
-rw-r--r--src/third_party/wiredtiger/src/checksum/zseries/crc32-s390x.c13
-rw-r--r--src/third_party/wiredtiger/src/checksum/zseries/slicing-consts.h6
-rw-r--r--src/third_party/wiredtiger/src/checksum/zseries/vx-insn.h7
-rw-r--r--src/third_party/wiredtiger/src/config/config_api.c6
-rw-r--r--src/third_party/wiredtiger/src/config/config_collapse.c10
-rw-r--r--src/third_party/wiredtiger/src/config/config_def.c97
-rw-r--r--src/third_party/wiredtiger/src/conn/conn_api.c150
-rw-r--r--src/third_party/wiredtiger/src/conn/conn_cache_pool.c16
-rw-r--r--src/third_party/wiredtiger/src/conn/conn_ckpt.c4
-rw-r--r--src/third_party/wiredtiger/src/conn/conn_dhandle.c62
-rw-r--r--src/third_party/wiredtiger/src/conn/conn_log.c83
-rw-r--r--src/third_party/wiredtiger/src/conn/conn_open.c8
-rw-r--r--src/third_party/wiredtiger/src/conn/conn_reconfig.c29
-rw-r--r--src/third_party/wiredtiger/src/conn/conn_stat.c87
-rw-r--r--src/third_party/wiredtiger/src/conn/conn_sweep.c18
-rw-r--r--src/third_party/wiredtiger/src/cursor/cur_backup.c34
-rw-r--r--src/third_party/wiredtiger/src/cursor/cur_ds.c14
-rw-r--r--src/third_party/wiredtiger/src/cursor/cur_index.c25
-rw-r--r--src/third_party/wiredtiger/src/cursor/cur_join.c12
-rw-r--r--src/third_party/wiredtiger/src/cursor/cur_metadata.c10
-rw-r--r--src/third_party/wiredtiger/src/cursor/cur_std.c16
-rw-r--r--src/third_party/wiredtiger/src/cursor/cur_table.c26
-rw-r--r--src/third_party/wiredtiger/src/evict/evict_file.c31
-rw-r--r--src/third_party/wiredtiger/src/evict/evict_lru.c218
-rw-r--r--src/third_party/wiredtiger/src/evict/evict_page.c119
-rw-r--r--src/third_party/wiredtiger/src/include/api.h4
-rw-r--r--src/third_party/wiredtiger/src/include/async.h15
-rw-r--r--src/third_party/wiredtiger/src/include/btmem.h197
-rw-r--r--src/third_party/wiredtiger/src/include/btree.h27
-rw-r--r--src/third_party/wiredtiger/src/include/btree.i117
-rw-r--r--src/third_party/wiredtiger/src/include/btree_cmp.i8
-rw-r--r--src/third_party/wiredtiger/src/include/cache.i7
-rw-r--r--src/third_party/wiredtiger/src/include/cell.h20
-rw-r--r--src/third_party/wiredtiger/src/include/column.i26
-rw-r--r--src/third_party/wiredtiger/src/include/connection.h16
-rw-r--r--src/third_party/wiredtiger/src/include/cursor.h32
-rw-r--r--src/third_party/wiredtiger/src/include/cursor.i28
-rw-r--r--src/third_party/wiredtiger/src/include/extern.h26
-rw-r--r--src/third_party/wiredtiger/src/include/extern_posix.h3
-rw-r--r--src/third_party/wiredtiger/src/include/gcc.h15
-rw-r--r--src/third_party/wiredtiger/src/include/hardware.h13
-rw-r--r--src/third_party/wiredtiger/src/include/log.h31
-rw-r--r--src/third_party/wiredtiger/src/include/misc.h16
-rw-r--r--src/third_party/wiredtiger/src/include/misc.i53
-rw-r--r--src/third_party/wiredtiger/src/include/mutex.h9
-rw-r--r--src/third_party/wiredtiger/src/include/mutex.i5
-rw-r--r--src/third_party/wiredtiger/src/include/packing.i6
-rw-r--r--src/third_party/wiredtiger/src/include/reconcile.h89
-rw-r--r--src/third_party/wiredtiger/src/include/reconcile.i56
-rw-r--r--src/third_party/wiredtiger/src/include/serial.i41
-rw-r--r--src/third_party/wiredtiger/src/include/session.h15
-rw-r--r--src/third_party/wiredtiger/src/include/stat.h119
-rw-r--r--src/third_party/wiredtiger/src/include/time.i190
-rw-r--r--src/third_party/wiredtiger/src/include/txn.h22
-rw-r--r--src/third_party/wiredtiger/src/include/txn.i132
-rw-r--r--src/third_party/wiredtiger/src/include/verify_build.h7
-rw-r--r--src/third_party/wiredtiger/src/include/wiredtiger.in49
-rw-r--r--src/third_party/wiredtiger/src/include/wt_internal.h1
-rw-r--r--src/third_party/wiredtiger/src/log/log.c92
-rw-r--r--src/third_party/wiredtiger/src/lsm/lsm_cursor.c99
-rw-r--r--src/third_party/wiredtiger/src/lsm/lsm_manager.c10
-rw-r--r--src/third_party/wiredtiger/src/lsm/lsm_merge.c27
-rw-r--r--src/third_party/wiredtiger/src/lsm/lsm_meta.c5
-rw-r--r--src/third_party/wiredtiger/src/lsm/lsm_stat.c5
-rw-r--r--src/third_party/wiredtiger/src/lsm/lsm_tree.c14
-rw-r--r--src/third_party/wiredtiger/src/lsm/lsm_work_unit.c27
-rw-r--r--src/third_party/wiredtiger/src/meta/meta_apply.c14
-rw-r--r--src/third_party/wiredtiger/src/meta/meta_ckpt.c41
-rw-r--r--src/third_party/wiredtiger/src/meta/meta_table.c9
-rw-r--r--src/third_party/wiredtiger/src/meta/meta_turtle.c21
-rw-r--r--src/third_party/wiredtiger/src/os_common/os_alloc.c7
-rw-r--r--src/third_party/wiredtiger/src/os_common/os_fhandle.c6
-rw-r--r--src/third_party/wiredtiger/src/os_posix/os_dlopen.c5
-rw-r--r--src/third_party/wiredtiger/src/os_posix/os_fallocate.c17
-rw-r--r--src/third_party/wiredtiger/src/os_posix/os_fs.c10
-rw-r--r--src/third_party/wiredtiger/src/os_posix/os_mtx_cond.c34
-rw-r--r--src/third_party/wiredtiger/src/os_posix/os_time.c1
-rw-r--r--src/third_party/wiredtiger/src/os_win/os_fs.c14
-rw-r--r--src/third_party/wiredtiger/src/os_win/os_mtx_cond.c17
-rw-r--r--src/third_party/wiredtiger/src/os_win/os_setvbuf.c11
-rw-r--r--src/third_party/wiredtiger/src/reconcile/rec_child.c166
-rw-r--r--src/third_party/wiredtiger/src/reconcile/rec_col.c55
-rw-r--r--src/third_party/wiredtiger/src/reconcile/rec_row.c90
-rw-r--r--src/third_party/wiredtiger/src/reconcile/rec_track.c38
-rw-r--r--src/third_party/wiredtiger/src/reconcile/rec_visibility.c154
-rw-r--r--src/third_party/wiredtiger/src/reconcile/rec_write.c344
-rw-r--r--src/third_party/wiredtiger/src/schema/schema_create.c30
-rw-r--r--src/third_party/wiredtiger/src/schema/schema_drop.c14
-rw-r--r--src/third_party/wiredtiger/src/schema/schema_open.c12
-rw-r--r--src/third_party/wiredtiger/src/schema/schema_project.c8
-rw-r--r--src/third_party/wiredtiger/src/schema/schema_rename.c7
-rw-r--r--src/third_party/wiredtiger/src/schema/schema_stat.c5
-rw-r--r--src/third_party/wiredtiger/src/session/session_api.c85
-rw-r--r--src/third_party/wiredtiger/src/session/session_compact.c8
-rw-r--r--src/third_party/wiredtiger/src/session/session_dhandle.c62
-rw-r--r--src/third_party/wiredtiger/src/support/err.c30
-rw-r--r--src/third_party/wiredtiger/src/support/generation.c5
-rw-r--r--src/third_party/wiredtiger/src/support/hazard.c39
-rw-r--r--src/third_party/wiredtiger/src/support/huffman.c12
-rw-r--r--src/third_party/wiredtiger/src/support/modify.c41
-rw-r--r--src/third_party/wiredtiger/src/support/mtx_rw.c12
-rw-r--r--src/third_party/wiredtiger/src/support/rand.c17
-rw-r--r--src/third_party/wiredtiger/src/support/scratch.c9
-rw-r--r--src/third_party/wiredtiger/src/support/time.c109
-rw-r--r--src/third_party/wiredtiger/src/txn/txn.c183
-rw-r--r--src/third_party/wiredtiger/src/txn/txn_ckpt.c240
-rw-r--r--src/third_party/wiredtiger/src/txn/txn_log.c9
-rw-r--r--src/third_party/wiredtiger/src/txn/txn_recover.c26
-rw-r--r--src/third_party/wiredtiger/src/txn/txn_rollback_to_stable.c82
-rw-r--r--src/third_party/wiredtiger/src/txn/txn_timestamp.c32
-rw-r--r--src/third_party/wiredtiger/src/utilities/util_loadtext.c3
-rw-r--r--src/third_party/wiredtiger/test/csuite/schema_abort/main.c11
-rw-r--r--src/third_party/wiredtiger/test/csuite/scope/main.c19
-rw-r--r--src/third_party/wiredtiger/test/csuite/truncated_log/main.c10
-rw-r--r--src/third_party/wiredtiger/test/csuite/wt2853_perf/main.c16
-rw-r--r--src/third_party/wiredtiger/test/csuite/wt2909_checkpoint_integrity/main.c55
-rw-r--r--src/third_party/wiredtiger/test/csuite/wt2999_join_extractor/main.c12
-rw-r--r--src/third_party/wiredtiger/test/csuite/wt3363_checkpoint_op_races/main.c15
-rw-r--r--src/third_party/wiredtiger/test/csuite/wt4803_cache_overflow_abort/main.c30
-rwxr-xr-xsrc/third_party/wiredtiger/test/evergreen.yml316
-rw-r--r--src/third_party/wiredtiger/test/format/Makefile.am2
-rw-r--r--src/third_party/wiredtiger/test/format/bulk.c14
-rw-r--r--src/third_party/wiredtiger/test/format/compact.c9
-rw-r--r--src/third_party/wiredtiger/test/format/config.c48
-rw-r--r--src/third_party/wiredtiger/test/format/config.h3
-rw-r--r--src/third_party/wiredtiger/test/format/format.h12
-rw-r--r--src/third_party/wiredtiger/test/format/ops.c91
-rw-r--r--src/third_party/wiredtiger/test/format/random.c104
-rw-r--r--src/third_party/wiredtiger/test/format/salvage.c9
-rw-r--r--src/third_party/wiredtiger/test/format/t.c15
-rw-r--r--src/third_party/wiredtiger/test/format/util.c29
-rw-r--r--src/third_party/wiredtiger/test/format/wts.c3
-rw-r--r--src/third_party/wiredtiger/test/readonly/readonly.c9
-rw-r--r--src/third_party/wiredtiger/test/suite/test_debug_mode05.py2
-rwxr-xr-xsrc/third_party/wiredtiger/test/suite/test_las01.py9
-rw-r--r--src/third_party/wiredtiger/test/suite/test_timestamp04.py7
-rw-r--r--src/third_party/wiredtiger/test/suite/test_timestamp06.py26
-rw-r--r--src/third_party/wiredtiger/test/suite/test_timestamp11.py1
-rw-r--r--src/third_party/wiredtiger/test/suite/test_timestamp16.py14
-rw-r--r--src/third_party/wiredtiger/test/suite/test_txn20.py5
-rw-r--r--src/third_party/wiredtiger/test/suite/test_txn21.py49
-rw-r--r--src/third_party/wiredtiger/test/utility/test_util.h4
200 files changed, 4087 insertions, 4657 deletions
diff --git a/src/third_party/wiredtiger/SConstruct b/src/third_party/wiredtiger/SConstruct
index f895a53c426..f4d0d31dab0 100644
--- a/src/third_party/wiredtiger/SConstruct
+++ b/src/third_party/wiredtiger/SConstruct
@@ -496,6 +496,7 @@ t = env.Program("t_format",
"test/format/lrt.c",
"test/format/ops.c",
"test/format/rebalance.c",
+ "test/format/random.c",
"test/format/salvage.c",
"test/format/snap.c",
"test/format/t.c",
diff --git a/src/third_party/wiredtiger/bench/wtperf/runners/wtperf_xray.sh b/src/third_party/wiredtiger/bench/wtperf/runners/wtperf_xray.sh
new file mode 100644
index 00000000000..398c6a9bcf5
--- /dev/null
+++ b/src/third_party/wiredtiger/bench/wtperf/runners/wtperf_xray.sh
@@ -0,0 +1,122 @@
+#!/bin/bash
+
+# wtperf_xray.sh - run wtperf regression tests with xray profiling and generate
+# profiling information.
+#
+# This script assumes it is running in the directory with the wtperf executable.
+#
+# Usage
+# wtperf_xray.sh <wtperf-config-file> [-h output-directory] [wtperf other args]
+#
+# This script checks the first argument after the wtperf configuration to see
+# whether a home directory is being specified with the -h flag. If so, this
+# script will write its output files to that directory. Otherwise it will
+# default to WT_TEST (wtperf's default).
+#
+# Environment variables
+# XRAY_BINARY --
+# The binary to use to inspect the xray log. (default: llvm-xray)
+# FLAME_GRAPH_PATH --
+# The path to your copy of Brendan Gregg's FlameGraph script. (optional)
+#
+# When this is complete you can find information in the following files:
+# wtperf_account.txt --
+# The top 10 functions where the workload is spending the most time along
+# with a count, min, max and some percentiles for each one.
+# wtperf_stacks.txt --
+# The top 10 stack traces where the workload is spending the most time.
+# This calculation is done separately per thread.
+# wtperf_graph.svg --
+# A function call graph showing what functions call each other. The edges
+# are labelled and coloured proportionally to represent the ratio of time
+# spent in each function call.
+# wtperf_flame.svg --
+# A graph visualising stack traces and the time spent within each stack
+# frame. If FLAME_GRAPH_PATH is not specified, this graph won't be
+# generated.
+#
+if ! test -f ./wtperf; then
+ echo "$0: could not find wtperf in current working directory"
+ exit 1
+fi
+
+if test "$#" -lt "1"; then
+ echo "$0: must specify wtperf configuration to run"
+ exit 1
+fi
+
+# By default, wtperf uses WT_TEST as its home directory.
+xray_home="WT_TEST"
+if test "$2" = "-h"; then
+ if ! test -z "$3"; then
+ xray_home="$3"
+ fi
+fi
+echo "$0: using $xray_home as home directory"
+
+# Check symbols to ensure we've compiled with XRay.
+objdump_out=$(objdump -h -j xray_instr_map ./wtperf)
+if test -z "$objdump_out"; then
+ echo "$0: wtperf not compiled with xray, add '-fxray-instrument' to your CFLAGS"
+ exit 1
+fi
+
+if ! test -d "$xray_home"; then
+ echo "$0: creating directory $xray_home"
+ mkdir "$xray_home"
+fi
+
+xray_account_path="${xray_home}/wtperf_account.txt"
+xray_stack_path="${xray_home}/wtperf_stack.txt"
+xray_graph_path="${xray_home}/wtperf_graph.svg"
+xray_flame_path="${xray_home}/wtperf_flame.svg"
+
+rm xray-log.wtperf.* \
+ "$xray_account_path" \
+ "$xray_stack_path" \
+ "$xray_graph_path" \
+ "$xray_flame_path"
+
+export XRAY_OPTIONS="patch_premain=true xray_mode=xray-basic verbosity=1"
+./wtperf -O "$@"
+
+xray_log=$(ls xray-log.wtperf.*)
+num_logs=$(echo "$xray_log" | wc -w)
+if test "$num_logs" -ne "1"; then
+ echo "$0: detected more than one xray log"
+ exit 1
+fi
+
+if test -z "$XRAY_BINARY"; then
+ xray_bin="llvm-xray"
+ echo "$0: XRAY_BINARY is unset, defaulting to $xray_bin"
+else
+ xray_bin="$XRAY_BINARY"
+fi
+
+$xray_bin account "$xray_log" \
+ -top=10 -sort=sum -sortorder=dsc -instr_map ./wtperf > \
+ "$xray_account_path"
+
+# Use the -per-thread-stacks option to get the top 10 stacks for each thread.
+# We could use the -aggregate-threads flag here so get the top stacks for all threads (omitting duplicates).
+$xray_bin stack -per-thread-stacks "$xray_log" \
+ -instr_map ./wtperf > \
+ "$xray_stack_path"
+
+# Generate a DOT graph.
+$xray_bin graph "$xray_log" \
+ -m ./wtperf -color-edges=sum -edge-label=sum | \
+ unflatten -f -l10 | \
+ dot -Tsvg -o "$xray_graph_path"
+
+# This file can be inspected in the Google Chrome Trace Viewer.
+# It seems to take a long time to generate this so just disable it for now.
+# $xray_bin convert -symbolize -instr_map=./wtperf -output-format=trace_event $xray_log | gzip > wtperf_trace.txt.gz
+if test -z "$FLAME_GRAPH_PATH"; then
+ echo "$0: FLAME_GRAPH_PATH is unset, skipping flame graph generation"
+else
+ $xray_bin stack "$xray_log" \
+ -instr_map ./wtperf -stack-format=flame -aggregation-type=time -all-stacks | \
+ "$FLAME_GRAPH_PATH" > "$xray_flame_path"
+fi
diff --git a/src/third_party/wiredtiger/bench/wtperf/wtperf.c b/src/third_party/wiredtiger/bench/wtperf/wtperf.c
index cf12df3f2fc..697d59c8dcd 100644
--- a/src/third_party/wiredtiger/bench/wtperf/wtperf.c
+++ b/src/third_party/wiredtiger/bench/wtperf/wtperf.c
@@ -226,9 +226,9 @@ cb_asyncop(WT_ASYNC_CALLBACK *cb, WT_ASYNC_OP *op, int ret, uint32_t flags)
/*
* Either we have success and we track it, or failure and panic.
*
- * Reads and updates can fail with WT_NOTFOUND: we may be searching
- * in a random range, or an insert op might have updated the
- * last record in the table but not yet finished the actual insert.
+ * Reads and updates can fail with WT_NOTFOUND: we may be searching in a random range, or an
+ * insert op might have updated the last record in the table but not yet finished the actual
+ * insert.
*/
if (type == WT_AOP_COMPACT)
return (0);
@@ -654,8 +654,7 @@ worker(void *arg)
*/
measure_latency = opts->sample_interval != 0 && trk != NULL && trk->ops != 0 &&
(trk->ops % opts->sample_rate == 0);
- if (measure_latency)
- __wt_epoch(NULL, &start);
+ __wt_epoch(NULL, &start); /* [-Werror=maybe-uninitialized] */
cursor->set_key(cursor, key_buf);
switch (*op) {
diff --git a/src/third_party/wiredtiger/build_posix/configure.ac.in b/src/third_party/wiredtiger/build_posix/configure.ac.in
index 7c829f86f80..c50d86678e6 100644
--- a/src/third_party/wiredtiger/build_posix/configure.ac.in
+++ b/src/third_party/wiredtiger/build_posix/configure.ac.in
@@ -56,7 +56,7 @@ AS_CASE([$host_cpu],
[wt_cv_powerpc="no"])
AM_CONDITIONAL([POWERPC_HOST], [test "$wt_cv_powerpc" = "yes"])
AS_CASE([$host_cpu],
- [amd*|i[[3456]]86*|pentium|x86*], [wt_cv_x86="yes"], [wt_cv_x86="no"])
+ [amd*|i[[3456]]86*|pentium|x86*|mips64el*], [wt_cv_x86="yes"], [wt_cv_x86="no"])
AM_CONDITIONAL([X86_HOST], [test "$wt_cv_x86" = "yes"])
AS_CASE([$host_cpu],
[s390x*], [wt_cv_zseries="yes"],
@@ -66,6 +66,10 @@ AS_CASE([$host_cpu],
[aarch64*], [wt_cv_arm64="yes"],
[wt_cv_arm64="no"])
AM_CONDITIONAL([ARM64_HOST], [test "$wt_cv_arm64" = "yes"])
+AS_CASE([$host_cpu],
+ [mips64el*], [wt_cv_mips64el="yes"],
+ [wt_cv_mips64el="no"])
+AM_CONDITIONAL([MIPS64EL_HOST], [test "$wt_cv_mips64el" = "yes"])
AS_CASE([$host_os], [*solaris*], [wt_cv_solaris="yes"], [wt_cv_solaris="no"])
# This is a workaround as part of WT-2459. Currently, clang (v3.7) does not
diff --git a/src/third_party/wiredtiger/dist/api_data.py b/src/third_party/wiredtiger/dist/api_data.py
index 3bd75b7187c..6d9d4f1db3d 100644
--- a/src/third_party/wiredtiger/dist/api_data.py
+++ b/src/third_party/wiredtiger/dist/api_data.py
@@ -605,6 +605,13 @@ connection_runtime_config = [
Config('lsm_merge', 'true', r'''
merge LSM chunks where possible (deprecated)''',
type='boolean', undoc=True),
+ Config('operation_timeout_ms', '0', r'''
+ when non-zero, a requested limit on the number of elapsed real time milliseconds
+ application threads will take to complete database operations. Time is measured from the
+ start of each WiredTiger API call. There is no guarantee any operation will not take
+ longer than this amount of time. If WiredTiger notices the limit has been exceeded, an
+ operation may return a WT_ROLLBACK error. Default is to have no limit''',
+ min=1),
Config('operation_tracking', '', r'''
enable tracking of performance-critical functions. See
@ref operation_tracking for more information''',
@@ -1333,6 +1340,13 @@ methods = {
choices=['read-uncommitted', 'read-committed', 'snapshot']),
Config('name', '', r'''
name of the transaction for tracing and debugging'''),
+ Config('operation_timeout_ms', '0', r'''
+ when non-zero, a requested limit on the number of elapsed real time milliseconds taken
+ to complete database operations in this transaction. Time is measured from the start
+ of each WiredTiger API call. There is no guarantee any operation will not take longer
+ than this amount of time. If WiredTiger notices the limit has been exceeded, an operation
+ may return a WT_ROLLBACK error. Default is to have no limit''',
+ min=1),
Config('priority', 0, r'''
priority of the transaction for resolving conflicts.
Transactions with higher values are less likely to abort''',
@@ -1436,8 +1450,8 @@ methods = {
dropped while a hot backup is in progress or if open in
a cursor''', type='list'),
Config('force', 'false', r'''
- by default, checkpoints may be skipped if the underlying object
- has not been modified, this option forces the checkpoint''',
+ if false (the default), checkpoints may be skipped if the underlying object has not been
+ modified, if true, this option forces the checkpoint''',
type='boolean'),
Config('name', '', r'''
if set, specify a name for the checkpoint (note that checkpoints
@@ -1445,10 +1459,9 @@ methods = {
Config('target', '', r'''
if non-empty, checkpoint the list of objects''', type='list'),
Config('use_timestamp', 'true', r'''
- by default, create the checkpoint as of the last stable timestamp
- if timestamps are in use, or all current updates if there is no
- stable timestamp set. If false, this option generates a checkpoint
- with all updates including those later than the timestamp''',
+ if true (the default), create the checkpoint as of the last stable timestamp if timestamps
+ are in use, or all current updates if there is no stable timestamp set. If false, this
+ option generates a checkpoint with all updates including those later than the timestamp''',
type='boolean'),
]),
diff --git a/src/third_party/wiredtiger/dist/filelist b/src/third_party/wiredtiger/dist/filelist
index 1d398a4aa88..9e7eb0b23ac 100644
--- a/src/third_party/wiredtiger/dist/filelist
+++ b/src/third_party/wiredtiger/dist/filelist
@@ -205,7 +205,6 @@ src/support/rand.c
src/support/scratch.c
src/support/stat.c
src/support/thread_group.c
-src/support/time.c
src/txn/txn.c
src/txn/txn_ckpt.c
src/txn/txn_ext.c
diff --git a/src/third_party/wiredtiger/dist/s_comment.py b/src/third_party/wiredtiger/dist/s_comment.py
index 556862f0fcc..f30de0e4794 100644
--- a/src/third_party/wiredtiger/dist/s_comment.py
+++ b/src/third_party/wiredtiger/dist/s_comment.py
@@ -63,14 +63,20 @@ for line in sys.stdin:
indent_ws = ' ' * indentation
sys.stdout.write('{}/*\n'.format(indent_ws))
current_line = indent_ws + ' *'
- for word in words:
+ for i in range(len(words)):
+ word = words[i]
if word == '--' and function_desc:
sys.stdout.write(current_line + ' ' + word + '\n')
current_line = indent_ws + ' *' + ' ' * 4
continue
if word == '\n':
- sys.stdout.write(current_line + '\n')
- sys.stdout.write(indent_ws + ' *' + '\n')
+ # If we already have partially built a line, write it out.
+ if current_line != indent_ws + ' *':
+ sys.stdout.write(current_line + '\n')
+ # If there are more words in this comment after this
+ # newline, add another line break.
+ if i < (len(words) - 1):
+ sys.stdout.write(indent_ws + ' *' + '\n')
current_line = indent_ws + ' *'
continue
if len(current_line) + len(word) >= line_length:
@@ -89,6 +95,10 @@ for line in sys.stdin:
function_desc = False
elif multiline:
comment += line
+ # We want to preserve newlines for block comments that have multiple paragraphs.
+ if sline == '*':
+ words.append('\n')
+ continue
# Function names begin with either a lowercase char or an underscore.
if (len(sline) >= 3 and sline.startswith('*') and sline[1] == ' ' and
(sline[2].islower() or sline[2] == '_') and sline.endswith('--')):
diff --git a/src/third_party/wiredtiger/dist/s_define.list b/src/third_party/wiredtiger/dist/s_define.list
index f5e3584343d..85a240550ea 100644
--- a/src/third_party/wiredtiger/dist/s_define.list
+++ b/src/third_party/wiredtiger/dist/s_define.list
@@ -75,7 +75,6 @@ WT_TIMEDIFF_US
WT_TRACK_OP
WT_TRACK_OP_END
WT_TRACK_OP_INIT
-WT_TRACK_TIME
WT_TRET_ERROR_OK
WT_UPDATE_SIZE
WT_USE_OPENAT
diff --git a/src/third_party/wiredtiger/examples/c/ex_all.c b/src/third_party/wiredtiger/examples/c/ex_all.c
index b9792fbc82b..53c2ac9d95d 100644
--- a/src/third_party/wiredtiger/examples/c/ex_all.c
+++ b/src/third_party/wiredtiger/examples/c/ex_all.c
@@ -928,6 +928,8 @@ transaction_ops(WT_SESSION *session_arg)
error_check(conn->set_timestamp(conn, "stable_timestamp=2a"));
/*! [set stable timestamp] */
+ /* WT_CONNECTION.rollback_to_stable requires a timestamped checkpoint. */
+ error_check(session->checkpoint(session, NULL));
/*! [rollback to stable] */
error_check(conn->rollback_to_stable(conn, NULL));
/*! [rollback to stable] */
@@ -1045,12 +1047,12 @@ connection_ops(WT_CONNECTION *conn)
/*! [Configure method configuration] */
/*
- * Applications opening a cursor for the data-source object "my_data"
- * have an additional configuration option "entries", which is an
- * integer type, defaults to 5, and must be an integer between 1 and 10.
+ * Applications opening a cursor for the data-source object "my_data" have an additional
+ * configuration option "entries", which is an integer type, defaults to 5, and must be an
+ * integer between 1 and 10.
*
- * The method being configured is specified using a concatenation of the
- * handle name, a period and the method name.
+ * The method being configured is specified using a concatenation of the handle name, a period
+ * and the method name.
*/
error_check(conn->configure_method(
conn, "WT_SESSION.open_cursor", "my_data:", "entries=5", "int", "min=1,max=10"));
diff --git a/src/third_party/wiredtiger/examples/c/ex_call_center.c b/src/third_party/wiredtiger/examples/c/ex_call_center.c
index 2c404046ee8..3a7430300c4 100644
--- a/src/third_party/wiredtiger/examples/c/ex_call_center.c
+++ b/src/third_party/wiredtiger/examples/c/ex_call_center.c
@@ -141,17 +141,16 @@ main(int argc, char *argv[])
error_check(cursor->close(cursor));
/*
- * First query: a call arrives. In SQL:
+ * First query: a call arrives. In SQL:
*
* SELECT id, name FROM Customers WHERE phone=?
*
- * Use the cust_phone index, lookup by phone number to fill the
- * customer record. The cursor will have a key format of "S" for a
- * string because the cust_phone index has a single column ("phone"),
- * which is of type "S".
+ * Use the cust_phone index, lookup by phone number to fill the customer record. The cursor will
+ * have a key format of "S" for a string because the cust_phone index has a single column
+ * ("phone"), which is of type "S".
*
- * Specify the columns we want: the customer ID and the name. This
- * means the cursor's value format will be "rS".
+ * Specify the columns we want: the customer ID and the name. This means the cursor's value
+ * format will be "rS".
*/
error_check(
session->open_cursor(session, "index:customers:phone(id,name)", NULL, NULL, &cursor));
@@ -162,17 +161,16 @@ main(int argc, char *argv[])
error_check(cursor->close(cursor));
/*
- * Next query: get the recent order history. In SQL:
+ * Next query: get the recent order history. In SQL:
*
* SELECT * FROM Calls WHERE cust_id=? ORDER BY call_date DESC LIMIT 3
*
- * Use the call_cust_date index to find the matching calls. Since it is
- * is in increasing order by date for a given customer, we want to start
- * with the last record for the customer and work backwards.
+ * Use the call_cust_date index to find the matching calls. Since it is in increasing order by
+ * date for a given customer, we want to start with the last record for the customer and work
+ * backwards.
*
- * Specify a subset of columns to be returned. (Note that if these were
- * all covered by the index, the primary would not have to be accessed.)
- * Stop after getting 3 records.
+ * Specify a subset of columns to be returned. (Note that if these were all covered by the
+ * index, the primary would not have to be accessed.) Stop after getting 3 records.
*/
error_check(session->open_cursor(
session, "index:calls:cust_date(cust_id,call_type,notes)", NULL, NULL, &cursor));
diff --git a/src/third_party/wiredtiger/examples/java/com/wiredtiger/examples/ex_call_center.java b/src/third_party/wiredtiger/examples/java/com/wiredtiger/examples/ex_call_center.java
index 97e5c3d7d34..abcbc395170 100644
--- a/src/third_party/wiredtiger/examples/java/com/wiredtiger/examples/ex_call_center.java
+++ b/src/third_party/wiredtiger/examples/java/com/wiredtiger/examples/ex_call_center.java
@@ -238,7 +238,7 @@ public class ex_call_center {
* SELECT * FROM Calls WHERE cust_id=? ORDER BY call_date DESC LIMIT 3
*
* Use the call_cust_date index to find the matching calls. Since it is
- * is in increasing order by date for a given customer, we want to start
+ * in increasing order by date for a given customer, we want to start
* with the last record for the customer and work backwards.
*
* Specify a subset of columns to be returned. (Note that if these were
diff --git a/src/third_party/wiredtiger/ext/collators/revint/revint_collator.c b/src/third_party/wiredtiger/ext/collators/revint/revint_collator.c
index d3dbaa4f4c7..52cc8c958f5 100644
--- a/src/third_party/wiredtiger/ext/collators/revint/revint_collator.c
+++ b/src/third_party/wiredtiger/ext/collators/revint/revint_collator.c
@@ -61,22 +61,18 @@ revint_compare(
wt_api = revint_collator->wt_api;
/*
- * All indices using this collator have an integer key, and the
- * primary key is also an integer. A collator is usually passed the
- * concatenation of index key and primary key (when available),
+ * All indices using this collator have an integer key, and the primary key is also an integer.
+ * A collator is usually passed the concatenation of index key and primary key (when available),
* hence we initially unpack using "ii".
*
- * A collator may also be called with an item that includes a index
- * key and no primary key. Among items having the same index key,
- * an item with no primary key should sort before an item with a
- * primary key. The reason is that if the application calls
- * WT_CURSOR::search on a index key for which there are more than
- * one value, the search key will not yet have a primary key. We
- * want to position the cursor at the 'first' matching index key so
- * that repeated calls to WT_CURSOR::next will see them all.
+ * A collator may also be called with an item that includes a index key and no primary key.
+ * Among items having the same index key, an item with no primary key should sort before an item
+ * with a primary key. The reason is that if the application calls WT_CURSOR::search on a index
+ * key for which there are more than one value, the search key will not yet have a primary key.
+ * We want to position the cursor at the 'first' matching index key so that repeated calls to
+ * WT_CURSOR::next will see them all.
*
- * To keep this code simple, we do not reverse the ordering
- * when comparing primary keys.
+ * To keep this code simple, we do not reverse the ordering when comparing primary keys.
*/
if ((ret = wt_api->unpack_start(wt_api, session, "ii", k1->data, k1->size, &pstream)) != 0)
return (ret);
diff --git a/src/third_party/wiredtiger/ext/compressors/lz4/lz4_compress.c b/src/third_party/wiredtiger/ext/compressors/lz4/lz4_compress.c
index 2204f4942fa..700bb84216d 100644
--- a/src/third_party/wiredtiger/ext/compressors/lz4/lz4_compress.c
+++ b/src/third_party/wiredtiger/ext/compressors/lz4/lz4_compress.c
@@ -190,20 +190,18 @@ lz4_decompress(WT_COMPRESSOR *compressor, WT_SESSION *session, uint8_t *src, siz
}
/*
- * Decompress, starting after the prefix bytes. Use safe decompression:
- * we rely on decompression to detect corruption.
+ * Decompress, starting after the prefix bytes. Use safe decompression: we rely on decompression
+ * to detect corruption.
*
- * Two code paths, one with and one without a bounce buffer. When doing
- * raw compression, we compress to a target size irrespective of row
- * boundaries, and return to our caller a "useful" compression length
- * based on the last complete row that was compressed. Our caller stores
- * that length, not the length of bytes actually compressed by LZ4. In
- * other words, our caller doesn't know how many bytes will result from
- * decompression, likely hasn't provided us a large enough buffer, and
- * we have to allocate a scratch buffer.
+ * Two code paths, one with and one without a bounce buffer. When doing raw compression, we
+ * compress to a target size irrespective of row boundaries, and return to our caller a "useful"
+ * compression length based on the last complete row that was compressed. Our caller stores that
+ * length, not the length of bytes actually compressed by LZ4. In other words, our caller
+ * doesn't know how many bytes will result from decompression, likely hasn't provided us a large
+ * enough buffer, and we have to allocate a scratch buffer.
*
- * Even though raw compression has been removed from WiredTiger, this
- * code remains for backward compatibility with existing objects.
+ * Even though raw compression has been removed from WiredTiger, this code remains for backward
+ * compatibility with existing objects.
*/
if (dst_len < prefix.uncompressed_len) {
if ((dst_tmp = wt_api->scr_alloc(wt_api, session, (size_t)prefix.uncompressed_len)) == NULL)
diff --git a/src/third_party/wiredtiger/ext/compressors/nop/nop_compress.c b/src/third_party/wiredtiger/ext/compressors/nop/nop_compress.c
index 8bf60e5f25f..1f0a15997b9 100644
--- a/src/third_party/wiredtiger/ext/compressors/nop/nop_compress.c
+++ b/src/third_party/wiredtiger/ext/compressors/nop/nop_compress.c
@@ -157,9 +157,8 @@ wiredtiger_extension_init(WT_CONNECTION *connection, WT_CONFIG_ARG *config)
return (errno);
/*
- * Allocate a local compressor structure, with a WT_COMPRESSOR structure
- * as the first field, allowing us to treat references to either type of
- * structure as a reference to the other type.
+ * Allocate a local compressor structure, with a WT_COMPRESSOR structure as the first field,
+ * allowing us to treat references to either type of structure as a reference to the other type.
*
* Heap memory (not static), because it can support multiple databases.
*/
diff --git a/src/third_party/wiredtiger/ext/compressors/snappy/snappy_compress.c b/src/third_party/wiredtiger/ext/compressors/snappy/snappy_compress.c
index ce63e89334e..d1febdd63dd 100644
--- a/src/third_party/wiredtiger/ext/compressors/snappy/snappy_compress.c
+++ b/src/third_party/wiredtiger/ext/compressors/snappy/snappy_compress.c
@@ -128,9 +128,8 @@ snappy_compression(WT_COMPRESSOR *compressor, WT_SESSION *session, uint8_t *src,
*compression_failed = 0;
/*
- * On decompression, snappy requires an exact compressed byte
- * count (the current value of snaplen). WiredTiger does not
- * preserve that value, so save snaplen at the beginning of
+ * On decompression, snappy requires an exact compressed byte count (the current value of
+ * snaplen). WiredTiger does not preserve that value, so save snaplen at the beginning of
* the destination buffer.
*
* Store the value in little-endian format.
diff --git a/src/third_party/wiredtiger/ext/compressors/zstd/zstd_compress.c b/src/third_party/wiredtiger/ext/compressors/zstd/zstd_compress.c
index 23087fa87f4..a6abf95e558 100644
--- a/src/third_party/wiredtiger/ext/compressors/zstd/zstd_compress.c
+++ b/src/third_party/wiredtiger/ext/compressors/zstd/zstd_compress.c
@@ -116,9 +116,8 @@ zstd_compress(WT_COMPRESSOR *compressor, WT_SESSION *session, uint8_t *src, size
*compression_failed = 0;
/*
- * On decompression, Zstd requires an exact compressed byte
- * count (the current value of zstd_ret). WiredTiger does not
- * preserve that value, so save zstd_ret at the beginning of
+ * On decompression, Zstd requires an exact compressed byte count (the current value of
+ * zstd_ret). WiredTiger does not preserve that value, so save zstd_ret at the beginning of
* the destination buffer.
*
* Store the value in little-endian format.
diff --git a/src/third_party/wiredtiger/ext/encryptors/nop/nop_encrypt.c b/src/third_party/wiredtiger/ext/encryptors/nop/nop_encrypt.c
index 7192381befe..be12a6b19ea 100644
--- a/src/third_party/wiredtiger/ext/encryptors/nop/nop_encrypt.c
+++ b/src/third_party/wiredtiger/ext/encryptors/nop/nop_encrypt.c
@@ -168,9 +168,8 @@ wiredtiger_extension_init(WT_CONNECTION *connection, WT_CONFIG_ARG *config)
return (errno);
/*
- * Allocate a local encryptor structure, with a WT_ENCRYPTOR structure
- * as the first field, allowing us to treat references to either type of
- * structure as a reference to the other type.
+ * Allocate a local encryptor structure, with a WT_ENCRYPTOR structure as the first field,
+ * allowing us to treat references to either type of structure as a reference to the other type.
*
* Heap memory (not static), because it can support multiple databases.
*/
diff --git a/src/third_party/wiredtiger/ext/encryptors/rotn/rotn_encrypt.c b/src/third_party/wiredtiger/ext/encryptors/rotn/rotn_encrypt.c
index 8fc355c9d6c..df252beefbd 100644
--- a/src/third_party/wiredtiger/ext/encryptors/rotn/rotn_encrypt.c
+++ b/src/third_party/wiredtiger/ext/encryptors/rotn/rotn_encrypt.c
@@ -145,8 +145,8 @@ do_rotate(char *buf, size_t len, int rotn)
/*
* Now rotate.
*
- * Avoid ctype functions because they behave in unexpected ways,
- * particularly when the locale is not "C".
+ * Avoid ctype functions because they behave in unexpected ways, particularly when the locale is
+ * not "C".
*/
for (i = 0; i < len; i++) {
if ('a' <= buf[i] && buf[i] <= 'z')
@@ -439,9 +439,8 @@ wiredtiger_extension_init(WT_CONNECTION *connection, WT_CONFIG_ARG *config)
return (errno);
/*
- * Allocate a local encryptor structure, with a WT_ENCRYPTOR structure
- * as the first field, allowing us to treat references to either type of
- * structure as a reference to the other type.
+ * Allocate a local encryptor structure, with a WT_ENCRYPTOR structure as the first field,
+ * allowing us to treat references to either type of structure as a reference to the other type.
*
* Heap memory (not static), because it can support multiple databases.
*/
diff --git a/src/third_party/wiredtiger/ext/test/fail_fs/fail_fs.c b/src/third_party/wiredtiger/ext/test/fail_fs/fail_fs.c
index a715a1056d9..9f9f510c8cf 100644
--- a/src/third_party/wiredtiger/ext/test/fail_fs/fail_fs.c
+++ b/src/third_party/wiredtiger/ext/test/fail_fs/fail_fs.c
@@ -47,17 +47,15 @@
#define FAIL_FS_ENV_READ_ALLOW "WT_FAIL_FS_READ_ALLOW"
/*
- * A "fail file system", that is, a file system extension that fails when we
- * want it to. This is only used in test frameworks, this fact allows us to
- * simplify some error paths. This code is not portable to Windows, as it has
- * direct knowledge of file descriptors, environment variables and stack
- * traces.
+ * A "fail file system", that is, a file system extension that fails when we want it to. This is
+ * only used in test frameworks, this fact allows us to simplify some error paths. This code is not
+ * portable to Windows, as it has direct knowledge of file descriptors, environment variables and
+ * stack traces.
*
- * When the filesystem extension is configured, parameters can set how many
- * reads or writes can be allowed before failure. If this is not fine-grained
- * enough, an 'environment' configuration parameter can be specified. If that
- * is used, then on every file system read or write, environment variables are
- * checked that control when reading or writing should fail.
+ * When the filesystem extension is configured, parameters can set how many reads or writes can be
+ * allowed before failure. If this is not fine-grained enough, an 'environment' configuration
+ * parameter can be specified. If that is used, then on every file system read or write, environment
+ * variables are checked that control when reading or writing should fail.
*/
typedef struct {
WT_FILE_SYSTEM iface;
diff --git a/src/third_party/wiredtiger/import.data b/src/third_party/wiredtiger/import.data
index 8131208bf28..5cdb158e4b3 100644
--- a/src/third_party/wiredtiger/import.data
+++ b/src/third_party/wiredtiger/import.data
@@ -1,5 +1,5 @@
{
- "commit": "18d13b8f6bc8d345952f16a7f3c63608e405fd77",
+ "commit": "f3d8dbf9cc285e1f508562ab5d28e029d66b2101",
"github": "wiredtiger/wiredtiger.git",
"vendor": "wiredtiger",
"branch": "mongodb-4.2"
diff --git a/src/third_party/wiredtiger/src/async/async_api.c b/src/third_party/wiredtiger/src/async/async_api.c
index 81b23b238e7..ec12d8d02ab 100644
--- a/src/third_party/wiredtiger/src/async/async_api.c
+++ b/src/third_party/wiredtiger/src/async/async_api.c
@@ -176,8 +176,8 @@ __async_config(WT_SESSION_IMPL *session, WT_CONNECTION_IMPL *conn, const char **
*runp = cval.val != 0;
/*
- * Even if async is turned off, we want to parse and store the default
- * values so that reconfigure can just enable them.
+ * Even if async is turned off, we want to parse and store the default values so that
+ * reconfigure can just enable them.
*
* Bound the minimum maximum operations at 10.
*/
@@ -308,18 +308,15 @@ __wt_async_reconfig(WT_SESSION_IMPL *session, const char *cfg[])
WT_RET(__async_config(session, &tmp_conn, cfg, &run));
/*
- * There are some restrictions on the live reconfiguration of async.
- * Unlike other subsystems where we simply destroy anything existing
- * and restart with the new configuration, async is not so easy.
- * If the user is just changing the number of workers, we want to
- * allow the existing op handles and other information to remain in
- * existence. So we must handle various combinations of changes
- * individually.
+ * There are some restrictions on the live reconfiguration of async. Unlike other subsystems
+ * where we simply destroy anything existing and restart with the new configuration, async is
+ * not so easy. If the user is just changing the number of workers, we want to allow the
+ * existing op handles and other information to remain in existence. So we must handle various
+ * combinations of changes individually.
*
- * One restriction is that if async is currently on, the user cannot
- * change the number of async op handles available. The user can try
- * but we do nothing with it. However we must allow the ops_max config
- * string so that a user can completely start async via reconfigure.
+ * One restriction is that if async is currently on, the user cannot change the number of async
+ * op handles available. The user can try but we do nothing with it. However we must allow the
+ * ops_max config string so that a user can completely start async via reconfigure.
*/
/*
diff --git a/src/third_party/wiredtiger/src/block/block_ckpt.c b/src/third_party/wiredtiger/src/block/block_ckpt.c
index 5e2f261a424..158fc919820 100644
--- a/src/third_party/wiredtiger/src/block/block_ckpt.c
+++ b/src/third_party/wiredtiger/src/block/block_ckpt.c
@@ -272,16 +272,13 @@ __ckpt_extlist_read(WT_SESSION_IMPL *session, WT_BLOCK *block, WT_CKPT *ckpt)
WT_BLOCK_CKPT *ci;
/*
- * Allocate a checkpoint structure, crack the cookie and read the
- * checkpoint's extent lists.
+ * Allocate a checkpoint structure, crack the cookie and read the checkpoint's extent lists.
*
- * Ignore the avail list: checkpoint avail lists are only useful if we
- * are rolling forward from the particular checkpoint and they represent
- * our best understanding of what blocks can be allocated. If we are
- * not operating on the live checkpoint, subsequent checkpoints might
- * have allocated those blocks, and the avail list is useless. We don't
- * discard it, because it is useful as part of verification, but we
- * don't re-write it either.
+ * Ignore the avail list: checkpoint avail lists are only useful if we are rolling forward from
+ * the particular checkpoint and they represent our best understanding of what blocks can be
+ * allocated. If we are not operating on the live checkpoint, subsequent checkpoints might have
+ * allocated those blocks, and the avail list is useless. We don't discard it, because it is
+ * useful as part of verification, but we don't re-write it either.
*/
WT_RET(__wt_calloc(session, 1, sizeof(WT_BLOCK_CKPT), &ckpt->bpriv));
@@ -366,30 +363,24 @@ __ckpt_process(WT_SESSION_IMPL *session, WT_BLOCK *block, WT_CKPT *ckptbase)
#endif
/*
- * Checkpoints are a two-step process: first, write a new checkpoint to
- * disk (including all the new extent lists for modified checkpoints
- * and the live system). As part of this, create a list of file blocks
- * newly available for reallocation, based on checkpoints being deleted.
- * We then return the locations of the new checkpoint information to our
- * caller. Our caller has to write that information into some kind of
- * stable storage, and once that's done, we can actually allocate from
- * that list of newly available file blocks. (We can't allocate from
- * that list immediately because the allocation might happen before our
- * caller saves the new checkpoint information, and if we crashed before
- * the new checkpoint location was saved, we'd have overwritten blocks
- * still referenced by checkpoints in the system.) In summary, there is
- * a second step: after our caller saves the checkpoint information, we
- * are called to add the newly available blocks into the live system's
- * available list.
+ * Checkpoints are a two-step process: first, write a new checkpoint to disk (including all the
+ * new extent lists for modified checkpoints and the live system). As part of this, create a
+ * list of file blocks newly available for reallocation, based on checkpoints being deleted. We
+ * then return the locations of the new checkpoint information to our caller. Our caller has to
+ * write that information into some kind of stable storage, and once that's done, we can
+ * actually allocate from that list of newly available file blocks. (We can't allocate from that
+ * list immediately because the allocation might happen before our caller saves the new
+ * checkpoint information, and if we crashed before the new checkpoint location was saved, we'd
+ * have overwritten blocks still referenced by checkpoints in the system.) In summary, there is
+ * a second step: after our caller saves the checkpoint information, we are called to add the
+ * newly available blocks into the live system's available list.
*
- * This function is the first step, the second step is in the resolve
- * function.
+ * This function is the first step, the second step is in the resolve function.
*
- * If we're called to checkpoint the same file twice (without the second
- * resolution step), or re-entered for any reason, it's an error in our
- * caller, and our choices are all bad: leak blocks or potentially crash
- * with our caller not yet having saved previous checkpoint information
- * to stable storage.
+ * If we're called to checkpoint the same file twice (without the second resolution step), or
+ * re-entered for any reason, it's an error in our caller, and our choices are all bad: leak
+ * blocks or potentially crash with our caller not yet having saved previous checkpoint
+ * information to stable storage.
*/
__wt_spin_lock(session, &block->live_lock);
switch (block->ckpt_state) {
@@ -412,18 +403,16 @@ __ckpt_process(WT_SESSION_IMPL *session, WT_BLOCK *block, WT_CKPT *ckptbase)
WT_RET(ret);
/*
- * Extents newly available as a result of deleting previous checkpoints
- * are added to a list of extents. The list should be empty, but as
- * described above, there is no "free the checkpoint information" call
- * into the block manager; if there was an error in an upper level that
- * resulted in some previous checkpoint never being resolved, the list
- * may not be empty. We should have caught that with the "checkpoint
- * in progress" test, but it doesn't cost us anything to be cautious.
+ * Extents newly available as a result of deleting previous checkpoints are added to a list of
+ * extents. The list should be empty, but as described above, there is no "free the checkpoint
+ * information" call into the block manager; if there was an error in an upper level that
+ * resulted in some previous checkpoint never being resolved, the list may not be empty. We
+ * should have caught that with the "checkpoint in progress" test, but it doesn't cost us
+ * anything to be cautious.
*
- * We free the checkpoint's allocation and discard extent lists as part
- * of the resolution step, not because they're needed at that time, but
- * because it's potentially a lot of work, and waiting allows the btree
- * layer to continue eviction sooner. As for the checkpoint-available
+ * We free the checkpoint's allocation and discard extent lists as part of the resolution step,
+ * not because they're needed at that time, but because it's potentially a lot of work, and
+ * waiting allows the btree layer to continue eviction sooner. As for the checkpoint-available
* list, make sure they get cleaned out.
*/
__wt_block_extlist_free(session, &ci->ckpt_avail);
@@ -566,11 +555,11 @@ __ckpt_process(WT_SESSION_IMPL *session, WT_BLOCK *block, WT_CKPT *ckptbase)
continue;
/*
- * We have to write the "to" checkpoint's extent lists out in
- * new blocks, and update its cookie.
+ * We have to write the "to" checkpoint's extent lists out in new blocks, and update its
+ * cookie.
*
- * Free the blocks used to hold the "to" checkpoint's extent
- * lists; don't include the avail list, it's not changing.
+ * Free the blocks used to hold the "to" checkpoint's extent lists; don't include the avail
+ * list, it's not changing.
*/
WT_ERR(__ckpt_extlist_fblocks(session, block, &b->alloc));
WT_ERR(__ckpt_extlist_fblocks(session, block, &b->discard));
@@ -717,16 +706,14 @@ __ckpt_update(WT_SESSION_IMPL *session, WT_BLOCK *block, WT_CKPT *ckptbase, WT_C
}
/*
- * We only write an avail list for the live system, other checkpoint's
- * avail lists are static and never change.
+ * We only write an avail list for the live system, other checkpoint's avail lists are static
+ * and never change.
*
- * Write the avail list last so it reflects changes due to allocating
- * blocks for the alloc and discard lists. Second, when we write the
- * live system's avail list, it's two lists: the current avail list
- * plus the list of blocks to be made available when the new checkpoint
- * completes. We can't merge that second list into the real list yet,
- * it's not truly available until the new checkpoint locations have been
- * saved to the metadata.
+ * Write the avail list last so it reflects changes due to allocating blocks for the alloc and
+ * discard lists. Second, when we write the live system's avail list, it's two lists: the
+ * current avail list plus the list of blocks to be made available when the new checkpoint
+ * completes. We can't merge that second list into the real list yet, it's not truly available
+ * until the new checkpoint locations have been saved to the metadata.
*/
if (is_live) {
block->final_ckpt = ckpt;
diff --git a/src/third_party/wiredtiger/src/block/block_ckpt_scan.c b/src/third_party/wiredtiger/src/block/block_ckpt_scan.c
index b7fda0d73b2..9af0221a81f 100644
--- a/src/third_party/wiredtiger/src/block/block_ckpt_scan.c
+++ b/src/third_party/wiredtiger/src/block/block_ckpt_scan.c
@@ -239,16 +239,14 @@ __wt_block_checkpoint_last(WT_SESSION_IMPL *session, WT_BLOCK *block, char **met
WT_RET(__wt_buf_init(session, checkpoint, WT_BLOCK_CHECKPOINT_BUFFER));
/*
- * Initialize a pair of structures that track the best and current
- * checkpoints found so far. This is a little trickier than normal
- * because we don't want to start saving a checkpoint only to find
- * out it's not one we can use. I doubt that can happen and it
- * suggests corruption, but half-a-checkpoint isn't a good place to
- * be. Only swap to a new "best" checkpoint if we read the whole
- * thing successfully.
+ * Initialize a pair of structures that track the best and current checkpoints found so far.
+ * This is a little trickier than normal because we don't want to start saving a checkpoint only
+ * to find out it's not one we can use. I doubt that can happen and it suggests corruption, but
+ * half-a-checkpoint isn't a good place to be. Only swap to a new "best" checkpoint if we read
+ * the whole thing successfully.
*
- * Don't re-order these lines: it's done this way so the WT_ITEMs
- * are always initialized and error handling works.
+ * Don't re-order these lines: it's done this way so the WT_ITEMs are always initialized and
+ * error handling works.
*/
memset((best = &_best), 0, sizeof(_best));
memset((current = &_current), 0, sizeof(_current));
diff --git a/src/third_party/wiredtiger/src/block/block_compact.c b/src/third_party/wiredtiger/src/block/block_compact.c
index 6fe4d879e23..ea20bb80ef9 100644
--- a/src/third_party/wiredtiger/src/block/block_compact.c
+++ b/src/third_party/wiredtiger/src/block/block_compact.c
@@ -93,14 +93,12 @@ __wt_block_compact_skip(WT_SESSION_IMPL *session, WT_BLOCK *block, bool *skipp)
/*
* Skip files where we can't recover at least 1MB.
*
- * If at least 20% of the total file is available and in the first 80%
- * of the file, we'll try compaction on the last 20% of the file; else,
- * if at least 10% of the total file is available and in the first 90%
- * of the file, we'll try compaction on the last 10% of the file.
+ * If at least 20% of the total file is available and in the first 80% of the file, we'll try
+ * compaction on the last 20% of the file; else, if at least 10% of the total file is available
+ * and in the first 90% of the file, we'll try compaction on the last 10% of the file.
*
- * We could push this further, but there's diminishing returns, a mostly
- * empty file can be processed quickly, so more aggressive compaction is
- * less useful.
+ * We could push this further, but there's diminishing returns, a mostly empty file can be
+ * processed quickly, so more aggressive compaction is less useful.
*/
if (avail_eighty > WT_MEGABYTE && avail_eighty >= ((block->size / 10) * 2)) {
*skipp = false;
diff --git a/src/third_party/wiredtiger/src/block/block_ext.c b/src/third_party/wiredtiger/src/block/block_ext.c
index ac8ef950868..71aeea0714f 100644
--- a/src/third_party/wiredtiger/src/block/block_ext.c
+++ b/src/third_party/wiredtiger/src/block/block_ext.c
@@ -63,14 +63,14 @@ __block_off_srch(WT_EXT **head, wt_off_t off, WT_EXT ***stack, bool skip_off)
int i;
/*
- * Start at the highest skip level, then go as far as possible at each
- * level before stepping down to the next.
+ * Start at the highest skip level, then go as far as possible at each level before stepping
+ * down to the next.
*
* Return a stack for an exact match or the next-largest item.
*
- * The WT_EXT structure contains two skiplists, the primary one and the
- * per-size bucket one: if the skip_off flag is set, offset the skiplist
- * array by the depth specified in this particular structure.
+ * The WT_EXT structure contains two skiplists, the primary one and the per-size bucket one: if
+ * the skip_off flag is set, offset the skiplist array by the depth specified in this particular
+ * structure.
*/
for (i = WT_SKIP_MAXDEPTH - 1, extp = &head[i]; i >= 0;)
if (*extp != NULL && (*extp)->off < off)
@@ -113,8 +113,8 @@ __block_size_srch(WT_SIZE **head, wt_off_t size, WT_SIZE ***stack)
int i;
/*
- * Start at the highest skip level, then go as far as possible at each
- * level before stepping down to the next.
+ * Start at the highest skip level, then go as far as possible at each level before stepping
+ * down to the next.
*
* Return a stack for an exact match or the next-largest item.
*/
@@ -451,8 +451,8 @@ static inline int
__block_extend(WT_SESSION_IMPL *session, WT_BLOCK *block, wt_off_t *offp, wt_off_t size)
{
/*
- * Callers of this function are expected to have already acquired any
- * locks required to extend the file.
+ * Callers of this function are expected to have already acquired any locks required to extend
+ * the file.
*
* We should never be allocating from an empty file.
*/
@@ -502,14 +502,12 @@ __wt_block_alloc(WT_SESSION_IMPL *session, WT_BLOCK *block, wt_off_t *offp, wt_o
(intmax_t)size, block->allocsize);
/*
- * Allocation is either first-fit (lowest offset), or best-fit (best
- * size). If it's first-fit, walk the offset list linearly until we
- * find an entry that will work.
+ * Allocation is either first-fit (lowest offset), or best-fit (best size). If it's first-fit,
+ * walk the offset list linearly until we find an entry that will work.
*
- * If it's best-fit by size, search the by-size skiplist for the size
- * and take the first entry on the by-size offset list. This means we
- * prefer best-fit over lower offset, but within a size we'll prefer an
- * offset appearing earlier in the file.
+ * If it's best-fit by size, search the by-size skiplist for the size and take the first entry
+ * on the by-size offset list. This means we prefer best-fit over lower offset, but within a
+ * size we'll prefer an offset appearing earlier in the file.
*
* If we don't have anything big enough, extend the file.
*/
@@ -603,13 +601,12 @@ __wt_block_off_free(WT_SESSION_IMPL *session, WT_BLOCK *block, wt_off_t offset,
WT_ASSERT(session, WT_SESSION_BTREE_SYNC_SAFE(session, S2BT(session)));
/*
- * Callers of this function are expected to have already acquired any
- * locks required to manipulate the extent lists.
+ * Callers of this function are expected to have already acquired any locks required to
+ * manipulate the extent lists.
*
- * We can reuse this extent immediately if it was allocated during this
- * checkpoint, merge it into the avail list (which slows file growth in
- * workloads including repeated overflow record modification). If this
- * extent is referenced in a previous checkpoint, merge into the discard
+ * We can reuse this extent immediately if it was allocated during this checkpoint, merge it
+ * into the avail list (which slows file growth in workloads including repeated overflow record
+ * modification). If this extent is referenced in a previous checkpoint, merge into the discard
* list.
*/
if ((ret = __wt_block_off_remove_overlap(session, block, &block->live.alloc, offset, size)) ==
@@ -914,13 +911,12 @@ __block_append(
WT_ASSERT(session, el->track_size == 0);
/*
- * Identical to __block_merge, when we know the file is being extended,
- * that is, the information is either going to be used to extend the
- * last object on the list, or become a new object ending the list.
+ * Identical to __block_merge, when we know the file is being extended, that is, the information
+ * is either going to be used to extend the last object on the list, or become a new object
+ * ending the list.
*
- * The terminating element of the list is cached, check it; otherwise,
- * get a stack for the last object in the skiplist, check for a simple
- * extension, and otherwise append a new structure.
+ * The terminating element of the list is cached, check it; otherwise, get a stack for the last
+ * object in the skiplist, check for a simple extension, and otherwise append a new structure.
*/
if ((ext = el->last) != NULL && ext->off + ext->size == off)
ext->size += size;
@@ -955,15 +951,13 @@ __wt_block_insert_ext(
WT_SESSION_IMPL *session, WT_BLOCK *block, WT_EXTLIST *el, wt_off_t off, wt_off_t size)
{
/*
- * There are currently two copies of this function (this code is a one-
- * liner that calls the internal version of the function, which means
- * the compiler should compress out the function call). It's that way
- * because the interface is still fluid, I'm not convinced there won't
- * be a need for a functional split between the internal and external
- * versions in the future.
+ * There are currently two copies of this function (this code is a one- liner that calls the
+ * internal version of the function, which means the compiler should compress out the function
+ * call). It's that way because the interface is still fluid, I'm not convinced there won't be a
+ * need for a functional split between the internal and external versions in the future.
*
- * Callers of this function are expected to have already acquired any
- * locks required to manipulate the extent list.
+ * Callers of this function are expected to have already acquired any locks required to
+ * manipulate the extent list.
*/
return (__block_merge(session, block, el, off, size));
}
@@ -1180,12 +1174,10 @@ __wt_block_extlist_write(
}
/*
- * Get a scratch buffer, clear the page's header and data, initialize
- * the header.
+ * Get a scratch buffer, clear the page's header and data, initialize the header.
*
- * Allocate memory for the extent list entries plus two additional
- * entries: the initial WT_BLOCK_EXTLIST_MAGIC/0 pair and the list-
- * terminating WT_BLOCK_INVALID_OFFSET/0 pair.
+ * Allocate memory for the extent list entries plus two additional entries: the initial
+ * WT_BLOCK_EXTLIST_MAGIC/0 pair and the list- terminating WT_BLOCK_INVALID_OFFSET/0 pair.
*/
size = ((size_t)entries + 2) * 2 * WT_INTPACK64_MAXSIZE;
WT_RET(__wt_block_write_size(session, block, &size));
diff --git a/src/third_party/wiredtiger/src/block/block_open.c b/src/third_party/wiredtiger/src/block/block_open.c
index b29fb939663..ba32bce74bc 100644
--- a/src/third_party/wiredtiger/src/block/block_open.c
+++ b/src/third_party/wiredtiger/src/block/block_open.c
@@ -36,10 +36,9 @@ __wt_block_manager_create(WT_SESSION_IMPL *session, const char *filename, uint32
/*
* Create the underlying file and open a handle.
*
- * Since WiredTiger schema operations are (currently) non-transactional,
- * it's possible to see a partially-created file left from a previous
- * create. Further, there's nothing to prevent users from creating files
- * in our space. Move any existing files out of the way and complain.
+ * Since WiredTiger schema operations are (currently) non-transactional, it's possible to see a
+ * partially-created file left from a previous create. Further, there's nothing to prevent users
+ * from creating files in our space. Move any existing files out of the way and complain.
*/
for (;;) {
if ((ret = __wt_open(session, filename, WT_FS_OPEN_FILE_TYPE_DATA,
@@ -162,9 +161,9 @@ __wt_block_open(WT_SESSION_IMPL *session, const char *filename, const char *cfg[
/*
* Basic structure allocation, initialization.
*
- * Note: set the block's name-hash value before any work that can fail
- * because cleanup calls the block destroy code which uses that hash
- * value to remove the block from the underlying linked lists.
+ * Note: set the block's name-hash value before any work that can fail because cleanup calls the
+ * block destroy code which uses that hash value to remove the block from the underlying linked
+ * lists.
*/
WT_ERR(__wt_calloc_one(session, &block));
block->ref = 1;
@@ -215,8 +214,8 @@ __wt_block_open(WT_SESSION_IMPL *session, const char *filename, const char *cfg[
/*
* Read the description information from the first block.
*
- * Salvage is a special case: if we're forcing the salvage, we don't
- * look at anything, including the description information.
+ * Salvage is a special case: if we're forcing the salvage, we don't look at anything, including
+ * the description information.
*/
if (!forced_salvage)
WT_ERR(__desc_read(session, allocsize, block));
diff --git a/src/third_party/wiredtiger/src/block/block_vrfy.c b/src/third_party/wiredtiger/src/block/block_vrfy.c
index bc3109fe570..e0930aadec0 100644
--- a/src/third_party/wiredtiger/src/block/block_vrfy.c
+++ b/src/third_party/wiredtiger/src/block/block_vrfy.c
@@ -327,17 +327,14 @@ __wt_block_verify_addr(
WT_RET(__verify_filefrag_add(session, block, NULL, offset, size, false));
/*
- * It's tempting to try and flag a page as "verified" when we read it.
- * That doesn't work because we may visit a page multiple times when
- * verifying a single checkpoint (for example, when verifying the
- * physical image of a row-store leaf page with overflow keys, the
- * overflow keys are read when checking for key sort issues, and read
- * again when more general overflow item checking is done). This
- * function is called by the btree verification code, once per logical
- * visit in a checkpoint, so we can detect if a page is referenced
- * multiple times within a single checkpoint. This doesn't apply to
- * the per-file list, because it is expected for the same btree blocks
- * to appear in multiple checkpoints.
+ * It's tempting to try and flag a page as "verified" when we read it. That doesn't work because
+ * we may visit a page multiple times when verifying a single checkpoint (for example, when
+ * verifying the physical image of a row-store leaf page with overflow keys, the overflow keys
+ * are read when checking for key sort issues, and read again when more general overflow item
+ * checking is done). This function is called by the btree verification code, once per logical
+ * visit in a checkpoint, so we can detect if a page is referenced multiple times within a
+ * single checkpoint. This doesn't apply to the per-file list, because it is expected for the
+ * same btree blocks to appear in multiple checkpoints.
*
* Add the block to the per-checkpoint list.
*/
@@ -401,14 +398,12 @@ __verify_filefrag_chk(WT_SESSION_IMPL *session, WT_BLOCK *block)
return (0);
/*
- * It's OK if we have not verified blocks at the end of the file: that
- * happens if the file is truncated during a checkpoint or load or was
- * extended after writing a checkpoint. We should never see unverified
- * blocks anywhere else, though.
+ * It's OK if we have not verified blocks at the end of the file: that happens if the file is
+ * truncated during a checkpoint or load or was extended after writing a checkpoint. We should
+ * never see unverified blocks anywhere else, though.
*
- * I'm deliberately testing for a last fragment of 0, it makes no sense
- * there would be no fragments verified, complain if the first fragment
- * in the file wasn't verified.
+ * I'm deliberately testing for a last fragment of 0, it makes no sense there would be no
+ * fragments verified, complain if the first fragment in the file wasn't verified.
*/
for (last = block->frags - 1; last != 0; --last) {
if (__bit_test(block->fragfile, last))
diff --git a/src/third_party/wiredtiger/src/block/block_write.c b/src/third_party/wiredtiger/src/block/block_write.c
index 31e000032d6..476d94af582 100644
--- a/src/third_party/wiredtiger/src/block/block_write.c
+++ b/src/third_party/wiredtiger/src/block/block_write.c
@@ -25,21 +25,19 @@ __wt_block_truncate(WT_SESSION_IMPL *session, WT_BLOCK *block, wt_off_t len)
/*
* Truncate requires serialization, we depend on our caller for that.
*
- * Truncation isn't a requirement of the block manager, it's only used
- * to conserve disk space. Regardless of the underlying file system
- * call's result, the in-memory understanding of the file size changes.
+ * Truncation isn't a requirement of the block manager, it's only used to conserve disk space.
+ * Regardless of the underlying file system call's result, the in-memory understanding of the
+ * file size changes.
*/
block->size = block->extend_size = len;
/*
- * Backups are done by copying files outside of WiredTiger, potentially
- * by system utilities. We cannot truncate the file during the backup
- * window, we might surprise an application.
+ * Backups are done by copying files outside of WiredTiger, potentially by system utilities. We
+ * cannot truncate the file during the backup window, we might surprise an application.
*
- * This affects files that aren't involved in the backup (for example,
- * doing incremental backups, which only copies log files, or targeted
- * backups, stops all block truncation unnecessarily). We may want a
- * more targeted solution at some point.
+ * This affects files that aren't involved in the backup (for example, doing incremental
+ * backups, which only copies log files, or targeted backups, stops all block truncation
+ * unnecessarily). We may want a more targeted solution at some point.
*/
if (!conn->hot_backup) {
WT_WITH_HOTBACKUP_READ_LOCK(session, ret = __wt_ftruncate(session, block->fh, len), NULL);
@@ -97,13 +95,11 @@ __wt_block_extend(WT_SESSION_IMPL *session, WT_BLOCK *block, WT_FH *fh, wt_off_t
WT_FILE_HANDLE *handle;
/*
- * The locking in this function is messy: by definition, the live system
- * is locked when we're called, but that lock may have been acquired by
- * our caller or our caller's caller. If our caller's lock, release_lock
- * comes in set and this function can unlock it before returning (so it
- * isn't held while extending the file). If it is our caller's caller,
- * then release_lock comes in not set, indicating it cannot be released
- * here.
+ * The locking in this function is messy: by definition, the live system is locked when we're
+ * called, but that lock may have been acquired by our caller or our caller's caller. If our
+ * caller's lock, release_lock comes in set and this function can unlock it before returning (so
+ * it isn't held while extending the file). If it is our caller's caller, then release_lock
+ * comes in not set, indicating it cannot be released here.
*
* If we unlock here, we clear release_lock.
*/
@@ -135,13 +131,12 @@ __wt_block_extend(WT_SESSION_IMPL *session, WT_BLOCK *block, WT_FH *fh, wt_off_t
return (0);
/*
- * Set the extend_size before releasing the lock, I don't want to read
- * and manipulate multiple values without holding a lock.
+ * Set the extend_size before releasing the lock, I don't want to read and manipulate multiple
+ * values without holding a lock.
*
- * There's a race between the calculation and doing the extension, but
- * it should err on the side of extend_size being smaller than the
- * actual file size, and that's OK, we simply may do another extension
- * sooner than otherwise.
+ * There's a race between the calculation and doing the extension, but it should err on the side
+ * of extend_size being smaller than the actual file size, and that's OK, we simply may do
+ * another extension sooner than otherwise.
*/
block->extend_size = block->size + block->extend_len * 2;
@@ -245,9 +240,9 @@ __block_write_off(WT_SESSION_IMPL *session, WT_BLOCK *block, WT_ITEM *buf, wt_of
/*
* Align the size to an allocation unit.
*
- * The buffer must be big enough for us to zero to the next allocsize
- * boundary, this is one of the reasons the btree layer must find out
- * from the block-manager layer the maximum size of the eventual write.
+ * The buffer must be big enough for us to zero to the next allocsize boundary, this is one of
+ * the reasons the btree layer must find out from the block-manager layer the maximum size of
+ * the eventual write.
*/
align_size = WT_ALIGN(buf->size, block->allocsize);
if (align_size > buf->memsize) {
@@ -301,21 +296,19 @@ __block_write_off(WT_SESSION_IMPL *session, WT_BLOCK *block, WT_ITEM *buf, wt_of
blk->disk_size = WT_STORE_SIZE(align_size);
/*
- * Update the block's checksum: if our caller specifies, checksum the
- * complete data, otherwise checksum the leading WT_BLOCK_COMPRESS_SKIP
- * bytes. The assumption is applications with good compression support
- * turn off checksums and assume corrupted blocks won't decompress
- * correctly. However, if compression failed to shrink the block, the
- * block wasn't compressed, in which case our caller will tell us to
- * checksum the data to detect corruption. If compression succeeded,
- * we still need to checksum the first WT_BLOCK_COMPRESS_SKIP bytes
- * because they're not compressed, both to give salvage a quick test
- * of whether a block is useful and to give us a test so we don't lose
- * the first WT_BLOCK_COMPRESS_SKIP bytes without noticing.
+ * Update the block's checksum: if our caller specifies, checksum the complete data, otherwise
+ * checksum the leading WT_BLOCK_COMPRESS_SKIP bytes. The assumption is applications with good
+ * compression support turn off checksums and assume corrupted blocks won't decompress
+ * correctly. However, if compression failed to shrink the block, the block wasn't compressed,
+ * in which case our caller will tell us to checksum the data to detect corruption. If
+ * compression succeeded, we still need to checksum the first WT_BLOCK_COMPRESS_SKIP bytes
+ * because they're not compressed, both to give salvage a quick test of whether a block is
+ * useful and to give us a test so we don't lose the first WT_BLOCK_COMPRESS_SKIP bytes without
+ * noticing.
*
- * Checksum a little-endian version of the header, and write everything
- * in little-endian format. The checksum is (potentially) returned in a
- * big-endian format, swap it into place in a separate step.
+ * Checksum a little-endian version of the header, and write everything in little-endian format.
+ * The checksum is (potentially) returned in a big-endian format, swap it into place in a
+ * separate step.
*/
blk->flags = 0;
if (data_checksum)
diff --git a/src/third_party/wiredtiger/src/btree/bt_compact.c b/src/third_party/wiredtiger/src/btree/bt_compact.c
index e4d8a6abb10..d396f87ab49 100644
--- a/src/third_party/wiredtiger/src/btree/bt_compact.c
+++ b/src/third_party/wiredtiger/src/btree/bt_compact.c
@@ -37,13 +37,12 @@ __compact_rewrite(WT_SESSION_IMPL *session, WT_REF *ref, bool *skipp)
}
/*
- * If the page is a replacement, test the replacement addresses.
- * Ignore empty pages, they get merged into the parent.
+ * If the page is a replacement, test the replacement addresses. Ignore empty pages, they get
+ * merged into the parent.
*
- * Page-modify variable initialization done here because the page could
- * be modified while we're looking at it, so the page modified structure
- * may appear at any time (but cannot disappear). We've confirmed there
- * is a page modify structure, it's OK to look at it.
+ * Page-modify variable initialization done here because the page could be modified while we're
+ * looking at it, so the page modified structure may appear at any time (but cannot disappear).
+ * We've confirmed there is a page modify structure, it's OK to look at it.
*/
mod = page->modify;
if (mod->rec_result == WT_PM_REC_REPLACE)
@@ -77,18 +76,15 @@ __compact_rewrite_lock(WT_SESSION_IMPL *session, WT_REF *ref, bool *skipp)
btree = S2BT(session);
/*
- * Reviewing in-memory pages requires looking at page reconciliation
- * results, because we care about where the page is stored now, not
- * where the page was stored when we first read it into the cache.
- * We need to ensure we don't race with page reconciliation as it's
- * writing the page modify information.
+ * Reviewing in-memory pages requires looking at page reconciliation results, because we care
+ * about where the page is stored now, not where the page was stored when we first read it into
+ * the cache. We need to ensure we don't race with page reconciliation as it's writing the page
+ * modify information.
*
- * There are two ways we call reconciliation: checkpoints and eviction.
- * Get the tree's flush lock which blocks threads writing pages for
- * checkpoints. If checkpoint is holding the lock, quit working this
- * file, we'll visit it again in our next pass. We don't have to worry
- * about eviction, we're holding a hazard pointer on the WT_REF, it's
- * not going anywhere.
+ * There are two ways we call reconciliation: checkpoints and eviction. Get the tree's flush
+ * lock which blocks threads writing pages for checkpoints. If checkpoint is holding the lock,
+ * quit working this file, we'll visit it again in our next pass. We don't have to worry about
+ * eviction, we're holding a hazard pointer on the WT_REF, it's not going anywhere.
*/
WT_RET(__wt_spin_trylock(session, &btree->flush_lock));
@@ -192,8 +188,8 @@ __wt_compact(WT_SESSION_IMPL *session)
/*
* Cheap checks that don't require locking.
*
- * Ignore the root: it may not have a replacement address, and
- * besides, if anything else gets written, so will it.
+ * Ignore the root: it may not have a replacement address, and besides, if anything else
+ * gets written, so will it.
*
* Ignore dirty pages, checkpoint writes them regardless.
*/
@@ -247,12 +243,12 @@ __wt_compact_page_skip(WT_SESSION_IMPL *session, WT_REF *ref, void *context, boo
}
/*
- * If the page is in-memory, we want to look at it (it may have been
- * modified and written, and the current location is the interesting
- * one in terms of compaction, not the original location).
+ * If the page is in-memory, we want to look at it (it may have been modified and written, and
+ * the current location is the interesting one in terms of compaction, not the original
+ * location).
*
- * This test could be combined with the next one, but this is a cheap
- * test and the next one is expensive.
+ * This test could be combined with the next one, but this is a cheap test and the next one is
+ * expensive.
*/
if (ref->state != WT_REF_DISK)
return (0);
@@ -266,12 +262,11 @@ __wt_compact_page_skip(WT_SESSION_IMPL *session, WT_REF *ref, void *context, boo
return (0);
/*
- * The page is on disk, so there had better be an address; assert that
- * fact, test at run-time to avoid the core dump.
+ * The page is on disk, so there had better be an address; assert that fact, test at run-time to
+ * avoid the core dump.
*
- * Internal pages must be read to walk the tree; ask the block-manager
- * if it's useful to rewrite leaf pages, don't do the I/O if a rewrite
- * won't help.
+ * Internal pages must be read to walk the tree; ask the block-manager if it's useful to rewrite
+ * leaf pages, don't do the I/O if a rewrite won't help.
*/
__wt_ref_info(session, ref, &addr, &addr_size, &type);
WT_ASSERT(session, addr != NULL);
diff --git a/src/third_party/wiredtiger/src/btree/bt_curnext.c b/src/third_party/wiredtiger/src/btree/bt_curnext.c
index d4ebd5322f4..b7f2e0db4b3 100644
--- a/src/third_party/wiredtiger/src/btree/bt_curnext.c
+++ b/src/third_party/wiredtiger/src/btree/bt_curnext.c
@@ -41,20 +41,17 @@ __cursor_fix_append_next(WT_CURSOR_BTREE *cbt, bool newpage, bool restart)
__cursor_set_recno(cbt, cbt->recno + 1);
/*
- * Fixed-width column store appends are inherently non-transactional.
- * Even a non-visible update by a concurrent or aborted transaction
- * changes the effective end of the data. The effect is subtle because
- * of the blurring between deleted and empty values, but ideally we
- * would skip all uncommitted changes at the end of the data. This
- * doesn't apply to variable-width column stores because the implicitly
- * created records written by reconciliation are deleted and so can be
- * never seen by a read.
+ * Fixed-width column store appends are inherently non-transactional. Even a non-visible update
+ * by a concurrent or aborted transaction changes the effective end of the data. The effect is
+ * subtle because of the blurring between deleted and empty values, but ideally we would skip
+ * all uncommitted changes at the end of the data. This doesn't apply to variable-width column
+ * stores because the implicitly created records written by reconciliation are deleted and so
+ * can be never seen by a read.
*
- * The problem is that we don't know at this point whether there may be
- * multiple uncommitted changes at the end of the data, and it would be
- * expensive to check every time we hit an aborted update. If an
- * insert is aborted, we simply return zero (empty), regardless of
- * whether we are at the end of the data.
+ * The problem is that we don't know at this point whether there may be multiple uncommitted
+ * changes at the end of the data, and it would be expensive to check every time we hit an
+ * aborted update. If an insert is aborted, we simply return zero (empty), regardless of whether
+ * we are at the end of the data.
*/
if (cbt->recno < WT_INSERT_RECNO(cbt->ins)) {
cbt->v = 0;
@@ -249,14 +246,12 @@ __cursor_var_next(WT_CURSOR_BTREE *cbt, bool newpage, bool restart)
continue;
/*
- * There can be huge gaps in the variable-length
- * column-store name space appearing as deleted
- * records. If more than one deleted record, do
- * the work of finding the next record to return
- * instead of looping through the records.
+ * There can be huge gaps in the variable-length column-store name space appearing
+ * as deleted records. If more than one deleted record, do the work of finding the
+ * next record to return instead of looping through the records.
*
- * First, find the smallest record in the update
- * list that's larger than the current record.
+ * First, find the smallest record in the update list that's larger than the current
+ * record.
*/
ins = __col_insert_search_gt(cbt->ins_head, cbt->recno);
@@ -313,13 +308,11 @@ __cursor_row_next(WT_CURSOR_BTREE *cbt, bool newpage, bool restart)
cbt->iter_retry = WT_CBT_RETRY_NOTSET;
/*
- * For row-store pages, we need a single item that tells us the part
- * of the page we're walking (otherwise switching from next to prev
- * and vice-versa is just too complicated), so we map the WT_ROW and
- * WT_INSERT_HEAD insert array slots into a single name space: slot 1
- * is the "smallest key insert list", slot 2 is WT_ROW[0], slot 3 is
- * WT_INSERT_HEAD[0], and so on. This means WT_INSERT lists are
- * odd-numbered slots, and WT_ROW array slots are even-numbered slots.
+ * For row-store pages, we need a single item that tells us the part of the page we're walking
+ * (otherwise switching from next to prev and vice-versa is just too complicated), so we map the
+ * WT_ROW and WT_INSERT_HEAD insert array slots into a single name space: slot 1 is the
+ * "smallest key insert list", slot 2 is WT_ROW[0], slot 3 is WT_INSERT_HEAD[0], and so on. This
+ * means WT_INSERT lists are odd-numbered slots, and WT_ROW array slots are even-numbered slots.
*
* Initialize for each new page.
*/
@@ -696,17 +689,13 @@ err:
F_SET(cursor, WT_CURSTD_KEY_INT | WT_CURSTD_VALUE_INT);
#ifdef HAVE_DIAGNOSTIC
/*
- * Skip key order check, if prev is called after a next returned
- * a prepare conflict error, i.e cursor has changed direction
- * at a prepared update, hence current key returned could be
- * same as earlier returned key.
+ * Skip key order check, if prev is called after a next returned a prepare conflict error,
+ * i.e cursor has changed direction at a prepared update, hence current key returned could
+ * be same as earlier returned key.
*
- * eg: Initial data set : (1,2,3,...10)
- * insert key 11 in a prepare transaction.
- * loop on next will return 1,2,3...10 and subsequent call to
- * next will return a prepare conflict. Now if we call prev
- * key 10 will be returned which will be same as earlier
- * returned key.
+ * eg: Initial data set : (1,2,3,...10) insert key 11 in a prepare transaction. loop on next
+ * will return 1,2,3...10 and subsequent call to next will return a prepare conflict. Now if
+ * we call prev key 10 will be returned which will be same as earlier returned key.
*/
if (!F_ISSET(cbt, WT_CBT_ITERATE_RETRY_PREV))
ret = __wt_cursor_key_order_check(session, cbt, true);
diff --git a/src/third_party/wiredtiger/src/btree/bt_curprev.c b/src/third_party/wiredtiger/src/btree/bt_curprev.c
index 315f0f5b654..2a91695ebd2 100644
--- a/src/third_party/wiredtiger/src/btree/bt_curprev.c
+++ b/src/third_party/wiredtiger/src/btree/bt_curprev.c
@@ -11,13 +11,12 @@
/*
* Walking backwards through skip lists.
*
- * The skip list stack is an array of pointers set up by a search. It points
- * to the position a node should go in the skip list. In other words, the skip
- * list search stack always points *after* the search item (that is, into the
- * search item's next array).
+ * The skip list stack is an array of pointers set up by a search. It points to the position a node
+ * should go in the skip list. In other words, the skip list search stack always points *after* the
+ * search item (that is, into the search item's next array).
*
- * Helper macros to go from a stack pointer at level i, pointing into a next
- * array, back to the insert node containing that next array.
+ * Helper macros to go from a stack pointer at level i, pointing into a next array, back to the
+ * insert node containing that next array.
*/
#undef PREV_ITEM
#define PREV_ITEM(ins_head, insp, i) \
@@ -73,13 +72,11 @@ restart:
break;
/*
- * Find a starting point for the new search. That is either at the
- * non-moving node if we found a valid node, or the beginning of the
- * next list down that is not the current node.
+ * Find a starting point for the new search. That is either at the non-moving node if we found a
+ * valid node, or the beginning of the next list down that is not the current node.
*
- * Since it is the beginning of a list, and we know the current node is
- * has a skip depth at least this high, any node we find must sort
- * before the current node.
+ * Since it is the beginning of a list, and we know the current node is has a skip depth at
+ * least this high, any node we find must sort before the current node.
*/
if (ins == NULL || ins == current)
for (; i >= 0; i--) {
@@ -390,14 +387,12 @@ __cursor_var_prev(WT_CURSOR_BTREE *cbt, bool newpage, bool restart)
if (__wt_cell_rle(&unpack) == 1)
continue;
/*
- * There can be huge gaps in the variable-length
- * column-store name space appearing as deleted
- * records. If more than one deleted record, do
- * the work of finding the next record to return
- * instead of looping through the records.
+ * There can be huge gaps in the variable-length column-store name space appearing
+ * as deleted records. If more than one deleted record, do the work of finding the
+ * next record to return instead of looping through the records.
*
- * First, find the largest record in the update
- * list that's smaller than the current record.
+ * First, find the largest record in the update list that's smaller than the current
+ * record.
*/
ins = __col_insert_search_lt(cbt->ins_head, cbt->recno);
@@ -454,13 +449,11 @@ __cursor_row_prev(WT_CURSOR_BTREE *cbt, bool newpage, bool restart)
cbt->iter_retry = WT_CBT_RETRY_NOTSET;
/*
- * For row-store pages, we need a single item that tells us the part
- * of the page we're walking (otherwise switching from next to prev
- * and vice-versa is just too complicated), so we map the WT_ROW and
- * WT_INSERT_HEAD insert array slots into a single name space: slot 1
- * is the "smallest key insert list", slot 2 is WT_ROW[0], slot 3 is
- * WT_INSERT_HEAD[0], and so on. This means WT_INSERT lists are
- * odd-numbered slots, and WT_ROW array slots are even-numbered slots.
+ * For row-store pages, we need a single item that tells us the part of the page we're walking
+ * (otherwise switching from next to prev and vice-versa is just too complicated), so we map the
+ * WT_ROW and WT_INSERT_HEAD insert array slots into a single name space: slot 1 is the
+ * "smallest key insert list", slot 2 is WT_ROW[0], slot 3 is WT_INSERT_HEAD[0], and so on. This
+ * means WT_INSERT lists are odd-numbered slots, and WT_ROW array slots are even-numbered slots.
*
* Initialize for each new page.
*/
@@ -659,17 +652,13 @@ err:
F_SET(cursor, WT_CURSTD_KEY_INT | WT_CURSTD_VALUE_INT);
#ifdef HAVE_DIAGNOSTIC
/*
- * Skip key order check, if next is called after a prev returned
- * a prepare conflict error, i.e cursor has changed direction
- * at a prepared update, hence current key returned could be
- * same as earlier returned key.
+ * Skip key order check, if next is called after a prev returned a prepare conflict error,
+ * i.e cursor has changed direction at a prepared update, hence current key returned could
+ * be same as earlier returned key.
*
- * eg: Initial data set : (2,3,...10)
- * insert key 1 in a prepare transaction.
- * loop on prev will return 10,...3,2 and subsequent call to
- * prev will return a prepare conflict. Now if we call next
- * key 2 will be returned which will be same as earlier
- * returned key.
+ * eg: Initial data set : (2,3,...10) insert key 1 in a prepare transaction. loop on prev
+ * will return 10,...3,2 and subsequent call to prev will return a prepare conflict. Now if
+ * we call next key 2 will be returned which will be same as earlier returned key.
*/
if (!F_ISSET(cbt, WT_CBT_ITERATE_RETRY_NEXT))
ret = __wt_cursor_key_order_check(session, cbt, false);
diff --git a/src/third_party/wiredtiger/src/btree/bt_cursor.c b/src/third_party/wiredtiger/src/btree/bt_cursor.c
index c45d9ed8b6b..8f64dd5562e 100644
--- a/src/third_party/wiredtiger/src/btree/bt_cursor.c
+++ b/src/third_party/wiredtiger/src/btree/bt_cursor.c
@@ -54,7 +54,7 @@ __cursor_state_restore(WT_CURSOR *cursor, WT_CURFILE_STATE *state)
* Return if we have a page pinned.
*/
static inline bool
-__cursor_page_pinned(WT_CURSOR_BTREE *cbt)
+__cursor_page_pinned(WT_CURSOR_BTREE *cbt, bool search_operation)
{
WT_CURSOR *cursor;
WT_SESSION_IMPL *session;
@@ -72,11 +72,25 @@ __cursor_page_pinned(WT_CURSOR_BTREE *cbt)
}
/*
- * Check if the key references the page. When returning from search, the page is active and the
- * key is internal. After the application sets a key, the key is external, and the page is
- * useless.
+ * Check if the key references an item on a page. When returning from search, the page is pinned
+ * and the key is internal. After the application sets a key, the key becomes external. For the
+ * search and search-near operations, we assume locality and check any pinned page first on each
+ * new search operation. For operations other than search and search-near, check if we have an
+ * internal key. If the page is pinned and we're pointing into the page, we don't need to search
+ * at all, we can proceed with the operation. However, if the key has been set, that is, it's an
+ * external key, we're going to have to do a full search.
*/
- if (!F_ISSET(cursor, WT_CURSTD_KEY_INT))
+ if (!search_operation && !F_ISSET(cursor, WT_CURSTD_KEY_INT))
+ return (false);
+
+ /*
+ * XXX No fast-path searches at read-committed isolation. Underlying transactional functions
+ * called by the fast and slow path search code handle transaction IDs differently, resulting in
+ * different search results at read-committed isolation. This makes no difference for the update
+ * functions, but in the case of a search, we will see different results based on the cursor's
+ * initial location. See WT-5134 for the details.
+ */
+ if (search_operation && session->txn.isolation == WT_ISO_READ_COMMITTED)
return (false);
/*
@@ -181,15 +195,13 @@ static inline bool
__cursor_fix_implicit(WT_BTREE *btree, WT_CURSOR_BTREE *cbt)
{
/*
- * When there's no exact match, column-store search returns the key
- * nearest the searched-for key (continuing past keys smaller than the
- * searched-for key to return the next-largest key). Therefore, if the
- * returned comparison is -1, the searched-for key was larger than any
- * row on the page's standard information or column-store insert list.
+ * When there's no exact match, column-store search returns the key nearest the searched-for key
+ * (continuing past keys smaller than the searched-for key to return the next-largest key).
+ * Therefore, if the returned comparison is -1, the searched-for key was larger than any row on
+ * the page's standard information or column-store insert list.
*
- * If the returned comparison is NOT -1, there was a row equal to or
- * larger than the searched-for key, and we implicitly create missing
- * rows.
+ * If the returned comparison is NOT -1, there was a row equal to or larger than the
+ * searched-for key, and we implicitly create missing rows.
*/
return (btree->type == BTREE_COL_FIX && cbt->compare != -1);
}
@@ -541,7 +553,7 @@ __wt_btcur_search(WT_CURSOR_BTREE *cbt)
* pinned page doesn't find an exact match, search from the root.
*/
valid = false;
- if (__cursor_page_pinned(cbt)) {
+ if (__cursor_page_pinned(cbt, true)) {
__wt_txn_cursor_op(session);
WT_ERR(btree->type == BTREE_ROW ? __cursor_row_search(session, cbt, cbt->ref, false) :
@@ -630,19 +642,17 @@ __wt_btcur_search_near(WT_CURSOR_BTREE *cbt, int *exactp)
__cursor_state_save(cursor, &state);
/*
- * If we have a row-store page pinned, search it; if we don't have a
- * page pinned, or the search of the pinned page doesn't find an exact
- * match, search from the root. Unlike WT_CURSOR.search, ignore pinned
- * pages in the case of column-store, search-near isn't an interesting
- * enough case for column-store to add the complexity needed to avoid
- * the tree search.
+ * If we have a row-store page pinned, search it; if we don't have a page pinned, or the search
+ * of the pinned page doesn't find an exact match, search from the root. Unlike
+ * WT_CURSOR.search, ignore pinned pages in the case of column-store, search-near isn't an
+ * interesting enough case for column-store to add the complexity needed to avoid the tree
+ * search.
*
- * Set the "insert" flag for the btree row-store search; we may intend
- * to position the cursor at the end of the tree, rather than match an
- * existing record.
+ * Set the "insert" flag for the btree row-store search; we may intend to position the cursor at
+ * the end of the tree, rather than match an existing record.
*/
valid = false;
- if (btree->type == BTREE_ROW && __cursor_page_pinned(cbt)) {
+ if (btree->type == BTREE_ROW && __cursor_page_pinned(cbt, true)) {
__wt_txn_cursor_op(session);
WT_ERR(__cursor_row_search(session, cbt, cbt->ref, true));
@@ -667,17 +677,15 @@ __wt_btcur_search_near(WT_CURSOR_BTREE *cbt, int *exactp)
/*
* If we find a valid key, return it.
*
- * Else, creating a record past the end of the tree in a fixed-length
- * column-store implicitly fills the gap with empty records. In this
- * case, we instantiate the empty record, it's an exact match.
+ * Else, creating a record past the end of the tree in a fixed-length column-store implicitly
+ * fills the gap with empty records. In this case, we instantiate the empty record, it's an
+ * exact match.
*
- * Else, move to the next key in the tree (bias for prefix searches).
- * Cursor next skips invalid rows, so we don't have to test for them
- * again.
+ * Else, move to the next key in the tree (bias for prefix searches). Cursor next skips invalid
+ * rows, so we don't have to test for them again.
*
- * Else, redo the search and move to the previous key in the tree.
- * Cursor previous skips invalid rows, so we don't have to test for
- * them again.
+ * Else, redo the search and move to the previous key in the tree. Cursor previous skips invalid
+ * rows, so we don't have to test for them again.
*
* If that fails, quit, there's no record to return.
*/
@@ -784,16 +792,14 @@ __wt_btcur_insert(WT_CURSOR_BTREE *cbt)
__cursor_state_save(cursor, &state);
/*
- * If inserting with overwrite configured, and positioned to an on-page
- * key, the update doesn't require another search. Cursors configured
- * for append aren't included, regardless of whether or not they meet
- * all other criteria.
+ * If inserting with overwrite configured, and positioned to an on-page key, the update doesn't
+ * require another search. Cursors configured for append aren't included, regardless of whether
+ * or not they meet all other criteria.
*
- * Fixed-length column store can never use a positioned cursor to update
- * because the cursor may not be positioned to the correct record in the
- * case of implicit records in the append list.
+ * Fixed-length column store can never use a positioned cursor to update because the cursor may
+ * not be positioned to the correct record in the case of implicit records in the append list.
*/
- if (btree->type != BTREE_COL_FIX && __cursor_page_pinned(cbt) &&
+ if (btree->type != BTREE_COL_FIX && __cursor_page_pinned(cbt, false) &&
F_ISSET(cursor, WT_CURSTD_OVERWRITE) && !append_key) {
WT_ERR(__wt_txn_autocommit_check(session));
/*
@@ -997,29 +1003,24 @@ __wt_btcur_remove(WT_CURSOR_BTREE *cbt, bool positioned)
__cursor_state_save(cursor, &state);
/*
- * If remove positioned to an on-page key, the remove doesn't require
- * another search. We don't care about the "overwrite" configuration
- * because regardless of the overwrite setting, any existing record is
- * removed, and the record must exist with a positioned cursor.
+ * If remove positioned to an on-page key, the remove doesn't require another search. We don't
+ * care about the "overwrite" configuration because regardless of the overwrite setting, any
+ * existing record is removed, and the record must exist with a positioned cursor.
*
- * There's trickiness in the page-pinned check. By definition a remove
- * operation leaves a cursor positioned if it's initially positioned.
- * However, if every item on the page is deleted and we unpin the page,
- * eviction might delete the page and our search will re-instantiate an
- * empty page for us. Cursor remove returns not-found whether or not
- * that eviction/deletion happens and it's OK unless cursor-overwrite
- * is configured (which means we return success even if there's no item
- * to delete). In that case, we'll fail when we try to point the cursor
- * at the key on the page to satisfy the positioned requirement. It's
- * arguably safe to simply leave the key initialized in the cursor (as
- * that's all a positioned cursor implies), but it's probably safer to
- * avoid page eviction entirely in the positioned case.
+ * There's trickiness in the page-pinned check. By definition a remove operation leaves a cursor
+ * positioned if it's initially positioned. However, if every item on the page is deleted and we
+ * unpin the page, eviction might delete the page and our search will re-instantiate an empty
+ * page for us. Cursor remove returns not-found whether or not that eviction/deletion happens
+ * and it's OK unless cursor-overwrite is configured (which means we return success even if
+ * there's no item to delete). In that case, we'll fail when we try to point the cursor at the
+ * key on the page to satisfy the positioned requirement. It's arguably safe to simply leave the
+ * key initialized in the cursor (as that's all a positioned cursor implies), but it's probably
+ * safer to avoid page eviction entirely in the positioned case.
*
- * Fixed-length column store can never use a positioned cursor to update
- * because the cursor may not be positioned to the correct record in the
- * case of implicit records in the append list.
+ * Fixed-length column store can never use a positioned cursor to update because the cursor may
+ * not be positioned to the correct record in the case of implicit records in the append list.
*/
- if (btree->type != BTREE_COL_FIX && __cursor_page_pinned(cbt)) {
+ if (btree->type != BTREE_COL_FIX && __cursor_page_pinned(cbt, false)) {
WT_ERR(__wt_txn_autocommit_check(session));
/*
@@ -1036,12 +1037,11 @@ __wt_btcur_remove(WT_CURSOR_BTREE *cbt, bool positioned)
retry:
/*
- * Note these steps must be repeatable, we'll continue to take this path
- * as long as we encounter WT_RESTART.
+ * Note these steps must be repeatable, we'll continue to take this path as long as we encounter
+ * WT_RESTART.
*
- * Any pinned page goes away if we do a search, including as a result of
- * a restart. Get a local copy of any pinned key and re-save the cursor
- * state: we may retry but eventually fail.
+ * Any pinned page goes away if we do a search, including as a result of a restart. Get a local
+ * copy of any pinned key and re-save the cursor state: we may retry but eventually fail.
*/
WT_ERR(__cursor_localkey(cursor));
__cursor_state_save(cursor, &state);
@@ -1085,14 +1085,12 @@ retry:
if (!__cursor_fix_implicit(btree, cbt))
goto search_notfound;
/*
- * Creating a record past the end of the tree in a
- * fixed-length column-store implicitly fills the
- * gap with empty records. Return success in that
- * case, the record was deleted successfully.
+ * Creating a record past the end of the tree in a fixed-length column-store implicitly
+ * fills the gap with empty records. Return success in that case, the record was deleted
+ * successfully.
*
- * Correct the btree cursor's location: the search
- * will have pointed us at the previous/next item,
- * and that's not correct.
+ * Correct the btree cursor's location: the search will have pointed us at the
+ * previous/next item, and that's not correct.
*/
cbt->recno = cursor->recno;
} else
@@ -1107,11 +1105,10 @@ err:
if (ret == 0) {
/*
- * If positioned originally, but we had to do a search, acquire
- * a position so we can return success.
+ * If positioned originally, but we had to do a search, acquire a position so we can return
+ * success.
*
- * If not positioned originally, leave it that way, clear any
- * key and reset the cursor.
+ * If not positioned originally, leave it that way, clear any key and reset the cursor.
*/
if (positioned) {
if (searched)
@@ -1192,16 +1189,14 @@ __btcur_update(WT_CURSOR_BTREE *cbt, WT_ITEM *value, u_int modify_type)
__cursor_state_save(cursor, &state);
/*
- * If update positioned to an on-page key, the update doesn't require
- * another search. We don't care about the "overwrite" configuration
- * because regardless of the overwrite setting, any existing record is
- * updated, and the record must exist with a positioned cursor.
+ * If update positioned to an on-page key, the update doesn't require another search. We don't
+ * care about the "overwrite" configuration because regardless of the overwrite setting, any
+ * existing record is updated, and the record must exist with a positioned cursor.
*
- * Fixed-length column store can never use a positioned cursor to update
- * because the cursor may not be positioned to the correct record in the
- * case of implicit records in the append list.
+ * Fixed-length column store can never use a positioned cursor to update because the cursor may
+ * not be positioned to the correct record in the case of implicit records in the append list.
*/
- if (btree->type != BTREE_COL_FIX && __cursor_page_pinned(cbt)) {
+ if (btree->type != BTREE_COL_FIX && __cursor_page_pinned(cbt, false)) {
WT_ERR(__wt_txn_autocommit_check(session));
/*
@@ -1349,23 +1344,20 @@ __cursor_chain_exceeded(WT_CURSOR_BTREE *cbt)
/*
* Step through the modify operations at the beginning of the chain.
*
- * Deleted or standard updates are anticipated to be sufficient to base
- * the modify (although that's not guaranteed: they may not be visible
- * or might abort before we read them). Also, this is not a hard
- * limit, threads can race modifying updates.
+ * Deleted or standard updates are anticipated to be sufficient to base the modify (although
+ * that's not guaranteed: they may not be visible or might abort before we read them). Also,
+ * this is not a hard limit, threads can race modifying updates.
*
- * If the total size in bytes of the updates exceeds some factor of the
- * underlying value size (which we know because the cursor is
- * positioned), create a new full copy of the value. This limits the
- * cache pressure from creating full copies to that factor: with the
- * default factor of 1, the total size in memory of a set of modify
- * updates is limited to double the size of the modifies.
+ * If the total size in bytes of the updates exceeds some factor of the underlying value size
+ * (which we know because the cursor is positioned), create a new full copy of the value. This
+ * limits the cache pressure from creating full copies to that factor: with the default factor
+ * of 1, the total size in memory of a set of modify updates is limited to double the size of
+ * the modifies.
*
- * Otherwise, limit the length of the update chain to a fixed size to
- * bound the cost of rebuilding the value during reads. When history
- * has to be maintained, creating extra copies of large documents
- * multiplies cache pressure because the old ones cannot be freed, so
- * allow the modify chain to grow.
+ * Otherwise, limit the length of the update chain to a fixed size to bound the cost of
+ * rebuilding the value during reads. When history has to be maintained, creating extra copies
+ * of large documents multiplies cache pressure because the old ones cannot be freed, so allow
+ * the modify chain to grow.
*/
for (i = 0, upd_size = 0; upd != NULL && upd->type == WT_UPDATE_MODIFY; ++i, upd = upd->next) {
upd_size += WT_UPDATE_MEMSIZE(upd);
@@ -1400,26 +1392,22 @@ __wt_btcur_modify(WT_CURSOR_BTREE *cbt, WT_MODIFY *entries, int nentries)
__cursor_state_save(cursor, &state);
/*
- * Get the current value and apply the modification to it, for a few
- * reasons: first, we set the updated value so the application can
- * retrieve the cursor's value; second, we use the updated value as
- * the update if the update chain is too long; third, there's a check
- * if the updated value is too large to store; fourth, to simplify the
- * count of bytes being added/removed; fifth, we can get into serious
- * trouble if we attempt to modify a value that doesn't exist or read
- * a value that might not exist in the future. For the fifth reason,
- * fail if in anything other than a snapshot transaction, read-committed
- * and read-uncommitted imply values that might disappear out from under
- * us or an inability to repeat point-in-time reads.
+ * Get the current value and apply the modification to it, for a few reasons: first, we set the
+ * updated value so the application can retrieve the cursor's value; second, we use the updated
+ * value as the update if the update chain is too long; third, there's a check if the updated
+ * value is too large to store; fourth, to simplify the count of bytes being added/removed;
+ * fifth, we can get into serious trouble if we attempt to modify a value that doesn't exist or
+ * read a value that might not exist in the future. For the fifth reason, fail if in anything
+ * other than a snapshot transaction, read-committed and read-uncommitted imply values that
+ * might disappear out from under us or an inability to repeat point-in-time reads.
*
- * Also, an application might read a value outside of a transaction and
- * then call modify. For that to work, the read must be part of the
- * transaction that performs the update for correctness, otherwise we
- * could race with another thread and end up modifying the wrong value.
- * A clever application could get this right (imagine threads that only
- * updated non-overlapping, fixed-length byte strings), but it's unsafe
- * because it will work most of the time and the failure is unlikely to
- * be detected. Require explicit transactions for modify operations.
+ * Also, an application might read a value outside of a transaction and then call modify. For
+ * that to work, the read must be part of the transaction that performs the update for
+ * correctness, otherwise we could race with another thread and end up modifying the wrong
+ * value. A clever application could get this right (imagine threads that only updated
+ * non-overlapping, fixed-length byte strings), but it's unsafe because it will work most of the
+ * time and the failure is unlikely to be detected. Require explicit transactions for modify
+ * operations.
*/
if (session->txn.isolation != WT_ISO_SNAPSHOT)
WT_ERR_MSG(session, ENOTSUP,
@@ -1444,9 +1432,8 @@ __wt_btcur_modify(WT_CURSOR_BTREE *cbt, WT_MODIFY *entries, int nentries)
/*
* WT_CURSOR.modify is update-without-overwrite.
*
- * Use the modify buffer as the update if the data package saves us some
- * memory and the update chain is under the limit, else use the complete
- * value.
+ * Use the modify buffer as the update if the data package saves us some memory and the update
+ * chain is under the limit, else use the complete value.
*/
overwrite = F_ISSET(cursor, WT_CURSTD_OVERWRITE);
F_CLR(cursor, WT_CURSTD_OVERWRITE);
@@ -1645,23 +1632,19 @@ __cursor_truncate(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *start, WT_CURSOR_BT
yield_count = sleep_usecs = 0;
/*
- * First, call the cursor search method to re-position the cursor: we
- * may not have a cursor position (if the higher-level truncate code
- * switched the cursors to have an "external" cursor key, and because
- * we don't save a copy of the page's write generation information,
- * which we need to remove records).
+ * First, call the cursor search method to re-position the cursor: we may not have a cursor position
+ * (if the higher-level truncate code switched the cursors to have an "external" cursor key, and
+ * because we don't save a copy of the page's write generation information, which we need to remove
+ * records).
*
- * Once that's done, we can delete records without a full search, unless
- * we encounter a restart error because the page was modified by some
- * other thread of control; in that case, repeat the full search to
- * refresh the page's modification information.
+ * Once that's done, we can delete records without a full search, unless we encounter a restart
+ * error because the page was modified by some other thread of control; in that case, repeat the
+ * full search to refresh the page's modification information.
*
- * If this is a row-store, we delete leaf pages having no overflow items
- * without reading them; for that to work, we have to ensure we read the
- * page referenced by the ending cursor, since we may be deleting only a
- * partial page at the end of the truncation. Our caller already fully
- * instantiated the end cursor, so we know that page is pinned in memory
- * and we can proceed without concern.
+ * If this is a row-store, we delete leaf pages having no overflow items without reading them; for
+ * that to work, we have to ensure we read the page referenced by the ending cursor, since we may be
+ * deleting only a partial page at the end of the truncation. Our caller already fully instantiated
+ * the end cursor, so we know that page is pinned in memory and we can proceed without concern.
*/
retry:
WT_ERR(__wt_btcur_search(start));
@@ -1703,23 +1686,20 @@ __cursor_truncate_fix(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *start, WT_CURSO
yield_count = sleep_usecs = 0;
/*
- * Handle fixed-length column-store objects separately: for row-store
- * and variable-length column-store objects we have "deleted" values
- * and so returned objects actually exist: fixed-length column-store
- * objects are filled-in if they don't exist, that is, if you create
- * record 37, records 1-36 magically appear. Those records can't be
- * deleted, which means we have to ignore already "deleted" records.
+ * Handle fixed-length column-store objects separately: for row-store and variable-length
+ * column-store objects we have "deleted" values and so returned objects actually exist:
+ * fixed-length column-store objects are filled-in if they don't exist, that is, if you create
+ * record 37, records 1-36 magically appear. Those records can't be deleted, which means we have to
+ * ignore already "deleted" records.
*
- * First, call the cursor search method to re-position the cursor: we
- * may not have a cursor position (if the higher-level truncate code
- * switched the cursors to have an "external" cursor key, and because
- * we don't save a copy of the page's write generation information,
- * which we need to remove records).
+ * First, call the cursor search method to re-position the cursor: we may not have a cursor position
+ * (if the higher-level truncate code switched the cursors to have an "external" cursor key, and
+ * because we don't save a copy of the page's write generation information, which we need to remove
+ * records).
*
- * Once that's done, we can delete records without a full search, unless
- * we encounter a restart error because the page was modified by some
- * other thread of control; in that case, repeat the full search to
- * refresh the page's modification information.
+ * Once that's done, we can delete records without a full search, unless we encounter a restart
+ * error because the page was modified by some other thread of control; in that case, repeat the
+ * full search to refresh the page's modification information.
*/
retry:
WT_ERR(__wt_btcur_search(start));
@@ -1764,13 +1744,12 @@ __wt_btcur_range_truncate(WT_CURSOR_BTREE *start, WT_CURSOR_BTREE *stop)
WT_STAT_DATA_INCR(session, cursor_truncate);
/*
- * For recovery, log the start and stop keys for a truncate operation,
- * not the individual records removed. On the other hand, for rollback
- * we need to keep track of all the in-memory operations.
+ * For recovery, log the start and stop keys for a truncate operation, not the individual
+ * records removed. On the other hand, for rollback we need to keep track of all the in-memory
+ * operations.
*
- * We deal with this here by logging the truncate range first, then (in
- * the logging code) disabling writing of the in-memory remove records
- * to disk.
+ * We deal with this here by logging the truncate range first, then (in the logging code)
+ * disabling writing of the in-memory remove records to disk.
*/
if (FLD_ISSET(S2C(session)->log_flags, WT_CONN_LOG_ENABLED))
WT_RET(__wt_txn_truncate_log(session, start, stop));
diff --git a/src/third_party/wiredtiger/src/btree/bt_debug.c b/src/third_party/wiredtiger/src/btree/bt_debug.c
index 7ed85112b42..f971de0e4f0 100644
--- a/src/third_party/wiredtiger/src/btree/bt_debug.c
+++ b/src/third_party/wiredtiger/src/btree/bt_debug.c
@@ -382,11 +382,9 @@ __wt_debug_offset(
WT_ASSERT(session, S2BT_SAFE(session) != NULL);
/*
- * This routine depends on the default block manager's view of files,
- * where an address consists of a file offset, length, and checksum.
- * This is for debugging only: other block managers might not see a
- * file or address the same way, that's why there's no block manager
- * method.
+ * This routine depends on the default block manager's view of files, where an address consists
+ * of a file offset, length, and checksum. This is for debugging only: other block managers
+ * might not see a file or address the same way, that's why there's no block manager method.
*
* Convert the triplet into an address structure.
*/
@@ -1181,7 +1179,7 @@ __debug_update(WT_DBG *ds, WT_UPDATE *upd, bool hexbyte)
WT_RET(ds->f(ds, ", start_ts %s", __wt_timestamp_to_string(upd->start_ts, ts_string)));
if (upd->durable_ts != WT_TS_NONE)
WT_RET(
- ds->f(ds, ", durable-ts %s", __wt_timestamp_to_string(upd->durable_ts, ts_string)));
+ ds->f(ds, ", durable_ts %s", __wt_timestamp_to_string(upd->durable_ts, ts_string)));
prepare_state = NULL;
switch (upd->prepare_state) {
diff --git a/src/third_party/wiredtiger/src/btree/bt_delete.c b/src/third_party/wiredtiger/src/btree/bt_delete.c
index 9749cef3706..f7b63524d42 100644
--- a/src/third_party/wiredtiger/src/btree/bt_delete.c
+++ b/src/third_party/wiredtiger/src/btree/bt_delete.c
@@ -11,51 +11,42 @@
/*
* Fast-delete support.
*
- * This file contains most of the code that allows WiredTiger to delete pages
- * of data without reading them into the cache. (This feature is currently
- * only available for row-store objects.)
+ * This file contains most of the code that allows WiredTiger to delete pages of data without
+ * reading them into the cache. (This feature is currently only available for row-store objects.)
*
- * The way cursor truncate works in a row-store object is it explicitly reads
- * the first and last pages of the truncate range, then walks the tree with a
- * flag so the tree walk code skips reading eligible pages within the range
- * and instead just marks them as deleted, by changing their WT_REF state to
- * WT_REF_DELETED. Pages ineligible for this fast path include pages already
- * in the cache, having overflow items, or requiring lookaside records.
- * Ineligible pages are read and have their rows updated/deleted individually.
- * The transaction for the delete operation is stored in memory referenced by
- * the WT_REF.page_del field.
+ * The way cursor truncate works in a row-store object is it explicitly reads the first and last
+ * pages of the truncate range, then walks the tree with a flag so the tree walk code skips reading
+ * eligible pages within the range and instead just marks them as deleted, by changing their WT_REF
+ * state to WT_REF_DELETED. Pages ineligible for this fast path include pages already in the cache,
+ * having overflow items, or requiring lookaside records. Ineligible pages are read and have their
+ * rows updated/deleted individually. The transaction for the delete operation is stored in memory
+ * referenced by the WT_REF.page_del field.
*
- * Future cursor walks of the tree will skip the deleted page based on the
- * transaction stored for the delete, but it gets more complicated if a read is
- * done using a random key, or a cursor walk is done with a transaction where
- * the delete is not visible. In those cases, we read the original contents of
- * the page. The page-read code notices a deleted page is being read, and as
- * part of the read instantiates the contents of the page, creating a WT_UPDATE
- * with a deleted operation, in the same transaction as deleted the page. In
- * other words, the read process makes it appear as if the page was read and
- * each individual row deleted, exactly as would have happened if the page had
+ * Future cursor walks of the tree will skip the deleted page based on the transaction stored for
+ * the delete, but it gets more complicated if a read is done using a random key, or a cursor walk
+ * is done with a transaction where the delete is not visible. In those cases, we read the original
+ * contents of the page. The page-read code notices a deleted page is being read, and as part of the
+ * read instantiates the contents of the page, creating a WT_UPDATE with a deleted operation, in the
+ * same transaction as deleted the page. In other words, the read process makes it appear as if the
+ * page was read and each individual row deleted, exactly as would have happened if the page had
* been in the cache all along.
*
- * There's an additional complication to support rollback of the page delete.
- * When the page was marked deleted, a pointer to the WT_REF was saved in the
- * deleting session's transaction list and the delete is unrolled by resetting
- * the WT_REF_DELETED state back to WT_REF_DISK. However, if the page has been
- * instantiated by some reading thread, that's not enough, each individual row
- * on the page must have the delete operation reset. If the page split, the
- * WT_UPDATE lists might have been saved/restored during reconciliation and
- * appear on multiple pages, and the WT_REF stored in the deleting session's
- * transaction list is no longer useful. For this reason, when the page is
- * instantiated by a read, a list of the WT_UPDATE structures on the page is
- * stored in the WT_REF.page_del field, with the transaction ID, that way the
- * session committing/unrolling the delete can find all WT_UPDATE structures
- * that require update.
+ * There's an additional complication to support rollback of the page delete. When the page was
+ * marked deleted, a pointer to the WT_REF was saved in the deleting session's transaction list and
+ * the delete is unrolled by resetting the WT_REF_DELETED state back to WT_REF_DISK. However, if the
+ * page has been instantiated by some reading thread, that's not enough, each individual row on the
+ * page must have the delete operation reset. If the page split, the WT_UPDATE lists might have been
+ * saved/restored during reconciliation and appear on multiple pages, and the WT_REF stored in the
+ * deleting session's transaction list is no longer useful. For this reason, when the page is
+ * instantiated by a read, a list of the WT_UPDATE structures on the page is stored in the
+ * WT_REF.page_del field, with the transaction ID, that way the session committing/unrolling the
+ * delete can find all WT_UPDATE structures that require update.
*
- * One final note: pages can also be marked deleted if emptied and evicted. In
- * that case, the WT_REF state will be set to WT_REF_DELETED but there will not
- * be any associated WT_REF.page_del field. These pages are always skipped
- * during cursor traversal (the page could not have been evicted if there were
- * updates that weren't globally visible), and if read is forced to instantiate
- * such a page, it simply creates an empty page from scratch.
+ * One final note: pages can also be marked deleted if emptied and evicted. In that case, the WT_REF
+ * state will be set to WT_REF_DELETED but there will not be any associated WT_REF.page_del field.
+ * These pages are always skipped during cursor traversal (the page could not have been evicted if
+ * there were updates that weren't globally visible), and if read is forced to instantiate such a
+ * page, it simply creates an empty page from scratch.
*/
/*
@@ -102,12 +93,10 @@ __wt_delete_page(WT_SESSION_IMPL *session, WT_REF *ref, bool *skipp)
return (0);
/*
- * If this WT_REF was previously part of a truncate operation, there
- * may be existing page-delete information. The structure is only read
- * while the state is locked, free the previous version.
+ * If this WT_REF was previously part of a truncate operation, there may be existing page-delete
+ * information. The structure is only read while the state is locked, free the previous version.
*
- * Note: changes have been made, we must publish any state change from
- * this point on.
+ * Note: changes have been made, we must publish any state change from this point on.
*/
if (ref->page_del != NULL) {
WT_ASSERT(session, ref->page_del->txnid == WT_TXN_ABORTED);
@@ -116,18 +105,15 @@ __wt_delete_page(WT_SESSION_IMPL *session, WT_REF *ref, bool *skipp)
}
/*
- * We cannot truncate pages that have overflow key/value items as the
- * overflow blocks have to be discarded. The way we figure that out is
- * to check the page's cell type, cells for leaf pages without overflow
- * items are special.
+ * We cannot truncate pages that have overflow key/value items as the overflow blocks have to be
+ * discarded. The way we figure that out is to check the page's cell type, cells for leaf pages
+ * without overflow items are special.
*
- * To look at an on-page cell, we need to look at the parent page, and
- * that's dangerous, our parent page could change without warning if
- * the parent page were to split, deepening the tree. We can look at
- * the parent page itself because the page can't change underneath us.
- * However, if the parent page splits, our reference address can change;
- * we don't care what version of it we read, as long as we don't read
- * it twice.
+ * To look at an on-page cell, we need to look at the parent page, and that's dangerous, our
+ * parent page could change without warning if the parent page were to split, deepening the
+ * tree. We can look at the parent page itself because the page can't change underneath us.
+ * However, if the parent page splits, our reference address can change; we don't care what
+ * version of it we read, as long as we don't read it twice.
*/
WT_ORDERED_READ(ref_addr, ref->addr);
if (ref_addr != NULL && (__wt_off_page(ref->home, ref_addr) ?
@@ -219,15 +205,12 @@ __wt_delete_page_rollback(WT_SESSION_IMPL *session, WT_REF *ref)
}
/*
- * We can't use the normal read path to get a copy of the page
- * because the session may have closed the cursor, we no longer
- * have the reference to the tree required for a hazard
- * pointer. We're safe because with unresolved transactions,
- * the page isn't going anywhere.
+ * We can't use the normal read path to get a copy of the page because the session may have
+ * closed the cursor, we no longer have the reference to the tree required for a hazard pointer.
+ * We're safe because with unresolved transactions, the page isn't going anywhere.
*
- * The page is in an in-memory state, which means it
- * was instantiated at some point. Walk any list of
- * update structures and abort them.
+ * The page is in an in-memory state, which means it was instantiated at some point. Walk any
+ * list of update structures and abort them.
*/
WT_ASSERT(session, locked);
if ((updp = ref->page_del->update_list) != NULL)
@@ -255,22 +238,19 @@ __wt_delete_page_skip(WT_SESSION_IMPL *session, WT_REF *ref, bool visible_all)
bool skip;
/*
- * Deleted pages come from two sources: either it's a truncate as
- * described above, or the page has been emptied by other operations
- * and eviction deleted it.
+ * Deleted pages come from two sources: either it's a truncate as described above, or the page
+ * has been emptied by other operations and eviction deleted it.
*
- * In both cases, the WT_REF state will be WT_REF_DELETED. In the case
- * of a truncated page, there will be a WT_PAGE_DELETED structure with
- * the transaction ID of the transaction that deleted the page, and the
- * page is visible if that transaction ID is visible. In the case of an
- * empty page, there will be no WT_PAGE_DELETED structure and the delete
- * is by definition visible, eviction could not have deleted the page if
- * there were changes on it that were not globally visible.
+ * In both cases, the WT_REF state will be WT_REF_DELETED. In the case of a truncated page,
+ * there will be a WT_PAGE_DELETED structure with the transaction ID of the transaction that
+ * deleted the page, and the page is visible if that transaction ID is visible. In the case of
+ * an empty page, there will be no WT_PAGE_DELETED structure and the delete is by definition
+ * visible, eviction could not have deleted the page if there were changes on it that were not
+ * globally visible.
*
- * We're here because we found a WT_REF state set to WT_REF_DELETED. It
- * is possible the page is being read into memory right now, though, and
- * the page could switch to an in-memory state at any time. Lock down
- * the structure, just to be safe.
+ * We're here because we found a WT_REF state set to WT_REF_DELETED. It is possible the page is
+ * being read into memory right now, though, and the page could switch to an in-memory state at
+ * any time. Lock down the structure, just to be safe.
*/
if (ref->page_del == NULL && ref->page_las == NULL)
return (true);
@@ -362,26 +342,22 @@ __wt_delete_page_instantiate(WT_SESSION_IMPL *session, WT_REF *ref)
}
/*
- * An operation is accessing a "deleted" page, and we're building an
- * in-memory version of the page (making it look like all entries in
- * the page were individually updated by a remove operation). There
- * are two cases where we end up here:
+ * An operation is accessing a "deleted" page, and we're building an in-memory version of the
+ * page (making it look like all entries in the page were individually updated by a remove
+ * operation). There are two cases where we end up here:
*
- * First, a running transaction used a truncate call to delete the page
- * without reading it, in which case the page reference includes a
- * structure with a transaction ID; the page we're building might split
- * in the future, so we update that structure to include references to
- * all of the update structures we create, so the transaction can abort.
+ * First, a running transaction used a truncate call to delete the page without reading it, in
+ * which case the page reference includes a structure with a transaction ID; the page we're
+ * building might split in the future, so we update that structure to include references to all
+ * of the update structures we create, so the transaction can abort.
*
- * Second, a truncate call deleted a page and the truncate committed,
- * but an older transaction in the system forced us to keep the old
- * version of the page around, then we crashed and recovered or we're
- * running inside a checkpoint, and now we're being forced to read that
- * page.
+ * Second, a truncate call deleted a page and the truncate committed, but an older transaction
+ * in the system forced us to keep the old version of the page around, then we crashed and
+ * recovered or we're running inside a checkpoint, and now we're being forced to read that page.
*
- * Expect a page-deleted structure if there's a running transaction that
- * needs to be resolved, otherwise, there may not be one (and, if the
- * transaction has resolved, we can ignore the page-deleted structure).
+ * Expect a page-deleted structure if there's a running transaction that needs to be resolved,
+ * otherwise, there may not be one (and, if the transaction has resolved, we can ignore the
+ * page-deleted structure).
*/
page_del = __wt_page_del_active(session, ref, true) ? ref->page_del : NULL;
diff --git a/src/third_party/wiredtiger/src/btree/bt_discard.c b/src/third_party/wiredtiger/src/btree/bt_discard.c
index c3b8a52d150..9dd84879ddf 100644
--- a/src/third_party/wiredtiger/src/btree/bt_discard.c
+++ b/src/third_party/wiredtiger/src/btree/bt_discard.c
@@ -24,8 +24,7 @@ void
__wt_ref_out(WT_SESSION_IMPL *session, WT_REF *ref)
{
/*
- * A version of the page-out function that allows us to make additional
- * diagnostic checks.
+ * A version of the page-out function that allows us to make additional diagnostic checks.
*
* The WT_REF cannot be the eviction thread's location.
*/
@@ -336,9 +335,8 @@ __free_page_row_leaf(WT_SESSION_IMPL *session, WT_PAGE *page)
/*
* Free the in-memory index array.
*
- * For each entry, see if the key was an allocation (that is, if it
- * points somewhere other than the original page), and if so, free
- * the memory.
+ * For each entry, see if the key was an allocation (that is, if it points somewhere other than
+ * the original page), and if so, free the memory.
*/
WT_ROW_FOREACH (page, rip, i) {
copy = WT_ROW_KEY_COPY(rip);
diff --git a/src/third_party/wiredtiger/src/btree/bt_handle.c b/src/third_party/wiredtiger/src/btree/bt_handle.c
index d8994e7bfab..595eb55fc5c 100644
--- a/src/third_party/wiredtiger/src/btree/bt_handle.c
+++ b/src/third_party/wiredtiger/src/btree/bt_handle.c
@@ -26,12 +26,10 @@ __wt_btree_page_version_config(WT_SESSION_IMPL *session)
conn = S2C(session);
/*
- * Write timestamp format pages if at the right version or if configured
- * at build-time.
+ * Write timestamp format pages if at the right version or if configured at build-time.
*
- * WiredTiger version where timestamp page format is written. This is a
- * future release, and the values may require update when the release is
- * named.
+ * WiredTiger version where timestamp page format is written. This is a future release, and the
+ * values may require update when the release is named.
*/
#define WT_VERSION_TS_MAJOR 3
#define WT_VERSION_TS_MINOR 3
@@ -201,17 +199,15 @@ __wt_btree_open(WT_SESSION_IMPL *session, const char *op_cfg[])
}
/*
- * Eviction ignores trees until the handle's open flag is set, configure
- * eviction before that happens.
+ * Eviction ignores trees until the handle's open flag is set, configure eviction before that
+ * happens.
*
- * Files that can still be bulk-loaded cannot be evicted.
- * Permanently cache-resident files can never be evicted.
- * Special operations don't enable eviction. The underlying commands may
- * turn on eviction (for example, verify turns on eviction while working
- * a file to keep from consuming the cache), but it's their decision. If
- * an underlying command reconfigures eviction, it must either clear the
- * evict-disabled-open flag or restore the eviction configuration when
- * finished so that handle close behaves correctly.
+ * Files that can still be bulk-loaded cannot be evicted. Permanently cache-resident files can
+ * never be evicted. Special operations don't enable eviction. The underlying commands may turn
+ * on eviction (for example, verify turns on eviction while working a file to keep from
+ * consuming the cache), but it's their decision. If an underlying command reconfigures
+ * eviction, it must either clear the evict-disabled-open flag or restore the eviction
+ * configuration when finished so that handle close behaves correctly.
*/
if (btree->original || F_ISSET(btree, WT_BTREE_IN_MEMORY | WT_BTREE_REBALANCE |
WT_BTREE_SALVAGE | WT_BTREE_UPGRADE | WT_BTREE_VERIFY)) {
@@ -243,12 +239,10 @@ __wt_btree_close(WT_SESSION_IMPL *session)
btree = S2BT(session);
/*
- * The close process isn't the same as discarding the handle: we might
- * re-open the handle, which isn't a big deal, but the backing blocks
- * for the handle may not yet have been discarded from the cache, and
- * eviction uses WT_BTREE structure elements. Free backing resources
- * but leave the rest alone, and we'll discard the structure when we
- * discard the data handle.
+ * The close process isn't the same as discarding the handle: we might re-open the handle, which
+ * isn't a big deal, but the backing blocks for the handle may not yet have been discarded from
+ * the cache, and eviction uses WT_BTREE structure elements. Free backing resources but leave
+ * the rest alone, and we'll discard the structure when we discard the data handle.
*
* Handles can be closed multiple times, ignore all but the first.
*/
@@ -532,14 +526,12 @@ __btree_conf(WT_SESSION_IMPL *session, WT_CKPT *ckpt)
if (btree->compressor != NULL && btree->compressor->compress != NULL &&
btree->type != BTREE_COL_FIX) {
/*
- * Don't do compression adjustment when on-disk page sizes are
- * less than 16KB. There's not enough compression going on to
- * fine-tune the size, all we end up doing is hammering shared
- * memory.
+ * Don't do compression adjustment when on-disk page sizes are less than 16KB. There's not
+ * enough compression going on to fine-tune the size, all we end up doing is hammering
+ * shared memory.
*
- * Don't do compression adjustment when on-disk page sizes are
- * equal to the maximum in-memory page image, the bytes taken
- * for compression can't grow past the base value.
+ * Don't do compression adjustment when on-disk page sizes are equal to the maximum
+ * in-memory page image, the bytes taken for compression can't grow past the base value.
*/
if (btree->maxintlpage >= 16 * 1024 && btree->maxmempage_image > btree->maxintlpage) {
btree->intlpage_compadjust = true;
@@ -611,9 +603,8 @@ __wt_btree_tree_open(WT_SESSION_IMPL *session, const uint8_t *addr, size_t addr_
WT_CLEAR(dsk);
/*
- * Read and verify the page (verify to catch encrypted objects we can't
- * decrypt, where we read the object successfully but we can't decrypt
- * it, and we want to fail gracefully).
+ * Read and verify the page (verify to catch encrypted objects we can't decrypt, where we read
+ * the object successfully but we can't decrypt it, and we want to fail gracefully).
*
* Create a printable version of the address to pass to verify.
*/
@@ -939,8 +930,8 @@ __btree_page_sizes(WT_SESSION_IMPL *session)
/*
* Get the maximum internal/leaf page key/value sizes.
*
- * In-memory configuration overrides any key/value sizes, there's no
- * such thing as an overflow item in an in-memory configuration.
+ * In-memory configuration overrides any key/value sizes, there's no such thing as an overflow
+ * item in an in-memory configuration.
*/
if (F_ISSET(conn, WT_CONN_IN_MEMORY)) {
btree->maxintlkey = WT_BTREE_MAX_OBJECT_SIZE;
@@ -971,13 +962,12 @@ __btree_page_sizes(WT_SESSION_IMPL *session)
}
/*
- * Default/maximum for internal and leaf page keys: split-page / 10.
- * Default for leaf page values: split-page / 2.
+ * Default/maximum for internal and leaf page keys: split-page / 10. Default for leaf page
+ * values: split-page / 2.
*
- * It's difficult for applications to configure this in any exact way as
- * they have to duplicate our calculation of how many keys must fit on a
- * page, and given a split-percentage and page header, that isn't easy
- * to do. If the maximum internal key value is too large for the page,
+ * It's difficult for applications to configure this in any exact way as they have to duplicate
+ * our calculation of how many keys must fit on a page, and given a split-percentage and page
+ * header, that isn't easy to do. If the maximum internal key value is too large for the page,
* reset it to the default.
*/
if (btree->maxintlkey == 0 || btree->maxintlkey > intl_split_size / 10)
diff --git a/src/third_party/wiredtiger/src/btree/bt_import.c b/src/third_party/wiredtiger/src/btree/bt_import.c
index 7a1e1cd936c..02f023567f5 100644
--- a/src/third_party/wiredtiger/src/btree/bt_import.c
+++ b/src/third_party/wiredtiger/src/btree/bt_import.c
@@ -83,17 +83,16 @@ __wt_import(WT_SESSION_IMPL *session, const char *uri)
}
/*
- * OK, we've now got three chunks of data: the file's metadata from when
- * the last checkpoint started, the array of checkpoints as of when the
- * last checkpoint was almost complete (everything written but the avail
- * list), and fixed-up checkpoint information from the last checkpoint.
+ * OK, we've now got three chunks of data: the file's metadata from when the last checkpoint
+ * started, the array of checkpoints as of when the last checkpoint was almost complete
+ * (everything written but the avail list), and fixed-up checkpoint information from the last
+ * checkpoint.
*
- * Build and flatten the metadata and the checkpoint list, then insert
- * it into the metadata for this file.
+ * Build and flatten the metadata and the checkpoint list, then insert it into the metadata for
+ * this file.
*
- * Strip out the checkpoint-LSN, an imported file isn't associated
- * with any log files.
- * Assign a unique file ID.
+ * Strip out the checkpoint-LSN, an imported file isn't associated with any log files. Assign a
+ * unique file ID.
*/
filecfg[1] = a->data;
filecfg[2] = checkpoint_list;
@@ -107,30 +106,25 @@ __wt_import(WT_SESSION_IMPL *session, const char *uri)
__wt_verbose(session, WT_VERB_CHECKPOINT, "import configuration: %s/%s", uri, fileconf);
/*
- * The just inserted metadata was correct as of immediately before the
- * before the final checkpoint, but it's not quite right. The block
- * manager returned the corrected final checkpoint, put it all together.
+ * The just inserted metadata was correct as of immediately before the before the final
+ * checkpoint, but it's not quite right. The block manager returned the corrected final
+ * checkpoint, put it all together.
*
- * Get the checkpoint information from the file's metadata as an array
- * of WT_CKPT structures.
+ * Get the checkpoint information from the file's metadata as an array of WT_CKPT structures.
*
- * XXX
- * There's a problem here. If a file is imported from our future (leaf
- * pages with unstable entries that have write-generations ahead of the
- * current database's base write generation), we'll read the values and
- * treat them as stable. A restart will fix this: when we added the
- * imported file to our metadata, the write generation in the imported
- * file's checkpoints updated our database's maximum write generation,
- * and so a restart will have a maximum generation newer than the
- * imported file's write generation. An alternative solution is to add
- * a "base write generation" value to the imported file's metadata, and
- * use that value instead of the connection's base write generation when
- * deciding what page items should be read. Since all future writes to
- * the imported file would be ahead of that write generation, it would
- * have the effect we want.
+ * XXX There's a problem here. If a file is imported from our future (leaf pages with unstable
+ * entries that have write-generations ahead of the current database's base write generation),
+ * we'll read the values and treat them as stable. A restart will fix this: when we added the
+ * imported file to our metadata, the write generation in the imported file's checkpoints
+ * updated our database's maximum write generation, and so a restart will have a maximum
+ * generation newer than the imported file's write generation. An alternative solution is to add
+ * a "base write generation" value to the imported file's metadata, and use that value instead
+ * of the connection's base write generation when deciding what page items should be read. Since
+ * all future writes to the imported file would be ahead of that write generation, it would have
+ * the effect we want.
*
- * Update the last checkpoint with the corrected information.
- * Update the file's metadata with the new checkpoint information.
+ * Update the last checkpoint with the corrected information. Update the file's metadata with
+ * the new checkpoint information.
*/
WT_ERR(__wt_meta_ckptlist_get(session, uri, false, &ckptbase));
WT_CKPT_FOREACH (ckptbase, ckpt)
diff --git a/src/third_party/wiredtiger/src/btree/bt_io.c b/src/third_party/wiredtiger/src/btree/bt_io.c
index 25373fa592a..44b672251cb 100644
--- a/src/third_party/wiredtiger/src/btree/bt_io.c
+++ b/src/third_party/wiredtiger/src/btree/bt_io.c
@@ -309,19 +309,17 @@ __wt_bt_write(WT_SESSION_IMPL *session, WT_ITEM *buf, uint8_t *addr, size_t *add
F_SET(dsk, WT_PAGE_ENCRYPTED);
/*
- * We increment the block's write generation so it's easy to identify
- * newer versions of blocks during salvage. (It's common in WiredTiger,
- * at least for the default block manager, for multiple blocks to be
- * internally consistent with identical first and last keys, so we need
- * a way to know the most recent state of the block. We could check
- * which leaf is referenced by a valid internal page, but that implies
- * salvaging internal pages, which I don't want to do, and it's not
- * as good anyway, because the internal page may not have been written
- * after the leaf page was updated. So, write generations it is.
+ * We increment the block's write generation so it's easy to identify newer versions of blocks
+ * during salvage. (It's common in WiredTiger, at least for the default block manager, for
+ * multiple blocks to be internally consistent with identical first and last keys, so we need a
+ * way to know the most recent state of the block. We could check which leaf is referenced by a
+ * valid internal page, but that implies salvaging internal pages, which I don't want to do, and
+ * it's not as good anyway, because the internal page may not have been written after the leaf
+ * page was updated. So, write generations it is.
*
- * Nothing is locked at this point but two versions of a page with the
- * same generation is pretty unlikely, and if we did, they're going to
- * be roughly identical for the purposes of salvage, anyway.
+ * Nothing is locked at this point but two versions of a page with the same generation is pretty
+ * unlikely, and if we did, they're going to be roughly identical for the purposes of salvage,
+ * anyway.
*/
dsk->write_gen = ++btree->write_gen;
diff --git a/src/third_party/wiredtiger/src/btree/bt_ovfl.c b/src/third_party/wiredtiger/src/btree/bt_ovfl.c
index 4ad373c2ba5..8ea91b31fd2 100644
--- a/src/third_party/wiredtiger/src/btree/bt_ovfl.c
+++ b/src/third_party/wiredtiger/src/btree/bt_ovfl.c
@@ -21,11 +21,11 @@ __ovfl_read(WT_SESSION_IMPL *session, const uint8_t *addr, size_t addr_size, WT_
btree = S2BT(session);
/*
- * Read the overflow item from the block manager, then reference the
- * start of the data and set the data's length.
+ * Read the overflow item from the block manager, then reference the start of the data and set
+ * the data's length.
*
- * Overflow reads are synchronous. That may bite me at some point, but
- * WiredTiger supports large page sizes, overflow items should be rare.
+ * Overflow reads are synchronous. That may bite me at some point, but WiredTiger supports large
+ * page sizes, overflow items should be rare.
*/
WT_RET(__wt_bt_read(session, store, addr, addr_size));
dsk = store->data;
@@ -60,13 +60,11 @@ __wt_ovfl_read(
return (__ovfl_read(session, unpack->data, unpack->size, store));
/*
- * WT_CELL_VALUE_OVFL_RM cells: If reconciliation deleted an overflow
- * value, but there was still a reader in the system that might need it,
- * the on-page cell type will have been reset to WT_CELL_VALUE_OVFL_RM
- * and we will be passed a page so we can check the on-page cell.
+ * WT_CELL_VALUE_OVFL_RM cells: If reconciliation deleted an overflow value, but there was still
+ * a reader in the system that might need it, the on-page cell type will have been reset to
+ * WT_CELL_VALUE_OVFL_RM and we will be passed a page so we can check the on-page cell.
*
- * Acquire the overflow lock, and retest the on-page cell's value inside
- * the lock.
+ * Acquire the overflow lock, and retest the on-page cell's value inside the lock.
*/
__wt_readlock(session, &S2BT(session)->ovfl_lock);
if (__wt_cell_type_raw(unpack->cell) == WT_CELL_VALUE_OVFL_RM) {
@@ -188,12 +186,11 @@ __wt_ovfl_remove(WT_SESSION_IMPL *session, WT_PAGE *page, WT_CELL_UNPACK *unpack
WT_RET(__ovfl_cache(session, page, unpack));
/*
- * The second problem is to only remove the underlying blocks once,
- * solved by the WT_CELL_VALUE_OVFL_RM flag.
+ * The second problem is to only remove the underlying blocks once, solved by the
+ * WT_CELL_VALUE_OVFL_RM flag.
*
- * Queue the on-page cell to be set to WT_CELL_VALUE_OVFL_RM and the
- * underlying overflow value's blocks to be freed when reconciliation
- * completes.
+ * Queue the on-page cell to be set to WT_CELL_VALUE_OVFL_RM and the underlying overflow value's
+ * blocks to be freed when reconciliation completes.
*/
return (__wt_ovfl_discard_add(session, page, unpack->cell));
}
@@ -216,15 +213,13 @@ __wt_ovfl_discard(WT_SESSION_IMPL *session, WT_PAGE *page, WT_CELL *cell)
__wt_cell_unpack(session, page, cell, unpack);
/*
- * Finally remove overflow key/value objects, called when reconciliation
- * finishes after successfully writing a page.
+ * Finally remove overflow key/value objects, called when reconciliation finishes after
+ * successfully writing a page.
*
- * Keys must have already been instantiated and value objects must have
- * already been cached (if they might potentially still be read by any
- * running transaction).
+ * Keys must have already been instantiated and value objects must have already been cached (if
+ * they might potentially still be read by any running transaction).
*
- * Acquire the overflow lock to avoid racing with a thread reading the
- * backing overflow blocks.
+ * Acquire the overflow lock to avoid racing with a thread reading the backing overflow blocks.
*/
__wt_writelock(session, &btree->ovfl_lock);
diff --git a/src/third_party/wiredtiger/src/btree/bt_page.c b/src/third_party/wiredtiger/src/btree/bt_page.c
index 407fbca7839..0db3e5216d2 100644
--- a/src/third_party/wiredtiger/src/btree/bt_page.c
+++ b/src/third_party/wiredtiger/src/btree/bt_page.c
@@ -148,14 +148,12 @@ __wt_page_inmem(WT_SESSION_IMPL *session, WT_REF *ref, const void *image, uint32
case WT_PAGE_COL_INT:
case WT_PAGE_COL_VAR:
/*
- * Column-store leaf page entries map one-to-one to the number
- * of physical entries on the page (each physical entry is a
- * value item). Note this value isn't necessarily correct, we
+ * Column-store leaf page entries map one-to-one to the number of physical entries on the
+ * page (each physical entry is a value item). Note this value isn't necessarily correct, we
* may skip values when reading the disk image.
*
- * Column-store internal page entries map one-to-one to the
- * number of physical entries on the page (each entry is a
- * location cookie).
+ * Column-store internal page entries map one-to-one to the number of physical entries on
+ * the page (each entry is a location cookie).
*/
alloc_entries = dsk->u.entries;
break;
@@ -191,14 +189,12 @@ __wt_page_inmem(WT_SESSION_IMPL *session, WT_REF *ref, const void *image, uint32
F_SET_ATOMIC(page, flags);
/*
- * Track the memory allocated to build this page so we can update the
- * cache statistics in a single call. If the disk image is in allocated
- * memory, start with that.
+ * Track the memory allocated to build this page so we can update the cache statistics in a
+ * single call. If the disk image is in allocated memory, start with that.
*
- * Accounting is based on the page-header's in-memory disk size instead
- * of the buffer memory used to instantiate the page image even though
- * the values might not match exactly, because that's the only value we
- * have when discarding the page image and accounting needs to match.
+ * Accounting is based on the page-header's in-memory disk size instead of the buffer memory
+ * used to instantiate the page image even though the values might not match exactly, because
+ * that's the only value we have when discarding the page image and accounting needs to match.
*/
size = LF_ISSET(WT_PAGE_DISK_ALLOC) ? dsk->mem_size : 0;
@@ -454,21 +450,16 @@ __inmem_row_int(WT_SESSION_IMPL *session, WT_PAGE *page, size_t *sizep)
break;
case WT_CELL_ADDR_DEL:
/*
- * A cell may reference a deleted leaf page: if a leaf
- * page was deleted without being read (fast truncate),
- * and the deletion committed, but older transactions
- * in the system required the previous version of the
- * page to remain available, a special deleted-address
- * type cell is written. We'll see that cell on a page
- * if we read from a checkpoint including a deleted
- * cell or if we crash/recover and start off from such
- * a checkpoint (absent running recovery, a version of
- * the page without the deleted cell would eventually
- * have been written). If we crash and recover to a
- * page with a deleted-address cell, we want to discard
- * the page from the backing store (it was never
- * discarded), and, of course, by definition no earlier
- * transaction will ever need it.
+ * A cell may reference a deleted leaf page: if a leaf page was deleted without being
+ * read (fast truncate), and the deletion committed, but older transactions in the
+ * system required the previous version of the page to remain available, a special
+ * deleted-address type cell is written. We'll see that cell on a page if we read from a
+ * checkpoint including a deleted cell or if we crash/recover and start off from such a
+ * checkpoint (absent running recovery, a version of the page without the deleted cell
+ * would eventually have been written). If we crash and recover to a page with a
+ * deleted-address cell, we want to discard the page from the backing store (it was
+ * never discarded), and, of course, by definition no earlier transaction will ever need
+ * it.
*
* Re-create the state of a deleted page.
*/
@@ -524,15 +515,14 @@ __inmem_row_leaf_entries(WT_SESSION_IMPL *session, const WT_PAGE_HEADER *dsk, ui
btree = S2BT(session);
/*
- * Leaf row-store page entries map to a maximum of one-to-one to the
- * number of physical entries on the page (each physical entry might be
- * a key without a subsequent data item). To avoid over-allocation in
- * workloads without empty data items, first walk the page counting the
+ * Leaf row-store page entries map to a maximum of one-to-one to the number of physical entries
+ * on the page (each physical entry might be a key without a subsequent data item). To avoid
+ * over-allocation in workloads without empty data items, first walk the page counting the
* number of keys, then allocate the indices.
*
- * The page contains key/data pairs. Keys are on-page (WT_CELL_KEY) or
- * overflow (WT_CELL_KEY_OVFL) items, data are either non-existent or a
- * single on-page (WT_CELL_VALUE) or overflow (WT_CELL_VALUE_OVFL) item.
+ * The page contains key/data pairs. Keys are on-page (WT_CELL_KEY) or overflow
+ * (WT_CELL_KEY_OVFL) items, data are either non-existent or a single on-page (WT_CELL_VALUE) or
+ * overflow (WT_CELL_VALUE_OVFL) item.
*/
nindx = 0;
WT_CELL_FOREACH_BEGIN (session, btree, dsk, unpack) {
diff --git a/src/third_party/wiredtiger/src/btree/bt_random.c b/src/third_party/wiredtiger/src/btree/bt_random.c
index ae2c64a126d..6acccf699a4 100644
--- a/src/third_party/wiredtiger/src/btree/bt_random.c
+++ b/src/third_party/wiredtiger/src/btree/bt_random.c
@@ -314,18 +314,25 @@ __random_leaf(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt)
for (i = __wt_random(&session->rnd) % WT_RANDOM_CURSOR_MOVE;;) {
ret = next ? __wt_btcur_next(cbt, false) : __wt_btcur_prev(cbt, false);
if (ret == WT_NOTFOUND) {
- next = false; /* Reverse direction from the end of the tree. */
- ret = __wt_btcur_prev(cbt, false);
- WT_RET(ret); /* An empty tree. */
+ next = !next; /* Reverse direction. */
+ ret = next ? __wt_btcur_next(cbt, false) : __wt_btcur_prev(cbt, false);
}
+ WT_RET(ret);
+
if (i > 0)
--i;
else {
/*
* Skip the record we returned last time, once. Clear the tracking value so we don't
* skip that record twice, it just means the tree is too small for anything reasonable.
+ *
+ * Testing WT_DATA_IN_ITEM requires explanation: the cursor temporary buffer is used to
+ * build keys for row-store searches and can point into the row-store page (which might
+ * have been freed subsequently). If a previous random call set the temporary buffer,
+ * then it will be local data. If it's local data for some other reason than a previous
+ * random call, we don't care: it won't match, and if it does we just retry.
*/
- if (cursor->key.size == cbt->tmp->size &&
+ if (WT_DATA_IN_ITEM(cbt->tmp) && cursor->key.size == cbt->tmp->size &&
memcmp(cursor->key.data, cbt->tmp->data, cbt->tmp->size) == 0) {
cbt->tmp->size = 0;
i = __wt_random(&session->rnd) % WT_RANDOM_CURSOR_MOVE;
@@ -391,15 +398,13 @@ restart:
}
/*
- * There may be empty pages in the tree, and they're useless to
- * us. If we don't find a non-empty page in "entries" random
- * guesses, take the first non-empty page in the tree. If the
- * search page contains nothing other than empty pages, restart
- * from the root some number of times before giving up.
+ * There may be empty pages in the tree, and they're useless to us. If we don't find a
+ * non-empty page in "entries" random guesses, take the first non-empty page in the tree. If
+ * the search page contains nothing other than empty pages, restart from the root some
+ * number of times before giving up.
*
- * Random sampling is looking for a key/value pair on a random
- * leaf page, and so will accept any page that contains a valid
- * key/value pair, so on-disk is fine, but deleted is not.
+ * Random sampling is looking for a key/value pair on a random leaf page, and so will accept
+ * any page that contains a valid key/value pair, so on-disk is fine, but deleted is not.
*/
descent = NULL;
for (i = 0; i < entries; ++i) {
@@ -424,11 +429,10 @@ restart:
}
/*
- * Swap the current page for the child page. If the page splits
- * while we're retrieving it, restart the search at the root.
+ * Swap the current page for the child page. If the page splits while we're retrieving it,
+ * restart the search at the root.
*
- * On other error, simply return, the swap call ensures we're
- * holding nothing on failure.
+ * On other error, simply return, the swap call ensures we're holding nothing on failure.
*/
descend:
if ((ret = __wt_page_swap(session, current, descent, flags)) == 0) {
@@ -517,12 +521,11 @@ __wt_btcur_next_random(WT_CURSOR_BTREE *cbt)
}
/*
- * Cursor through the tree, skipping past the sample size of the leaf
- * pages in the tree between each random key return to compensate for
- * unbalanced trees.
+ * Cursor through the tree, skipping past the sample size of the leaf pages in the tree between
+ * each random key return to compensate for unbalanced trees.
*
- * If the random descent attempt failed, we don't have a configured
- * sample size, use 100 for no particular reason.
+ * If the random descent attempt failed, we don't have a configured sample size, use 100 for no
+ * particular reason.
*/
if (cbt->next_random_sample_size == 0)
cbt->next_random_sample_size = 100;
@@ -549,19 +552,17 @@ __wt_btcur_next_random(WT_CURSOR_BTREE *cbt)
}
/*
- * Be paranoid about loop termination: first, if the last leaf page
- * skipped was also the last leaf page in the tree, skip may be set to
- * zero on return along with the NULL WT_REF end-of-walk condition.
- * Second, if a tree has no valid pages at all (the condition after
- * initial creation), we might make no progress at all, or finally, if
- * a tree has only deleted pages, we'll make progress, but never get a
- * useful WT_REF. And, of course, the tree can switch from one of these
- * states to another without warning. Decrement skip regardless of what
+ * Be paranoid about loop termination: first, if the last leaf page skipped was also the last
+ * leaf page in the tree, skip may be set to zero on return along with the NULL WT_REF
+ * end-of-walk condition. Second, if a tree has no valid pages at all (the condition after
+ * initial creation), we might make no progress at all, or finally, if a tree has only deleted
+ * pages, we'll make progress, but never get a useful WT_REF. And, of course, the tree can
+ * switch from one of these states to another without warning. Decrement skip regardless of what
* is happening in the search, guarantee we eventually quit.
*
- * Pages read for data sampling aren't "useful"; don't update the read
- * generation of pages already in memory, and if a page is read, set
- * its generation to a low value so it is evicted quickly.
+ * Pages read for data sampling aren't "useful"; don't update the read generation of pages
+ * already in memory, and if a page is read, set its generation to a low value so it is evicted
+ * quickly.
*/
for (skip = cbt->next_random_leaf_skip; cbt->ref == NULL || skip > 0;) {
n = skip;
diff --git a/src/third_party/wiredtiger/src/btree/bt_read.c b/src/third_party/wiredtiger/src/btree/bt_read.c
index b21221439f6..e75680fc946 100644
--- a/src/third_party/wiredtiger/src/btree/bt_read.c
+++ b/src/third_party/wiredtiger/src/btree/bt_read.c
@@ -116,6 +116,7 @@ __las_page_instantiate(WT_SESSION_IMPL *session, WT_REF *ref)
WT_DECL_RET;
WT_ITEM las_key, las_value;
WT_PAGE *page;
+ WT_PAGE_LOOKASIDE *page_las;
WT_UPDATE *first_upd, *last_upd, *upd;
wt_timestamp_t durable_timestamp, las_timestamp;
size_t incr, total_incr;
@@ -131,7 +132,8 @@ __las_page_instantiate(WT_SESSION_IMPL *session, WT_REF *ref)
locked = false;
total_incr = 0;
current_recno = recno = WT_RECNO_OOB;
- las_pageid = ref->page_las->las_pageid;
+ page_las = ref->page_las;
+ las_pageid = page_las->las_pageid;
session_flags = 0; /* [-Werror=maybe-uninitialized] */
WT_CLEAR(las_key);
@@ -167,7 +169,7 @@ __las_page_instantiate(WT_SESSION_IMPL *session, WT_REF *ref)
* Confirm the search using the unique prefix; if not a match, we're done searching for
* records for this page.
*/
- if (las_pageid != ref->page_las->las_pageid)
+ if (las_pageid != page_las->las_pageid)
break;
/* Allocate the WT_UPDATE structure. */
@@ -265,12 +267,11 @@ __las_page_instantiate(WT_SESSION_IMPL *session, WT_REF *ref)
FLD_SET(page->modify->restore_state, WT_PAGE_RS_LOOKASIDE);
- if (ref->page_las->skew_newest && !ref->page_las->has_prepares &&
+ if (page_las->min_skipped_ts == WT_TS_MAX && !page_las->has_prepares &&
!S2C(session)->txn_global.has_stable_timestamp &&
- __wt_txn_visible_all(
- session, ref->page_las->unstable_txn, ref->page_las->unstable_durable_timestamp)) {
- page->modify->rec_max_txn = ref->page_las->max_txn;
- page->modify->rec_max_timestamp = ref->page_las->max_timestamp;
+ __wt_txn_visible_all(session, page_las->max_txn, page_las->max_ondisk_ts)) {
+ page->modify->rec_max_txn = page_las->max_txn;
+ page->modify->rec_max_timestamp = page_las->max_ondisk_ts;
__wt_page_modify_clear(session, page);
}
}
@@ -279,8 +280,8 @@ __las_page_instantiate(WT_SESSION_IMPL *session, WT_REF *ref)
* Now the lookaside history has been read into cache there is no further need to maintain a
* reference to it.
*/
- ref->page_las->eviction_to_lookaside = false;
- ref->page_las->resolved = true;
+ page_las->eviction_to_lookaside = false;
+ page_las->resolved = true;
err:
if (locked)
@@ -429,12 +430,10 @@ __page_read(WT_SESSION_IMPL *session, WT_REF *ref, uint32_t flags)
WT_CLEAR(tmp);
/*
- * Attempt to set the state to WT_REF_READING for normal reads, or
- * WT_REF_LOCKED, for deleted pages or pages with lookaside entries.
- * The difference is that checkpoints can skip over clean pages that
- * are being read into cache, but need to wait for deletes or lookaside
- * updates to be resolved (in order for checkpoint to write the correct
- * version of the page).
+ * Attempt to set the state to WT_REF_READING for normal reads, or WT_REF_LOCKED, for deleted
+ * pages or pages with lookaside entries. The difference is that checkpoints can skip over clean
+ * pages that are being read into cache, but need to wait for deletes or lookaside updates to be
+ * resolved (in order for checkpoint to write the correct version of the page).
*
* If successful, we've won the race, read the page.
*/
@@ -488,15 +487,13 @@ __page_read(WT_SESSION_IMPL *session, WT_REF *ref, uint32_t flags)
}
/*
- * Build the in-memory version of the page. Clear our local reference to
- * the allocated copy of the disk image on return, the in-memory object
- * steals it.
+ * Build the in-memory version of the page. Clear our local reference to the allocated copy of
+ * the disk image on return, the in-memory object steals it.
*
- * If a page is read with eviction disabled, we don't count evicting it
- * as progress. Since disabling eviction allows pages to be read even
- * when the cache is full, we want to avoid workloads repeatedly reading
- * a page with eviction disabled (e.g., a metadata page), then evicting
- * that page and deciding that is a sign that eviction is unstuck.
+ * If a page is read with eviction disabled, we don't count evicting it as progress. Since
+ * disabling eviction allows pages to be read even when the cache is full, we want to avoid
+ * workloads repeatedly reading a page with eviction disabled (e.g., a metadata page), then
+ * evicting that page and deciding that is a sign that eviction is unstuck.
*/
page_flags = WT_DATA_IN_ITEM(&tmp) ? WT_PAGE_DISK_ALLOC : WT_PAGE_DISK_MAPPED;
if (LF_ISSET(WT_READ_IGNORE_CACHE_SIZE))
@@ -543,7 +540,7 @@ skip_read:
* Don't free WT_REF.page_las, there may be concurrent readers.
*/
if (final_state == WT_REF_MEM && ref->page_las != NULL &&
- (!ref->page_las->skew_newest || ref->page_las->has_prepares))
+ (ref->page_las->min_skipped_ts != WT_TS_MAX || ref->page_las->has_prepares))
WT_ERR(__wt_las_remove_block(session, ref->page_las->las_pageid));
WT_REF_SET_STATE(ref, final_state);
@@ -682,9 +679,8 @@ read:
/*
* The page is in memory.
*
- * Get a hazard pointer if one is required. We cannot
- * be evicting if no hazard pointer is required, we're
- * done.
+ * Get a hazard pointer if one is required. We cannot be evicting if no hazard pointer
+ * is required, we're done.
*/
if (F_ISSET(btree, WT_BTREE_IN_MEMORY))
goto skip_evict;
@@ -760,14 +756,11 @@ read:
skip_evict:
/*
- * If we read the page and are configured to not trash
- * the cache, and no other thread has already used the
- * page, set the read generation so the page is evicted
- * soon.
+ * If we read the page and are configured to not trash the cache, and no other thread
+ * has already used the page, set the read generation so the page is evicted soon.
*
- * Otherwise, if we read the page, or, if configured to
- * update the page's read generation and the page isn't
- * already flagged for forced eviction, update the page
+ * Otherwise, if we read the page, or, if configured to update the page's read
+ * generation and the page isn't already flagged for forced eviction, update the page
* read generation.
*/
page = ref->page;
@@ -780,17 +773,13 @@ read:
__wt_cache_read_gen_bump(session, page);
/*
- * Check if we need an autocommit transaction.
- * Starting a transaction can trigger eviction, so skip
- * it if eviction isn't permitted.
+ * Check if we need an autocommit transaction. Starting a transaction can trigger
+ * eviction, so skip it if eviction isn't permitted.
*
- * The logic here is a little weird: some code paths do
- * a blanket ban on checking the cache size in
- * sessions, but still require a transaction (e.g.,
- * when updating metadata or lookaside). If
- * WT_READ_IGNORE_CACHE_SIZE was passed in explicitly,
- * we're done. If we set WT_READ_IGNORE_CACHE_SIZE
- * because it was set in the session then make sure we
+ * The logic here is a little weird: some code paths do a blanket ban on checking the
+ * cache size in sessions, but still require a transaction (e.g., when updating metadata
+ * or lookaside). If WT_READ_IGNORE_CACHE_SIZE was passed in explicitly, we're done. If
+ * we set WT_READ_IGNORE_CACHE_SIZE because it was set in the session then make sure we
* start a transaction.
*/
return (LF_ISSET(WT_READ_IGNORE_CACHE_SIZE) &&
diff --git a/src/third_party/wiredtiger/src/btree/bt_rebalance.c b/src/third_party/wiredtiger/src/btree/bt_rebalance.c
index 304750bd1b5..ead542b77a1 100644
--- a/src/third_party/wiredtiger/src/btree/bt_rebalance.c
+++ b/src/third_party/wiredtiger/src/btree/bt_rebalance.c
@@ -243,12 +243,12 @@ __rebalance_row_leaf_key(WT_SESSION_IMPL *session, const uint8_t *addr, size_t a
WT_PAGE *page;
/*
- * We need the first key from a leaf page. Leaf pages are relatively
- * complex (Huffman encoding, prefix compression, and so on), do the
- * work to instantiate the page and copy the first key to the buffer.
+ * We need the first key from a leaf page. Leaf pages are relatively complex (Huffman encoding,
+ * prefix compression, and so on), do the work to instantiate the page and copy the first key to
+ * the buffer.
*
- * Page flags are 0 because we aren't releasing the memory used to read
- * the page into memory and we don't want page discard to free it.
+ * Page flags are 0 because we aren't releasing the memory used to read the page into memory and
+ * we don't want page discard to free it.
*/
WT_RET(__wt_bt_read(session, rs->tmp1, addr, addr_len));
WT_RET(__wt_page_inmem(session, NULL, rs->tmp1->data, 0, false, &page));
@@ -296,13 +296,12 @@ __rebalance_row_walk(WT_SESSION_IMPL *session, wt_timestamp_t durable_ts, const
break;
case WT_CELL_KEY_OVFL:
/*
- * Any overflow key that references an internal page is
- * of no further use, schedule its blocks to be freed.
+ * Any overflow key that references an internal page is of no further use, schedule its
+ * blocks to be freed.
*
- * We could potentially use the same overflow key being
- * freed here for the internal page we're creating, but
- * that's more work to get reconciliation to understand
- * and overflow keys are (well, should be), uncommon.
+ * We could potentially use the same overflow key being freed here for the internal page
+ * we're creating, but that's more work to get reconciliation to understand and overflow
+ * keys are (well, should be), uncommon.
*/
__wt_verbose(session, WT_VERB_REBALANCE, "free-list append overflow key: %s",
__wt_addr_string(session, unpack.data, unpack.size, rs->tmp1));
diff --git a/src/third_party/wiredtiger/src/btree/bt_ret.c b/src/third_party/wiredtiger/src/btree/bt_ret.c
index 829a4c3a9f3..d9d1d8263a8 100644
--- a/src/third_party/wiredtiger/src/btree/bt_ret.c
+++ b/src/third_party/wiredtiger/src/btree/bt_ret.c
@@ -137,9 +137,8 @@ __wt_value_return_upd(
allocated_bytes = 0;
/*
- * We're passed a "standard" or "modified" update that's visible to us.
- * Our caller should have already checked for deleted items (we're too
- * far down the call stack to return not-found).
+ * We're passed a "standard" or "modified" update that's visible to us. Our caller should have
+ * already checked for deleted items (we're too far down the call stack to return not-found).
*
* Fast path if it's a standard item, assert our caller's behavior.
*/
diff --git a/src/third_party/wiredtiger/src/btree/bt_slvg.c b/src/third_party/wiredtiger/src/btree/bt_slvg.c
index 5ca21d61001..ea54d449576 100644
--- a/src/third_party/wiredtiger/src/btree/bt_slvg.c
+++ b/src/third_party/wiredtiger/src/btree/bt_slvg.c
@@ -170,19 +170,15 @@ __slvg_checkpoint(WT_SESSION_IMPL *session, WT_REF *root)
config = NULL;
/*
- * XXX
- * The salvage process reads and discards previous checkpoints, so the
- * underlying block manager has to ignore any previous checkpoint
- * entries when creating a new checkpoint. In other words, we can't use
- * the metadata checkpoint list, it lists the previous checkpoints and
- * we don't care about them. Build a clean checkpoint list and use it
- * instead.
+ * XXX The salvage process reads and discards previous checkpoints, so the underlying block
+ * manager has to ignore any previous checkpoint entries when creating a new checkpoint. In
+ * other words, we can't use the metadata checkpoint list, it lists the previous checkpoints and
+ * we don't care about them. Build a clean checkpoint list and use it instead.
*
- * Don't first clear the metadata checkpoint list and call the function
- * to get a list of checkpoints: a crash between clearing the metadata
- * checkpoint list and creating a new checkpoint list would look like a
- * create or open of a file without a checkpoint to roll-forward from,
- * and the contents of the file would be discarded.
+ * Don't first clear the metadata checkpoint list and call the function to get a list of
+ * checkpoints: a crash between clearing the metadata checkpoint list and creating a new
+ * checkpoint list would look like a create or open of a file without a checkpoint to
+ * roll-forward from, and the contents of the file would be discarded.
*/
WT_RET(__wt_calloc_def(session, 2, &ckptbase));
WT_ERR(__wt_strdup(session, WT_CHECKPOINT, &ckptbase->name));
@@ -209,11 +205,11 @@ __slvg_checkpoint(WT_SESSION_IMPL *session, WT_REF *root)
}
/*
- * If no checkpoint was created, clear all recorded checkpoints for the
- * file. This is expected if we didn't find any leaf pages to salvage.
+ * If no checkpoint was created, clear all recorded checkpoints for the file. This is expected
+ * if we didn't find any leaf pages to salvage.
*
- * If a checkpoint was created, life is good, replace any existing list
- * of checkpoints with the single new one.
+ * If a checkpoint was created, life is good, replace any existing list of checkpoints with the
+ * single new one.
*/
if (ckptbase->raw.data == NULL)
WT_TRET(__wt_meta_checkpoint_clear(session, dhandle->name));
@@ -259,13 +255,11 @@ __wt_salvage(WT_SESSION_IMPL *session, const char *cfg[])
WT_ERR(bm->salvage_start(bm, session));
/*
- * Step 2:
- * Read the file and build in-memory structures that reference any leaf
- * or overflow page. Any pages other than leaf or overflow pages are
- * added to the free list.
+ * Step 2: Read the file and build in-memory structures that reference any leaf or overflow
+ * page. Any pages other than leaf or overflow pages are added to the free list.
*
- * Turn off read checksum and verification error messages while we're
- * reading the file, we expect to see corrupted blocks.
+ * Turn off read checksum and verification error messages while we're reading the file, we
+ * expect to see corrupted blocks.
*/
F_SET(session, WT_SESSION_QUIET_CORRUPT_FILE);
ret = __slvg_read(session, ss);
@@ -348,12 +342,11 @@ __wt_salvage(WT_SESSION_IMPL *session, const char *cfg[])
}
/*
- * Step 7:
- * Build an internal page that references all of the leaf pages,
- * and write it, as well as any merged pages, to the file.
+ * Step 7: Build an internal page that references all of the leaf pages, and write it, as well
+ * as any merged pages, to the file.
*
- * Count how many leaf pages we have (we could track this during the
- * array shuffling/splitting, but that's a lot harder).
+ * Count how many leaf pages we have (we could track this during the array shuffling/splitting,
+ * but that's a lot harder).
*/
for (leaf_cnt = i = 0; i < ss->pages_next; ++i)
if (ss->pages[i] != NULL)
@@ -439,10 +432,9 @@ __slvg_read(WT_SESSION_IMPL *session, WT_STUFF *ss)
WT_ERR(__wt_progress(session, NULL, ss->fcnt));
/*
- * Read (and potentially decompress) the block; the underlying
- * block manager might return only good blocks if checksums are
- * configured, or both good and bad blocks if we're relying on
- * compression.
+ * Read (and potentially decompress) the block; the underlying block manager might return
+ * only good blocks if checksums are configured, or both good and bad blocks if we're
+ * relying on compression.
*
* Report the block's status to the block manager.
*/
@@ -464,11 +456,10 @@ __slvg_read(WT_SESSION_IMPL *session, WT_STUFF *ss)
/*
* Make sure it's an expected page type for the file.
*
- * We only care about leaf and overflow pages from here on out;
- * discard all of the others. We put them on the free list now,
- * because we might as well overwrite them, we want the file to
- * grow as little as possible, or shrink, and future salvage
- * calls don't need them either.
+ * We only care about leaf and overflow pages from here on out; discard all of the others.
+ * We put them on the free list now, because we might as well overwrite them, we want the
+ * file to grow as little as possible, or shrink, and future salvage calls don't need them
+ * either.
*/
dsk = buf->data;
switch (dsk->type) {
@@ -617,16 +608,13 @@ __slvg_trk_leaf(WT_SESSION_IMPL *session, const WT_PAGE_HEADER *dsk, uint8_t *ad
break;
case WT_PAGE_ROW_LEAF:
/*
- * Row-store format: copy the first and last keys on the page.
- * Keys are prefix-compressed, the simplest and slowest thing
- * to do is instantiate the in-memory page, then instantiate
- * and copy the full keys, then free the page. We do this on
- * every leaf page, and if you need to speed up the salvage,
- * it's probably a great place to start.
+ * Row-store format: copy the first and last keys on the page. Keys are prefix-compressed,
+ * the simplest and slowest thing to do is instantiate the in-memory page, then instantiate
+ * and copy the full keys, then free the page. We do this on every leaf page, and if you
+ * need to speed up the salvage, it's probably a great place to start.
*
- * Page flags are 0 because we aren't releasing the memory used
- * to read the page into memory and we don't want page discard
- * to free it.
+ * Page flags are 0 because we aren't releasing the memory used to read the page into memory
+ * and we don't want page discard to free it.
*/
WT_ERR(__wt_page_inmem(session, NULL, dsk, 0, false, &page));
WT_ERR(__wt_row_leaf_key_copy(session, page, &page->pg_row[0], &trk->row_start));
@@ -768,16 +756,14 @@ __slvg_col_range(WT_SESSION_IMPL *session, WT_STUFF *ss)
uint32_t i, j;
/*
- * DO NOT MODIFY THIS CODE WITHOUT REVIEWING THE CORRESPONDING ROW- OR
- * COLUMN-STORE CODE: THEY ARE IDENTICAL OTHER THAN THE PAGES THAT ARE
- * BEING HANDLED.
+ * DO NOT MODIFY THIS CODE WITHOUT REVIEWING THE CORRESPONDING ROW- OR COLUMN-STORE CODE: THEY
+ * ARE IDENTICAL OTHER THAN THE PAGES THAT ARE BEING HANDLED.
*
- * Walk the page array looking for overlapping key ranges, adjusting
- * the ranges based on the LSN until there are no overlaps.
+ * Walk the page array looking for overlapping key ranges, adjusting the ranges based on the LSN
+ * until there are no overlaps.
*
- * DO NOT USE POINTERS INTO THE ARRAY: THE ARRAY IS RE-SORTED IN PLACE
- * AS ENTRIES ARE SPLIT, SO ARRAY REFERENCES MUST ALWAYS BE ARRAY BASE
- * PLUS OFFSET.
+ * DO NOT USE POINTERS INTO THE ARRAY: THE ARRAY IS RE-SORTED IN PLACE AS ENTRIES ARE SPLIT, SO
+ * ARRAY REFERENCES MUST ALWAYS BE ARRAY BASE PLUS OFFSET.
*/
for (i = 0; i < ss->pages_next; ++i) {
if (ss->pages[i] == NULL)
@@ -951,12 +937,10 @@ __slvg_col_range_overlap(WT_SESSION_IMPL *session, uint32_t a_slot, uint32_t b_s
}
/*
- * Case #5: b_trk is more desirable and is a middle chunk of a_trk.
- * Split a_trk into two parts, the key range before b_trk and the
- * key range after b_trk.
+ * Case #5: b_trk is more desirable and is a middle chunk of a_trk. Split a_trk into two parts,
+ * the key range before b_trk and the key range after b_trk.
*
- * Allocate a new WT_TRACK object, and extend the array of pages as
- * necessary.
+ * Allocate a new WT_TRACK object, and extend the array of pages as necessary.
*/
WT_RET(__wt_calloc_one(session, &new));
if ((ret = __wt_realloc_def(session, &ss->pages_allocated, ss->pages_next + 1, &ss->pages)) !=
@@ -1356,16 +1340,14 @@ __slvg_row_range(WT_SESSION_IMPL *session, WT_STUFF *ss)
btree = S2BT(session);
/*
- * DO NOT MODIFY THIS CODE WITHOUT REVIEWING THE CORRESPONDING ROW- OR
- * COLUMN-STORE CODE: THEY ARE IDENTICAL OTHER THAN THE PAGES THAT ARE
- * BEING HANDLED.
+ * DO NOT MODIFY THIS CODE WITHOUT REVIEWING THE CORRESPONDING ROW- OR COLUMN-STORE CODE: THEY
+ * ARE IDENTICAL OTHER THAN THE PAGES THAT ARE BEING HANDLED.
*
- * Walk the page array looking for overlapping key ranges, adjusting
- * the ranges based on the LSN until there are no overlaps.
+ * Walk the page array looking for overlapping key ranges, adjusting the ranges based on the LSN
+ * until there are no overlaps.
*
- * DO NOT USE POINTERS INTO THE ARRAY: THE ARRAY IS RE-SORTED IN PLACE
- * AS ENTRIES ARE SPLIT, SO ARRAY REFERENCES MUST ALWAYS BE ARRAY BASE
- * PLUS OFFSET.
+ * DO NOT USE POINTERS INTO THE ARRAY: THE ARRAY IS RE-SORTED IN PLACE AS ENTRIES ARE SPLIT, SO
+ * ARRAY REFERENCES MUST ALWAYS BE ARRAY BASE PLUS OFFSET.
*/
for (i = 0; i < ss->pages_next; ++i) {
if (ss->pages[i] == NULL)
@@ -1550,12 +1532,10 @@ __slvg_row_range_overlap(WT_SESSION_IMPL *session, uint32_t a_slot, uint32_t b_s
}
/*
- * Case #5: b_trk is more desirable and is a middle chunk of a_trk.
- * Split a_trk into two parts, the key range before b_trk and the
- * key range after b_trk.
+ * Case #5: b_trk is more desirable and is a middle chunk of a_trk. Split a_trk into two parts,
+ * the key range before b_trk and the key range after b_trk.
*
- * Allocate a new WT_TRACK object, and extend the array of pages as
- * necessary.
+ * Allocate a new WT_TRACK object, and extend the array of pages as necessary.
*/
WT_RET(__wt_calloc_one(session, &new));
if ((ret = __wt_realloc_def(session, &ss->pages_allocated, ss->pages_next + 1, &ss->pages)) !=
@@ -1819,19 +1799,16 @@ __slvg_row_build_leaf(WT_SESSION_IMPL *session, WT_TRACK *trk, WT_REF *ref, WT_S
page = ref->page;
/*
- * Figure out how many page keys we want to take and how many we want
- * to skip.
+ * Figure out how many page keys we want to take and how many we want to skip.
*
- * If checking the starting range key, the key we're searching for will
- * be equal to the starting range key. This is because we figured out
- * the true merged-page start key as part of discarding initial keys
- * from the page (see the __slvg_row_range_overlap function, and its
+ * If checking the starting range key, the key we're searching for will be equal to the starting
+ * range key. This is because we figured out the true merged-page start key as part of
+ * discarding initial keys from the page (see the __slvg_row_range_overlap function, and its
* calls to __slvg_row_trk_update_start for more information).
*
- * If checking the stopping range key, we want the keys on the page that
- * are less-than the stopping range key. This is because we copied a
- * key from another page to define this page's stop range: that page is
- * the page that owns the "equal to" range space.
+ * If checking the stopping range key, we want the keys on the page that are less-than the
+ * stopping range key. This is because we copied a key from another page to define this page's
+ * stop range: that page is the page that owns the "equal to" range space.
*/
skip_start = skip_stop = 0;
if (F_ISSET(trk, WT_TRACK_CHECK_START))
@@ -2043,11 +2020,10 @@ __slvg_ovfl_reconcile(WT_SESSION_IMPL *session, WT_STUFF *ss)
slot = NULL;
/*
- * If an overflow page is referenced more than once, discard leaf pages
- * with the lowest LSNs until overflow pages are only referenced once.
+ * If an overflow page is referenced more than once, discard leaf pages with the lowest LSNs
+ * until overflow pages are only referenced once.
*
- * This requires sorting the page list by LSN, and the overflow array by
- * address cookie.
+ * This requires sorting the page list by LSN, and the overflow array by address cookie.
*/
__wt_qsort(ss->pages, (size_t)ss->pages_next, sizeof(WT_TRACK *), __slvg_trk_compare_gen);
__wt_qsort(ss->ovfl, (size_t)ss->ovfl_next, sizeof(WT_TRACK *), __slvg_trk_compare_addr);
@@ -2261,11 +2237,11 @@ __slvg_ovfl_discard(WT_SESSION_IMPL *session, WT_STUFF *ss)
uint32_t i;
/*
- * Walk the overflow page array: if an overflow page isn't referenced,
- * add its file blocks to the free list.
+ * Walk the overflow page array: if an overflow page isn't referenced, add its file blocks to
+ * the free list.
*
- * Clear the reference flag (it's reused to figure out if the overflow
- * record is referenced, but never used, by merged pages).
+ * Clear the reference flag (it's reused to figure out if the overflow record is referenced, but
+ * never used, by merged pages).
*/
for (i = 0; i < ss->ovfl_next; ++i) {
if ((trk = ss->ovfl[i]) == NULL)
diff --git a/src/third_party/wiredtiger/src/btree/bt_split.c b/src/third_party/wiredtiger/src/btree/bt_split.c
index 141eb78d8b4..f22036e1ebb 100644
--- a/src/third_party/wiredtiger/src/btree/bt_split.c
+++ b/src/third_party/wiredtiger/src/btree/bt_split.c
@@ -210,21 +210,17 @@ __split_ref_move(WT_SESSION_IMPL *session, WT_PAGE *from_home, WT_REF **from_ref
addr = NULL;
/*
- * The from-home argument is the page into which the "from" WT_REF may
- * point, for example, if there's an on-page key the "from" WT_REF
- * references, it will be on the page "from-home".
+ * The from-home argument is the page into which the "from" WT_REF may point, for example, if
+ * there's an on-page key the "from" WT_REF references, it will be on the page "from-home".
*
- * Instantiate row-store keys, and column- and row-store addresses in
- * the WT_REF structures referenced by a page that's being split. The
- * WT_REF structures aren't moving, but the index references are moving
- * from the page we're splitting to a set of new pages, and so we can
- * no longer reference the block image that remains with the page being
- * split.
+ * Instantiate row-store keys, and column- and row-store addresses in the WT_REF structures
+ * referenced by a page that's being split. The WT_REF structures aren't moving, but the index
+ * references are moving from the page we're splitting to a set of new pages, and so we can no
+ * longer reference the block image that remains with the page being split.
*
- * No locking is required to update the WT_REF structure because we're
- * the only thread splitting the page, and there's no way for readers
- * to race with our updates of single pointers. The changes have to be
- * written before the page goes away, of course, our caller owns that
+ * No locking is required to update the WT_REF structure because we're the only thread splitting
+ * the page, and there's no way for readers to race with our updates of single pointers. The
+ * changes have to be written before the page goes away, of course, our caller owns that
* problem.
*/
if (from_home->type == WT_PAGE_ROW_INT) {
@@ -336,22 +332,19 @@ __split_ref_prepare(
locked = NULL;
/*
- * Update the moved WT_REFs so threads moving through them start looking
- * at the created children's page index information. Because we've not
- * yet updated the page index of the parent page into which we are going
- * to split this subtree, a cursor moving through these WT_REFs will
- * ascend into the created children, but eventually fail as that parent
- * page won't yet know about the created children pages. That's OK, we
- * spin there until the parent's page index is updated.
+ * Update the moved WT_REFs so threads moving through them start looking at the created
+ * children's page index information. Because we've not yet updated the page index of the parent
+ * page into which we are going to split this subtree, a cursor moving through these WT_REFs
+ * will ascend into the created children, but eventually fail as that parent page won't yet know
+ * about the created children pages. That's OK, we spin there until the parent's page index is
+ * updated.
*
- * Lock the newly created page to ensure none of its children can split.
- * First, to ensure all of the child pages are updated before any pages
- * can split. Second, to ensure the original split completes before any
- * of the children can split. The latter involves split generations:
- * the original split page has references to these children. If they
- * split immediately, they could free WT_REF structures based on split
- * generations earlier than the split generation we'll eventually choose
- * to protect the original split page's previous page index.
+ * Lock the newly created page to ensure none of its children can split. First, to ensure all of
+ * the child pages are updated before any pages can split. Second, to ensure the original split
+ * completes before any of the children can split. The latter involves split generations: the
+ * original split page has references to these children. If they split immediately, they could
+ * free WT_REF structures based on split generations earlier than the split generation we'll
+ * eventually choose to protect the original split page's previous page index.
*/
alloc = cnt = 0;
for (i = skip_first ? 1 : 0; i < pindex->entries; ++i) {
@@ -535,16 +528,13 @@ __split_root(WT_SESSION_IMPL *session, WT_PAGE *root)
__wt_timing_stress(session, WT_TIMING_STRESS_SPLIT_2);
/*
- * Get a generation for this split, mark the root page. This must be
- * after the new index is swapped into place in order to know that no
- * readers are looking at the old index.
+ * Get a generation for this split, mark the root page. This must be after the new index is
+ * swapped into place in order to know that no readers are looking at the old index.
*
- * Note: as the root page cannot currently be evicted, the root split
- * generation isn't ever used. That said, it future proofs eviction
- * and isn't expensive enough to special-case.
+ * Note: as the root page cannot currently be evicted, the root split generation isn't ever
+ * used. That said, it future proofs eviction and isn't expensive enough to special-case.
*
- * Getting a new split generation implies a full barrier, no additional
- * barrier is needed.
+ * Getting a new split generation implies a full barrier, no additional barrier is needed.
*/
split_gen = __wt_gen_next(session, WT_GEN_SPLIT);
root->pg_intl_split_gen = split_gen;
@@ -561,14 +551,12 @@ __split_root(WT_SESSION_IMPL *session, WT_PAGE *root)
complete = WT_ERR_IGNORE;
/*
- * We can't free the previous root's index, there may be threads using
- * it. Add to the session's discard list, to be freed once we know no
- * threads can still be using it.
+ * We can't free the previous root's index, there may be threads using it. Add to the session's
+ * discard list, to be freed once we know no threads can still be using it.
*
- * This change requires care with error handling: we have already
- * updated the page with a new index. Even if stashing the old value
- * fails, we don't roll back that change, because threads may already
- * be using the new index.
+ * This change requires care with error handling: we have already updated the page with a new
+ * index. Even if stashing the old value fails, we don't roll back that change, because threads
+ * may already be using the new index.
*/
size = sizeof(WT_PAGE_INDEX) + pindex->entries * sizeof(WT_REF *);
WT_TRET(__split_safe_free(session, split_gen, false, pindex, size));
@@ -644,14 +632,13 @@ __split_parent(WT_SESSION_IMPL *session, WT_REF *ref, WT_REF **ref_new, uint32_t
parent_entries = pindex->entries;
/*
- * Remove any refs to deleted pages while we are splitting, we have the
- * internal page locked down, and are copying the refs into a new array
- * anyway. Switch them to the special split state, so that any reading
- * thread will restart.
+ * Remove any refs to deleted pages while we are splitting, we have the internal page locked
+ * down, and are copying the refs into a new array anyway. Switch them to the special split
+ * state, so that any reading thread will restart.
*
- * We can't do this if there is a sync running in the tree in another
- * session: removing the refs frees the blocks for the deleted pages,
- * which can corrupt the free list calculated by the sync.
+ * We can't do this if there is a sync running in the tree in another session: removing the refs
+ * frees the blocks for the deleted pages, which can corrupt the free list calculated by the
+ * sync.
*/
WT_ERR(__wt_scr_alloc(session, 10 * sizeof(uint32_t), &scr));
for (deleted_entries = 0, i = 0; i < parent_entries; ++i) {
@@ -687,14 +674,12 @@ __split_parent(WT_SESSION_IMPL *session, WT_REF *ref, WT_REF **ref_new, uint32_t
}
/*
- * Allocate and initialize a new page index array for the parent, then
- * copy references from the original index array, plus references from
- * the newly created split array, into place.
+ * Allocate and initialize a new page index array for the parent, then copy references from the
+ * original index array, plus references from the newly created split array, into place.
*
- * Update the WT_REF's page-index hint as we go. This can race with a
- * thread setting the hint based on an older page-index, and the change
- * isn't backed out in the case of an error, so there ways for the hint
- * to be wrong; OK because it's just a hint.
+ * Update the WT_REF's page-index hint as we go. This can race with a thread setting the hint
+ * based on an older page-index, and the change isn't backed out in the case of an error, so
+ * there ways for the hint to be wrong; OK because it's just a hint.
*/
size = sizeof(WT_PAGE_INDEX) + result_entries * sizeof(WT_REF *);
WT_ERR(__wt_calloc(session, 1, size, &alloc_index));
@@ -737,12 +722,10 @@ __split_parent(WT_SESSION_IMPL *session, WT_REF *ref, WT_REF **ref_new, uint32_t
__wt_timing_stress(session, WT_TIMING_STRESS_SPLIT_4);
/*
- * Get a generation for this split, mark the page. This must be after
- * the new index is swapped into place in order to know that no readers
- * are looking at the old index.
+ * Get a generation for this split, mark the page. This must be after the new index is swapped
+ * into place in order to know that no readers are looking at the old index.
*
- * Getting a new split generation implies a full barrier, no additional
- * barrier is needed.
+ * Getting a new split generation implies a full barrier, no additional barrier is needed.
*/
split_gen = __wt_gen_next(session, WT_GEN_SPLIT);
parent->pg_intl_split_gen = split_gen;
@@ -798,18 +781,14 @@ __split_parent(WT_SESSION_IMPL *session, WT_REF *ref, WT_REF **ref_new, uint32_t
WT_ASSERT(session, next_ref->state == WT_REF_SPLIT);
/*
- * We set the WT_REF to split, discard it, freeing any resources
- * it holds.
+ * We set the WT_REF to split, discard it, freeing any resources it holds.
*
- * Row-store trees where the old version of the page is being
- * discarded: the previous parent page's key for this child page
- * may have been an on-page overflow key. In that case, if the
- * key hasn't been deleted, delete it now, including its backing
- * blocks. We are exchanging the WT_REF that referenced it for
- * the split page WT_REFs and their keys, and there's no longer
- * any reference to it. Done after completing the split (if we
- * failed, we'd leak the underlying blocks, but the parent page
- * would be unaffected).
+ * Row-store trees where the old version of the page is being discarded: the previous parent
+ * page's key for this child page may have been an on-page overflow key. In that case, if
+ * the key hasn't been deleted, delete it now, including its backing blocks. We are
+ * exchanging the WT_REF that referenced it for the split page WT_REFs and their keys, and
+ * there's no longer any reference to it. Done after completing the split (if we failed,
+ * we'd leak the underlying blocks, but the parent page would be unaffected).
*/
if (parent->type == WT_PAGE_ROW_INT) {
WT_TRET(__split_ovfl_key_cleanup(session, parent, next_ref));
@@ -837,7 +816,6 @@ __split_parent(WT_SESSION_IMPL *session, WT_REF *ref, WT_REF **ref_new, uint32_t
/* Free the backing block and address. */
WT_TRET(__wt_ref_block_free(session, next_ref));
- WT_ASSERT(session, __wt_hazard_check_assert(session, next_ref, false));
WT_TRET(__split_safe_free(session, split_gen, exclusive, next_ref, sizeof(WT_REF)));
parent_decr += sizeof(WT_REF);
}
@@ -956,14 +934,12 @@ __split_internal(WT_SESSION_IMPL *session, WT_PAGE *parent, WT_PAGE *page)
(void *)page, pindex->entries, children, (void *)parent);
/*
- * Ideally, we'd discard the original page, but that's hard since other
- * threads of control are using it (for example, if eviction is walking
- * the tree and looking at the page.) Instead, perform a right-split,
- * moving all except the first chunk of the page's WT_REF objects to new
+ * Ideally, we'd discard the original page, but that's hard since other threads of control are
+ * using it (for example, if eviction is walking the tree and looking at the page.) Instead,
+ * perform a right-split, moving all except the first chunk of the page's WT_REF objects to new
* pages.
*
- * Create and initialize a replacement WT_PAGE_INDEX for the original
- * page.
+ * Create and initialize a replacement WT_PAGE_INDEX for the original page.
*/
size = sizeof(WT_PAGE_INDEX) + chunk * sizeof(WT_REF *);
WT_ERR(__wt_calloc(session, 1, size, &replace_index));
@@ -974,11 +950,11 @@ __split_internal(WT_SESSION_IMPL *session, WT_PAGE *parent, WT_PAGE *page)
replace_index->index[i] = *page_refp++;
/*
- * Allocate a new WT_PAGE_INDEX and set of WT_REF objects to be inserted
- * into the page's parent, replacing the page's page-index.
+ * Allocate a new WT_PAGE_INDEX and set of WT_REF objects to be inserted into the page's parent,
+ * replacing the page's page-index.
*
- * The first slot of the new WT_PAGE_INDEX is the original page WT_REF.
- * The remainder of the slots are allocated WT_REFs.
+ * The first slot of the new WT_PAGE_INDEX is the original page WT_REF. The remainder of the
+ * slots are allocated WT_REFs.
*/
size = sizeof(WT_PAGE_INDEX) + children * sizeof(WT_REF *);
WT_ERR(__wt_calloc(session, 1, size, &alloc_index));
@@ -1074,12 +1050,10 @@ __split_internal(WT_SESSION_IMPL *session, WT_PAGE *parent, WT_PAGE *page)
__wt_timing_stress(session, WT_TIMING_STRESS_SPLIT_6);
/*
- * Get a generation for this split, mark the parent page. This must be
- * after the new index is swapped into place in order to know that no
- * readers are looking at the old index.
+ * Get a generation for this split, mark the parent page. This must be after the new index is
+ * swapped into place in order to know that no readers are looking at the old index.
*
- * Getting a new split generation implies a full barrier, no additional
- * barrier is needed.
+ * Getting a new split generation implies a full barrier, no additional barrier is needed.
*/
split_gen = __wt_gen_next(session, WT_GEN_SPLIT);
page->pg_intl_split_gen = split_gen;
@@ -1102,14 +1076,12 @@ __split_internal(WT_SESSION_IMPL *session, WT_PAGE *parent, WT_PAGE *page)
__wt_free(session, alloc_index);
/*
- * We can't free the previous page's index, there may be threads using
- * it. Add to the session's discard list, to be freed once we know no
- * threads can still be using it.
+ * We can't free the previous page's index, there may be threads using it. Add to the session's
+ * discard list, to be freed once we know no threads can still be using it.
*
- * This change requires care with error handling, we've already updated
- * the parent page. Even if stashing the old value fails, we don't roll
- * back that change, because threads may already be using the new parent
- * page.
+ * This change requires care with error handling, we've already updated the parent page. Even if
+ * stashing the old value fails, we don't roll back that change, because threads may already be
+ * using the new parent page.
*/
size = sizeof(WT_PAGE_INDEX) + pindex->entries * sizeof(WT_REF *);
WT_TRET(__split_safe_free(session, split_gen, false, pindex, size));
@@ -1133,22 +1105,18 @@ err:
__wt_free(session, replace_index);
/*
- * The alloc-index variable is the array of new WT_REF entries
- * intended to be inserted into the page being split's parent.
+ * The alloc-index variable is the array of new WT_REF entries intended to be inserted into
+ * the page being split's parent.
*
- * Except for the first slot (the original page's WT_REF), it's
- * an array of newly allocated combined WT_PAGE_INDEX and WT_REF
- * structures, each of which references a newly allocated (and
- * modified) child page, each of which references an index of
- * WT_REFs from the page being split. Free everything except for
- * slot 1 and the WT_REFs in the child page indexes.
+ * Except for the first slot (the original page's WT_REF), it's an array of newly allocated
+ * combined WT_PAGE_INDEX and WT_REF structures, each of which references a newly allocated
+ * (and modified) child page, each of which references an index of WT_REFs from the page
+ * being split. Free everything except for slot 1 and the WT_REFs in the child page indexes.
*
- * First, skip slot 1. Second, we want to free all of the child
- * pages referenced from the alloc-index array, but we can't
- * just call the usual discard function because the WT_REFs
- * referenced by the child pages remain referenced by the
- * original page, after error. For each entry, free the child
- * page's page index (so the underlying page-free function will
+ * First, skip slot 1. Second, we want to free all of the child pages referenced from the
+ * alloc-index array, but we can't just call the usual discard function because the WT_REFs
+ * referenced by the child pages remain referenced by the original page, after error. For
+ * each entry, free the child page's page index (so the underlying page-free function will
* ignore it), then call the general-purpose discard function.
*/
if (alloc_index == NULL)
@@ -1205,19 +1173,15 @@ __split_internal_lock(WT_SESSION_IMPL *session, WT_REF *ref, bool trylock, WT_PA
return (__wt_set_return(session, EBUSY));
/*
- * Get a page-level lock on the parent to single-thread splits into the
- * page because we need to single-thread sizing/growing the page index.
- * It's OK to queue up multiple splits as the child pages split, but the
- * actual split into the parent has to be serialized. Note we allocate
- * memory inside of the lock and may want to invest effort in making the
- * locked period shorter.
+ * Get a page-level lock on the parent to single-thread splits into the page because we need to
+ * single-thread sizing/growing the page index. It's OK to queue up multiple splits as the child
+ * pages split, but the actual split into the parent has to be serialized. Note we allocate
+ * memory inside of the lock and may want to invest effort in making the locked period shorter.
*
- * We use the reconciliation lock here because not only do we have to
- * single-thread the split, we have to lock out reconciliation of the
- * parent because reconciliation of the parent can't deal with finding
- * a split child during internal page traversal. Basically, there's no
- * reason to use a different lock if we have to block reconciliation
- * anyway.
+ * We use the reconciliation lock here because not only do we have to single-thread the split,
+ * we have to lock out reconciliation of the parent because reconciliation of the parent can't
+ * deal with finding a split child during internal page traversal. Basically, there's no reason
+ * to use a different lock if we have to block reconciliation anyway.
*/
for (;;) {
parent = ref->home;
@@ -1327,20 +1291,18 @@ __split_parent_climb(WT_SESSION_IMPL *session, WT_PAGE *page)
}
/*
- * Page splits trickle up the tree, that is, as leaf pages grow large
- * enough and are evicted, they'll split into their parent. And, as
- * that parent page grows large enough and is evicted, it splits into
- * its parent and so on. When the page split wave reaches the root,
- * the tree will permanently deepen as multiple root pages are written.
+ * Page splits trickle up the tree, that is, as leaf pages grow large enough and are evicted,
+ * they'll split into their parent. And, as that parent page grows large enough and is evicted,
+ * it splits into its parent and so on. When the page split wave reaches the root, the tree will
+ * permanently deepen as multiple root pages are written.
*
- * However, this only helps if internal pages are evicted (and we resist
- * evicting internal pages for obvious reasons), or if the tree were to
- * be closed and re-opened from a disk image, which may be a rare event.
+ * However, this only helps if internal pages are evicted (and we resist evicting internal pages
+ * for obvious reasons), or if the tree were to be closed and re-opened from a disk image, which
+ * may be a rare event.
*
- * To avoid internal pages becoming too large absent eviction, check
- * parent pages each time pages are split into them. If the page is big
- * enough, either split the page into its parent or, in the case of the
- * root, deepen the tree.
+ * To avoid internal pages becoming too large absent eviction, check parent pages each time
+ * pages are split into them. If the page is big enough, either split the page into its parent
+ * or, in the case of the root, deepen the tree.
*
* Split up the tree.
*/
@@ -1440,19 +1402,17 @@ __split_multi_inmem(WT_SESSION_IMPL *session, WT_PAGE *orig, WT_MULTI *multi, WT
WT_ASSERT(session, orig->type != WT_PAGE_COL_VAR || ref->ref_recno != 0);
/*
- * This code re-creates an in-memory page from a disk image, and adds
- * references to any unresolved update chains to the new page. We get
- * here either because an update could not be written when evicting a
- * page, or eviction chose to keep a page in memory.
+ * This code re-creates an in-memory page from a disk image, and adds references to any
+ * unresolved update chains to the new page. We get here either because an update could not be
+ * written when evicting a page, or eviction chose to keep a page in memory.
*
- * Reconciliation won't create a disk image with entries the running
- * database no longer cares about (at least, not based on the current
- * tests we're performing), ignore the validity window.
+ * Reconciliation won't create a disk image with entries the running database no longer cares
+ * about (at least, not based on the current tests we're performing), ignore the validity
+ * window.
*
- * Steal the disk image and link the page into the passed-in WT_REF to
- * simplify error handling: our caller will not discard the disk image
- * when discarding the original page, and our caller will discard the
- * allocated page on error, when discarding the allocated WT_REF.
+ * Steal the disk image and link the page into the passed-in WT_REF to simplify error handling:
+ * our caller will not discard the disk image when discarding the original page, and our caller
+ * will discard the allocated page on error, when discarding the allocated WT_REF.
*/
WT_RET(__wt_page_inmem(session, ref, multi->disk_image, WT_PAGE_DISK_ALLOC, false, &page));
multi->disk_image = NULL;
@@ -1592,14 +1552,13 @@ static void
__split_multi_inmem_fail(WT_SESSION_IMPL *session, WT_PAGE *orig, WT_REF *ref)
{
/*
- * We failed creating new in-memory pages. For error-handling reasons,
- * we've left the update chains referenced by both the original and
- * new pages. Discard the newly allocated WT_REF structures and their
- * pages (setting a flag so the discard code doesn't discard the updates
- * on the page).
+ * We failed creating new in-memory pages. For error-handling reasons, we've left the update
+ * chains referenced by both the original and new pages. Discard the newly allocated WT_REF
+ * structures and their pages (setting a flag so the discard code doesn't discard the updates on
+ * the page).
*
- * Our callers allocate WT_REF arrays, then individual WT_REFs, check
- * for uninitialized information.
+ * Our callers allocate WT_REF arrays, then individual WT_REFs, check for uninitialized
+ * information.
*/
if (ref != NULL) {
if (ref->page != NULL)
@@ -1667,9 +1626,9 @@ __wt_multi_to_ref(WT_SESSION_IMPL *session, WT_PAGE *page, WT_MULTI *multi, WT_R
/*
* If there's an address, the page was written, set it.
*
- * Copy the address: we could simply take the buffer, but that would
- * complicate error handling, freeing the reference array would have
- * to avoid freeing the memory, and it's not worth the confusion.
+ * Copy the address: we could simply take the buffer, but that would complicate error handling,
+ * freeing the reference array would have to avoid freeing the memory, and it's not worth the
+ * confusion.
*/
if (multi->addr.addr != NULL) {
WT_RET(__wt_calloc_one(session, &addr));
@@ -1703,8 +1662,7 @@ __wt_multi_to_ref(WT_SESSION_IMPL *session, WT_PAGE *page, WT_MULTI *multi, WT_R
}
/*
- * If we have a disk image and we're not closing the file,
- * re-instantiate the page.
+ * If we have a disk image and we're not closing the file, re-instantiate the page.
*
* Discard any page image we don't use.
*/
@@ -1743,9 +1701,9 @@ __split_insert(WT_SESSION_IMPL *session, WT_REF *ref)
type = page->type;
/*
- * Assert splitting makes sense; specifically assert the page is dirty,
- * we depend on that, otherwise the page might be evicted based on its
- * last reconciliation which no longer matches reality after the split.
+ * Assert splitting makes sense; specifically assert the page is dirty, we depend on that,
+ * otherwise the page might be evicted based on its last reconciliation which no longer matches
+ * reality after the split.
*
* Note this page has already been through an in-memory split.
*/
@@ -1762,16 +1720,14 @@ __split_insert(WT_SESSION_IMPL *session, WT_REF *ref)
moved_ins = WT_SKIP_LAST(ins_head);
/*
- * The first page in the split is the current page, but we still have
- * to create a replacement WT_REF, the original WT_REF will be set to
- * split status and eventually freed.
+ * The first page in the split is the current page, but we still have to create a replacement
+ * WT_REF, the original WT_REF will be set to split status and eventually freed.
*
- * The new WT_REF is not quite identical: we have to instantiate a key,
- * and the new reference is visible to readers once the split completes.
+ * The new WT_REF is not quite identical: we have to instantiate a key, and the new reference is
+ * visible to readers once the split completes.
*
- * Don't copy any deleted page state: we may be splitting a page that
- * was instantiated after a truncate and that history should not be
- * carried onto these new child pages.
+ * Don't copy any deleted page state: we may be splitting a page that was instantiated after a
+ * truncate and that history should not be carried onto these new child pages.
*/
WT_ERR(__wt_calloc_one(session, &split_ref[0]));
parent_incr += sizeof(WT_REF);
@@ -1870,12 +1826,10 @@ __split_insert(WT_SESSION_IMPL *session, WT_REF *ref)
WT_MEM_TRANSFER(page_decr, right_incr, __wt_update_list_memsize(moved_ins->upd));
/*
- * Move the last insert list item from the original page to the new
- * page.
+ * Move the last insert list item from the original page to the new page.
*
- * First, update the item to the new child page. (Just append the entry
- * for simplicity, the previous skip list pointers originally allocated
- * can be ignored.)
+ * First, update the item to the new child page. (Just append the entry for simplicity, the
+ * previous skip list pointers originally allocated can be ignored.)
*/
tmp_ins_head = type == WT_PAGE_ROW_LEAF ? right->modify->mod_row_insert[0] :
right->modify->mod_col_append[0];
@@ -1991,14 +1945,13 @@ __split_insert(WT_SESSION_IMPL *session, WT_REF *ref)
page->modify->mod_col_split_recno = WT_RECNO_OOB;
/*
- * Clear the allocated page's reference to the moved insert list element
- * so it's not freed when we discard the page.
+ * Clear the allocated page's reference to the moved insert list element so it's not freed when
+ * we discard the page.
*
- * Move the element back to the original page list. For simplicity, the
- * previous skip list pointers originally allocated can be ignored, just
- * append the entry to the end of the level 0 list. As before, we depend
- * on the list having multiple elements and ignore the edge cases small
- * lists have.
+ * Move the element back to the original page list. For simplicity, the previous skip list
+ * pointers originally allocated can be ignored, just append the entry to the end of the level 0
+ * list. As before, we depend on the list having multiple elements and ignore the edge cases
+ * small lists have.
*/
if (type == WT_PAGE_ROW_LEAF)
right->modify->mod_row_insert[0]->head[0] = right->modify->mod_row_insert[0]->tail[0] =
@@ -2124,8 +2077,7 @@ __split_multi(WT_SESSION_IMPL *session, WT_REF *ref, bool closing)
/*
* The split succeeded, we can no longer fail.
*
- * Finalize the move, discarding moved update lists from the original
- * page.
+ * Finalize the move, discarding moved update lists from the original page.
*/
for (i = 0; i < new_entries; ++i)
__split_multi_inmem_final(page, &mod->mod_multi[i]);
@@ -2242,18 +2194,16 @@ __wt_split_rewrite(WT_SESSION_IMPL *session, WT_REF *ref, WT_MULTI *multi)
__wt_verbose(session, WT_VERB_SPLIT, "%p: split-rewrite", (void *)ref);
/*
- * This isn't a split: a reconciliation failed because we couldn't write
- * something, and in the case of forced eviction, we need to stop this
- * page from being such a problem. We have exclusive access, rewrite the
- * page in memory. The code lives here because the split code knows how
- * to re-create a page in memory after it's been reconciled, and that's
- * exactly what we want to do.
+ * This isn't a split: a reconciliation failed because we couldn't write something, and in the
+ * case of forced eviction, we need to stop this page from being such a problem. We have
+ * exclusive access, rewrite the page in memory. The code lives here because the split code
+ * knows how to re-create a page in memory after it's been reconciled, and that's exactly what
+ * we want to do.
*
* Build the new page.
*
- * Allocate a WT_REF, the error path calls routines that free memory.
- * The only field we need to set is the record number, as it's used by
- * the search routines.
+ * Allocate a WT_REF, the error path calls routines that free memory. The only field we need to
+ * set is the record number, as it's used by the search routines.
*/
WT_RET(__wt_calloc_one(session, &new));
new->ref_recno = ref->ref_recno;
@@ -2263,19 +2213,17 @@ __wt_split_rewrite(WT_SESSION_IMPL *session, WT_REF *ref, WT_MULTI *multi)
/*
* The rewrite succeeded, we can no longer fail.
*
- * Finalize the move, discarding moved update lists from the original
- * page.
+ * Finalize the move, discarding moved update lists from the original page.
*/
__split_multi_inmem_final(page, multi);
/*
* Discard the original page.
*
- * Pages with unresolved changes are not marked clean during
- * reconciliation, do it now.
+ * Pages with unresolved changes are not marked clean during reconciliation, do it now.
*
- * Don't count this as eviction making progress, we did a one-for-one
- * rewrite of a page in memory, typical in the case of cache pressure.
+ * Don't count this as eviction making progress, we did a one-for-one rewrite of a page in
+ * memory, typical in the case of cache pressure.
*/
__wt_page_modify_clear(session, page);
F_SET_ATOMIC(page, WT_PAGE_EVICT_NO_PROGRESS);
diff --git a/src/third_party/wiredtiger/src/btree/bt_stat.c b/src/third_party/wiredtiger/src/btree/bt_stat.c
index 5873e611189..faf0fc0e7ac 100644
--- a/src/third_party/wiredtiger/src/btree/bt_stat.c
+++ b/src/third_party/wiredtiger/src/btree/bt_stat.c
@@ -290,13 +290,11 @@ __stat_page_row_leaf(WT_SESSION_IMPL *session, WT_PAGE *page, WT_DSRC_STATS **st
}
/*
- * Overflow keys are hard: we have to walk the disk image to count them,
- * the in-memory representation of the page doesn't necessarily contain
- * a reference to the original cell.
+ * Overflow keys are hard: we have to walk the disk image to count them, the in-memory
+ * representation of the page doesn't necessarily contain a reference to the original cell.
*
- * Zero-length values are the same, we have to look at the disk image to
- * know. They aren't stored but we know they exist if there are two keys
- * in a row, or a key as the last item.
+ * Zero-length values are the same, we have to look at the disk image to know. They aren't
+ * stored but we know they exist if there are two keys in a row, or a key as the last item.
*/
if (page->dsk != NULL) {
key = false;
diff --git a/src/third_party/wiredtiger/src/btree/bt_sync.c b/src/third_party/wiredtiger/src/btree/bt_sync.c
index 3fdaf9c240e..6ede60b97e0 100644
--- a/src/third_party/wiredtiger/src/btree/bt_sync.c
+++ b/src/third_party/wiredtiger/src/btree/bt_sync.c
@@ -41,16 +41,14 @@ __sync_checkpoint_can_skip(WT_SESSION_IMPL *session, WT_PAGE *page)
return (false);
/*
- * The problematic case is when a page was evicted but when there were
- * unresolved updates and not every block associated with the page has
- * a disk address. We can't skip such pages because we need a checkpoint
- * write with valid addresses.
+ * The problematic case is when a page was evicted but when there were unresolved updates and
+ * not every block associated with the page has a disk address. We can't skip such pages because
+ * we need a checkpoint write with valid addresses.
*
- * The page's modification information can change underfoot if the page
- * is being reconciled, so we'd normally serialize with reconciliation
- * before reviewing page-modification information. However, checkpoint
- * is the only valid writer of dirty leaf pages at this point, we skip
- * the lock.
+ * The page's modification information can change underfoot if the page is being reconciled, so
+ * we'd normally serialize with reconciliation before reviewing page-modification information.
+ * However, checkpoint is the only valid writer of dirty leaf pages at this point, we skip the
+ * lock.
*/
if (mod->rec_result == WT_PM_REC_MULTIBLOCK)
for (multi = mod->mod_multi, i = 0; i < mod->mod_multi_entries; ++multi, ++i)
@@ -152,9 +150,8 @@ __wt_sync_file(WT_SESSION_IMPL *session, WT_CACHE_OP syncop)
/*
* Write all immediately available, dirty in-cache leaf pages.
*
- * Writing the leaf pages is done without acquiring a high-level
- * lock, serialize so multiple threads don't walk the tree at
- * the same time.
+ * Writing the leaf pages is done without acquiring a high-level lock, serialize so multiple
+ * threads don't walk the tree at the same time.
*/
if (!btree->modified)
return (0);
@@ -195,27 +192,23 @@ __wt_sync_file(WT_SESSION_IMPL *session, WT_CACHE_OP syncop)
break;
case WT_SYNC_CHECKPOINT:
/*
- * If we are flushing a file at read-committed isolation, which
- * is of particular interest for flushing the metadata to make
- * a schema-changing operation durable, get a transactional
- * snapshot now.
+ * If we are flushing a file at read-committed isolation, which is of particular interest
+ * for flushing the metadata to make a schema-changing operation durable, get a
+ * transactional snapshot now.
*
- * All changes committed up to this point should be included.
- * We don't update the snapshot in between pages because the
- * metadata shouldn't have many pages. Instead, read-committed
- * isolation ensures that all metadata updates completed before
- * the checkpoint are included.
+ * All changes committed up to this point should be included. We don't update the snapshot
+ * in between pages because the metadata shouldn't have many pages. Instead, read-committed
+ * isolation ensures that all metadata updates completed before the checkpoint are included.
*/
if (txn->isolation == WT_ISO_READ_COMMITTED)
__wt_txn_get_snapshot(session);
/*
- * We cannot check the tree modified flag in the case of a
- * checkpoint, the checkpoint code has already cleared it.
+ * We cannot check the tree modified flag in the case of a checkpoint, the checkpoint code
+ * has already cleared it.
*
- * Writing the leaf pages is done without acquiring a high-level
- * lock, serialize so multiple threads don't walk the tree at
- * the same time. We're holding the schema lock, but need the
+ * Writing the leaf pages is done without acquiring a high-level lock, serialize so multiple
+ * threads don't walk the tree at the same time. We're holding the schema lock, but need the
* lower-level lock as well.
*/
__wt_spin_lock(session, &btree->flush_lock);
@@ -284,36 +277,20 @@ __wt_sync_file(WT_SESSION_IMPL *session, WT_CACHE_OP syncop)
}
/*
- * If the page was pulled into cache by our read, try
- * to evict it now.
+ * If the page was pulled into cache by our read, try to evict it now.
*
- * For eviction to have a chance, we first need to move
- * the walk point to the next page checkpoint will
- * visit. We want to avoid this code being too special
- * purpose, so try to reuse the ordinary eviction path.
+ * For eviction to have a chance, we first need to move the walk point to the next page
+ * checkpoint will visit. We want to avoid this code being too special purpose, so try
+ * to reuse the ordinary eviction path.
*
- * Regardless of whether eviction succeeds or fails,
- * the walk continues from the previous location. We
- * remember whether we tried eviction, and don't try
- * again. Even if eviction fails (the page may stay in
- * cache clean but with history that cannot be
- * discarded), that is not wasted effort because
- * checkpoint doesn't need to write the page again.
- *
- * Once the transaction has given up it's snapshot it
- * is no longer safe to reconcile pages. That happens
- * prior to the final metadata checkpoint.
- *
- * XXX Only attempt this eviction when there are no
- * readers older than the checkpoint. Otherwise, a bug
- * in eviction can mark the page clean and discard
- * history, causing those reads to incorrectly see
- * newer versions of data than they should.
+ * Regardless of whether eviction succeeds or fails, the walk continues from the
+ * previous location. We remember whether we tried eviction, and don't try again. Even
+ * if eviction fails (the page may stay in cache clean but with history that cannot be
+ * discarded), that is not wasted effort because checkpoint doesn't need to write the
+ * page again.
*/
if (!WT_PAGE_IS_INTERNAL(page) && page->read_gen == WT_READGEN_WONT_NEED &&
- !tried_eviction && F_ISSET(&session->txn, WT_TXN_HAS_SNAPSHOT) &&
- (!F_ISSET(txn, WT_TXN_HAS_TS_READ) ||
- txn->read_timestamp == conn->txn_global.pinned_timestamp)) {
+ !tried_eviction) {
WT_ERR_BUSY_OK(__wt_page_release_evict(session, walk, 0));
walk = prev;
prev = NULL;
diff --git a/src/third_party/wiredtiger/src/btree/bt_vrfy.c b/src/third_party/wiredtiger/src/btree/bt_vrfy.c
index f1aed89572a..7685547b351 100644
--- a/src/third_party/wiredtiger/src/btree/bt_vrfy.c
+++ b/src/third_party/wiredtiger/src/btree/bt_vrfy.c
@@ -537,12 +537,10 @@ __verify_tree(WT_SESSION_IMPL *session, WT_REF *ref, WT_CELL_UNPACK *addr_unpack
entry = 0;
WT_INTL_FOREACH_BEGIN (session, page, child_ref) {
/*
- * It's a depth-first traversal: this entry's starting
- * key should be larger than the largest key previously
- * reviewed.
+ * It's a depth-first traversal: this entry's starting key should be larger than the
+ * largest key previously reviewed.
*
- * The 0th key of any internal page is magic, and we
- * can't test against it.
+ * The 0th key of any internal page is magic, and we can't test against it.
*/
++entry;
if (entry != 1)
@@ -638,12 +636,10 @@ __verify_row_leaf_key_order(WT_SESSION_IMPL *session, WT_REF *ref, WT_VSTUFF *vs
/*
* Compare the key against the largest key we've seen so far.
*
- * If we're comparing against a key taken from an internal page,
- * we can compare equal (which is an expected path, the internal
- * page key is often a copy of the leaf page's first key). But,
- * in the case of the 0th slot on an internal page, the last key
- * we've seen was a key from a previous leaf page, and it's not
- * OK to compare equally in that case.
+ * If we're comparing against a key taken from an internal page, we can compare equal (which
+ * is an expected path, the internal page key is often a copy of the leaf page's first key).
+ * But, in the case of the 0th slot on an internal page, the last key we've seen was a key
+ * from a previous leaf page, and it's not OK to compare equally in that case.
*/
WT_RET(__wt_compare(session, btree->collator, vs->tmp1, (WT_ITEM *)vs->max_key, &cmp));
if (cmp < 0)
diff --git a/src/third_party/wiredtiger/src/btree/bt_vrfy_dsk.c b/src/third_party/wiredtiger/src/btree/bt_vrfy_dsk.c
index 0e4bbf2f92d..2d6654ebd43 100644
--- a/src/third_party/wiredtiger/src/btree/bt_vrfy_dsk.c
+++ b/src/third_party/wiredtiger/src/btree/bt_vrfy_dsk.c
@@ -264,14 +264,13 @@ __verify_dsk_validity(WT_SESSION_IMPL *session, WT_CELL_UNPACK *unpack, uint32_t
char ts_string[2][WT_TS_INT_STRING_SIZE];
/*
- * Check timestamp and transaction order, and optionally against parent
- * values. Timestamps and transactions in the parent address aren't
- * necessarily an exact match, but should be within the boundaries of
- * the parent's information.
+ * Check timestamp and transaction order, and optionally against parent values. Timestamps and
+ * transactions in the parent address aren't necessarily an exact match, but should be within
+ * the boundaries of the parent's information.
*
- * There's no checking if validity information should appear on a page
- * because the cell-unpacking code hides it by always returning durable
- * values if they don't appear on the page.
+ * There's no checking if validity information should appear on a page because the
+ * cell-unpacking code hides it by always returning durable values if they don't appear on the
+ * page.
*/
switch (unpack->type) {
case WT_CELL_ADDR_DEL:
@@ -507,8 +506,7 @@ __verify_dsk_row(
/*
* Prefix compression checks.
*
- * Confirm the first non-overflow key on a page has a zero
- * prefix compression count.
+ * Confirm the first non-overflow key on a page has a zero prefix compression count.
*/
prefix = unpack->prefix;
if (last_pfx->size == 0 && prefix != 0)
@@ -563,9 +561,8 @@ __verify_dsk_row(
/*
* Compare the current key against the last key.
*
- * Be careful about the 0th key on internal pages: we only store
- * the first byte and custom collators may not be able to handle
- * truncated keys.
+ * Be careful about the 0th key on internal pages: we only store the first byte and custom
+ * collators may not be able to handle truncated keys.
*/
if ((dsk->type == WT_PAGE_ROW_INT && cell_num > 3) ||
(dsk->type != WT_PAGE_ROW_INT && cell_num > 1)) {
diff --git a/src/third_party/wiredtiger/src/btree/bt_walk.c b/src/third_party/wiredtiger/src/btree/bt_walk.c
index f6cc0267a72..d1efbb2533d 100644
--- a/src/third_party/wiredtiger/src/btree/bt_walk.c
+++ b/src/third_party/wiredtiger/src/btree/bt_walk.c
@@ -34,15 +34,12 @@ __ref_index_slot(WT_SESSION_IMPL *session, WT_REF *ref, WT_PAGE_INDEX **pindexp,
entries = pindex->entries;
/*
- * Use the page's reference hint: it should be correct unless
- * there was a split or delete in the parent before our slot.
- * If the hint is wrong, it can be either too big or too small,
- * but often only by a small amount. Search up and down the
- * index starting from the hint.
+ * Use the page's reference hint: it should be correct unless there was a split or delete in
+ * the parent before our slot. If the hint is wrong, it can be either too big or too small,
+ * but often only by a small amount. Search up and down the index starting from the hint.
*
- * It's not an error for the reference hint to be wrong, it
- * just means the first retrieval (which sets the hint for
- * subsequent retrievals), is slower.
+ * It's not an error for the reference hint to be wrong, it just means the first retrieval
+ * (which sets the hint for subsequent retrievals), is slower.
*/
slot = ref->pindex_hint;
if (slot >= entries)
@@ -175,28 +172,25 @@ __split_prev_race(WT_SESSION_IMPL *session, WT_REF *ref, WT_PAGE_INDEX **pindexp
WT_PAGE_INDEX *pindex;
/*
- * Handle a cursor moving backwards through the tree or setting up at
- * the end of the tree. We're passed the child page into which we're
- * descending, and the parent page's page-index we used to find that
- * child page.
+ * Handle a cursor moving backwards through the tree or setting up at the end of the tree. We're
+ * passed the child page into which we're descending, and the parent page's page-index we used
+ * to find that child page.
*
- * When splitting an internal page into its parent, we move the split
- * pages WT_REF structures, then update the parent's page index, then
- * update the split page's page index, and nothing is atomic. A thread
- * can read the parent page's replacement page index and then the split
- * page's original index, or vice-versa, and either change can cause a
- * cursor moving backwards through the tree to skip pages.
+ * When splitting an internal page into its parent, we move the split pages WT_REF structures,
+ * then update the parent's page index, then update the split page's page index, and nothing is
+ * atomic. A thread can read the parent page's replacement page index and then the split page's
+ * original index, or vice-versa, and either change can cause a cursor moving backwards through
+ * the tree to skip pages.
*
- * This isn't a problem for a cursor setting up at the start of the tree
- * or moving forward through the tree because we do right-hand splits on
- * internal pages and the initial part of the split page's namespace
- * won't change as part of a split (in other words, a thread reading the
- * parent page's and split page's indexes will move to the same slot no
- * matter what order of indexes are read.
+ * This isn't a problem for a cursor setting up at the start of the tree or moving forward
+ * through the tree because we do right-hand splits on internal pages and the initial part of
+ * the split page's namespace won't change as part of a split (in other words, a thread reading
+ * the parent page's and split page's indexes will move to the same slot no matter what order of
+ * indexes are read.
*
- * Acquire the child's page index, then confirm the parent's page index
- * hasn't changed, to check for reading an old version of the parent's
- * page index and then reading a new version of the child's page index.
+ * Acquire the child's page index, then confirm the parent's page index hasn't changed, to check
+ * for reading an old version of the parent's page index and then reading a new version of the
+ * child's page index.
*/
WT_INTL_INDEX_GET(session, ref->page, pindex);
if (__wt_split_descent_race(session, ref, *pindexp))
@@ -406,16 +400,13 @@ restart:
for (;;) {
/*
- * Swap our previous hazard pointer for the page
- * we'll return.
+ * Swap our previous hazard pointer for the page we'll return.
*
- * Not-found is an expected return, as eviction
- * might have been attempted. The page can't be
- * evicted, we're holding a hazard pointer on a
- * child, spin until we're successful.
+ * Not-found is an expected return, as eviction might have been attempted. The page
+ * can't be evicted, we're holding a hazard pointer on a child, spin until we're
+ * successful.
*
- * Restart is not expected, our parent WT_REF
- * should not have split.
+ * Restart is not expected, our parent WT_REF should not have split.
*/
ret = __wt_page_swap(session, couple, ref, WT_READ_NOTFOUND_OK | flags);
if (ret == 0) {
@@ -528,8 +519,8 @@ descend:
}
/*
- * Not-found is an expected return when walking only
- * in-cache pages, or if we see a deleted page.
+ * Not-found is an expected return when walking only in-cache pages, or if we see a
+ * deleted page.
*
* An expected error, so "couple" is unchanged.
*/
@@ -540,8 +531,7 @@ descend:
}
/*
- * The page we're moving to might have split, in which
- * case restart the movement.
+ * The page we're moving to might have split, in which case restart the movement.
*
* An expected error, so "couple" is unchanged.
*/
diff --git a/src/third_party/wiredtiger/src/btree/col_modify.c b/src/third_party/wiredtiger/src/btree/col_modify.c
index 8bbda44d706..8a6c6e8aa2e 100644
--- a/src/third_party/wiredtiger/src/btree/col_modify.c
+++ b/src/third_party/wiredtiger/src/btree/col_modify.c
@@ -109,12 +109,11 @@ __wt_col_modify(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, uint64_t recno,
/*
* Delete, insert or update a column-store entry.
*
- * If modifying a previously modified record, cursor.ins will be set to
- * point to the correct update list. Create a new update entry and link
- * it into the existing list.
+ * If modifying a previously modified record, cursor.ins will be set to point to the correct
+ * update list. Create a new update entry and link it into the existing list.
*
- * Else, allocate an insert array as necessary, build an insert/update
- * structure pair, and link it into place.
+ * Else, allocate an insert array as necessary, build an insert/update structure pair, and link
+ * it into place.
*/
if (cbt->compare == 0 && cbt->ins != NULL) {
/*
@@ -190,17 +189,15 @@ __wt_col_modify(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, uint64_t recno,
ins_size += upd_size;
/*
- * If there was no insert list during the search, or there was
- * no search because the record number has not been allocated
- * yet, the cursor's information cannot be correct, search
+ * If there was no insert list during the search, or there was no search because the record
+ * number has not been allocated yet, the cursor's information cannot be correct, search
* couldn't have initialized it.
*
- * Otherwise, point the new WT_INSERT item's skiplist to the
- * next elements in the insert list (which we will check are
- * still valid inside the serialization function).
+ * Otherwise, point the new WT_INSERT item's skiplist to the next elements in the insert
+ * list (which we will check are still valid inside the serialization function).
*
- * The serial mutex acts as our memory barrier to flush these
- * writes before inserting them into the list.
+ * The serial mutex acts as our memory barrier to flush these writes before inserting them
+ * into the list.
*/
if (cbt->ins_stack[0] == NULL || recno == WT_RECNO_OOB)
for (i = 0; i < skipdepth; i++) {
diff --git a/src/third_party/wiredtiger/src/btree/col_srch.c b/src/third_party/wiredtiger/src/btree/col_srch.c
index f202dbd7f7b..160f19ffc2a 100644
--- a/src/third_party/wiredtiger/src/btree/col_srch.c
+++ b/src/third_party/wiredtiger/src/btree/col_srch.c
@@ -176,16 +176,13 @@ descend:
WT_DIAGNOSTIC_YIELD;
/*
- * Swap the current page for the child page. If the page splits
- * while we're retrieving it, restart the search at the root.
- * We cannot restart in the "current" page; for example, if a
- * thread is appending to the tree, the page it's waiting for
- * did an insert-split into the parent, then the parent split
- * into its parent, the name space we are searching for may have
- * moved above the current page in the tree.
+ * Swap the current page for the child page. If the page splits while we're retrieving it,
+ * restart the search at the root. We cannot restart in the "current" page; for example, if
+ * a thread is appending to the tree, the page it's waiting for did an insert-split into the
+ * parent, then the parent split into its parent, the name space we are searching for may
+ * have moved above the current page in the tree.
*
- * On other error, simply return, the swap call ensures we're
- * holding nothing on failure.
+ * On other error, simply return, the swap call ensures we're holding nothing on failure.
*/
read_flags = WT_READ_RESTART_OK;
if (F_ISSET(cbt, WT_CBT_READ_ONCE))
@@ -220,15 +217,13 @@ leaf_only:
/*
* Search the leaf page.
*
- * Search after a page is pinned does a search of the pinned page before
- * doing a full tree search, in which case we might be searching for a
- * record logically before the page. Return failure, and there's nothing
- * else to do, the record isn't going to be on this page.
+ * Search after a page is pinned does a search of the pinned page before doing a full tree
+ * search, in which case we might be searching for a record logically before the page. Return
+ * failure, and there's nothing else to do, the record isn't going to be on this page.
*
- * We don't check inside the search path for a record greater than the
- * maximum record in the tree; in that case, we get here with a record
- * that's impossibly large for the page. We do have additional setup to
- * do in that case, the record may be appended to the page.
+ * We don't check inside the search path for a record greater than the maximum record in the
+ * tree; in that case, we get here with a record that's impossibly large for the page. We do
+ * have additional setup to do in that case, the record may be appended to the page.
*/
if (page->type == WT_PAGE_COL_FIX) {
if (recno < current->ref_recno) {
diff --git a/src/third_party/wiredtiger/src/btree/row_key.c b/src/third_party/wiredtiger/src/btree/row_key.c
index c017e7c8a9c..d0524dfe5a3 100644
--- a/src/third_party/wiredtiger/src/btree/row_key.c
+++ b/src/third_party/wiredtiger/src/btree/row_key.c
@@ -180,17 +180,15 @@ __wt_row_leaf_key_work(
copy = WT_ROW_KEY_COPY(rip);
#ifdef HAVE_DIAGNOSTIC
/*
- * Debugging added to detect and gather information for rare hang. Detect and abort if the
- * current operation takes too long.
+ * Debugging added to detect and gather information for rare hang, WT-5043. Detect and abort
+ * if the current function call or operation takes too long (and 5 minutes is an eternity).
*/
__wt_seconds32(session, &current);
WT_ERR_ASSERT(session, (current - start) < WT_MINUTE * 5, EINVAL,
- "Current function call taking too long: current %" PRIu32 " func started %" PRIu32,
- current, start);
+ "call tracking for WT-5043: %s took longer than 5 minutes", __func__);
WT_ERR_ASSERT(session,
- session->op_start == 0 || ((current - session->op_start) < WT_MINUTE * 5), EINVAL,
- "Operation taking too long: current %" PRIu32 " started %" PRIu32, current,
- session->op_start);
+ (session->op_5043_seconds == 0 || (current - session->op_5043_seconds) < WT_MINUTE * 5),
+ EINVAL, "operation tracking for WT-5043: %s took longer than 5 minutes", session->name);
#endif
/*
@@ -204,15 +202,12 @@ __wt_row_leaf_key_work(
keyb->size = size;
/*
- * If this is the key we originally wanted, we don't
- * care if we're rolling forward or backward, or if
- * it's an overflow key or not, it's what we wanted.
- * This shouldn't normally happen, the fast-path code
- * that front-ends this function will have figured it
- * out before we were called.
+ * If this is the key we originally wanted, we don't care if we're rolling forward or
+ * backward, or if it's an overflow key or not, it's what we wanted. This shouldn't
+ * normally happen, the fast-path code that front-ends this function will have figured
+ * it out before we were called.
*
- * The key doesn't need to be instantiated, skip past
- * that test.
+ * The key doesn't need to be instantiated, skip past that test.
*/
if (slot_offset == 0)
goto done;
@@ -231,13 +226,11 @@ __wt_row_leaf_key_work(
/* 2: the test for an instantiated off-page key. */
if (ikey != NULL) {
/*
- * If this is the key we originally wanted, we don't
- * care if we're rolling forward or backward, or if
- * it's an overflow key or not, it's what we wanted.
- * Take a copy and wrap up.
+ * If this is the key we originally wanted, we don't care if we're rolling forward or
+ * backward, or if it's an overflow key or not, it's what we wanted. Take a copy and
+ * wrap up.
*
- * The key doesn't need to be instantiated, skip past
- * that test.
+ * The key doesn't need to be instantiated, skip past that test.
*/
if (slot_offset == 0) {
keyb->data = p;
@@ -283,19 +276,15 @@ __wt_row_leaf_key_work(
/* 3: the test for an on-page reference to an overflow key. */
if (unpack->type == WT_CELL_KEY_OVFL) {
/*
- * If this is the key we wanted from the start, we don't
- * care if it's an overflow key, get a copy and wrap up.
+ * If this is the key we wanted from the start, we don't care if it's an overflow key,
+ * get a copy and wrap up.
*
- * Avoid racing with reconciliation deleting overflow
- * keys. Deleted overflow keys must be instantiated
- * first, acquire the overflow lock and check. Read
- * the key if we still need to do so, but holding the
- * overflow lock. Note we are not using the version of
- * the cell-data-ref calls that acquire the overflow
- * lock and do a look-aside into the tracking cache:
- * this is an overflow key, not a value, meaning it's
- * instantiated before being deleted, not copied into
- * the tracking cache.
+ * Avoid racing with reconciliation deleting overflow keys. Deleted overflow keys must
+ * be instantiated first, acquire the overflow lock and check. Read the key if we still
+ * need to do so, but holding the overflow lock. Note we are not using the version of
+ * the cell-data-ref calls that acquire the overflow lock and do a look-aside into the
+ * tracking cache: this is an overflow key, not a value, meaning it's instantiated
+ * before being deleted, not copied into the tracking cache.
*/
if (slot_offset == 0) {
__wt_readlock(session, &btree->ovfl_lock);
@@ -364,16 +353,13 @@ __wt_row_leaf_key_work(
*/
if (direction == BACKWARD) {
/*
- * If there's a set of keys with identical prefixes, we
- * don't want to instantiate each one, the prefixes are
- * all the same.
+ * If there's a set of keys with identical prefixes, we don't want to instantiate each
+ * one, the prefixes are all the same.
*
- * As we roll backward through the page, track the last
- * time the prefix decreased in size, so we can start
- * with that key during our roll-forward. For a page
- * populated with a single key prefix, we'll be able to
- * instantiate the key we want as soon as we find a key
- * without a prefix.
+ * As we roll backward through the page, track the last time the prefix decreased in
+ * size, so we can start with that key during our roll-forward. For a page populated
+ * with a single key prefix, we'll be able to instantiate the key we want as soon as we
+ * find a key without a prefix.
*/
if (slot_offset == 0)
last_prefix = unpack->prefix;
@@ -400,13 +386,11 @@ __wt_row_leaf_key_work(
}
/*
- * Grow the buffer as necessary as well as ensure data
- * has been copied into local buffer space, then append
- * the suffix to the prefix already in the buffer.
+ * Grow the buffer as necessary as well as ensure data has been copied into local buffer
+ * space, then append the suffix to the prefix already in the buffer.
*
- * Don't grow the buffer unnecessarily or copy data we
- * don't need, truncate the item's data length to the
- * prefix bytes.
+ * Don't grow the buffer unnecessarily or copy data we don't need, truncate the item's
+ * data length to the prefix bytes.
*/
keyb->size = unpack->prefix;
WT_ERR(__wt_buf_grow(session, keyb, keyb->size + size));
diff --git a/src/third_party/wiredtiger/src/btree/row_modify.c b/src/third_party/wiredtiger/src/btree/row_modify.c
index 7298dee90a9..7a5b7fa2f91 100644
--- a/src/third_party/wiredtiger/src/btree/row_modify.c
+++ b/src/third_party/wiredtiger/src/btree/row_modify.c
@@ -68,13 +68,11 @@ __wt_row_modify(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, const WT_ITEM *k
mod = page->modify;
/*
- * Modify: allocate an update array as necessary, build a WT_UPDATE
- * structure, and call a serialized function to insert the WT_UPDATE
- * structure.
+ * Modify: allocate an update array as necessary, build a WT_UPDATE structure, and call a
+ * serialized function to insert the WT_UPDATE structure.
*
- * Insert: allocate an insert array as necessary, build a WT_INSERT
- * and WT_UPDATE structure pair, and call a serialized function to
- * insert the WT_INSERT structure.
+ * Insert: allocate an insert array as necessary, build a WT_INSERT and WT_UPDATE structure
+ * pair, and call a serialized function to insert the WT_INSERT structure.
*/
if (cbt->compare == 0) {
if (cbt->ins == NULL) {
@@ -125,13 +123,11 @@ __wt_row_modify(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, const WT_ITEM *k
/*
* Allocate the insert array as necessary.
*
- * We allocate an additional insert array slot for insert keys
- * sorting less than any key on the page. The test to select
- * that slot is baroque: if the search returned the first page
- * slot, we didn't end up processing an insert list, and the
- * comparison value indicates the search key was smaller than
- * the returned slot, then we're using the smallest-key insert
- * slot. That's hard, so we set a flag.
+ * We allocate an additional insert array slot for insert keys sorting less than any key on
+ * the page. The test to select that slot is baroque: if the search returned the first page
+ * slot, we didn't end up processing an insert list, and the comparison value indicates the
+ * search key was smaller than the returned slot, then we're using the smallest-key insert
+ * slot. That's hard, so we set a flag.
*/
WT_PAGE_ALLOC_AND_SWAP(session, page, mod->mod_row_insert, ins_headp, page->entries + 1);
@@ -167,16 +163,14 @@ __wt_row_modify(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, const WT_ITEM *k
ins_size += upd_size;
/*
- * If there was no insert list during the search, the cursor's
- * information cannot be correct, search couldn't have
- * initialized it.
+ * If there was no insert list during the search, the cursor's information cannot be
+ * correct, search couldn't have initialized it.
*
- * Otherwise, point the new WT_INSERT item's skiplist to the
- * next elements in the insert list (which we will check are
- * still valid inside the serialization function).
+ * Otherwise, point the new WT_INSERT item's skiplist to the next elements in the insert
+ * list (which we will check are still valid inside the serialization function).
*
- * The serial mutex acts as our memory barrier to flush these
- * writes before inserting them into the list.
+ * The serial mutex acts as our memory barrier to flush these writes before inserting them
+ * into the list.
*/
if (cbt->ins_stack[0] == NULL)
for (i = 0; i < skipdepth; i++) {
@@ -303,20 +297,17 @@ __wt_update_obsolete_check(
oldest = txn_global->has_oldest_timestamp ? txn_global->oldest_timestamp : WT_TS_NONE;
stable = txn_global->has_stable_timestamp ? txn_global->stable_timestamp : WT_TS_NONE;
/*
- * This function identifies obsolete updates, and truncates them from
- * the rest of the chain; because this routine is called from inside
- * a serialization function, the caller has responsibility for actually
- * freeing the memory.
+ * This function identifies obsolete updates, and truncates them from the rest of the chain;
+ * because this routine is called from inside a serialization function, the caller has
+ * responsibility for actually freeing the memory.
*
* Walk the list of updates, looking for obsolete updates at the end.
*
- * Only updates with globally visible, self-contained data can terminate
- * update chains.
+ * Only updates with globally visible, self-contained data can terminate update chains.
*
- * Birthmarks are a special case: once a birthmark becomes obsolete, it
- * can be discarded and subsequent reads will see the on-page value (as
- * expected). Inserting updates into the lookaside table relies on
- * this behavior to avoid creating update chains with multiple
+ * Birthmarks are a special case: once a birthmark becomes obsolete, it can be discarded and
+ * subsequent reads will see the on-page value (as expected). Inserting updates into the
+ * lookaside table relies on this behavior to avoid creating update chains with multiple
* birthmarks.
*/
for (first = prev = NULL, count = 0; upd != NULL; prev = upd, upd = upd->next, count++) {
diff --git a/src/third_party/wiredtiger/src/btree/row_srch.c b/src/third_party/wiredtiger/src/btree/row_srch.c
index 52057ad56b9..9b69c0aa9ed 100644
--- a/src/third_party/wiredtiger/src/btree/row_srch.c
+++ b/src/third_party/wiredtiger/src/btree/row_srch.c
@@ -30,10 +30,9 @@ __search_insert_append(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, WT_INSERT
if ((ins = WT_SKIP_LAST(ins_head)) == NULL)
return (0);
/*
- * Since the head of the skip list doesn't get mutated within this
- * function, the compiler may move this assignment above within the
- * loop below if it needs to (and may read a different value on each
- * loop due to other threads mutating the skip list).
+ * Since the head of the skip list doesn't get mutated within this function, the compiler may
+ * move this assignment above within the loop below if it needs to (and may read a different
+ * value on each loop due to other threads mutating the skip list).
*
* Place a read barrier here to avoid this issue.
*/
@@ -171,11 +170,10 @@ __check_leaf_key_range(
return (0);
/*
- * Check if the search key is smaller than the parent's starting key for
- * this page.
+ * Check if the search key is smaller than the parent's starting key for this page.
*
- * We can't compare against slot 0 on a row-store internal page because
- * reconciliation doesn't build it, it may not be a valid key.
+ * We can't compare against slot 0 on a row-store internal page because reconciliation doesn't
+ * build it, it may not be a valid key.
*/
if (indx != 0) {
__wt_ref_key(leaf->home, leaf, &item->data, &item->size);
@@ -241,12 +239,11 @@ __wt_row_search(WT_SESSION_IMPL *session, WT_ITEM *srch_key, WT_REF *leaf, WT_CU
skiphigh = skiplow = 0;
/*
- * If a cursor repeatedly appends to the tree, compare the search key
- * against the last key on each internal page during insert before
- * doing the full binary search.
+ * If a cursor repeatedly appends to the tree, compare the search key against the last key on
+ * each internal page during insert before doing the full binary search.
*
- * Track if the descent is to the right-side of the tree, used to set
- * the cursor's append history.
+ * Track if the descent is to the right-side of the tree, used to set the cursor's append
+ * history.
*/
append_check = insert && cbt->append_tree;
descend_right = true;
@@ -297,17 +294,14 @@ restart:
/*
* Fast-path appends.
*
- * The 0th key on an internal page is a problem for a couple of
- * reasons. First, we have to force the 0th key to sort less
- * than any application key, so internal pages don't have to be
- * updated if the application stores a new, "smallest" key in
- * the tree. Second, reconciliation is aware of this and will
- * store a byte of garbage in the 0th key, so the comparison of
- * an application key and a 0th key is meaningless (but doing
- * the comparison could still incorrectly modify our tracking
- * of the leading bytes in each key that we can skip during the
- * comparison). For these reasons, special-case the 0th key, and
- * never pass it to a collator.
+ * The 0th key on an internal page is a problem for a couple of reasons. First, we have to
+ * force the 0th key to sort less than any application key, so internal pages don't have to
+ * be updated if the application stores a new, "smallest" key in the tree. Second,
+ * reconciliation is aware of this and will store a byte of garbage in the 0th key, so the
+ * comparison of an application key and a 0th key is meaningless (but doing the comparison
+ * could still incorrectly modify our tracking of the leading bytes in each key that we can
+ * skip during the comparison). For these reasons, special-case the 0th key, and never pass
+ * it to a collator.
*/
if (append_check) {
descent = pindex->index[pindex->entries - 1];
@@ -420,16 +414,13 @@ descend:
WT_DIAGNOSTIC_YIELD;
/*
- * Swap the current page for the child page. If the page splits
- * while we're retrieving it, restart the search at the root.
- * We cannot restart in the "current" page; for example, if a
- * thread is appending to the tree, the page it's waiting for
- * did an insert-split into the parent, then the parent split
- * into its parent, the name space we are searching for may have
- * moved above the current page in the tree.
+ * Swap the current page for the child page. If the page splits while we're retrieving it,
+ * restart the search at the root. We cannot restart in the "current" page; for example, if
+ * a thread is appending to the tree, the page it's waiting for did an insert-split into the
+ * parent, then the parent split into its parent, the name space we are searching for may
+ * have moved above the current page in the tree.
*
- * On other error, simply return, the swap call ensures we're
- * holding nothing on failure.
+ * On other error, simply return, the swap call ensures we're holding nothing on failure.
*/
read_flags = WT_READ_RESTART_OK;
if (F_ISSET(cbt, WT_CBT_READ_ONCE))
@@ -458,21 +449,17 @@ leaf_only:
current = NULL;
/*
- * In the case of a right-side tree descent during an insert, do a fast
- * check for an append to the page, try to catch cursors appending data
- * into the tree.
+ * In the case of a right-side tree descent during an insert, do a fast check for an append to
+ * the page, try to catch cursors appending data into the tree.
*
- * It's tempting to make this test more rigorous: if a cursor inserts
- * randomly into a two-level tree (a root referencing a single child
- * that's empty except for an insert list), the right-side descent flag
- * will be set and this comparison wasted. The problem resolves itself
- * as the tree grows larger: either we're no longer doing right-side
- * descent, or we'll avoid additional comparisons in internal pages,
- * making up for the wasted comparison here. Similarly, the cursor's
- * history is set any time it's an insert and a right-side descent,
- * both to avoid a complicated/expensive test, and, in the case of
- * multiple threads appending to the tree, we want to mark them all as
- * appending, even if this test doesn't work.
+ * It's tempting to make this test more rigorous: if a cursor inserts randomly into a two-level
+ * tree (a root referencing a single child that's empty except for an insert list), the
+ * right-side descent flag will be set and this comparison wasted. The problem resolves itself
+ * as the tree grows larger: either we're no longer doing right-side descent, or we'll avoid
+ * additional comparisons in internal pages, making up for the wasted comparison here.
+ * Similarly, the cursor's history is set any time it's an insert and a right-side descent, both
+ * to avoid a complicated/expensive test, and, in the case of multiple threads appending to the
+ * tree, we want to mark them all as appending, even if this test doesn't work.
*/
if (insert && descend_right) {
cbt->append_tree = 1;
diff --git a/src/third_party/wiredtiger/src/cache/cache_las.c b/src/third_party/wiredtiger/src/cache/cache_las.c
index e1edcb596fa..5f4b4d20c9d 100644
--- a/src/third_party/wiredtiger/src/cache/cache_las.c
+++ b/src/third_party/wiredtiger/src/cache/cache_las.c
@@ -168,9 +168,8 @@ __wt_las_create(WT_SESSION_IMPL *session, const char **cfg)
return (0);
/*
- * Done at startup: we cannot do it on demand because we require the
- * schema lock to create and drop the table, and it may not always be
- * available.
+ * Done at startup: we cannot do it on demand because we require the schema lock to create and
+ * drop the table, and it may not always be available.
*
* Discard any previous incarnation of the table.
*/
@@ -262,13 +261,11 @@ __wt_las_cursor_open(WT_SESSION_IMPL *session)
S2C(session)->cache->las_fileid = btree->id;
/*
- * Set special flags for the lookaside table: the lookaside flag (used,
- * for example, to avoid writing records during reconciliation), also
- * turn off checkpoints and logging.
+ * Set special flags for the lookaside table: the lookaside flag (used, for example, to avoid
+ * writing records during reconciliation), also turn off checkpoints and logging.
*
- * Test flags before setting them so updates can't race in subsequent
- * opens (the first update is safe because it's single-threaded from
- * wiredtiger_open).
+ * Test flags before setting them so updates can't race in subsequent opens (the first update is
+ * safe because it's single-threaded from wiredtiger_open).
*/
if (!F_ISSET(btree, WT_BTREE_LOOKASIDE))
F_SET(btree, WT_BTREE_LOOKASIDE);
@@ -296,13 +293,11 @@ __wt_las_cursor(WT_SESSION_IMPL *session, WT_CURSOR **cursorp, uint32_t *session
*cursorp = NULL;
/*
- * We don't want to get tapped for eviction after we start using the
- * lookaside cursor; save a copy of the current eviction state, we'll
- * turn eviction off before we return.
+ * We don't want to get tapped for eviction after we start using the lookaside cursor; save a
+ * copy of the current eviction state, we'll turn eviction off before we return.
*
- * Don't cache lookaside table pages, we're here because of eviction
- * problems and there's no reason to believe lookaside pages will be
- * useful more than once.
+ * Don't cache lookaside table pages, we're here because of eviction problems and there's no
+ * reason to believe lookaside pages will be useful more than once.
*/
*session_flags = F_MASK(session, WT_LAS_SESSION_FLAGS);
@@ -396,26 +391,23 @@ bool
__wt_las_page_skip_locked(WT_SESSION_IMPL *session, WT_REF *ref)
{
WT_TXN *txn;
- wt_timestamp_t unstable_timestamp;
txn = &session->txn;
/*
- * Skip lookaside pages if reading without a timestamp and all the
- * updates in lookaside are in the past.
+ * Skip lookaside pages if reading without a timestamp and all the updates in lookaside are in
+ * the past.
*
- * Lookaside eviction preferentially chooses the newest updates when
- * creating page images with no stable timestamp. If a stable timestamp
- * has been set, we have to visit the page because eviction chooses old
- * version of records in that case.
+ * Lookaside eviction preferentially chooses the newest updates when creating page images with
+ * no stable timestamp. If a stable timestamp has been set, we have to visit the page because
+ * eviction chooses old version of records in that case.
*
- * One case where we may need to visit the page is if lookaside eviction
- * is active in tree 2 when a checkpoint has started and is working its
- * way through tree 1. In that case, lookaside may have created a page
- * image with updates in the future of the checkpoint.
+ * One case where we may need to visit the page is if lookaside eviction is active in tree 2
+ * when a checkpoint has started and is working its way through tree 1. In that case, lookaside
+ * may have created a page image with updates in the future of the checkpoint.
*
- * We also need to instantiate a lookaside page if this is an update
- * operation in progress or transaction is in prepared state.
+ * We also need to instantiate a lookaside page if this is an update operation in progress or
+ * transaction is in prepared state.
*/
if (F_ISSET(txn, WT_TXN_PREPARE | WT_TXN_UPDATE))
return (false);
@@ -425,35 +417,34 @@ __wt_las_page_skip_locked(WT_SESSION_IMPL *session, WT_REF *ref)
/*
* If some of the page's history overlaps with the reader's snapshot then we have to read it.
- * This is only relevant if we chose versions that were unstable when the page was written.
*/
- if (ref->page_las->skew_newest && WT_TXNID_LE(txn->snap_min, ref->page_las->unstable_txn))
+ if (WT_TXNID_LE(txn->snap_min, ref->page_las->max_txn))
return (false);
+ /*
+ * Otherwise, if not reading at a timestamp, the page's history is in the past, so the page
+ * image is correct if it contains the most recent versions of everything and nothing was
+ * prepared.
+ */
if (!F_ISSET(txn, WT_TXN_HAS_TS_READ))
- return (ref->page_las->skew_newest);
+ return (!ref->page_las->has_prepares && ref->page_las->min_skipped_ts == WT_TS_MAX);
/*
- * Skip lookaside history if reading as of a timestamp, we evicted new
- * versions of data and all the updates are in the past. This is not
- * possible for prepared updates, because the commit timestamp was not
- * known when the page was evicted.
+ * Skip lookaside history if reading as of a timestamp, we evicted new versions of data and all
+ * the updates are in the past. This is not possible for prepared updates, because the commit
+ * timestamp was not known when the page was evicted.
*
- * Skip lookaside pages if reading as of a timestamp, we evicted old
- * versions of data and all the unstable updates are in the future.
- *
- * Checkpoint should respect durable timestamps, other reads should
- * respect ordinary visibility. Checking for just the unstable updates
- * during checkpoint would end up reading more content from lookaside
- * than necessary.
+ * Otherwise, skip reading lookaside history if everything on the page is older than the read
+ * timestamp, and the oldest update in lookaside newer than the page is in the future of the
+ * reader. This seems unlikely, but is exactly what eviction tries to do when a checkpoint is
+ * running.
*/
- unstable_timestamp = WT_SESSION_IS_CHECKPOINT(session) ?
- ref->page_las->unstable_durable_timestamp :
- ref->page_las->unstable_timestamp;
- if (ref->page_las->skew_newest && !ref->page_las->has_prepares &&
- txn->read_timestamp > unstable_timestamp)
+ if (!ref->page_las->has_prepares && ref->page_las->min_skipped_ts == WT_TS_MAX &&
+ txn->read_timestamp >= ref->page_las->max_ondisk_ts)
return (true);
- if (!ref->page_las->skew_newest && txn->read_timestamp < unstable_timestamp)
+
+ if (txn->read_timestamp >= ref->page_las->max_ondisk_ts &&
+ txn->read_timestamp < ref->page_las->min_skipped_ts)
return (true);
return (false);
@@ -502,6 +493,7 @@ __las_remove_block(WT_CURSOR *cursor, uint64_t pageid, bool lock_wait, uint64_t
bool local_txn;
*remove_cntp = 0;
+ saved_isolation = 0; /*[-Wconditional-uninitialized] */
session = (WT_SESSION_IMPL *)cursor->session;
conn = S2C(session);
@@ -513,8 +505,8 @@ __las_remove_block(WT_CURSOR *cursor, uint64_t pageid, bool lock_wait, uint64_t
else
WT_RET(__wt_try_writelock(session, &conn->cache->las_sweepwalk_lock));
- __las_set_isolation(session, &saved_isolation);
WT_ERR(__wt_txn_begin(session, NULL));
+ __las_set_isolation(session, &saved_isolation);
local_txn = true;
/*
@@ -539,9 +531,9 @@ err:
ret = __wt_txn_commit(session, NULL);
else
WT_TRET(__wt_txn_rollback(session, NULL));
+ __las_restore_isolation(session, saved_isolation);
}
- __las_restore_isolation(session, saved_isolation);
__wt_writeunlock(session, &conn->cache->las_sweepwalk_lock);
return (ret);
}
@@ -586,16 +578,15 @@ __las_insert_block_verbose(WT_SESSION_IMPL *session, WT_BTREE *btree, WT_MULTI *
"file ID %" PRIu32 ", page ID %" PRIu64
". "
"Max txn ID %" PRIu64
- ", unstable timestamp %s,"
- " unstable durable timestamp %s, %s. "
+ ", max ondisk timestamp %s, "
+ "first skipped ts %s. "
"Entries now in lookaside file: %" PRId64
", "
"cache dirty: %2.3f%% , "
"cache use: %2.3f%%",
btree_id, multi->page_las.las_pageid, multi->page_las.max_txn,
- __wt_timestamp_to_string(multi->page_las.unstable_timestamp, ts_string[0]),
- __wt_timestamp_to_string(multi->page_las.unstable_durable_timestamp, ts_string[1]),
- multi->page_las.skew_newest ? "newest" : "not newest",
+ __wt_timestamp_to_string(multi->page_las.max_ondisk_ts, ts_string[0]),
+ __wt_timestamp_to_string(multi->page_las.min_skipped_ts, ts_string[1]),
WT_STAT_READ(conn->stats, cache_lookaside_entries), pct_dirty, pct_full);
}
@@ -629,6 +620,7 @@ __wt_las_insert_block(
session = (WT_SESSION_IMPL *)cursor->session;
conn = S2C(session);
WT_CLEAR(las_value);
+ saved_isolation = 0; /*[-Wconditional-uninitialized] */
insert_cnt = prepared_insert_cnt = 0;
btree_id = btree->id;
local_txn = false;
@@ -650,8 +642,8 @@ __wt_las_insert_block(
#endif
/* Wrap all the updates in a transaction. */
- __las_set_isolation(session, &saved_isolation);
WT_ERR(__wt_txn_begin(session, NULL));
+ __las_set_isolation(session, &saved_isolation);
local_txn = true;
/* Enter each update in the boundary's list into the lookaside store. */
@@ -746,18 +738,14 @@ __wt_las_insert_block(
if (upd == list->onpage_upd && upd->size > 0 &&
(upd->type == WT_UPDATE_STANDARD || upd->type == WT_UPDATE_MODIFY)) {
las_value.size = 0;
- WT_ASSERT(session, upd != first_upd || multi->page_las.skew_newest);
cursor->set_value(cursor, upd->txnid, upd->start_ts, upd->durable_ts,
upd->prepare_state, WT_UPDATE_BIRTHMARK, &las_value);
} else
cursor->set_value(cursor, upd->txnid, upd->start_ts, upd->durable_ts,
upd->prepare_state, upd->type, &las_value);
- /*
- * Using update looks a little strange because the keys are guaranteed to not exist, but
- * since we're appending, we want the cursor to stay positioned in between inserts.
- */
- WT_ERR(cursor->update(cursor));
+ /* Using insert so we don't keep the page pinned longer than necessary. */
+ WT_ERR(cursor->insert(cursor));
++insert_cnt;
if (upd->prepare_state == WT_PREPARE_INPROGRESS)
++prepared_insert_cnt;
@@ -780,6 +768,7 @@ err:
ret = __wt_txn_commit(session, NULL);
else
WT_TRET(__wt_txn_rollback(session, NULL));
+ __las_restore_isolation(session, saved_isolation);
/* Adjust the entry count. */
if (ret == 0) {
@@ -789,8 +778,6 @@ err:
}
}
- __las_restore_isolation(session, saved_isolation);
-
if (ret == 0 && insert_cnt > 0) {
multi->page_las.las_pageid = las_pageid;
multi->page_las.has_prepares = prepared_insert_cnt > 0;
@@ -834,13 +821,12 @@ __wt_las_cursor_position(WT_CURSOR *cursor, uint64_t pageid)
WT_RET(cursor->next(cursor));
/*
- * Because of the special visibility rules for lookaside, a new
- * block can appear in between our search and the block of
- * interest. Keep trying while we have a key lower than we
+ * Because of the special visibility rules for lookaside, a new block can appear in between
+ * our search and the block of interest. Keep trying while we have a key lower than we
* expect.
*
- * There may be no block of lookaside entries if they have been
- * removed by WT_CONNECTION::rollback_to_stable.
+ * There may be no block of lookaside entries if they have been removed by
+ * WT_CONNECTION::rollback_to_stable.
*/
WT_RET(cursor->get_key(cursor, &las_pageid, &las_id, &las_counter, &las_key));
if (las_pageid >= pageid)
@@ -939,20 +925,17 @@ __las_sweep_count(WT_CACHE *cache)
uint64_t las_entry_count;
/*
- * The sweep server is a slow moving thread. Try to review the entire
- * lookaside table once every 5 minutes.
+ * The sweep server is a slow moving thread. Try to review the entire lookaside table once every
+ * 5 minutes.
*
- * The reason is because the lookaside table exists because we're seeing
- * cache/eviction pressure (it allows us to trade performance and disk
- * space for cache space), and it's likely lookaside blocks are being
- * evicted, and reading them back in doesn't help things. A trickier,
- * but possibly better, alternative might be to review all lookaside
- * blocks in the cache in order to get rid of them, and slowly review
- * lookaside blocks that have already been evicted.
+ * The reason is because the lookaside table exists because we're seeing cache/eviction pressure
+ * (it allows us to trade performance and disk space for cache space), and it's likely lookaside
+ * blocks are being evicted, and reading them back in doesn't help things. A trickier, but
+ * possibly better, alternative might be to review all lookaside blocks in the cache in order to
+ * get rid of them, and slowly review lookaside blocks that have already been evicted.
*
- * Put upper and lower bounds on the calculation: since reads of pages
- * with lookaside entries are blocked during sweep, make sure we do
- * some work but don't block reads for too long.
+ * Put upper and lower bounds on the calculation: since reads of pages with lookaside entries
+ * are blocked during sweep, make sure we do some work but don't block reads for too long.
*/
las_entry_count = __las_entry_count(cache);
return (
@@ -977,22 +960,17 @@ __las_sweep_init(WT_SESSION_IMPL *session)
/*
* If no files have been dropped and the lookaside file is empty, there's nothing to do.
*/
- if (cache->las_dropped_next == 0) {
- if (__wt_las_empty(session))
- ret = WT_NOTFOUND;
- goto err;
- }
+ if (cache->las_dropped_next == 0 && __wt_las_empty(session))
+ WT_ERR(WT_NOTFOUND);
/*
* Record the current page ID: sweep will stop after this point.
*
- * Since the btree IDs we're scanning are closed, any eviction must
- * have already completed, so we won't miss anything with this
- * approach.
+ * Since the btree IDs we're scanning are closed, any eviction must have already completed, so
+ * we won't miss anything with this approach.
*
- * Also, if a tree is reopened and there is lookaside activity before
- * this sweep completes, it will have a higher page ID and should not
- * be removed.
+ * Also, if a tree is reopened and there is lookaside activity before this sweep completes, it
+ * will have a higher page ID and should not be removed.
*/
cache->las_sweep_max_pageid = cache->las_pageid;
@@ -1044,6 +1022,7 @@ __wt_las_sweep(WT_SESSION_IMPL *session)
cache = S2C(session)->cache;
cursor = NULL;
sweep_key = &cache->las_sweep_key;
+ saved_isolation = 0; /*[-Wconditional-uninitialized] */
remove_cnt = 0;
session_flags = 0; /* [-Werror=maybe-uninitialized] */
local_txn = locked = removing_key_block = false;
@@ -1063,20 +1042,18 @@ __wt_las_sweep(WT_SESSION_IMPL *session)
*/
__wt_las_cursor(session, &cursor, &session_flags);
WT_ASSERT(session, cursor->session == &session->iface);
- __las_set_isolation(session, &saved_isolation);
WT_ERR(__wt_txn_begin(session, NULL));
+ __las_set_isolation(session, &saved_isolation);
local_txn = true;
/* Encourage a race */
__wt_timing_stress(session, WT_TIMING_STRESS_LOOKASIDE_SWEEP);
/*
- * When continuing a sweep, position the cursor using the key from the
- * last call (we don't care if we're before or after the key, either
- * side is fine).
+ * When continuing a sweep, position the cursor using the key from the last call (we don't care
+ * if we're before or after the key, either side is fine).
*
- * Otherwise, we're starting a new sweep, gather the list of trees to
- * sweep.
+ * Otherwise, we're starting a new sweep, gather the list of trees to sweep.
*/
if (sweep_key->size != 0) {
__wt_cursor_set_raw_key(cursor, sweep_key);
@@ -1137,10 +1114,9 @@ __wt_las_sweep(WT_SESSION_IMPL *session)
/*
* If the entry belongs to a dropped tree, discard it.
*
- * Cursor opened overwrite=true: won't return WT_NOTFOUND
- * should another thread remove the record before we do (not
- * expected for dropped trees), and the cursor remains
- * positioned in that case.
+ * Cursor opened overwrite=true: won't return WT_NOTFOUND should another thread remove the
+ * record before we do (not expected for dropped trees), and the cursor remains positioned
+ * in that case.
*/
if (las_id >= cache->las_sweep_dropmin && las_id <= cache->las_sweep_dropmax &&
__bit_test(cache->las_sweep_dropmap, las_id - cache->las_sweep_dropmin)) {
@@ -1161,13 +1137,11 @@ __wt_las_sweep(WT_SESSION_IMPL *session)
&prepare_state, &upd_type, &las_value));
/*
- * Check to see if the page or key has changed this iteration,
- * and if they have, setup context for safely removing obsolete
- * updates.
+ * Check to see if the page or key has changed this iteration, and if they have, setup
+ * context for safely removing obsolete updates.
*
- * It's important to check for page boundaries explicitly
- * because it is possible for the same key to be at the start
- * of the next block. See WT-3982 for details.
+ * It's important to check for page boundaries explicitly because it is possible for the
+ * same key to be at the start of the next block. See WT-3982 for details.
*/
if (las_pageid != saved_pageid || saved_key->size != las_key.size ||
memcmp(saved_key->data, las_key.data, las_key.size) != 0) {
@@ -1246,11 +1220,11 @@ err:
ret = __wt_txn_commit(session, NULL);
else
WT_TRET(__wt_txn_rollback(session, NULL));
+ __las_restore_isolation(session, saved_isolation);
if (ret == 0)
(void)__wt_atomic_add64(&cache->las_remove_count, remove_cnt);
}
- __las_restore_isolation(session, saved_isolation);
WT_TRET(__wt_las_cursor_close(session, &cursor, session_flags));
if (locked)
diff --git a/src/third_party/wiredtiger/src/checksum/x86/crc32-x86-alt.c b/src/third_party/wiredtiger/src/checksum/x86/crc32-x86-alt.c
index db0e01c35fc..7484e9e72ce 100644
--- a/src/third_party/wiredtiger/src/checksum/x86/crc32-x86-alt.c
+++ b/src/third_party/wiredtiger/src/checksum/x86/crc32-x86-alt.c
@@ -32,12 +32,12 @@
#include <stddef.h>
/*
- * The hardware-accelerated checksum code that originally shipped on Windows
- * did not correctly handle memory that wasn't 8B aligned and a multiple of 8B.
- * It's likely that calculations were always 8B aligned, but there's some risk.
+ * The hardware-accelerated checksum code that originally shipped on Windows did not correctly
+ * handle memory that wasn't 8B aligned and a multiple of 8B. It's likely that calculations were
+ * always 8B aligned, but there's some risk.
*
- * What we do is always write the correct checksum, and if a checksum test
- * fails, check it against the alternate version have before failing.
+ * What we do is always write the correct checksum, and if a checksum test fails, check it against
+ * the alternate version have before failing.
*/
#if defined(_M_AMD64) && !defined(HAVE_NO_CRC32_HARDWARE)
diff --git a/src/third_party/wiredtiger/src/checksum/zseries/crc32-s390x.c b/src/third_party/wiredtiger/src/checksum/zseries/crc32-s390x.c
index 3fcfcf69887..dfa1d9e03b2 100644
--- a/src/third_party/wiredtiger/src/checksum/zseries/crc32-s390x.c
+++ b/src/third_party/wiredtiger/src/checksum/zseries/crc32-s390x.c
@@ -1,9 +1,7 @@
/*
- * CRC-32 algorithms implemented with the z/Architecture
- * Vector Extension Facility.
+ * CRC-32 algorithms implemented with the z/Architecture Vector Extension Facility.
*
- * Copyright IBM Corp. 2015
- * Author(s): Hendrik Brueckner <brueckner@linux.vnet.ibm.com>
+ * Copyright IBM Corp. 2015 Author(s): Hendrik Brueckner <brueckner@linux.vnet.ibm.com>
*
*/
@@ -48,10 +46,9 @@ __wt_crc32c_le(unsigned int crc, const unsigned char *buf, size_t len)
/*
* DEFINE_CRC32_VX() - Define a CRC-32 function using the vector extension
*
- * Creates a function to perform a particular CRC-32 computation. Depending
- * on the message buffer, the hardware-accelerated or software implementation
- * is used. Note that the message buffer is aligned to improve fetch
- * operations of VECTOR LOAD MULTIPLE instructions.
+ * Creates a function to perform a particular CRC-32 computation. Depending on the message buffer,
+ * the hardware-accelerated or software implementation is used. Note that the message buffer is
+ * aligned to improve fetch operations of VECTOR LOAD MULTIPLE instructions.
*
*/
#define DEFINE_CRC32_VX(___fname, ___crc32_vx, ___crc32_sw) \
diff --git a/src/third_party/wiredtiger/src/checksum/zseries/slicing-consts.h b/src/third_party/wiredtiger/src/checksum/zseries/slicing-consts.h
index dae4b9d1c1e..88ddc900243 100644
--- a/src/third_party/wiredtiger/src/checksum/zseries/slicing-consts.h
+++ b/src/third_party/wiredtiger/src/checksum/zseries/slicing-consts.h
@@ -1,4 +1,5 @@
/* CRC-32 and CRC-32C slicing-by-8 constants, for use on big-endian systems. */
+#if 0
static const unsigned int __attribute__((aligned(128))) crc32table_le[8][256] = {
{0x00000000, 0x96300777, 0x2c610eee, 0xba510999, 0x19c46d07, 0x8ff46a70, 0x35a563e9, 0xa395649e,
0x3288db0e, 0xa4b8dc79, 0x1ee9d5e0, 0x88d9d297, 0x2b4cb609, 0xbd7cb17e, 0x072db8e7, 0x911dbf90,
@@ -257,7 +258,9 @@ static const unsigned int __attribute__((aligned(128))) crc32table_le[8][256] =
0x4a146bff, 0xd414c133, 0x37134ebd, 0xa913e471, 0xb01a217b, 0x2e1a8bb7, 0xcd1d0439, 0x531daef5,
0xff0f8e2c, 0x610f24e0, 0x8208ab6e, 0x1c0801a2, 0x0501c4a8, 0x9b016e64, 0x7806e1ea,
0xe6064b26}};
+#endif /* NOT CURRENTLY USED */
+#if 0
static const unsigned int __attribute__((aligned(128))) crc32table_be[8][256] = {
{0x00000000, 0x04c11db7, 0x09823b6e, 0x0d4326d9, 0x130476dc, 0x17c56b6b, 0x1a864db2, 0x1e475005,
0x2608edb8, 0x22c9f00f, 0x2f8ad6d6, 0x2b4bcb61, 0x350c9b64, 0x31cd86d3, 0x3c8ea00a, 0x384fbdbd,
@@ -516,6 +519,7 @@ static const unsigned int __attribute__((aligned(128))) crc32table_be[8][256] =
0x3548049b, 0x6ee9d851, 0x820bbd0f, 0xd9aa61c5, 0x5f0e6a04, 0x04afb6ce, 0xe84dd390, 0xb3ec0f5a,
0xe1c4d9a5, 0xba65056f, 0x56876031, 0x0d26bcfb, 0x8b82b73a, 0xd0236bf0, 0x3cc10eae,
0x6760d264}};
+#endif /* NOT CURRENTLY USED */
static const unsigned int __attribute__((aligned(128))) crc32ctable_le[8][256] = {
{0x00000000, 0x03836bf2, 0xf7703be1, 0xf4f35013, 0x1f979ac7, 0x1c14f135, 0xe8e7a126, 0xeb64cad4,
@@ -776,6 +780,7 @@ static const unsigned int __attribute__((aligned(128))) crc32ctable_le[8][256] =
0xa1354ce5, 0x864870ac, 0xefcf3477, 0xc8b2083e, 0xccb751c4, 0xebca6d8d, 0x824d2956,
0xa530151f}};
+#if 0
static const unsigned int __attribute__((aligned(128))) crc32ctable_be[8][256] = {
{0x00000000, 0x1edc6f41, 0x3db8de82, 0x2364b1c3, 0x7b71bd04, 0x65add245, 0x46c96386, 0x58150cc7,
0xf6e37a08, 0xe83f1549, 0xcb5ba48a, 0xd587cbcb, 0x8d92c70c, 0x934ea84d, 0xb02a198e, 0xaef676cf,
@@ -1034,3 +1039,4 @@ static const unsigned int __attribute__((aligned(128))) crc32ctable_be[8][256] =
0x7b80461d, 0x5de9c631, 0x37534645, 0x113ac669, 0xe22646ad, 0xc44fc681, 0xaef546f5, 0x889cc6d9,
0x5610283c, 0x7079a810, 0x1ac32864, 0x3caaa848, 0xcfb6288c, 0xe9dfa8a0, 0x836528d4,
0xa50ca8f8}};
+#endif /* NOT CURRENTLY USED */
diff --git a/src/third_party/wiredtiger/src/checksum/zseries/vx-insn.h b/src/third_party/wiredtiger/src/checksum/zseries/vx-insn.h
index bf022d5ad9d..b8726e1f76f 100644
--- a/src/third_party/wiredtiger/src/checksum/zseries/vx-insn.h
+++ b/src/third_party/wiredtiger/src/checksum/zseries/vx-insn.h
@@ -1,11 +1,10 @@
/*
* Support for Vector Instructions
*
- * Assembler macros to generate .byte/.word code for particular
- * vector instructions that are supported by recent binutils (>= 2.26) only.
+ * Assembler macros to generate .byte/.word code for particular vector instructions that are
+ * supported by recent binutils (>= 2.26) only.
*
- * Copyright IBM Corp. 2015
- * Author(s): Hendrik Brueckner <brueckner@linux.vnet.ibm.com>
+ * Copyright IBM Corp. 2015 Author(s): Hendrik Brueckner <brueckner@linux.vnet.ibm.com>
*/
#ifndef __ASM_S390_VX_INSN_H
diff --git a/src/third_party/wiredtiger/src/config/config_api.c b/src/third_party/wiredtiger/src/config/config_api.c
index e489e932247..a50754a59a6 100644
--- a/src/third_party/wiredtiger/src/config/config_api.c
+++ b/src/third_party/wiredtiger/src/config/config_api.c
@@ -160,8 +160,7 @@ __conn_foc_add(WT_SESSION_IMPL *session, const void *p)
conn = S2C(session);
/*
- * Callers of this function are expected to be holding the connection's
- * api_lock.
+ * Callers of this function are expected to be holding the connection's api_lock.
*
* All callers of this function currently ignore errors.
*/
@@ -255,8 +254,7 @@ __wt_configure_method(WT_SESSION_IMPL *session, const char *method, const char *
/*
* Allocate new configuration entry and fill it in.
*
- * The new base value is the previous base value, a separator and the
- * new configuration string.
+ * The new base value is the previous base value, a separator and the new configuration string.
*/
WT_ERR(__wt_calloc_one(session, &entry));
entry->method = (*epp)->method;
diff --git a/src/third_party/wiredtiger/src/config/config_collapse.c b/src/third_party/wiredtiger/src/config/config_collapse.c
index 292f3fcbe4a..e1bdc202b2b 100644
--- a/src/third_party/wiredtiger/src/config/config_collapse.c
+++ b/src/third_party/wiredtiger/src/config/config_collapse.c
@@ -56,9 +56,8 @@ __wt_config_collapse(WT_SESSION_IMPL *session, const char **cfg, char **config_r
goto err;
/*
- * If the caller passes us no valid configuration strings, we get here
- * with no bytes to copy -- that's OK, the underlying string copy can
- * handle empty strings.
+ * If the caller passes us no valid configuration strings, we get here with no bytes to copy --
+ * that's OK, the underlying string copy can handle empty strings.
*
* Strip any trailing comma.
*/
@@ -145,9 +144,8 @@ keep:
goto err;
/*
- * If the caller passes us only default configuration strings, we get
- * here with no bytes to copy -- that's OK, the underlying string copy
- * can handle empty strings.
+ * If the caller passes us only default configuration strings, we get here with no bytes to copy
+ * -- that's OK, the underlying string copy can handle empty strings.
*
* Strip any trailing comma.
*/
diff --git a/src/third_party/wiredtiger/src/config/config_def.c b/src/third_party/wiredtiger/src/config/config_def.c
index e23c4dd4c5e..958c267a7ce 100644
--- a/src/third_party/wiredtiger/src/config/config_def.c
+++ b/src/third_party/wiredtiger/src/config/config_def.c
@@ -118,6 +118,7 @@ static const WT_CONFIG_CHECK confchk_WT_CONNECTION_reconfigure[] = {
{"log", "category", NULL, NULL, confchk_WT_CONNECTION_reconfigure_log_subconfigs, 4},
{"lsm_manager", "category", NULL, NULL, confchk_wiredtiger_open_lsm_manager_subconfigs, 2},
{"lsm_merge", "boolean", NULL, NULL, NULL, 0},
+ {"operation_timeout_ms", "int", NULL, "min=1", NULL, 0},
{"operation_tracking", "category", NULL, NULL,
confchk_wiredtiger_open_operation_tracking_subconfigs, 2},
{"shared_cache", "category", NULL, NULL, confchk_wiredtiger_open_shared_cache_subconfigs, 5},
@@ -190,7 +191,8 @@ static const WT_CONFIG_CHECK confchk_WT_SESSION_begin_transaction[] = {
"choices=[\"read-uncommitted\",\"read-committed\","
"\"snapshot\"]",
NULL, 0},
- {"name", "string", NULL, NULL, NULL, 0}, {"priority", "int", NULL, "min=-100,max=100", NULL, 0},
+ {"name", "string", NULL, NULL, NULL, 0}, {"operation_timeout_ms", "int", NULL, "min=1", NULL, 0},
+ {"priority", "int", NULL, "min=-100,max=100", NULL, 0},
{"read_timestamp", "string", NULL, NULL, NULL, 0},
{"roundup_timestamps", "category", NULL, NULL,
confchk_WT_SESSION_begin_transaction_roundup_timestamps_subconfigs, 2},
@@ -551,6 +553,7 @@ static const WT_CONFIG_CHECK confchk_wiredtiger_open[] = {
{"lsm_manager", "category", NULL, NULL, confchk_wiredtiger_open_lsm_manager_subconfigs, 2},
{"lsm_merge", "boolean", NULL, NULL, NULL, 0}, {"mmap", "boolean", NULL, NULL, NULL, 0},
{"multiprocess", "boolean", NULL, NULL, NULL, 0},
+ {"operation_timeout_ms", "int", NULL, "min=1", NULL, 0},
{"operation_tracking", "category", NULL, NULL,
confchk_wiredtiger_open_operation_tracking_subconfigs, 2},
{"readonly", "boolean", NULL, NULL, NULL, 0}, {"salvage", "boolean", NULL, NULL, NULL, 0},
@@ -618,6 +621,7 @@ static const WT_CONFIG_CHECK confchk_wiredtiger_open_all[] = {
{"lsm_manager", "category", NULL, NULL, confchk_wiredtiger_open_lsm_manager_subconfigs, 2},
{"lsm_merge", "boolean", NULL, NULL, NULL, 0}, {"mmap", "boolean", NULL, NULL, NULL, 0},
{"multiprocess", "boolean", NULL, NULL, NULL, 0},
+ {"operation_timeout_ms", "int", NULL, "min=1", NULL, 0},
{"operation_tracking", "category", NULL, NULL,
confchk_wiredtiger_open_operation_tracking_subconfigs, 2},
{"readonly", "boolean", NULL, NULL, NULL, 0}, {"salvage", "boolean", NULL, NULL, NULL, 0},
@@ -685,6 +689,7 @@ static const WT_CONFIG_CHECK confchk_wiredtiger_open_basecfg[] = {
{"lsm_manager", "category", NULL, NULL, confchk_wiredtiger_open_lsm_manager_subconfigs, 2},
{"lsm_merge", "boolean", NULL, NULL, NULL, 0}, {"mmap", "boolean", NULL, NULL, NULL, 0},
{"multiprocess", "boolean", NULL, NULL, NULL, 0},
+ {"operation_timeout_ms", "int", NULL, "min=1", NULL, 0},
{"operation_tracking", "category", NULL, NULL,
confchk_wiredtiger_open_operation_tracking_subconfigs, 2},
{"readonly", "boolean", NULL, NULL, NULL, 0}, {"salvage", "boolean", NULL, NULL, NULL, 0},
@@ -750,6 +755,7 @@ static const WT_CONFIG_CHECK confchk_wiredtiger_open_usercfg[] = {
{"lsm_manager", "category", NULL, NULL, confchk_wiredtiger_open_lsm_manager_subconfigs, 2},
{"lsm_merge", "boolean", NULL, NULL, NULL, 0}, {"mmap", "boolean", NULL, NULL, NULL, 0},
{"multiprocess", "boolean", NULL, NULL, NULL, 0},
+ {"operation_timeout_ms", "int", NULL, "min=1", NULL, 0},
{"operation_tracking", "category", NULL, NULL,
confchk_wiredtiger_open_operation_tracking_subconfigs, 2},
{"readonly", "boolean", NULL, NULL, NULL, 0}, {"salvage", "boolean", NULL, NULL, NULL, 0},
@@ -815,12 +821,12 @@ static const WT_CONFIG_ENTRY config_entries[] = {{"WT_CONNECTION.add_collator",
"close_scan_interval=10),io_capacity=(total=0),log=(archive=true,"
"os_cache_dirty_pct=0,prealloc=true,zero_fill=false),"
"lsm_manager=(merge=true,worker_thread_max=4),lsm_merge=true,"
- "operation_tracking=(enabled=false,path=\".\"),"
- "shared_cache=(chunk=10MB,name=,quota=0,reserve=0,size=500MB),"
- "statistics=none,statistics_log=(json=false,on_close=false,"
- "sources=,timestamp=\"%b %d %H:%M:%S\",wait=0),"
+ "operation_timeout_ms=0,operation_tracking=(enabled=false,"
+ "path=\".\"),shared_cache=(chunk=10MB,name=,quota=0,reserve=0,"
+ "size=500MB),statistics=none,statistics_log=(json=false,"
+ "on_close=false,sources=,timestamp=\"%b %d %H:%M:%S\",wait=0),"
"timing_stress_for_test=,verbose=",
- confchk_WT_CONNECTION_reconfigure, 26},
+ confchk_WT_CONNECTION_reconfigure, 27},
{"WT_CONNECTION.rollback_to_stable", "", NULL, 0}, {"WT_CONNECTION.set_file_system", "", NULL, 0},
{"WT_CONNECTION.set_timestamp",
"commit_timestamp=,durable_timestamp=,force=false,"
@@ -836,9 +842,10 @@ static const WT_CONFIG_ENTRY config_entries[] = {{"WT_CONNECTION.add_collator",
",os_cache_max=0",
confchk_WT_SESSION_alter, 8},
{"WT_SESSION.begin_transaction",
- "ignore_prepare=false,isolation=,name=,priority=0,read_timestamp="
- ",roundup_timestamps=(prepared=false,read=false),snapshot=,sync=",
- confchk_WT_SESSION_begin_transaction, 8},
+ "ignore_prepare=false,isolation=,name=,operation_timeout_ms=0,"
+ "priority=0,read_timestamp=,roundup_timestamps=(prepared=false,"
+ "read=false),snapshot=,sync=",
+ confchk_WT_SESSION_begin_transaction, 9},
{"WT_SESSION.checkpoint", "drop=,force=false,name=,target=,use_timestamp=true",
confchk_WT_SESSION_checkpoint, 5},
{"WT_SESSION.close", "", NULL, 0},
@@ -989,16 +996,16 @@ static const WT_CONFIG_ENTRY config_entries[] = {{"WT_CONNECTION.add_collator",
"enabled=false,file_max=100MB,os_cache_dirty_pct=0,path=\".\","
"prealloc=true,recover=on,zero_fill=false),"
"lsm_manager=(merge=true,worker_thread_max=4),lsm_merge=true,"
- "mmap=true,multiprocess=false,operation_tracking=(enabled=false,"
- "path=\".\"),readonly=false,salvage=false,session_max=100,"
- "session_scratch_max=2MB,session_table_cache=true,"
- "shared_cache=(chunk=10MB,name=,quota=0,reserve=0,size=500MB),"
- "statistics=none,statistics_log=(json=false,on_close=false,"
- "path=\".\",sources=,timestamp=\"%b %d %H:%M:%S\",wait=0),"
- "timing_stress_for_test=,transaction_sync=(enabled=false,"
- "method=fsync),use_environment=true,use_environment_priv=false,"
+ "mmap=true,multiprocess=false,operation_timeout_ms=0,"
+ "operation_tracking=(enabled=false,path=\".\"),readonly=false,"
+ "salvage=false,session_max=100,session_scratch_max=2MB,"
+ "session_table_cache=true,shared_cache=(chunk=10MB,name=,quota=0,"
+ "reserve=0,size=500MB),statistics=none,statistics_log=(json=false"
+ ",on_close=false,path=\".\",sources=,timestamp=\"%b %d %H:%M:%S\""
+ ",wait=0),timing_stress_for_test=,transaction_sync=(enabled=false"
+ ",method=fsync),use_environment=true,use_environment_priv=false,"
"verbose=,write_through=",
- confchk_wiredtiger_open, 50},
+ confchk_wiredtiger_open, 51},
{"wiredtiger_open_all",
"async=(enabled=false,ops_max=1024,threads=2),buffer_alignment=-1"
",builtin_extension_config=,cache_cursors=true,"
@@ -1019,16 +1026,16 @@ static const WT_CONFIG_ENTRY config_entries[] = {{"WT_CONNECTION.add_collator",
"enabled=false,file_max=100MB,os_cache_dirty_pct=0,path=\".\","
"prealloc=true,recover=on,zero_fill=false),"
"lsm_manager=(merge=true,worker_thread_max=4),lsm_merge=true,"
- "mmap=true,multiprocess=false,operation_tracking=(enabled=false,"
- "path=\".\"),readonly=false,salvage=false,session_max=100,"
- "session_scratch_max=2MB,session_table_cache=true,"
- "shared_cache=(chunk=10MB,name=,quota=0,reserve=0,size=500MB),"
- "statistics=none,statistics_log=(json=false,on_close=false,"
- "path=\".\",sources=,timestamp=\"%b %d %H:%M:%S\",wait=0),"
- "timing_stress_for_test=,transaction_sync=(enabled=false,"
- "method=fsync),use_environment=true,use_environment_priv=false,"
+ "mmap=true,multiprocess=false,operation_timeout_ms=0,"
+ "operation_tracking=(enabled=false,path=\".\"),readonly=false,"
+ "salvage=false,session_max=100,session_scratch_max=2MB,"
+ "session_table_cache=true,shared_cache=(chunk=10MB,name=,quota=0,"
+ "reserve=0,size=500MB),statistics=none,statistics_log=(json=false"
+ ",on_close=false,path=\".\",sources=,timestamp=\"%b %d %H:%M:%S\""
+ ",wait=0),timing_stress_for_test=,transaction_sync=(enabled=false"
+ ",method=fsync),use_environment=true,use_environment_priv=false,"
"verbose=,version=(major=0,minor=0),write_through=",
- confchk_wiredtiger_open_all, 51},
+ confchk_wiredtiger_open_all, 52},
{"wiredtiger_open_basecfg",
"async=(enabled=false,ops_max=1024,threads=2),buffer_alignment=-1"
",builtin_extension_config=,cache_cursors=true,"
@@ -1047,15 +1054,15 @@ static const WT_CONFIG_ENTRY config_entries[] = {{"WT_CONNECTION.add_collator",
"enabled=false,file_max=100MB,os_cache_dirty_pct=0,path=\".\","
"prealloc=true,recover=on,zero_fill=false),"
"lsm_manager=(merge=true,worker_thread_max=4),lsm_merge=true,"
- "mmap=true,multiprocess=false,operation_tracking=(enabled=false,"
- "path=\".\"),readonly=false,salvage=false,session_max=100,"
- "session_scratch_max=2MB,session_table_cache=true,"
- "shared_cache=(chunk=10MB,name=,quota=0,reserve=0,size=500MB),"
- "statistics=none,statistics_log=(json=false,on_close=false,"
- "path=\".\",sources=,timestamp=\"%b %d %H:%M:%S\",wait=0),"
- "timing_stress_for_test=,transaction_sync=(enabled=false,"
- "method=fsync),verbose=,version=(major=0,minor=0),write_through=",
- confchk_wiredtiger_open_basecfg, 45},
+ "mmap=true,multiprocess=false,operation_timeout_ms=0,"
+ "operation_tracking=(enabled=false,path=\".\"),readonly=false,"
+ "salvage=false,session_max=100,session_scratch_max=2MB,"
+ "session_table_cache=true,shared_cache=(chunk=10MB,name=,quota=0,"
+ "reserve=0,size=500MB),statistics=none,statistics_log=(json=false"
+ ",on_close=false,path=\".\",sources=,timestamp=\"%b %d %H:%M:%S\""
+ ",wait=0),timing_stress_for_test=,transaction_sync=(enabled=false"
+ ",method=fsync),verbose=,version=(major=0,minor=0),write_through=",
+ confchk_wiredtiger_open_basecfg, 46},
{"wiredtiger_open_usercfg",
"async=(enabled=false,ops_max=1024,threads=2),buffer_alignment=-1"
",builtin_extension_config=,cache_cursors=true,"
@@ -1074,15 +1081,15 @@ static const WT_CONFIG_ENTRY config_entries[] = {{"WT_CONNECTION.add_collator",
"enabled=false,file_max=100MB,os_cache_dirty_pct=0,path=\".\","
"prealloc=true,recover=on,zero_fill=false),"
"lsm_manager=(merge=true,worker_thread_max=4),lsm_merge=true,"
- "mmap=true,multiprocess=false,operation_tracking=(enabled=false,"
- "path=\".\"),readonly=false,salvage=false,session_max=100,"
- "session_scratch_max=2MB,session_table_cache=true,"
- "shared_cache=(chunk=10MB,name=,quota=0,reserve=0,size=500MB),"
- "statistics=none,statistics_log=(json=false,on_close=false,"
- "path=\".\",sources=,timestamp=\"%b %d %H:%M:%S\",wait=0),"
- "timing_stress_for_test=,transaction_sync=(enabled=false,"
- "method=fsync),verbose=,write_through=",
- confchk_wiredtiger_open_usercfg, 44},
+ "mmap=true,multiprocess=false,operation_timeout_ms=0,"
+ "operation_tracking=(enabled=false,path=\".\"),readonly=false,"
+ "salvage=false,session_max=100,session_scratch_max=2MB,"
+ "session_table_cache=true,shared_cache=(chunk=10MB,name=,quota=0,"
+ "reserve=0,size=500MB),statistics=none,statistics_log=(json=false"
+ ",on_close=false,path=\".\",sources=,timestamp=\"%b %d %H:%M:%S\""
+ ",wait=0),timing_stress_for_test=,transaction_sync=(enabled=false"
+ ",method=fsync),verbose=,write_through=",
+ confchk_wiredtiger_open_usercfg, 45},
{NULL, NULL, NULL, 0}};
int
diff --git a/src/third_party/wiredtiger/src/conn/conn_api.c b/src/third_party/wiredtiger/src/conn/conn_api.c
index c7e776c62c4..56b3febfeb1 100644
--- a/src/third_party/wiredtiger/src/conn/conn_api.c
+++ b/src/third_party/wiredtiger/src/conn/conn_api.c
@@ -1010,7 +1010,6 @@ __conn_close(WT_CONNECTION *wt_conn, const char *config)
WT_SESSION *wt_session;
WT_SESSION_IMPL *s, *session;
uint32_t i;
- const char *ckpt_cfg;
conn = (WT_CONNECTION_IMPL *)wt_conn;
@@ -1074,47 +1073,24 @@ err:
WT_TRET(__wt_lsm_manager_destroy(session));
/*
- * After the async and LSM threads have exited, we shouldn't opening any more files.
+ * After the async and LSM threads have exited, we won't open more files for the application.
+ * However, the sweep server is still running and it can close file handles at the same time the
+ * final checkpoint is reviewing open data handles (forcing checkpoint to reopen handles). Shut
+ * down the sweep server and then flag the system should not open anything new.
*/
+ WT_TRET(__wt_sweep_destroy(session));
F_SET(conn, WT_CONN_CLOSING_NO_MORE_OPENS);
WT_FULL_BARRIER();
- /* The default session is used to access data handles during close. */
- F_CLR(session, WT_SESSION_NO_DATA_HANDLES);
-
/*
- * Perform a system-wide checkpoint so that all tables are consistent with each other. All
- * transactions are resolved but ignore timestamps to make sure all data gets to disk. Do this
- * before shutting down all the subsystems. We have shut down all user sessions, but send in
- * true for waiting for internal races.
+ * Shut down the checkpoint and capacity server threads: we don't want to throttle writes and
+ * we're about to do a final checkpoint separately from the checkpoint server.
*/
- WT_TRET(__wt_config_gets(session, cfg, "use_timestamp", &cval));
- ckpt_cfg = "use_timestamp=false";
- if (cval.val != 0) {
- ckpt_cfg = "use_timestamp=true";
- if (conn->txn_global.has_stable_timestamp)
- F_SET(conn, WT_CONN_CLOSING_TIMESTAMP);
- }
- if (!F_ISSET(conn, WT_CONN_IN_MEMORY | WT_CONN_READONLY)) {
- s = NULL;
- WT_TRET(__wt_open_internal_session(conn, "close_ckpt", true, 0, &s));
- if (s != NULL) {
- const char *checkpoint_cfg[] = {
- WT_CONFIG_BASE(session, WT_SESSION_checkpoint), ckpt_cfg, NULL};
- wt_session = &s->iface;
- WT_TRET(__wt_txn_checkpoint(s, checkpoint_cfg, true));
+ WT_TRET(__wt_capacity_server_destroy(session));
+ WT_TRET(__wt_checkpoint_server_destroy(session));
- /*
- * Mark the metadata dirty so we flush it on close, allowing recovery to be skipped.
- */
- WT_WITH_DHANDLE(s, WT_SESSION_META_DHANDLE(s), __wt_tree_modify_set(s));
-
- WT_TRET(wt_session->close(wt_session, config));
- }
- }
-
- /* Shut down the global transaction state. */
- __wt_txn_global_shutdown(session);
+ /* Perform a final checkpoint and shut down the global transaction state. */
+ WT_TRET(__wt_txn_global_shutdown(session, config, cfg));
if (ret != 0) {
__wt_err(session, ret, "failure during close, disabling further writes");
@@ -1375,12 +1351,11 @@ __conn_config_file(
len = (size_t)size;
/*
- * Copy the configuration file into memory, with a little slop, I'm not
- * interested in debugging off-by-ones.
+ * Copy the configuration file into memory, with a little slop, I'm not interested in debugging
+ * off-by-ones.
*
- * The beginning of a file is the same as if we run into an unquoted
- * newline character, simplify the parsing loop by pretending that's
- * what we're doing.
+ * The beginning of a file is the same as if we run into an unquoted newline character, simplify
+ * the parsing loop by pretending that's what we're doing.
*/
WT_ERR(__wt_buf_init(session, cbuf, len + 10));
WT_ERR(__wt_read(session, fh, (wt_off_t)0, len, ((uint8_t *)cbuf->mem) + 1));
@@ -1429,11 +1404,10 @@ __conn_config_file(
}
/*
- * Replace any newline characters with commas (and strings of
- * commas are safe).
+ * Replace any newline characters with commas (and strings of commas are safe).
*
- * After any newline, skip to a non-white-space character; if
- * the next character is a hash mark, skip to the next newline.
+ * After any newline, skip to a non-white-space character; if the next character is a hash
+ * mark, skip to the next newline.
*/
for (;;) {
for (*t++ = ','; --len > 0 && __wt_isspace((u_char) * ++p);)
@@ -1496,8 +1470,8 @@ __conn_env_var(WT_SESSION_IMPL *session, const char *cfg[], const char *name, co
/*
* Security stuff:
*
- * Don't use the environment variable if the process has additional
- * privileges, unless "use_environment_priv" is configured.
+ * Don't use the environment variable if the process has additional privileges, unless
+ * "use_environment_priv" is configured.
*/
if (!__wt_has_priv())
return (0);
@@ -1656,14 +1630,12 @@ __conn_single(WT_SESSION_IMPL *session, const char *cfg[])
is_create || exist ? WT_FS_OPEN_CREATE : 0, &conn->lock_fh);
/*
- * If this is a read-only connection and we cannot grab the lock file,
- * check if it is because there's no write permission or if the file
- * does not exist. If so, then ignore the error.
- * XXX Ignoring the error does allow multiple read-only connections to
- * exist at the same time on a read-only directory.
+ * If this is a read-only connection and we cannot grab the lock file, check if it is because
+ * there's no write permission or if the file does not exist. If so, then ignore the error. XXX
+ * Ignoring the error does allow multiple read-only connections to exist at the same time on a
+ * read-only directory.
*
- * If we got an expected permission or non-existence error then skip
- * the byte lock.
+ * If we got an expected permission or non-existence error then skip the byte lock.
*/
if (F_ISSET(conn, WT_CONN_READONLY) && (ret == EACCES || ret == ENOENT)) {
bytelock = false;
@@ -1682,15 +1654,13 @@ __conn_single(WT_SESSION_IMPL *session, const char *cfg[])
"another process");
/*
- * If the size of the lock file is non-zero, we created it (or
- * won a locking race with the thread that created it, it
- * doesn't matter).
+ * If the size of the lock file is non-zero, we created it (or won a locking race with the thread
+ * that created it, it doesn't matter).
*
- * Write something into the file, zero-length files make me
- * nervous.
+ * Write something into the file, zero-length files make me nervous.
*
- * The test against the expected length is sheer paranoia (the
- * length should be 0 or correct), but it shouldn't hurt.
+ * The test against the expected length is sheer paranoia (the length should be 0 or correct), but
+ * it shouldn't hurt.
*/
#define WT_SINGLETHREAD_STRING "WiredTiger lock file\n"
WT_ERR(__wt_filesize(session, conn->lock_fh, &size));
@@ -2051,26 +2021,22 @@ __conn_write_base_config(WT_SESSION_IMPL *session, const char *cfg[])
"# or create a WiredTiger.config file to override them."));
/*
- * The base configuration file contains all changes to default settings
- * made at create, and we include the user-configuration file in that
- * list, even though we don't expect it to change. Of course, an
- * application could leave that file as it is right now and not remove
- * a configuration we need, but applications can also guarantee all
- * database users specify consistent environment variables and
- * wiredtiger_open configuration arguments -- if we protect against
- * those problems, might as well include the application's configuration
- * file in that protection.
+ * The base configuration file contains all changes to default settings made at create, and we
+ * include the user-configuration file in that list, even though we don't expect it to change.
+ * Of course, an application could leave that file as it is right now and not remove a
+ * configuration we need, but applications can also guarantee all database users specify
+ * consistent environment variables and wiredtiger_open configuration arguments -- if we protect
+ * against those problems, might as well include the application's configuration file in that
+ * protection.
*
- * We were passed the configuration items specified by the application.
- * That list includes configuring the default settings, presumably if
- * the application configured it explicitly, that setting should survive
- * even if the default changes.
+ * We were passed the configuration items specified by the application. That list includes
+ * configuring the default settings, presumably if the application configured it explicitly,
+ * that setting should survive even if the default changes.
*
- * When writing the base configuration file, we write the version and
- * any configuration information set by the application (in other words,
- * the stack except for cfg[0]). However, some configuration values need
- * to be stripped out from the base configuration file; do that now, and
- * merge the rest to be written.
+ * When writing the base configuration file, we write the version and any configuration
+ * information set by the application (in other words, the stack except for cfg[0]). However,
+ * some configuration values need to be stripped out from the base configuration file; do that
+ * now, and merge the rest to be written.
*/
WT_ERR(__wt_config_merge(session, cfg + 1,
"compatibility=(release=),"
@@ -2337,14 +2303,13 @@ wiredtiger_open(const char *home, WT_EVENT_HANDLER *event_handler, const char *c
WT_ERR(__conn_config_env(session, cfg, i1));
/*
- * We need to know if configured for read-only or in-memory behavior
- * before reading/writing the filesystem. The only way the application
- * can configure that before we touch the filesystem is the wiredtiger
- * config string or the WIREDTIGER_CONFIG environment variable.
+ * We need to know if configured for read-only or in-memory behavior before reading/writing the
+ * filesystem. The only way the application can configure that before we touch the filesystem is
+ * the wiredtiger config string or the WIREDTIGER_CONFIG environment variable.
*
- * The environment isn't trusted by default, for security reasons; if
- * the application wants us to trust the environment before reading
- * the filesystem, the wiredtiger_open config string is the only way.
+ * The environment isn't trusted by default, for security reasons; if the application wants us
+ * to trust the environment before reading the filesystem, the wiredtiger_open config string is
+ * the only way.
*/
WT_ERR(__wt_config_gets(session, cfg, "in_memory", &cval));
if (cval.val != 0)
@@ -2469,14 +2434,12 @@ wiredtiger_open(const char *home, WT_EVENT_HANDLER *event_handler, const char *c
/*
* Configuration ...
*
- * We can't open sessions yet, so any configurations that cause
- * sessions to be opened must be handled inside __wt_connection_open.
+ * We can't open sessions yet, so any configurations that cause sessions to be opened must be
+ * handled inside __wt_connection_open.
*
- * The error message configuration might have changed (if set in a
- * configuration file, and not in the application's configuration
- * string), get it again. Do it first, make error messages correct.
- * Ditto verbose configuration so we dump everything the application
- * wants to see.
+ * The error message configuration might have changed (if set in a configuration file, and not
+ * in the application's configuration string), get it again. Do it first, make error messages
+ * correct. Ditto verbose configuration so we dump everything the application wants to see.
*/
WT_ERR(__wt_config_gets(session, cfg, "error_prefix", &cval));
if (cval.len != 0) {
@@ -2574,6 +2537,9 @@ wiredtiger_open(const char *home, WT_EVENT_HANDLER *event_handler, const char *c
WT_ERR(__wt_config_gets(session, cfg, "mmap", &cval));
conn->mmap = cval.val != 0;
+ WT_ERR(__wt_config_gets(session, cfg, "operation_timeout_ms", &cval));
+ conn->operation_timeout_us = (uint64_t)(cval.val * WT_THOUSAND);
+
WT_ERR(__wt_config_gets(session, cfg, "salvage", &cval));
if (cval.val) {
if (F_ISSET(conn, WT_CONN_READONLY))
diff --git a/src/third_party/wiredtiger/src/conn/conn_cache_pool.c b/src/third_party/wiredtiger/src/conn/conn_cache_pool.c
index 1bb9bf887ff..f1f3bdf8ee9 100644
--- a/src/third_party/wiredtiger/src/conn/conn_cache_pool.c
+++ b/src/third_party/wiredtiger/src/conn/conn_cache_pool.c
@@ -473,9 +473,8 @@ __cache_pool_assess(WT_SESSION_IMPL *session, uint64_t *phighest)
cache = entry->cache;
/*
- * Figure out a delta since the last time we did an assessment
- * for each metric we are tracking. Watch out for wrapping
- * of values.
+ * Figure out a delta since the last time we did an assessment for each metric we are
+ * tracking. Watch out for wrapping of values.
*
* Count pages read, assuming pages are 4KB.
*/
@@ -652,15 +651,12 @@ __cache_pool_adjust(WT_SESSION_IMPL *session, uint64_t highest, uint64_t bump_th
cache->cp_quota - entry->cache_size);
}
/*
- * Bounds checking: don't go over the pool size or under the
- * reserved size for this cache.
+ * Bounds checking: don't go over the pool size or under the reserved size for this cache.
*
- * Shrink by a chunk size if that doesn't drop us
- * below the reserved size.
+ * Shrink by a chunk size if that doesn't drop us below the reserved size.
*
- * Limit the reduction to half of the free space in the
- * connection's cache. This should reduce cache sizes
- * gradually without stalling application threads.
+ * Limit the reduction to half of the free space in the connection's cache. This should
+ * reduce cache sizes gradually without stalling application threads.
*/
if (adjustment > 0) {
*adjustedp = true;
diff --git a/src/third_party/wiredtiger/src/conn/conn_ckpt.c b/src/third_party/wiredtiger/src/conn/conn_ckpt.c
index 68a437be046..70b93a164c9 100644
--- a/src/third_party/wiredtiger/src/conn/conn_ckpt.c
+++ b/src/third_party/wiredtiger/src/conn/conn_ckpt.c
@@ -140,8 +140,8 @@ __ckpt_server_start(WT_CONNECTION_IMPL *conn)
/*
* The checkpoint server gets its own session.
*
- * Checkpoint does enough I/O it may be called upon to perform slow
- * operations for the block manager.
+ * Checkpoint does enough I/O it may be called upon to perform slow operations for the block
+ * manager.
*/
session_flags = WT_SESSION_CAN_WAIT;
WT_RET(__wt_open_internal_session(
diff --git a/src/third_party/wiredtiger/src/conn/conn_dhandle.c b/src/third_party/wiredtiger/src/conn/conn_dhandle.c
index 8884fa5c23b..7c60e0d8239 100644
--- a/src/third_party/wiredtiger/src/conn/conn_dhandle.c
+++ b/src/third_party/wiredtiger/src/conn/conn_dhandle.c
@@ -50,18 +50,15 @@ __conn_dhandle_config_set(WT_SESSION_IMPL *session)
}
/*
- * The defaults are included because persistent configuration
- * information is stored in the metadata file and it may be from an
- * earlier version of WiredTiger. If defaults are included in the
- * configuration, we can add new configuration strings without
- * upgrading the metadata file or writing special code in case a
- * configuration string isn't initialized, as long as the new
- * configuration string has an appropriate default value.
+ * The defaults are included because persistent configuration information is stored in the
+ * metadata file and it may be from an earlier version of WiredTiger. If defaults are included
+ * in the configuration, we can add new configuration strings without upgrading the metadata
+ * file or writing special code in case a configuration string isn't initialized, as long as the
+ * new configuration string has an appropriate default value.
*
- * The error handling is a little odd, but be careful: we're holding a
- * chunk of allocated memory in metaconf. If we fail before we copy a
- * reference to it into the object's configuration array, we must free
- * it, after the copy, we don't want to free it.
+ * The error handling is a little odd, but be careful: we're holding a chunk of allocated memory
+ * in metaconf. If we fail before we copy a reference to it into the object's configuration
+ * array, we must free it, after the copy, we don't want to free it.
*/
WT_ERR(__wt_calloc_def(session, 3, &dhandle->cfg));
switch (dhandle->type) {
@@ -155,12 +152,11 @@ __wt_conn_dhandle_alloc(WT_SESSION_IMPL *session, const char *uri, const char *c
WT_ERR(__wt_spin_init(session, &dhandle->close_lock, "data handle close"));
/*
- * We are holding the data handle list lock, which protects most
- * threads from seeing the new handle until that lock is released.
+ * We are holding the data handle list lock, which protects most threads from seeing the new
+ * handle until that lock is released.
*
- * However, the sweep server scans the list of handles without holding
- * that lock, so we need a write barrier here to ensure the sweep
- * server doesn't see a partially filled in structure.
+ * However, the sweep server scans the list of handles without holding that lock, so we need a
+ * write barrier here to ensure the sweep server doesn't see a partially filled in structure.
*/
WT_WRITE_BARRIER();
@@ -294,19 +290,15 @@ __wt_conn_dhandle_close(WT_SESSION_IMPL *session, bool final, bool mark_dead)
marked_dead = true;
/*
- * Flush dirty data from any durable trees we couldn't mark
- * dead. That involves writing a checkpoint, which can fail if
- * an update cannot be written, causing the close to fail: if
- * not the final close, return the EBUSY error to our caller
- * for eventual retry.
+ * Flush dirty data from any durable trees we couldn't mark dead. That involves writing a
+ * checkpoint, which can fail if an update cannot be written, causing the close to fail: if
+ * not the final close, return the EBUSY error to our caller for eventual retry.
*
- * We can't discard non-durable trees yet: first we have to
- * close the underlying btree handle, then we can mark the
- * data handle dead.
+ * We can't discard non-durable trees yet: first we have to close the underlying btree
+ * handle, then we can mark the data handle dead.
*
- * If we are closing with timestamps enforced, then we have
- * already checkpointed as of the timestamp as needed and any
- * remaining dirty data should be discarded.
+ * If we are closing with timestamps enforced, then we have already checkpointed as of the
+ * timestamp as needed and any remaining dirty data should be discarded.
*/
if (!discard && !marked_dead) {
if (F_ISSET(conn, WT_CONN_CLOSING_TIMESTAMP) || F_ISSET(conn, WT_CONN_IN_MEMORY) ||
@@ -407,16 +399,14 @@ __wt_conn_dhandle_open(WT_SESSION_IMPL *session, const char *cfg[], uint32_t fla
WT_RET(__wt_evict_file_exclusive_on(session));
/*
- * If the handle is already open, it has to be closed so it can be
- * reopened with a new configuration.
+ * If the handle is already open, it has to be closed so it can be reopened with a new
+ * configuration.
*
- * This call can return EBUSY if there's an update in the tree that's
- * not yet globally visible. That's not a problem because it can only
- * happen when we're switching from a normal handle to a "special" one,
- * so we're returning EBUSY to an attempt to verify or do other special
- * operations. The reverse won't happen because when the handle from a
- * verify or other special operation is closed, there won't be updates
- * in the tree that can block the close.
+ * This call can return EBUSY if there's an update in the tree that's not yet globally visible.
+ * That's not a problem because it can only happen when we're switching from a normal handle to
+ * a "special" one, so we're returning EBUSY to an attempt to verify or do other special
+ * operations. The reverse won't happen because when the handle from a verify or other special
+ * operation is closed, there won't be updates in the tree that can block the close.
*/
if (F_ISSET(dhandle, WT_DHANDLE_OPEN))
WT_ERR(__wt_conn_dhandle_close(session, false, false));
diff --git a/src/third_party/wiredtiger/src/conn/conn_log.c b/src/third_party/wiredtiger/src/conn/conn_log.c
index 9d5e5e75041..83dbe69f4b2 100644
--- a/src/third_party/wiredtiger/src/conn/conn_log.c
+++ b/src/third_party/wiredtiger/src/conn/conn_log.c
@@ -97,13 +97,11 @@ __logmgr_version(WT_SESSION_IMPL *session, bool reconfig)
return (0);
/*
- * Set the log file format versions based on compatibility versions
- * set in the connection. We must set this before we call log_open
- * to open or create a log file.
+ * Set the log file format versions based on compatibility versions set in the connection. We
+ * must set this before we call log_open to open or create a log file.
*
- * Note: downgrade in this context means the new version is not the
- * latest possible version. It does not mean the direction of change
- * from the release we may be running currently.
+ * Note: downgrade in this context means the new version is not the latest possible version. It
+ * does not mean the direction of change from the release we may be running currently.
*/
if (conn->compat_major < WT_LOG_V2_MAJOR) {
new_version = 1;
@@ -148,15 +146,12 @@ __logmgr_version(WT_SESSION_IMPL *session, bool reconfig)
if (log->log_version == new_version)
return (0);
/*
- * If we are reconfiguring and at a new version we need to force
- * the log file to advance so that we write out a log file at the
- * correct version. When we are downgrading we must force a checkpoint
- * and finally archive, even if disabled, so that all new version log
- * files are gone.
+ * If we are reconfiguring and at a new version we need to force the log file to advance so that
+ * we write out a log file at the correct version. When we are downgrading we must force a
+ * checkpoint and finally archive, even if disabled, so that all new version log files are gone.
*
- * All of the version changes must be handled with locks on reconfigure
- * because other threads may be changing log files, using pre-allocated
- * files.
+ * All of the version changes must be handled with locks on reconfigure because other threads
+ * may be changing log files, using pre-allocated files.
*/
/*
* Set the version. If it is a live change the logging subsystem will do other work as well to
@@ -180,22 +175,20 @@ __logmgr_config(WT_SESSION_IMPL *session, const char **cfg, bool *runp, bool rec
bool enabled;
/*
- * A note on reconfiguration: the standard "is this configuration string
- * allowed" checks should fail if reconfiguration has invalid strings,
- * for example, "log=(enabled)", or "statistics_log=(path=XXX)", because
- * the connection reconfiguration method doesn't allow those strings.
- * Additionally, the base configuration values during reconfiguration
- * are the currently configured values (so we don't revert to default
- * values when repeatedly reconfiguring), and configuration processing
- * of a currently set value should not change the currently set value.
+ * A note on reconfiguration: the standard "is this configuration string allowed" checks should
+ * fail if reconfiguration has invalid strings, for example, "log=(enabled)", or
+ * "statistics_log=(path=XXX)", because the connection reconfiguration method doesn't allow
+ * those strings. Additionally, the base configuration values during reconfiguration are the
+ * currently configured values (so we don't revert to default values when repeatedly
+ * reconfiguring), and configuration processing of a currently set value should not change the
+ * currently set value.
*
- * In this code path, log server reconfiguration does not stop/restart
- * the log server, so there's no point in re-evaluating configuration
- * strings that cannot be reconfigured, risking bugs in configuration
- * setup, and depending on evaluation of currently set values to always
- * result in the currently set value. Skip tests for any configuration
- * strings which don't make sense during reconfiguration, but don't
- * worry about error reporting because it should never happen.
+ * In this code path, log server reconfiguration does not stop/restart the log server, so
+ * there's no point in re-evaluating configuration strings that cannot be reconfigured, risking
+ * bugs in configuration setup, and depending on evaluation of currently set values to always
+ * result in the currently set value. Skip tests for any configuration strings which don't make
+ * sense during reconfiguration, but don't worry about error reporting because it should never
+ * happen.
*/
conn = S2C(session);
@@ -204,11 +197,10 @@ __logmgr_config(WT_SESSION_IMPL *session, const char **cfg, bool *runp, bool rec
enabled = cval.val != 0;
/*
- * If we're reconfiguring, enabled must match the already
- * existing setting.
+ * If we're reconfiguring, enabled must match the already existing setting.
*
- * If it is off and the user it turning it on, or it is on
- * and the user is turning it off, return an error.
+ * If it is off and the user it turning it on, or it is on and the user is turning it off,
+ * return an error.
*
* See above: should never happen.
*/
@@ -230,9 +222,8 @@ __logmgr_config(WT_SESSION_IMPL *session, const char **cfg, bool *runp, bool rec
*runp = enabled;
/*
- * Setup a log path and compression even if logging is disabled in case
- * we are going to print a log. Only do this on creation. Once a
- * compressor or log path are set they cannot be changed.
+ * Setup a log path and compression even if logging is disabled in case we are going to print a
+ * log. Only do this on creation. Once a compressor or log path are set they cannot be changed.
*
* See above: should never happen.
*/
@@ -254,9 +245,9 @@ __logmgr_config(WT_SESSION_IMPL *session, const char **cfg, bool *runp, bool rec
FLD_SET(conn->log_flags, WT_CONN_LOG_ARCHIVE);
/*
- * The file size cannot be reconfigured. The amount of memory allocated
- * to the log slots may be based on the log file size at creation and we
- * don't want to re-allocate that memory while running.
+ * The file size cannot be reconfigured. The amount of memory allocated to the log slots may be
+ * based on the log file size at creation and we don't want to re-allocate that memory while
+ * running.
*
* See above: should never happen.
*/
@@ -286,8 +277,8 @@ __logmgr_config(WT_SESSION_IMPL *session, const char **cfg, bool *runp, bool rec
conn->log_prealloc = 1;
/*
- * Note it's meaningless to reconfigure this value during runtime, it
- * only matters on create before recovery runs.
+ * Note it's meaningless to reconfigure this value during runtime, it only matters on create
+ * before recovery runs.
*
* See above: should never happen.
*/
@@ -598,13 +589,11 @@ __log_file_server(void *arg)
*/
min_lsn = log->write_lsn;
/*
- * We have to wait until the LSN we asked for is
- * written. If it isn't signal the wrlsn thread
- * to get it written.
+ * We have to wait until the LSN we asked for is written. If it isn't signal the wrlsn
+ * thread to get it written.
*
- * We also have to wait for the written LSN and the
- * sync LSN to be in the same file so that we know we
- * have synchronized all earlier log files.
+ * We also have to wait for the written LSN and the sync LSN to be in the same file so
+ * that we know we have synchronized all earlier log files.
*/
if (__wt_log_cmp(&log->bg_sync_lsn, &min_lsn) <= 0) {
/*
diff --git a/src/third_party/wiredtiger/src/conn/conn_open.c b/src/third_party/wiredtiger/src/conn/conn_open.c
index fc352bbf821..f7e338ac9bb 100644
--- a/src/third_party/wiredtiger/src/conn/conn_open.c
+++ b/src/third_party/wiredtiger/src/conn/conn_open.c
@@ -82,10 +82,12 @@ __wt_connection_close(WT_CONNECTION_IMPL *conn)
F_SET(conn, WT_CONN_CLOSING);
WT_FULL_BARRIER();
+ /* The default session is used to access data handles during close. */
+ F_CLR(session, WT_SESSION_NO_DATA_HANDLES);
+
/*
- * Shut down server threads other than the eviction server, which is needed later to close btree
- * handles. Some of these threads access btree handles, so take care in ordering shutdown to
- * make sure they exit before files are closed.
+ * Shut down server threads. Some of these threads access btree handles and eviction, shut them
+ * down before the eviction server, and shut all servers down before closing open data handles.
*/
WT_TRET(__wt_capacity_server_destroy(session));
WT_TRET(__wt_checkpoint_server_destroy(session));
diff --git a/src/third_party/wiredtiger/src/conn/conn_reconfig.c b/src/third_party/wiredtiger/src/conn/conn_reconfig.c
index 3cc46618a4a..315ae099c51 100644
--- a/src/third_party/wiredtiger/src/conn/conn_reconfig.c
+++ b/src/third_party/wiredtiger/src/conn/conn_reconfig.c
@@ -397,30 +397,27 @@ __wt_conn_reconfig(WT_SESSION_IMPL *session, const char **cfg)
F_SET(conn, WT_CONN_RECONFIGURING);
/*
- * The configuration argument has been checked for validity, update the
- * previous connection configuration.
+ * The configuration argument has been checked for validity, update the previous connection
+ * configuration.
*
- * DO NOT merge the configuration before the reconfigure calls. Some
- * of the underlying reconfiguration functions do explicit checks with
- * the second element of the configuration array, knowing the defaults
- * are in slot #1 and the application's modifications are in slot #2.
+ * DO NOT merge the configuration before the reconfigure calls. Some of the underlying
+ * reconfiguration functions do explicit checks with the second element of the configuration
+ * array, knowing the defaults are in slot #1 and the application's modifications are in slot
+ * #2.
*
- * Replace the base configuration set up by CONNECTION_API_CALL with
- * the current connection configuration, otherwise reconfiguration
- * functions will find the base value instead of previously configured
- * value.
+ * Replace the base configuration set up by CONNECTION_API_CALL with the current connection
+ * configuration, otherwise reconfiguration functions will find the base value instead of
+ * previously configured value.
*/
cfg[0] = conn->cfg;
/*
* Reconfigure the system.
*
- * The compatibility version check is special: upgrade / downgrade
- * cannot be done with transactions active, and checkpoints must not
- * span a version change. Hold the checkpoint lock to avoid conflicts
- * with WiredTiger's checkpoint thread, and rely on the documentation
- * specifying that no new operations can start until the upgrade /
- * downgrade completes.
+ * The compatibility version check is special: upgrade / downgrade cannot be done with
+ * transactions active, and checkpoints must not span a version change. Hold the checkpoint lock
+ * to avoid conflicts with WiredTiger's checkpoint thread, and rely on the documentation
+ * specifying that no new operations can start until the upgrade / downgrade completes.
*/
WT_WITH_CHECKPOINT_LOCK(session, ret = __wt_conn_compat_config(session, cfg, true));
WT_ERR(ret);
diff --git a/src/third_party/wiredtiger/src/conn/conn_stat.c b/src/third_party/wiredtiger/src/conn/conn_stat.c
index 24397ed0666..4649fc9ef4d 100644
--- a/src/third_party/wiredtiger/src/conn/conn_stat.c
+++ b/src/third_party/wiredtiger/src/conn/conn_stat.c
@@ -101,19 +101,17 @@ __statlog_config(WT_SESSION_IMPL *session, const char **cfg, bool *runp)
char **sources;
/*
- * A note on reconfiguration: the standard "is this configuration string
- * allowed" checks should fail if reconfiguration has invalid strings,
- * for example, "log=(enabled)", or "statistics_log=(path=XXX)", because
- * the connection reconfiguration method doesn't allow those strings.
- * Additionally, the base configuration values during reconfiguration
- * are the currently configured values (so we don't revert to default
- * values when repeatedly reconfiguring), and configuration processing
- * of a currently set value should not change the currently set value.
+ * A note on reconfiguration: the standard "is this configuration string allowed" checks should
+ * fail if reconfiguration has invalid strings, for example, "log=(enabled)", or
+ * "statistics_log=(path=XXX)", because the connection reconfiguration method doesn't allow
+ * those strings. Additionally, the base configuration values during reconfiguration are the
+ * currently configured values (so we don't revert to default values when repeatedly
+ * reconfiguring), and configuration processing of a currently set value should not change the
+ * currently set value.
*
- * In this code path, a previous statistics log server reconfiguration
- * may have stopped the server (and we're about to restart it). Because
- * stopping the server discarded the configured information stored in
- * the connection structure, we have to re-evaluate all configuration
+ * In this code path, a previous statistics log server reconfiguration may have stopped the
+ * server (and we're about to restart it). Because stopping the server discarded the configured
+ * information stored in the connection structure, we have to re-evaluate all configuration
* values, reconfiguration can't skip any of them.
*/
@@ -336,8 +334,8 @@ __statlog_dump(WT_SESSION_IMPL *session, const char *name, bool conn_stats)
/*
* Open the statistics cursor and dump the statistics.
*
- * If we don't find an underlying object, silently ignore it, the object
- * may exist only intermittently.
+ * If we don't find an underlying object, silently ignore it, the object may exist only
+ * intermittently.
*/
if ((ret = __wt_curstat_open(session, uri, NULL, cfg, &cursor)) != 0) {
if (ret == EBUSY || ret == ENOENT || ret == WT_NOTFOUND)
@@ -420,18 +418,13 @@ __statlog_lsm_apply(WT_SESSION_IMPL *session)
cnt = locked = 0;
/*
- * Walk the list of LSM trees, checking for a match on the set of
- * sources.
+ * Walk the list of LSM trees, checking for a match on the set of sources.
*
- * XXX
- * We can't hold the schema lock for the traversal because the LSM
- * statistics code acquires the tree lock, and the LSM cursor code
- * acquires the tree lock and then acquires the schema lock, it's a
- * classic deadlock. This is temporary code so I'm not going to do
- * anything fancy.
- * It is OK to not keep holding the schema lock after populating
- * the list of matching LSM trees, since the __wt_lsm_tree_get call
- * will bump a reference count, so the tree won't go away.
+ * XXX We can't hold the schema lock for the traversal because the LSM statistics code acquires
+ * the tree lock, and the LSM cursor code acquires the tree lock and then acquires the schema
+ * lock, it's a classic deadlock. This is temporary code so I'm not going to do anything fancy.
+ * It is OK to not keep holding the schema lock after populating the list of matching LSM trees,
+ * since the __wt_lsm_tree_get call will bump a reference count, so the tree won't go away.
*/
__wt_spin_lock(session, &S2C(session)->schema_lock);
locked = true;
@@ -512,12 +505,9 @@ __statlog_log_one(WT_SESSION_IMPL *session, WT_ITEM *path, WT_ITEM *tmp)
WT_RET(__wt_conn_btree_apply(session, NULL, __statlog_apply, NULL, NULL));
/*
- * Walk the list of open LSM trees, dumping any that match the
- * the list of object sources.
+ * Walk the list of open LSM trees, dumping any that match the list of object sources.
*
- * XXX
- * This code should be removed when LSM objects are converted to
- * data handles.
+ * XXX This code should be removed when LSM objects are converted to data handles.
*/
if (conn->stat_sources != NULL)
WT_RET(__statlog_lsm_apply(session));
@@ -584,11 +574,11 @@ __statlog_server(void *arg)
WT_CLEAR(tmp);
/*
- * We need a temporary place to build a path and an entry prefix.
- * The length of the path plus 128 should be more than enough.
+ * We need a temporary place to build a path and an entry prefix. The length of the path plus
+ * 128 should be more than enough.
*
- * We also need a place to store the current path, because that's
- * how we know when to close/re-open the file.
+ * We also need a place to store the current path, because that's how we know when to
+ * close/re-open the file.
*/
WT_ERR(__wt_buf_init(session, &path, strlen(conn->stat_path) + 128));
WT_ERR(__wt_buf_setstr(session, &path, ""));
@@ -640,12 +630,11 @@ __statlog_start(WT_CONNECTION_IMPL *conn)
/*
* Start the thread.
*
- * Statistics logging creates a thread per database, rather than using
- * a single thread to do logging for all of the databases. If we ever
- * see lots of databases at a time, doing statistics logging, and we
- * want to reduce the number of threads, there's no reason we have to
- * have more than one thread, I just didn't feel like writing the code
- * to figure out the scheduling.
+ * Statistics logging creates a thread per database, rather than using a single thread to do
+ * logging for all of the databases. If we ever see lots of databases at a time, doing
+ * statistics logging, and we want to reduce the number of threads, there's no reason we have to
+ * have more than one thread, I just didn't feel like writing the code to figure out the
+ * scheduling.
*/
WT_RET(__wt_thread_create(session, &conn->stat_tid, __statlog_server, session));
conn->stat_tid_set = true;
@@ -666,17 +655,15 @@ __wt_statlog_create(WT_SESSION_IMPL *session, const char *cfg[])
conn = S2C(session);
/*
- * Stop any server that is already running. This means that each time
- * reconfigure is called we'll bounce the server even if there are no
- * configuration changes. This makes our life easier as the underlying
- * configuration routine doesn't have to worry about freeing objects
- * in the connection structure (it's guaranteed to always start with a
- * blank slate), and we don't have to worry about races where a running
- * server is reading configuration information that we're updating, and
- * it's not expected that reconfiguration will happen a lot.
+ * Stop any server that is already running. This means that each time reconfigure is called
+ * we'll bounce the server even if there are no configuration changes. This makes our life
+ * easier as the underlying configuration routine doesn't have to worry about freeing objects in
+ * the connection structure (it's guaranteed to always start with a blank slate), and we don't
+ * have to worry about races where a running server is reading configuration information that
+ * we're updating, and it's not expected that reconfiguration will happen a lot.
*
- * If there's no server running, discard any configuration information
- * so we don't leak memory during reconfiguration.
+ * If there's no server running, discard any configuration information so we don't leak memory
+ * during reconfiguration.
*/
if (conn->stat_session == NULL)
WT_RET(__stat_config_discard(session));
diff --git a/src/third_party/wiredtiger/src/conn/conn_sweep.c b/src/third_party/wiredtiger/src/conn/conn_sweep.c
index a9c3775ae39..b762a4d8f42 100644
--- a/src/third_party/wiredtiger/src/conn/conn_sweep.c
+++ b/src/third_party/wiredtiger/src/conn/conn_sweep.c
@@ -89,8 +89,8 @@ __sweep_expire_one(WT_SESSION_IMPL *session)
/*
* Mark the handle dead and close the underlying handle.
*
- * For btree handles, closing the handle decrements the open file
- * count, meaning the close loop won't overrun the configured minimum.
+ * For btree handles, closing the handle decrements the open file count, meaning the close loop
+ * won't overrun the configured minimum.
*/
ret = __wt_conn_dhandle_close(session, false, true);
@@ -299,15 +299,13 @@ __sweep_server(void *arg)
__wt_seconds(session, &now);
/*
- * Sweep the lookaside table. If the lookaside table hasn't yet
- * been written, there's no work to do.
+ * Sweep the lookaside table. If the lookaside table hasn't yet been written, there's no
+ * work to do.
*
- * Don't sweep the lookaside table if the cache is stuck full.
- * The sweep uses the cache and can exacerbate the problem.
- * If we try to sweep when the cache is full or we aren't
- * making progress in eviction, sweeping can wind up constantly
- * bringing in and evicting pages from the lookaside table,
- * which will stop the cache from moving into the stuck state.
+ * Don't sweep the lookaside table if the cache is stuck full. The sweep uses the cache and
+ * can exacerbate the problem. If we try to sweep when the cache is full or we aren't making
+ * progress in eviction, sweeping can wind up constantly bringing in and evicting pages from
+ * the lookaside table, which will stop the cache from moving into the stuck state.
*/
if ((FLD_ISSET(conn->timing_stress_flags, WT_TIMING_STRESS_AGGRESSIVE_SWEEP) ||
now - last >= WT_LAS_SWEEP_SEC) &&
diff --git a/src/third_party/wiredtiger/src/cursor/cur_backup.c b/src/third_party/wiredtiger/src/cursor/cur_backup.c
index 656cb3ac3a1..4869bcb3b71 100644
--- a/src/third_party/wiredtiger/src/cursor/cur_backup.c
+++ b/src/third_party/wiredtiger/src/cursor/cur_backup.c
@@ -242,20 +242,16 @@ __backup_start(WT_SESSION_IMPL *session, WT_CURSOR_BACKUP *cb, bool is_dup, cons
if (!is_dup) {
/*
- * The hot backup copy is done outside of WiredTiger, which
- * means file blocks can't be freed and re-allocated until the
- * backup completes. The checkpoint code checks the backup flag,
- * and if a backup cursor is open checkpoints aren't discarded.
- * We release the lock as soon as we've set the flag, we don't
- * want to block checkpoints, we just want to make sure no
- * checkpoints are deleted. The checkpoint code holds the lock
- * until it's finished the checkpoint, otherwise we could start
- * a hot backup that would race with an already-started
+ * The hot backup copy is done outside of WiredTiger, which means file blocks can't be freed
+ * and re-allocated until the backup completes. The checkpoint code checks the backup flag,
+ * and if a backup cursor is open checkpoints aren't discarded. We release the lock as soon
+ * as we've set the flag, we don't want to block checkpoints, we just want to make sure no
+ * checkpoints are deleted. The checkpoint code holds the lock until it's finished the
+ * checkpoint, otherwise we could start a hot backup that would race with an already-started
* checkpoint.
*
- * We are holding the checkpoint and schema locks so schema
- * operations will not see the backup file list until it is
- * complete and valid.
+ * We are holding the checkpoint and schema locks so schema operations will not see the
+ * backup file list until it is complete and valid.
*/
WT_WITH_HOTBACKUP_WRITE_LOCK(session, WT_CONN_HOTBACKUP_START(conn));
@@ -313,15 +309,13 @@ __backup_start(WT_SESSION_IMPL *session, WT_CURSOR_BACKUP *cb, bool is_dup, cons
/* Add the hot backup and standard WiredTiger files to the list. */
if (log_only) {
/*
- * If this is not a duplicate cursor, using the log target is an
- * incremental backup. If this is a duplicate cursor then using
- * the log target on an existing backup cursor means this cursor
- * returns the current list of log files. That list was set up
- * when parsing the URI so we don't have anything to do here.
+ * If this is not a duplicate cursor, using the log target is an incremental backup. If this
+ * is a duplicate cursor then using the log target on an existing backup cursor means this
+ * cursor returns the current list of log files. That list was set up when parsing the URI
+ * so we don't have anything to do here.
*
- * We also open an incremental backup source file so that we can
- * detect a crash with an incremental backup existing in the
- * source directory versus an improper destination.
+ * We also open an incremental backup source file so that we can detect a crash with an
+ * incremental backup existing in the source directory versus an improper destination.
*/
dest = WT_INCREMENTAL_BACKUP;
WT_ERR(__wt_fopen(session, WT_INCREMENTAL_SRC, WT_FS_OPEN_CREATE, WT_STREAM_WRITE, &srcfs));
diff --git a/src/third_party/wiredtiger/src/cursor/cur_ds.c b/src/third_party/wiredtiger/src/cursor/cur_ds.c
index bf90ad7238e..84a39e9292d 100644
--- a/src/third_party/wiredtiger/src/cursor/cur_ds.c
+++ b/src/third_party/wiredtiger/src/cursor/cur_ds.c
@@ -87,15 +87,13 @@ __curds_cursor_resolve(WT_CURSOR *cursor, int ret)
source = ((WT_CURSOR_DATA_SOURCE *)cursor)->source;
/*
- * Update the cursor's key, value and flags. (We use the _INT flags in
- * the same way as file objects: there's some chance the underlying data
- * source is passing us a reference to data only pinned per operation,
- * might as well be safe.)
+ * Update the cursor's key, value and flags. (We use the _INT flags in the same way as file
+ * objects: there's some chance the underlying data source is passing us a reference to data
+ * only pinned per operation, might as well be safe.)
*
- * There's also a requirement the underlying data-source never returns
- * with the cursor/source key referencing application memory: it'd be
- * great to do a copy as necessary here so the data-source doesn't have
- * to worry about copying the key, but we don't have enough information
+ * There's also a requirement the underlying data-source never returns with the cursor/source
+ * key referencing application memory: it'd be great to do a copy as necessary here so the
+ * data-source doesn't have to worry about copying the key, but we don't have enough information
* to know if a cursor is pointing at application or data-source memory.
*/
if (ret == 0) {
diff --git a/src/third_party/wiredtiger/src/cursor/cur_index.c b/src/third_party/wiredtiger/src/cursor/cur_index.c
index 8ab7c58f263..e675392939c 100644
--- a/src/third_party/wiredtiger/src/cursor/cur_index.c
+++ b/src/third_party/wiredtiger/src/cursor/cur_index.c
@@ -222,15 +222,14 @@ __curindex_search(WT_CURSOR *cursor)
WT_ERR(child->next(child));
/*
- * We expect partial matches, and want the smallest record with a key
- * greater than or equal to the search key.
+ * We expect partial matches, and want the smallest record with a key greater than or equal to
+ * the search key.
*
- * If the key we find is shorter than the search key, it can't possibly
- * match.
+ * If the key we find is shorter than the search key, it can't possibly match.
*
- * The only way for the key to be exactly equal is if there is an index
- * on the primary key, because otherwise the primary key columns will
- * be appended to the index key, but we don't disallow that (odd) case.
+ * The only way for the key to be exactly equal is if there is an index on the primary key,
+ * because otherwise the primary key columns will be appended to the index key, but we don't
+ * disallow that (odd) case.
*/
found_key = child->key;
if (found_key.size < cursor->key.size)
@@ -301,14 +300,14 @@ __curindex_search_near(WT_CURSOR *cursor, int *exact)
}
/*
- * We expect partial matches, and want the smallest record with a key
- * greater than or equal to the search key.
+ * We expect partial matches, and want the smallest record with a key greater than or equal to
+ * the search key.
*
- * If the found key starts with the search key, we indicate a match by
- * setting exact equal to zero.
+ * If the found key starts with the search key, we indicate a match by setting exact equal to
+ * zero.
*
- * The compare function expects application-supplied keys to come first
- * so we flip the sign of the result to match what callers expect.
+ * The compare function expects application-supplied keys to come first so we flip the sign of
+ * the result to match what callers expect.
*/
found_key = child->key;
if (found_key.size > cursor->key.size) {
diff --git a/src/third_party/wiredtiger/src/cursor/cur_join.c b/src/third_party/wiredtiger/src/cursor/cur_join.c
index c58e032cb80..5b2dc711a7e 100644
--- a/src/third_party/wiredtiger/src/cursor/cur_join.c
+++ b/src/third_party/wiredtiger/src/cursor/cur_join.c
@@ -590,14 +590,12 @@ __curjoin_entry_member(
if (entry->bloom != NULL) {
/*
- * If the item is not in the Bloom filter, we return
- * immediately, otherwise, we still may need to check the
- * long way, since it may be a false positive.
+ * If the item is not in the Bloom filter, we return immediately, otherwise, we still may
+ * need to check the long way, since it may be a false positive.
*
- * If we don't own the Bloom filter, we must be sharing one
- * in a previous entry. So the shared filter has already
- * been checked and passed, we don't need to check it again.
- * We'll still need to check the long way.
+ * If we don't own the Bloom filter, we must be sharing one in a previous entry. So the
+ * shared filter has already been checked and passed, we don't need to check it again. We'll
+ * still need to check the long way.
*/
if (F_ISSET(entry, WT_CURJOIN_ENTRY_OWN_BLOOM))
WT_ERR(__wt_bloom_inmem_get(entry->bloom, key));
diff --git a/src/third_party/wiredtiger/src/cursor/cur_metadata.c b/src/third_party/wiredtiger/src/cursor/cur_metadata.c
index 9933122f13c..14e295cddd5 100644
--- a/src/third_party/wiredtiger/src/cursor/cur_metadata.c
+++ b/src/third_party/wiredtiger/src/cursor/cur_metadata.c
@@ -264,13 +264,11 @@ __curmetadata_next(WT_CURSOR *cursor)
WT_ERR(__curmetadata_metadata_search(session, cursor));
else {
/*
- * When applications open metadata cursors, they expect to see
- * all schema-level operations reflected in the results. Query
- * at read-uncommitted to avoid confusion caused by the current
- * transaction state.
+ * When applications open metadata cursors, they expect to see all schema-level operations
+ * reflected in the results. Query at read-uncommitted to avoid confusion caused by the
+ * current transaction state.
*
- * Don't exit from the scan if we find an incomplete entry:
- * just skip over it.
+ * Don't exit from the scan if we find an incomplete entry: just skip over it.
*/
for (;;) {
WT_WITH_TXN_ISOLATION(
diff --git a/src/third_party/wiredtiger/src/cursor/cur_std.c b/src/third_party/wiredtiger/src/cursor/cur_std.c
index fa2b52d254d..6140b453f86 100644
--- a/src/third_party/wiredtiger/src/cursor/cur_std.c
+++ b/src/third_party/wiredtiger/src/cursor/cur_std.c
@@ -980,17 +980,15 @@ __wt_cursor_dup_position(WT_CURSOR *to_dup, WT_CURSOR *cursor)
WT_ITEM key;
/*
- * Get a copy of the cursor's raw key, and set it in the new cursor,
- * then search for that key to position the cursor.
+ * Get a copy of the cursor's raw key, and set it in the new cursor, then search for that key to
+ * position the cursor.
*
- * We don't clear the WT_ITEM structure: all that happens when getting
- * and setting the key is the data/size fields are reset to reference
- * the original cursor's key.
+ * We don't clear the WT_ITEM structure: all that happens when getting and setting the key is
+ * the data/size fields are reset to reference the original cursor's key.
*
- * That said, we're playing games with the cursor flags: setting the key
- * sets the key/value application-set flags in the new cursor, which may
- * or may not be correct, but there's nothing simple that fixes it. We
- * depend on the subsequent cursor search to clean things up, as search
+ * That said, we're playing games with the cursor flags: setting the key sets the key/value
+ * application-set flags in the new cursor, which may or may not be correct, but there's nothing
+ * simple that fixes it. We depend on the subsequent cursor search to clean things up, as search
* is required to copy and/or reference private memory after success.
*/
WT_RET(__wt_cursor_get_raw_key(to_dup, &key));
diff --git a/src/third_party/wiredtiger/src/cursor/cur_table.c b/src/third_party/wiredtiger/src/cursor/cur_table.c
index fdf10a558a4..94acee0592e 100644
--- a/src/third_party/wiredtiger/src/cursor/cur_table.c
+++ b/src/third_party/wiredtiger/src/cursor/cur_table.c
@@ -495,11 +495,10 @@ __curtable_insert(WT_CURSOR *cursor)
/*
* Split out the first insert, it may be allocating a recno.
*
- * If the table has indices, we also need to know whether this record
- * is replacing an existing record so that the existing index entries
- * can be removed. We discover if this is an overwrite by configuring
- * the primary cursor for no-overwrite, and checking if the insert
- * detects a duplicate key.
+ * If the table has indices, we also need to know whether this record is replacing an existing
+ * record so that the existing index entries can be removed. We discover if this is an overwrite
+ * by configuring the primary cursor for no-overwrite, and checking if the insert detects a
+ * duplicate key.
*/
cp = ctable->cg_cursors;
primary = *cp++;
@@ -675,12 +674,12 @@ __curtable_reserve(WT_CURSOR *cursor)
JOINABLE_CURSOR_UPDATE_API_CALL(cursor, session, update);
/*
- * We don't have to open the indices here, but it makes the code similar
- * to other cursor functions, and it's odd for a reserve call to succeed
- * but the subsequent update fail opening indices.
+ * We don't have to open the indices here, but it makes the code similar to other cursor
+ * functions, and it's odd for a reserve call to succeed but the subsequent update fail opening
+ * indices.
*
- * Check for a transaction before index open, opening the indices will
- * start a transaction if one isn't running.
+ * Check for a transaction before index open, opening the indices will start a transaction if
+ * one isn't running.
*/
WT_ERR(__wt_txn_context_check(session, true));
WT_ERR(__curtable_open_indices(ctable));
@@ -731,10 +730,9 @@ __wt_table_range_truncate(WT_CURSOR_TABLE *start, WT_CURSOR_TABLE *stop)
/*
* Step through the cursor range, removing the index entries.
*
- * If there are indices, copy the key we're using to step through the
- * cursor range (so we can reset the cursor to its original position),
- * then remove all of the index records in the truncated range. Copy
- * the raw key because the memory is only valid until the cursor moves.
+ * If there are indices, copy the key we're using to step through the cursor range (so we can
+ * reset the cursor to its original position), then remove all of the index records in the
+ * truncated range. Copy the raw key because the memory is only valid until the cursor moves.
*/
if (ctable->table->nindices > 0) {
if (start == NULL) {
diff --git a/src/third_party/wiredtiger/src/evict/evict_file.c b/src/third_party/wiredtiger/src/evict/evict_file.c
index 7f916ca4a1e..b8ec59372dc 100644
--- a/src/third_party/wiredtiger/src/evict/evict_file.c
+++ b/src/third_party/wiredtiger/src/evict/evict_file.c
@@ -49,24 +49,20 @@ __wt_evict_file(WT_SESSION_IMPL *session, WT_CACHE_OP syncop)
page = ref->page;
/*
- * Eviction can fail when a page in the evicted page's subtree
- * switches state. For example, if we don't evict a page marked
- * empty, because we expect it to be merged into its parent, it
- * might no longer be empty after it's reconciled, in which case
- * eviction of its parent would fail. We can either walk the
- * tree multiple times (until it's finally empty), or reconcile
- * each page to get it to its final state before considering if
- * it's an eviction target or will be merged into its parent.
+ * Eviction can fail when a page in the evicted page's subtree switches state. For example,
+ * if we don't evict a page marked empty, because we expect it to be merged into its parent,
+ * it might no longer be empty after it's reconciled, in which case eviction of its parent
+ * would fail. We can either walk the tree multiple times (until it's finally empty), or
+ * reconcile each page to get it to its final state before considering if it's an eviction
+ * target or will be merged into its parent.
*
- * Don't limit this test to any particular page type, that tends
- * to introduce bugs when the reconciliation of other page types
- * changes, and there's no advantage to doing so.
+ * Don't limit this test to any particular page type, that tends to introduce bugs when the
+ * reconciliation of other page types changes, and there's no advantage to doing so.
*
- * Eviction can also fail because an update cannot be written.
- * If sessions have disjoint sets of files open, updates in a
- * no-longer-referenced file may not yet be globally visible,
- * and the write will fail with EBUSY. Our caller handles that
- * error, retrying later.
+ * Eviction can also fail because an update cannot be written. If sessions have disjoint
+ * sets of files open, updates in a no-longer-referenced file may not yet be globally
+ * visible, and the write will fail with EBUSY. Our caller handles that error, retrying
+ * later.
*/
if (syncop == WT_SYNC_CLOSE && __wt_page_is_modified(page))
WT_ERR(__wt_reconcile(session, ref, NULL, WT_REC_EVICT | WT_REC_VISIBLE_ALL, NULL));
@@ -85,8 +81,7 @@ __wt_evict_file(WT_SESSION_IMPL *session, WT_CACHE_OP syncop)
/*
* Evict the page.
*
- * Ensure the ref state is restored to the previous
- * value if eviction fails.
+ * Ensure the ref state is restored to the previous value if eviction fails.
*/
WT_ERR(__wt_evict(session, ref, ref->state, WT_EVICT_CALL_CLOSING));
break;
diff --git a/src/third_party/wiredtiger/src/evict/evict_lru.c b/src/third_party/wiredtiger/src/evict/evict_lru.c
index c224a3b7b11..2f9f3220106 100644
--- a/src/third_party/wiredtiger/src/evict/evict_lru.c
+++ b/src/third_party/wiredtiger/src/evict/evict_lru.c
@@ -277,10 +277,12 @@ __wt_evict_thread_run(WT_SESSION_IMPL *session, WT_THREAD *thread)
conn = S2C(session);
cache = conn->cache;
- /*
- * The thread group code calls us repeatedly. So each call is one pass through eviction.
- */
- WT_TRACK_TIME(session);
+/*
+ * The thread group code calls us repeatedly. So each call is one pass through eviction.
+ */
+#ifdef HAVE_DIAGNOSTIC
+ __wt_seconds32(session, &session->op_5043_seconds);
+#endif
if (conn->evict_server_running && __wt_spin_trylock(session, &cache->evict_pass_lock) == 0) {
/*
* Cannot use WT_WITH_PASS_LOCK because this is a try lock. Fix when that is supported. We
@@ -426,15 +428,14 @@ __evict_server(WT_SESSION_IMPL *session, bool *did_work)
return (0);
#endif
/*
- * If we're stuck for 5 minutes in diagnostic mode, or the verbose
- * evict_stuck flag is configured, log the cache and transaction state.
+ * If we're stuck for 5 minutes in diagnostic mode, or the verbose evict_stuck flag is
+ * configured, log the cache and transaction state.
*
* If we're stuck for 5 minutes in diagnostic mode, give up.
*
- * We don't do this check for in-memory workloads because application
- * threads are not blocked by the cache being full. If the cache becomes
- * full of clean pages, we can be servicing reads while the cache
- * appears stuck to eviction.
+ * We don't do this check for in-memory workloads because application threads are not blocked by
+ * the cache being full. If the cache becomes full of clean pages, we can be servicing reads
+ * while the cache appears stuck to eviction.
*/
if (F_ISSET(conn, WT_CONN_IN_MEMORY))
return (0);
@@ -578,8 +579,7 @@ __evict_update_work(WT_SESSION_IMPL *session)
/*
* If we need space in the cache, try to find clean pages to evict.
*
- * Avoid division by zero if the cache size has not yet been set in a
- * shared cache.
+ * Avoid division by zero if the cache size has not yet been set in a shared cache.
*/
bytes_max = conn->cache_size + 1;
bytes_inuse = __wt_cache_bytes_inuse(cache);
@@ -679,14 +679,12 @@ __evict_pass(WT_SESSION_IMPL *session)
++cache->evict_pass_gen;
/*
- * Update the oldest ID: we use it to decide whether pages are
- * candidates for eviction. Without this, if all threads are
- * blocked after a long-running transaction (such as a
+ * Update the oldest ID: we use it to decide whether pages are candidates for eviction.
+ * Without this, if all threads are blocked after a long-running transaction (such as a
* checkpoint) completes, we may never start evicting again.
*
- * Do this every time the eviction server wakes up, regardless
- * of whether the cache is full, to prevent the oldest ID
- * falling too far behind. Don't wait to lock the table: with
+ * Do this every time the eviction server wakes up, regardless of whether the cache is full,
+ * to prevent the oldest ID falling too far behind. Don't wait to lock the table: with
* highly threaded workloads, that creates a bottleneck.
*/
WT_RET(__wt_txn_update_oldest(session, WT_TXN_OLDEST_STRICT));
@@ -702,14 +700,12 @@ __evict_pass(WT_SESSION_IMPL *session)
WT_RET(__evict_lru_walk(session));
/*
- * If the queue has been empty recently, keep queuing more
- * pages to evict. If the rate of queuing pages is high
- * enough, this score will go to zero, in which case the
- * eviction server might as well help out with eviction.
+ * If the queue has been empty recently, keep queuing more pages to evict. If the rate of
+ * queuing pages is high enough, this score will go to zero, in which case the eviction
+ * server might as well help out with eviction.
*
- * Also, if there is a single eviction server thread with no
- * workers, it must service the urgent queue in case all
- * application threads are busy.
+ * Also, if there is a single eviction server thread with no workers, it must service the
+ * urgent queue in case all application threads are busy.
*/
if (!WT_EVICT_HAS_WORKERS(session) &&
(cache->evict_empty_score < WT_EVICT_SCORE_CUTOFF ||
@@ -720,16 +716,13 @@ __evict_pass(WT_SESSION_IMPL *session)
break;
/*
- * If we're making progress, keep going; if we're not making
- * any progress at all, mark the cache "stuck" and go back to
- * sleep, it's not something we can fix.
+ * If we're making progress, keep going; if we're not making any progress at all, mark the
+ * cache "stuck" and go back to sleep, it's not something we can fix.
*
- * We check for progress every 20ms, the idea being that the
- * aggressive score will reach 10 after 200ms if we aren't
- * making progress and eviction will start considering more
- * pages. If there is still no progress after 2s, we will
- * treat the cache as stuck and start rolling back
- * transactions and writing updates to the lookaside table.
+ * We check for progress every 20ms, the idea being that the aggressive score will reach 10
+ * after 200ms if we aren't making progress and eviction will start considering more pages.
+ * If there is still no progress after 2s, we will treat the cache as stuck and start
+ * rolling back transactions and writing updates to the lookaside table.
*/
if (eviction_progress == cache->eviction_progress) {
if (WT_CLOCKDIFF_MS(time_now, time_prev) >= 20 &&
@@ -750,14 +743,11 @@ __evict_pass(WT_SESSION_IMPL *session)
*/
if (loop < 100 || cache->evict_aggressive_score < 100) {
/*
- * Back off if we aren't making progress: walks
- * hold the handle list lock, blocking other
- * operations that can free space in cache,
- * such as LSM discarding handles.
+ * Back off if we aren't making progress: walks hold the handle list lock, blocking
+ * other operations that can free space in cache, such as LSM discarding handles.
*
- * Allow this wait to be interrupted (e.g. if a
- * checkpoint completes): make sure we wait for
- * a non-zero number of microseconds).
+ * Allow this wait to be interrupted (e.g. if a checkpoint completes): make sure we
+ * wait for a non-zero number of microseconds).
*/
WT_STAT_CONN_INCR(session, cache_eviction_server_slept);
__wt_cond_wait(session, cache->evict_cond, WT_THOUSAND, NULL);
@@ -1181,8 +1171,8 @@ __evict_lru_walk(WT_SESSION_IMPL *session)
/*
* Get some more pages to consider for eviction.
*
- * If the walk is interrupted, we still need to sort the queue: the
- * next walk assumes there are no entries beyond WT_EVICT_WALK_BASE.
+ * If the walk is interrupted, we still need to sort the queue: the next walk assumes there are
+ * no entries beyond WT_EVICT_WALK_BASE.
*/
if ((ret = __evict_walk(cache->walk_session, queue)) == EBUSY)
ret = 0;
@@ -1264,15 +1254,12 @@ __evict_lru_walk(WT_SESSION_IMPL *session)
queue->evict_candidates = candidates;
else {
/*
- * Take all of the urgent pages plus a third of
- * ordinary candidates (which could be expressed as
- * WT_EVICT_WALK_INCR / WT_EVICT_WALK_BASE). In the
- * steady state, we want to get as many candidates as
- * the eviction walk adds to the queue.
+ * Take all of the urgent pages plus a third of ordinary candidates (which could be
+ * expressed as WT_EVICT_WALK_INCR / WT_EVICT_WALK_BASE). In the steady state, we want
+ * to get as many candidates as the eviction walk adds to the queue.
*
- * That said, if there is only one entry, which is
- * normal when populating an empty file, don't exclude
- * it.
+ * That said, if there is only one entry, which is normal when populating an empty file,
+ * don't exclude it.
*/
queue->evict_candidates = 1 + candidates + ((entries - candidates) - 1) / 3;
cache->read_gen_oldest = read_gen_oldest;
@@ -1468,11 +1455,9 @@ retry:
/*
* Skip files if we have too many active walks.
*
- * This used to be limited by the configured maximum number of
- * hazard pointers per session. Even though that ceiling has
- * been removed, we need to test eviction with huge numbers of
- * active trees before allowing larger numbers of hazard
- * pointers in the walk session.
+ * This used to be limited by the configured maximum number of hazard pointers per session.
+ * Even though that ceiling has been removed, we need to test eviction with huge numbers of
+ * active trees before allowing larger numbers of hazard pointers in the walk session.
*/
if (btree->evict_ref == NULL && session->nhazard > WT_EVICT_MAX_TREES)
continue;
@@ -1490,16 +1475,14 @@ retry:
dhandle_locked = false;
/*
- * Re-check the "no eviction" flag, used to enforce exclusive
- * access when a handle is being closed.
+ * Re-check the "no eviction" flag, used to enforce exclusive access when a handle is being
+ * closed.
*
- * Only try to acquire the lock and simply continue if we fail;
- * the lock is held while the thread turning off eviction clears
- * the tree's current eviction point, and part of the process is
- * waiting on this thread to acknowledge that action.
+ * Only try to acquire the lock and simply continue if we fail; the lock is held while the
+ * thread turning off eviction clears the tree's current eviction point, and part of the
+ * process is waiting on this thread to acknowledge that action.
*
- * If a handle is being discarded, it will still be marked open,
- * but won't have a root page.
+ * If a handle is being discarded, it will still be marked open, but won't have a root page.
*/
if (btree->evict_disabled == 0 && !__wt_spin_trylock(session, &cache->evict_walk_lock)) {
if (btree->evict_disabled == 0 && btree->root.page != NULL) {
@@ -1888,9 +1871,8 @@ __evict_walk_tree(WT_SESSION_IMPL *session, WT_EVICT_QUEUE *queue, u_int max_ent
/*
* Pages that are empty or from dead trees are fast-tracked.
*
- * Also evict lookaside table pages without further filtering:
- * the cache is under pressure by definition and we want to
- * free space.
+ * Also evict lookaside table pages without further filtering: the cache is under pressure
+ * by definition and we want to free space.
*/
if (__wt_page_is_empty(page) || F_ISSET(session->dhandle, WT_DHANDLE_DEAD) ||
F_ISSET(btree, WT_BTREE_LOOKASIDE))
@@ -1920,15 +1902,12 @@ __evict_walk_tree(WT_SESSION_IMPL *session, WT_EVICT_QUEUE *queue, u_int max_ent
continue;
/*
- * Don't attempt eviction of internal pages with children in
- * cache (indicated by seeing an internal page that is the
- * parent of the last page we saw).
+ * Don't attempt eviction of internal pages with children in cache (indicated by seeing an
+ * internal page that is the parent of the last page we saw).
*
- * Also skip internal page unless we get aggressive, the tree
- * is idle (indicated by the tree being skipped for walks),
- * or we are in eviction debug mode.
- * The goal here is that if trees become completely idle, we
- * eventually push them out of cache completely.
+ * Also skip internal page unless we get aggressive, the tree is idle (indicated by the tree
+ * being skipped for walks), or we are in eviction debug mode. The goal here is that if
+ * trees become completely idle, we eventually push them out of cache completely.
*/
if (!F_ISSET(cache, WT_CACHE_EVICT_DEBUG_MODE) && WT_PAGE_IS_INTERNAL(page)) {
if (page == last_parent)
@@ -1987,18 +1966,15 @@ fast:
/*
* Give up the walk occasionally.
*
- * If we happen to end up on the root page or a page requiring urgent
- * eviction, clear it. We have to track hazard pointers, and the root
- * page complicates that calculation.
+ * If we happen to end up on the root page or a page requiring urgent eviction, clear it. We
+ * have to track hazard pointers, and the root page complicates that calculation.
*
- * Likewise if we found no new candidates during the walk: there is no
- * point keeping a page pinned, since it may be the only candidate in
- * an idle tree.
+ * Likewise if we found no new candidates during the walk: there is no point keeping a page
+ * pinned, since it may be the only candidate in an idle tree.
*
- * If we land on a page requiring forced eviction, or that isn't an
- * ordinary in-memory page (e.g., WT_REF_LIMBO), move until we find an
- * ordinary page: we should not prevent exclusive access to the page
- * until the next walk.
+ * If we land on a page requiring forced eviction, or that isn't an ordinary in-memory page
+ * (e.g., WT_REF_LIMBO), move until we find an ordinary page: we should not prevent exclusive
+ * access to the page until the next walk.
*/
if (ref != NULL) {
if (__wt_ref_is_root(ref) || evict == start || give_up ||
@@ -2064,13 +2040,12 @@ __evict_get_ref(WT_SESSION_IMPL *session, bool is_server, WT_BTREE **btreep, WT_
}
/*
- * The server repopulates whenever the other queue is not full, as long
- * as at least one page has been evicted out of the current queue.
+ * The server repopulates whenever the other queue is not full, as long as at least one page has
+ * been evicted out of the current queue.
*
- * Note that there are pathological cases where there are only enough
- * eviction candidates in the cache to fill one queue. In that case,
- * we will continually evict one page and attempt to refill the queues.
- * Such cases are extremely rare in real applications.
+ * Note that there are pathological cases where there are only enough eviction candidates in the
+ * cache to fill one queue. In that case, we will continually evict one page and attempt to
+ * refill the queues. Such cases are extremely rare in real applications.
*/
if (is_server && (!urgent_ok || __evict_queue_empty(urgent_queue, false)) &&
!__evict_queue_full(cache->evict_current_queue) &&
@@ -2088,9 +2063,8 @@ __evict_get_ref(WT_SESSION_IMPL *session, bool is_server, WT_BTREE **btreep, WT_
/*
* Check if the current queue needs to change.
*
- * The server will only evict half of the pages before looking
- * for more, but should only switch queues if there are no
- * other eviction workers.
+ * The server will only evict half of the pages before looking for more, but should only
+ * switch queues if there are no other eviction workers.
*/
queue = cache->evict_current_queue;
other_queue = cache->evict_other_queue;
@@ -2136,14 +2110,13 @@ __evict_get_ref(WT_SESSION_IMPL *session, bool is_server, WT_BTREE **btreep, WT_
WT_ASSERT(session, evict->btree != NULL);
/*
- * Evicting a dirty page in the server thread could stall
- * during a write and prevent eviction from finding new work.
+ * Evicting a dirty page in the server thread could stall during a write and prevent
+ * eviction from finding new work.
*
- * However, we can't skip entries in the urgent queue or they
- * may never be found again.
+ * However, we can't skip entries in the urgent queue or they may never be found again.
*
- * Don't force application threads to evict dirty pages if they
- * aren't stalled by the amount of dirty data in cache.
+ * Don't force application threads to evict dirty pages if they aren't stalled by the amount
+ * of dirty data in cache.
*/
if (!urgent_ok && (is_server || !F_ISSET(cache, WT_CACHE_EVICT_DIRTY_HARD)) &&
__wt_page_is_modified(evict->ref->page)) {
@@ -2233,13 +2206,11 @@ __evict_page(WT_SESSION_IMPL *session, bool is_server)
}
/*
- * In case something goes wrong, don't pick the same set of pages every
- * time.
+ * In case something goes wrong, don't pick the same set of pages every time.
*
- * We used to bump the page's read generation only if eviction failed,
- * but that isn't safe: at that point, eviction has already unlocked
- * the page and some other thread may have evicted it by the time we
- * look at it.
+ * We used to bump the page's read generation only if eviction failed, but that isn't safe: at
+ * that point, eviction has already unlocked the page and some other thread may have evicted it
+ * by the time we look at it.
*/
__wt_cache_read_gen_bump(session, ref->page);
@@ -2295,31 +2266,32 @@ __wt_cache_eviction_worker(WT_SESSION_IMPL *session, bool busy, bool readonly, d
if (timer)
time_start = __wt_clock(session);
- WT_TRACK_TIME(session);
+#ifdef HAVE_DIAGNOSTIC
+ __wt_seconds32(session, &session->op_5043_seconds);
+#endif
for (initial_progress = cache->eviction_progress;; ret = 0) {
/*
- * A pathological case: if we're the oldest transaction in the
- * system and the eviction server is stuck trying to find space
- * (and we're not in recovery, because those transactions can't
- * be rolled back), abort the transaction to give up all hazard
- * pointers before trying again.
+ * If eviction is stuck, check if this thread is likely causing problems and should be
+ * rolled back. Ignore if in recovery, those transactions can't be rolled back.
*/
- if (__wt_cache_stuck(session) && __wt_txn_am_oldest(session) &&
- !F_ISSET(conn, WT_CONN_RECOVERING)) {
- --cache->evict_aggressive_score;
- WT_STAT_CONN_INCR(session, txn_fail_cache);
- WT_ERR(
- __wt_txn_rollback_required(session, "oldest transaction rolled back for eviction"));
+ if (!F_ISSET(conn, WT_CONN_RECOVERING) && __wt_cache_stuck(session)) {
+ ret = __wt_txn_is_blocking_old(session);
+ if (ret == 0)
+ ret = __wt_txn_is_blocking_pin(session);
+ if (ret == WT_ROLLBACK) {
+ --cache->evict_aggressive_score;
+ WT_STAT_CONN_INCR(session, txn_fail_cache);
+ }
+ WT_ERR(ret);
}
/*
* Check if we have become busy.
*
- * If we're busy (because of the transaction check we just did
- * or because our caller is waiting on a longer-than-usual event
- * such as a page read), and the cache level drops below 100%,
- * limit the work to 5 evictions and return. If that's not the
- * case, we can do more.
+ * If we're busy (because of the transaction check we just did or because our caller is
+ * waiting on a longer-than-usual event such as a page read), and the cache level drops
+ * below 100%, limit the work to 5 evictions and return. If that's not the case, we can do
+ * more.
*/
if (!busy && txn_state->pinned_id != WT_TXN_NONE &&
txn_global->current != txn_global->oldest_id)
diff --git a/src/third_party/wiredtiger/src/evict/evict_page.c b/src/third_party/wiredtiger/src/evict/evict_page.c
index 41ecfb40242..a13526302a2 100644
--- a/src/third_party/wiredtiger/src/evict/evict_page.c
+++ b/src/third_party/wiredtiger/src/evict/evict_page.c
@@ -260,14 +260,12 @@ __evict_delete_ref(WT_SESSION_IMPL *session, WT_REF *ref, uint32_t flags)
ndeleted = __wt_atomic_addv32(&pindex->deleted_entries, 1);
/*
- * If more than 10% of the parent references are deleted, try a
- * reverse split. Don't bother if there is a single deleted
- * reference: the internal page is empty and we have to wait
+ * If more than 10% of the parent references are deleted, try a reverse split. Don't bother
+ * if there is a single deleted reference: the internal page is empty and we have to wait
* for eviction to notice.
*
- * This will consume the deleted ref (and eventually free it).
- * If the reverse split can't get the access it needs because
- * something is busy, be sure that the page still ends up
+ * This will consume the deleted ref (and eventually free it). If the reverse split can't
+ * get the access it needs because something is busy, be sure that the page still ends up
* marked deleted.
*/
if (ndeleted > pindex->entries / 10 && pindex->entries > 1) {
@@ -361,20 +359,19 @@ __evict_page_dirty_update(WT_SESSION_IMPL *session, WT_REF *ref, uint32_t evict_
break;
case WT_PM_REC_MULTIBLOCK: /* Multiple blocks */
/*
- * Either a split where we reconciled a page and it turned into
- * a lot of pages or an in-memory page that got too large, we
- * forcibly evicted it, and there wasn't anything to write.
+ * Either a split where we reconciled a page and it turned into a lot
+ * of pages or an in-memory page that got too large, we forcibly
+ * evicted it, and there wasn't anything to write.
*
- * The latter is a special case of forced eviction. Imagine a
- * thread updating a small set keys on a leaf page. The page
- * is too large or has too many deleted items, so we try and
- * evict it, but after reconciliation there's only a small
- * amount of live data (so it's a single page we can't split),
- * and if there's an older reader somewhere, there's data on
- * the page we can't write (so the page can't be evicted). In
- * that case, we end up here with a single block that we can't
- * write. Take advantage of the fact we have exclusive access
- * to the page and rewrite it in memory.
+ * The latter is a special case of forced eviction. Imagine a thread
+ * updating a small set keys on a leaf page. The page is too large or
+ * has too many deleted items, so we try and evict it, but after
+ * reconciliation there's only a small amount of live data (so it's a
+ * single page we can't split), and if there's an older reader
+ * somewhere, there's data on the page we can't write (so the page
+ * can't be evicted). In that case, we end up here with a single
+ * block that we can't write. Take advantage of the fact we have
+ * exclusive access to the page and rewrite it in memory.
*/
if (mod->mod_multi_entries == 1) {
WT_ASSERT(session, closing == false);
@@ -386,11 +383,11 @@ __evict_page_dirty_update(WT_SESSION_IMPL *session, WT_REF *ref, uint32_t evict_
/*
* Update the parent to reference the replacement page.
*
- * A page evicted with lookaside entries may not have an
- * address, if no updates were visible to reconciliation.
+ * A page evicted with lookaside entries may not have an address, if no
+ * updates were visible to reconciliation.
*
- * Publish: a barrier to ensure the structure fields are set
- * before the state change makes the page available to readers.
+ * Publish: a barrier to ensure the structure fields are set before the
+ * state change makes the page available to readers.
*/
if (mod->mod_replace.addr != NULL) {
WT_RET(__wt_calloc_one(session, &addr));
@@ -486,19 +483,16 @@ __evict_child_check(WT_SESSION_IMPL *session, WT_REF *parent)
break;
case WT_REF_DELETED: /* On-disk, deleted */
/*
- * If the child page was part of a truncate,
- * transaction rollback might switch this page into its
- * previous state at any time, so the delete must be
- * resolved before the parent can be evicted.
+ * If the child page was part of a truncate, transaction rollback might
+ * switch this page into its previous state at any time, so the delete
+ * must be resolved before the parent can be evicted.
*
- * We have the internal page locked, which prevents a
- * search from descending into it. However, a walk
- * from an adjacent leaf page could attempt to hazard
- * couple into a child page and free the page_del
- * structure as we are examining it. Flip the state to
- * locked to make this check safe: if that fails, we
- * have raced with a read and should give up on
- * evicting the parent.
+ * We have the internal page locked, which prevents a search from
+ * descending into it. However, a walk from an adjacent leaf page could
+ * attempt to hazard couple into a child page and free the page_del
+ * structure as we are examining it. Flip the state to locked to make
+ * this check safe: if that fails, we have raced with a read and should
+ * give up on evicting the parent.
*/
if (!__wt_atomic_casv32(&child->state, WT_REF_DELETED, WT_REF_LOCKED))
return (__wt_set_return(session, EBUSY));
@@ -613,32 +607,29 @@ __evict_review(WT_SESSION_IMPL *session, WT_REF *ref, uint32_t evict_flags, bool
/*
* If the page is dirty, reconcile it to decide if we can evict it.
*
- * If we have an exclusive lock (we're discarding the tree), assert
- * there are no updates we cannot read.
+ * If we have an exclusive lock (we're discarding the tree), assert there are no updates we
+ * cannot read.
*
- * Don't set any other flags for internal pages: there are no update
- * lists to be saved and restored, changes can't be written into the
- * lookaside table, nor can we re-create internal pages in memory.
+ * Don't set any other flags for internal pages: there are no update lists to be saved and
+ * restored, changes can't be written into the lookaside table, nor can we re-create internal
+ * pages in memory.
*
* For leaf pages:
*
* In-memory pages are a known configuration.
*
- * Set the update/restore flag, so reconciliation will write blocks it
- * can write and create a list of skipped updates for blocks it cannot
- * write, along with disk images. This is how eviction of active, huge
- * pages works: we take a big page and reconcile it into blocks, some of
- * which we write and discard, the rest of which we re-create as smaller
- * in-memory pages, (restoring the updates that stopped us from writing
- * the block), and inserting the whole mess into the page's parent. Set
- * the flag in all cases because the incremental cost of update/restore
- * in reconciliation is minimal, eviction shouldn't have picked a page
- * where update/restore is necessary, absent some cache pressure. It's
- * possible updates occurred after we selected this page for eviction,
- * but it's unlikely and we don't try and manage that risk.
+ * Set the update/restore flag, so reconciliation will write blocks it can write and create a
+ * list of skipped updates for blocks it cannot write, along with disk images. This is how
+ * eviction of active, huge pages works: we take a big page and reconcile it into blocks, some
+ * of which we write and discard, the rest of which we re-create as smaller in-memory pages,
+ * (restoring the updates that stopped us from writing the block), and inserting the whole mess
+ * into the page's parent. Set the flag in all cases because the incremental cost of
+ * update/restore in reconciliation is minimal, eviction shouldn't have picked a page where
+ * update/restore is necessary, absent some cache pressure. It's possible updates occurred after
+ * we selected this page for eviction, but it's unlikely and we don't try and manage that risk.
*
- * Additionally, if we aren't trying to free space in the cache, scrub
- * the page and keep it in memory.
+ * Additionally, if we aren't trying to free space in the cache, scrub the page and keep it in
+ * memory.
*/
cache = conn->cache;
lookaside_retry = false;
@@ -678,15 +669,8 @@ __evict_review(WT_SESSION_IMPL *session, WT_REF *ref, uint32_t evict_flags, bool
/* Reconcile the page. */
ret = __wt_reconcile(session, ref, NULL, flags, lookaside_retryp);
-
- /*
- * If attempting eviction during a checkpoint, we may successfully reconcile but then find that
- * there are updates on the page too new to evict. Give up evicting in that case: checkpoint
- * will include the reconciled page when it visits the parent.
- */
- if (WT_SESSION_BTREE_SYNC(session) && !__wt_page_is_modified(page) &&
- !__wt_txn_visible_all(session, page->modify->rec_max_txn, page->modify->rec_max_timestamp))
- return (__wt_set_return(session, EBUSY));
+ WT_ASSERT(session, __wt_page_is_modified(page) ||
+ __wt_txn_visible_all(session, page->modify->rec_max_txn, page->modify->rec_max_timestamp));
/*
* If reconciliation fails but reports it might succeed if we use the lookaside table, try again
@@ -704,11 +688,10 @@ __evict_review(WT_SESSION_IMPL *session, WT_REF *ref, uint32_t evict_flags, bool
/*
* Give up on eviction during a checkpoint if the page splits.
*
- * We get here if checkpoint reads a page with lookaside entries: if
- * more of those entries are visible now than when the original
- * eviction happened, the page could split. In most workloads, this is
- * very unlikely. However, since checkpoint is partway through
- * reconciling the parent page, a split can corrupt the checkpoint.
+ * We get here if checkpoint reads a page with lookaside entries: if more of those entries are
+ * visible now than when the original eviction happened, the page could split. In most
+ * workloads, this is very unlikely. However, since checkpoint is partway through reconciling
+ * the parent page, a split can corrupt the checkpoint.
*/
if (WT_SESSION_BTREE_SYNC(session) && page->modify->rec_result == WT_PM_REC_MULTIBLOCK)
return (__wt_set_return(session, EBUSY));
diff --git a/src/third_party/wiredtiger/src/include/api.h b/src/third_party/wiredtiger/src/include/api.h
index 817ccbae553..36cefa8dc68 100644
--- a/src/third_party/wiredtiger/src/include/api.h
+++ b/src/third_party/wiredtiger/src/include/api.h
@@ -27,11 +27,9 @@
#define WT_SINGLE_THREAD_CHECK_STOP(s) \
if (--(s)->api_enter_refcnt == 0) \
WT_PUBLISH((s)->api_tid, 0);
-#define WT_TRACK_TIME(s) __wt_seconds32((s), &(s)->op_start)
#else
#define WT_SINGLE_THREAD_CHECK_START(s)
#define WT_SINGLE_THREAD_CHECK_STOP(s)
-#define WT_TRACK_TIME(s) (s)->op_start = 0
#endif
/* Standard entry points to the API: declares/initializes local variables. */
@@ -46,8 +44,8 @@
* correct. \
*/ \
WT_TRACK_OP_INIT(s); \
- (s)->op_start = 0; \
WT_SINGLE_THREAD_CHECK_START(s); \
+ __wt_op_timer_start(s); \
WT_ERR(WT_SESSION_CHECK_PANIC(s)); \
/* Reset wait time if this isn't an API reentry. */ \
if (__oldname == NULL) \
diff --git a/src/third_party/wiredtiger/src/include/async.h b/src/third_party/wiredtiger/src/include/async.h
index 9a32ce6e0d2..16862a5a4e9 100644
--- a/src/third_party/wiredtiger/src/include/async.h
+++ b/src/third_party/wiredtiger/src/include/async.h
@@ -63,16 +63,15 @@ struct __wt_async {
WT_ASYNC_OP_IMPL **async_queue; /* Async ops work queue */
uint32_t async_qsize; /* Async work queue size */
/*
- * We need to have two head and tail values. All but one is
+ * We need to have two head and tail values. All but one is
* maintained as an ever increasing value to ease wrap around.
*
- * alloc_head: the next one to allocate for producers.
- * head: the current head visible to consumers.
- * head is always <= alloc_head.
- * alloc_tail: the next slot for consumers to dequeue.
- * alloc_tail is always <= head.
- * tail_slot: the last slot consumed.
- * A producer may need wait for tail_slot to advance.
+ * alloc_head: the next one to allocate for producers. head: the
+ * current head visible to consumers. head is always <=
+ * alloc_head. alloc_tail: the next slot for consumers to
+ * dequeue. alloc_tail is always <= head. tail_slot: the last
+ * slot consumed. A producer may need wait for tail_slot to
+ * advance.
*/
uint64_t alloc_head; /* Next slot to enqueue */
uint64_t head; /* Head visible to worker */
diff --git a/src/third_party/wiredtiger/src/include/btmem.h b/src/third_party/wiredtiger/src/include/btmem.h
index 68d6f53c0f3..2997fb064a8 100644
--- a/src/third_party/wiredtiger/src/include/btmem.h
+++ b/src/third_party/wiredtiger/src/include/btmem.h
@@ -166,14 +166,13 @@ struct __wt_ovfl_reuse {
uint8_t addr_size; /* Overflow addr size */
/*
- * On each page reconciliation, we clear the entry's in-use flag, and
- * reset it as the overflow record is re-used. After reconciliation
- * completes, unused skiplist entries are discarded, along with their
- * underlying blocks.
+ * On each page reconciliation, we clear the entry's in-use flag, and reset it as the overflow
+ * record is re-used. After reconciliation completes, unused skiplist entries are discarded, along
+ * with their underlying blocks.
*
- * On each page reconciliation, set the just-added flag for each new
- * skiplist entry; if reconciliation fails for any reason, discard the
- * newly added skiplist entries, along with their underlying blocks.
+ * On each page reconciliation, set the just-added flag for each new skiplist entry; if
+ * reconciliation fails for any reason, discard the newly added skiplist entries, along with their
+ * underlying blocks.
*/
/* AUTOMATIC FLAG VALUE GENERATION START */
#define WT_OVFL_REUSE_INUSE 0x1u
@@ -230,21 +229,31 @@ struct __wt_ovfl_reuse {
/*
* WT_PAGE_LOOKASIDE --
- * Related information for on-disk pages with lookaside entries.
+ * Information for on-disk pages with lookaside entries.
+ *
+ * This information is used to decide whether history evicted to lookaside is
+ * needed for a read, and when it is no longer needed at all. We track the
+ * newest update written to the disk image in `max_ondisk_ts`, and the oldest
+ * update skipped to choose the on-disk version in `min_skipped_ts`. If no
+ * updates were skipped, then the disk image contains the newest versions of
+ * all updates and `min_skipped_ts == WT_TS_MAX`.
+ *
+ * For reads without a timestamp, we check that there are no skipped updates
+ * and that the reader's snapshot can see everything on disk.
+ *
+ * For readers with a timestamp, it is safe to ignore lookaside if either
+ * (a) there are no skipped updates and everything on disk is visible, or
+ * (b) everything on disk is visible, and the minimum skipped update is in
+ * the future of the reader.
*/
struct __wt_page_lookaside {
- uint64_t las_pageid; /* Page ID in lookaside */
- uint64_t max_txn; /* Maximum transaction ID */
- uint64_t unstable_txn; /* First transaction ID not on page */
- wt_timestamp_t max_timestamp; /* Maximum timestamp */
- wt_timestamp_t unstable_timestamp; /* First timestamp not on page */
- wt_timestamp_t unstable_durable_timestamp;
- /* First durable timestamp not on
- * page */
- bool eviction_to_lookaside; /* Revert to lookaside on eviction */
- bool has_prepares; /* One or more updates are prepared */
- bool resolved; /* History has been read into cache */
- bool skew_newest; /* Page image has newest versions */
+ uint64_t las_pageid; /* Page ID in lookaside */
+ uint64_t max_txn; /* Maximum transaction ID */
+ wt_timestamp_t max_ondisk_ts; /* Maximum timestamp on disk */
+ wt_timestamp_t min_skipped_ts; /* Skipped in favor of disk version */
+ bool eviction_to_lookaside; /* Revert to lookaside on eviction */
+ bool has_prepares; /* One or more updates are prepared */
+ bool resolved; /* History has been read into cache */
};
/*
@@ -283,20 +292,17 @@ struct __wt_page_modify {
size_t bytes_dirty;
/*
- * When pages are reconciled, the result is one or more replacement
- * blocks. A replacement block can be in one of two states: it was
- * written to disk, and so we have a block address, or it contained
- * unresolved modifications and we have a disk image for it with a
- * list of those unresolved modifications. The former is the common
- * case: we only build lists of unresolved modifications when we're
- * evicting a page, and we only expect to see unresolved modifications
- * on a page being evicted in the case of a hot page that's too large
- * to keep in memory as it is. In other words, checkpoints will skip
- * unresolved modifications, and will write the blocks rather than
- * build lists of unresolved modifications.
+ * When pages are reconciled, the result is one or more replacement blocks. A replacement block
+ * can be in one of two states: it was written to disk, and so we have a block address, or it
+ * contained unresolved modifications and we have a disk image for it with a list of those
+ * unresolved modifications. The former is the common case: we only build lists of unresolved
+ * modifications when we're evicting a page, and we only expect to see unresolved modifications
+ * on a page being evicted in the case of a hot page that's too large to keep in memory as it
+ * is. In other words, checkpoints will skip unresolved modifications, and will write the blocks
+ * rather than build lists of unresolved modifications.
*
- * Ugly union/struct layout to conserve memory, we never have both
- * a replace address and multiple replacement blocks.
+ * Ugly union/struct layout to conserve memory, we never have both a replace address and
+ * multiple replacement blocks.
*/
union {
struct { /* Single, written replacement block */
@@ -336,13 +342,12 @@ struct __wt_page_modify {
void *disk_image;
/*
- * List of unresolved updates. Updates are either a row-store
- * insert or update list, or column-store insert list. When
- * creating lookaside records, there is an additional value,
- * the committed item's transaction information.
+ * List of unresolved updates. Updates are either a row-store insert or update list,
+ * or column-store insert list. When creating lookaside records, there is an
+ * additional value, the committed item's transaction information.
*
- * If there are unresolved updates, the block wasn't written and
- * there will always be a disk image.
+ * If there are unresolved updates, the block wasn't written and there will always
+ * be a disk image.
*/
struct __wt_save_upd {
WT_INSERT *ins; /* Insert list reference */
@@ -372,12 +377,11 @@ struct __wt_page_modify {
} u1;
/*
- * Internal pages need to be able to chain root-page splits and have a
- * special transactional eviction requirement. Column-store leaf pages
- * need update and append lists.
+ * Internal pages need to be able to chain root-page splits and have a special transactional
+ * eviction requirement. Column-store leaf pages need update and append lists.
*
- * Ugly union/struct layout to conserve memory, a page is either a leaf
- * page or an internal page.
+ * Ugly union/struct layout to conserve memory, a page is either a leaf page or an internal
+ * page.
*/
union {
struct {
@@ -554,12 +558,12 @@ struct __wt_page {
#define pg_intl_split_gen u.intl.split_gen
/*
- * Macros to copy/set the index because the name is obscured to ensure
- * the field isn't read multiple times.
+ * Macros to copy/set the index because the name is obscured to ensure the field isn't read multiple
+ * times.
*
- * There are two versions of WT_INTL_INDEX_GET because the session split
- * generation is usually set, but it's not always required: for example,
- * if a page is locked for splitting, or being created or destroyed.
+ * There are two versions of WT_INTL_INDEX_GET because the session split generation is usually set,
+ * but it's not always required: for example, if a page is locked for splitting, or being created or
+ * destroyed.
*/
#define WT_INTL_INDEX_GET_SAFE(page) ((page)->u.intl.__index)
#define WT_INTL_INDEX_GET(session, page, pindex) \
@@ -614,15 +618,12 @@ struct __wt_page {
WT_COL *col_var; /* Values */
/*
- * Variable-length column-store pages have an array
- * of page entries with RLE counts greater than 1 when
- * reading the page, so it's not necessary to walk the
- * page counting records to find a specific entry. We
- * can do a binary search in this array, then an offset
- * calculation to find the cell.
+ * Variable-length column-store pages have an array of page entries with RLE counts
+ * greater than 1 when reading the page, so it's not necessary to walk the page counting
+ * records to find a specific entry. We can do a binary search in this array, then an
+ * offset calculation to find the cell.
*
- * It's a separate structure to keep the page structure
- * as small as possible.
+ * It's a separate structure to keep the page structure as small as possible.
*/
struct __wt_col_var_repeat {
uint32_t nrepeats; /* repeat slots */
@@ -639,11 +640,11 @@ struct __wt_page {
} u;
/*
- * Page entries, type and flags are positioned at the end of the WT_PAGE
- * union to reduce cache misses in the row-store search function.
+ * Page entries, type and flags are positioned at the end of the WT_PAGE union to reduce cache
+ * misses in the row-store search function.
*
- * The entries field only applies to leaf pages, internal pages use the
- * page-index entries instead.
+ * The entries field only applies to leaf pages, internal pages use the page-index entries
+ * instead.
*/
uint32_t entries; /* Leaf page entries */
@@ -909,7 +910,7 @@ struct __wt_ref {
WT_SESSION_IMPL *session;
const char *name;
const char *func;
- uint32_t time_sec; /* DEBUGGING field for rare hang. */
+ uint32_t time_sec;
uint16_t line;
uint16_t state;
} hist[WT_REF_SAVE_STATE_MAX];
@@ -1000,14 +1001,13 @@ struct __wt_row { /* On-page key, on-page cell, or off-page WT_IKEY */
*/
struct __wt_col {
/*
- * Variable-length column-store data references are page offsets, not
- * pointers (we boldly re-invent short pointers). The trade-off is 4B
- * per K/V pair on a 64-bit machine vs. a single cycle for the addition
- * of a base pointer. The on-page data is a WT_CELL (same as row-store
+ * Variable-length column-store data references are page offsets, not pointers (we boldly
+ * re-invent short pointers). The trade-off is 4B per K/V pair on a 64-bit machine vs. a single
+ * cycle for the addition of a base pointer. The on-page data is a WT_CELL (same as row-store
* pages).
*
- * Obscure the field name, code shouldn't use WT_COL->__col_value, the
- * public interface is WT_COL_PTR and WT_COL_PTR_SET.
+ * Obscure the field name, code shouldn't use WT_COL->__col_value, the public interface is
+ * WT_COL_PTR and WT_COL_PTR_SET.
*/
uint32_t __col_value;
};
@@ -1133,33 +1133,28 @@ struct __wt_update {
/*
* WT_INSERT --
*
- * Row-store leaf pages support inserts of new K/V pairs. When the first K/V
- * pair is inserted, the WT_INSERT_HEAD array is allocated, with one slot for
- * every existing element in the page, plus one additional slot. A slot points
- * to a WT_INSERT_HEAD structure for the items which sort after the WT_ROW
- * element that references it and before the subsequent WT_ROW element; the
- * skiplist structure has a randomly chosen depth of next pointers in each
- * inserted node.
- *
- * The additional slot is because it's possible to insert items smaller than any
- * existing key on the page: for that reason, the first slot of the insert array
- * holds keys smaller than any other key on the page.
+ * Row-store leaf pages support inserts of new K/V pairs. When the first K/V pair is inserted, the
+ * WT_INSERT_HEAD array is allocated, with one slot for every existing element in the page, plus one
+ * additional slot. A slot points to a WT_INSERT_HEAD structure for the items which sort after the
+ * WT_ROW element that references it and before the subsequent WT_ROW element; the skiplist
+ * structure has a randomly chosen depth of next pointers in each inserted node.
*
- * In column-store variable-length run-length encoded pages, a single indx
- * entry may reference a large number of records, because there's a single
- * on-page entry representing many identical records. (We don't expand those
- * entries when the page comes into memory, as that would require resources as
- * pages are moved to/from the cache, including read-only files.) Instead, a
- * single indx entry represents all of the identical records originally found
+ * The additional slot is because it's possible to insert items smaller than any existing key on the
+ * page: for that reason, the first slot of the insert array holds keys smaller than any other key
* on the page.
*
- * Modifying (or deleting) run-length encoded column-store records is hard
- * because the page's entry no longer references a set of identical items. We
- * handle this by "inserting" a new entry into the insert array, with its own
- * record number. (This is the only case where it's possible to insert into a
- * column-store: only appends are allowed, as insert requires re-numbering
- * subsequent records. Berkeley DB did support mutable records, but it won't
- * scale and it isn't useful enough to re-implement, IMNSHO.)
+ * In column-store variable-length run-length encoded pages, a single indx entry may reference a
+ * large number of records, because there's a single on-page entry representing many identical
+ * records. (We don't expand those entries when the page comes into memory, as that would require
+ * resources as pages are moved to/from the cache, including read-only files.) Instead, a single
+ * indx entry represents all of the identical records originally found on the page.
+ *
+ * Modifying (or deleting) run-length encoded column-store records is hard because the page's entry
+ * no longer references a set of identical items. We handle this by "inserting" a new entry into the
+ * insert array, with its own record number. (This is the only case where it's possible to insert
+ * into a column-store: only appends are allowed, as insert requires re-numbering subsequent
+ * records. Berkeley DB did support mutable records, but it won't scale and it isn't useful enough
+ * to re-implement, IMNSHO.)
*/
struct __wt_insert {
WT_UPDATE *upd; /* value */
@@ -1272,17 +1267,15 @@ struct __wt_insert_head {
++(i), (v) = __bit_getv(WT_PAGE_HEADER_BYTE(btree, dsk), i, (btree)->bitcnt))
/*
- * Manage split generation numbers. Splits walk the list of sessions to check
- * when it is safe to free structures that have been replaced. We also check
- * that list periodically (e.g., when wrapping up a transaction) to free any
- * memory we can.
+ * Manage split generation numbers. Splits walk the list of sessions to check when it is safe to
+ * free structures that have been replaced. We also check that list periodically (e.g., when
+ * wrapping up a transaction) to free any memory we can.
*
- * Before a thread enters code that will examine page indexes (which are
- * swapped out by splits), it publishes a copy of the current split generation
- * into its session. Don't assume that threads never re-enter this code: if we
- * already have a split generation, leave it alone. If our caller is examining
- * an index, we don't want the oldest split generation to move forward and
- * potentially free it.
+ * Before a thread enters code that will examine page indexes (which are swapped out by splits), it
+ * publishes a copy of the current split generation into its session. Don't assume that threads
+ * never re-enter this code: if we already have a split generation, leave it alone. If our caller is
+ * examining an index, we don't want the oldest split generation to move forward and potentially
+ * free it.
*/
#define WT_ENTER_PAGE_INDEX(session) \
do { \
diff --git a/src/third_party/wiredtiger/src/include/btree.h b/src/third_party/wiredtiger/src/include/btree.h
index 248297e6f26..e5d091112a4 100644
--- a/src/third_party/wiredtiger/src/include/btree.h
+++ b/src/third_party/wiredtiger/src/include/btree.h
@@ -26,22 +26,19 @@
#define WT_BTREE_PAGE_SIZE_MAX (512 * WT_MEGABYTE)
/*
- * The length of variable-length column-store values and row-store keys/values
- * are stored in a 4B type, so the largest theoretical key/value item is 4GB.
- * However, in the WT_UPDATE structure we use the UINT32_MAX size as a "deleted"
- * flag, and second, the size of an overflow object is constrained by what an
- * underlying block manager can actually write. (For example, in the default
- * block manager, writing an overflow item includes the underlying block's page
- * header and block manager specific structure, aligned to an allocation-sized
- * unit). The btree engine limits the size of a single object to (4GB - 1KB);
- * that gives us additional bytes if we ever want to store a structure length
- * plus the object size in 4B, or if we need additional flag values. Attempts
- * to store large key/value items in the tree trigger an immediate check to the
- * block manager, to make sure it can write the item. Storing 4GB objects in a
- * btree borders on clinical insanity, anyway.
+ * The length of variable-length column-store values and row-store keys/values are stored in a 4B
+ * type, so the largest theoretical key/value item is 4GB. However, in the WT_UPDATE structure we
+ * use the UINT32_MAX size as a "deleted" flag, and second, the size of an overflow object is
+ * constrained by what an underlying block manager can actually write. (For example, in the default
+ * block manager, writing an overflow item includes the underlying block's page header and block
+ * manager specific structure, aligned to an allocation-sized unit). The btree engine limits the
+ * size of a single object to (4GB - 1KB); that gives us additional bytes if we ever want to store a
+ * structure length plus the object size in 4B, or if we need additional flag values. Attempts to
+ * store large key/value items in the tree trigger an immediate check to the block manager, to make
+ * sure it can write the item. Storing 4GB objects in a btree borders on clinical insanity, anyway.
*
- * Record numbers are stored in 64-bit unsigned integers, meaning the largest
- * record number is "really, really big".
+ * Record numbers are stored in 64-bit unsigned integers, meaning the largest record number is
+ * "really, really big".
*/
#define WT_BTREE_MAX_OBJECT_SIZE ((uint32_t)(UINT32_MAX - 1024))
diff --git a/src/third_party/wiredtiger/src/include/btree.i b/src/third_party/wiredtiger/src/include/btree.i
index 3f80ee5cda7..69bb0dec90a 100644
--- a/src/third_party/wiredtiger/src/include/btree.i
+++ b/src/third_party/wiredtiger/src/include/btree.i
@@ -476,34 +476,29 @@ __wt_page_only_modify_set(WT_SESSION_IMPL *session, WT_PAGE *page)
last_running = S2C(session)->txn_global.last_running;
/*
- * We depend on the atomic operation being a write barrier, that is, a
- * barrier to ensure all changes to the page are flushed before updating
- * the page state and/or marking the tree dirty, otherwise checkpoints
- * and/or page reconciliation might be looking at a clean page/tree.
+ * We depend on the atomic operation being a write barrier, that is, a barrier to ensure all
+ * changes to the page are flushed before updating the page state and/or marking the tree dirty,
+ * otherwise checkpoints and/or page reconciliation might be looking at a clean page/tree.
*
- * Every time the page transitions from clean to dirty, update the cache
- * and transactional information.
+ * Every time the page transitions from clean to dirty, update the cache and transactional
+ * information.
*
- * The page state can only ever be incremented above dirty by the number
- * of concurrently running threads, so the counter will never approach
- * the point where it would wrap.
+ * The page state can only ever be incremented above dirty by the number of concurrently running
+ * threads, so the counter will never approach the point where it would wrap.
*/
if (page->modify->page_state < WT_PAGE_DIRTY &&
__wt_atomic_add32(&page->modify->page_state, 1) == WT_PAGE_DIRTY_FIRST) {
__wt_cache_dirty_incr(session, page);
/*
- * We won the race to dirty the page, but another thread could
- * have committed in the meantime, and the last_running field
- * been updated past it. That is all very unlikely, but not
- * impossible, so we take care to read the global state before
- * the atomic increment.
+ * We won the race to dirty the page, but another thread could have committed in the
+ * meantime, and the last_running field been updated past it. That is all very unlikely, but
+ * not impossible, so we take care to read the global state before the atomic increment.
*
- * If the page was dirty on entry, then last_running == 0. The
- * page could have become clean since then, if reconciliation
- * completed. In that case, we leave the previous value for
- * first_dirty_txn rather than potentially racing to update it,
- * at worst, we'll unnecessarily write a page in a checkpoint.
+ * If the page was dirty on entry, then last_running == 0. The page could have become clean
+ * since then, if reconciliation completed. In that case, we leave the previous value for
+ * first_dirty_txn rather than potentially racing to update it, at worst, we'll
+ * unnecessarily write a page in a checkpoint.
*/
if (last_running != 0)
page->modify->first_dirty_txn = last_running;
@@ -524,10 +519,9 @@ __wt_tree_modify_set(WT_SESSION_IMPL *session)
/*
* Test before setting the dirty flag, it's a hot cache line.
*
- * The tree's modified flag is cleared by the checkpoint thread: set it
- * and insert a barrier before dirtying the page. (I don't think it's
- * a problem if the tree is marked dirty with all the pages clean, it
- * might result in an extra checkpoint that doesn't do any work but it
+ * The tree's modified flag is cleared by the checkpoint thread: set it and insert a barrier
+ * before dirtying the page. (I don't think it's a problem if the tree is marked dirty with all
+ * the pages clean, it might result in an extra checkpoint that doesn't do any work but it
* shouldn't cause problems; regardless, let's play it safe.)
*/
if (!S2BT(session)->modified) {
@@ -554,21 +548,19 @@ static inline void
__wt_page_modify_clear(WT_SESSION_IMPL *session, WT_PAGE *page)
{
/*
- * The page must be held exclusive when this call is made, this call
- * can only be used when the page is owned by a single thread.
+ * The page must be held exclusive when this call is made, this call can only be used when the
+ * page is owned by a single thread.
*
* Allow the call to be made on clean pages.
*/
if (__wt_page_is_modified(page)) {
/*
- * The only part where ordering matters is during
- * reconciliation where updates on other threads are performing
- * writes to the page state that need to be visible to the
+ * The only part where ordering matters is during reconciliation where updates on other
+ * threads are performing writes to the page state that need to be visible to the
* reconciliation thread.
*
- * Since clearing of the page state is not going to be happening
- * during reconciliation on a separate thread, there's no write
- * barrier needed here.
+ * Since clearing of the page state is not going to be happening during reconciliation on a
+ * separate thread, there's no write barrier needed here.
*/
page->modify->page_state = WT_PAGE_CLEAN;
__wt_cache_dirty_decr(session, page);
@@ -1067,9 +1059,8 @@ __wt_ref_info(
page = ref->home;
/*
- * If NULL, there is no location.
- * If off-page, the pointer references a WT_ADDR structure.
- * If on-page, the pointer references a cell.
+ * If NULL, there is no location. If off-page, the pointer references a WT_ADDR structure. If
+ * on-page, the pointer references a cell.
*
* The type is of a limited set: internal, leaf or no-overflow leaf.
*/
@@ -1160,12 +1151,10 @@ __wt_page_las_active(WT_SESSION_IMPL *session, WT_REF *ref)
return (false);
if (page_las->resolved)
return (false);
- if (!page_las->skew_newest || page_las->has_prepares)
+ if (page_las->min_skipped_ts != WT_TS_MAX || page_las->has_prepares)
return (true);
- if (__wt_txn_visible_all(session, page_las->max_txn, page_las->max_timestamp))
- return (false);
- return (true);
+ return (!__wt_txn_visible_all(session, page_las->max_txn, page_las->max_ondisk_ts));
}
/*
@@ -1216,16 +1205,14 @@ __wt_leaf_page_can_split(WT_SESSION_IMPL *session, WT_PAGE *page)
return (false);
/*
- * Check for pages with append-only workloads. A common application
- * pattern is to have multiple threads frantically appending to the
- * tree. We want to reconcile and evict this page, but we'd like to
- * do it without making the appending threads wait. See if it's worth
- * doing a split to let the threads continue before doing eviction.
+ * Check for pages with append-only workloads. A common application pattern is to have multiple
+ * threads frantically appending to the tree. We want to reconcile and evict this page, but we'd
+ * like to do it without making the appending threads wait. See if it's worth doing a split to
+ * let the threads continue before doing eviction.
*
- * Ignore anything other than large, dirty leaf pages. We depend on the
- * page being dirty for correctness (the page must be reconciled again
- * before being evicted after the split, information from a previous
- * reconciliation will be wrong, so we can't evict immediately).
+ * Ignore anything other than large, dirty leaf pages. We depend on the page being dirty for
+ * correctness (the page must be reconciled again before being evicted after the split,
+ * information from a previous reconciliation will be wrong, so we can't evict immediately).
*/
if (page->memory_footprint < btree->splitmempage)
return (false);
@@ -1386,15 +1373,13 @@ __wt_page_can_evict(WT_SESSION_IMPL *session, WT_REF *ref, bool *inmem_splitp)
}
/*
- * If a split created new internal pages, those newly created internal
- * pages cannot be evicted until all threads are known to have exited
- * the original parent page's index, because evicting an internal page
- * discards its WT_REF array, and a thread traversing the original
- * parent page index might see a freed WT_REF.
+ * If a split created new internal pages, those newly created internal pages cannot be evicted
+ * until all threads are known to have exited the original parent page's index, because evicting
+ * an internal page discards its WT_REF array, and a thread traversing the original parent page
+ * index might see a freed WT_REF.
*
- * One special case where we know this is safe is if the handle is
- * locked exclusive (e.g., when the whole tree is being evicted). In
- * that case, no readers can be looking at an old index.
+ * One special case where we know this is safe is if the handle is locked exclusive (e.g., when
+ * the whole tree is being evicted). In that case, no readers can be looking at an old index.
*/
if (WT_PAGE_IS_INTERNAL(page) && !F_ISSET(session->dhandle, WT_DHANDLE_EXCLUSIVE) &&
__wt_gen_active(session, WT_GEN_SPLIT, page->pg_intl_split_gen))
@@ -1436,20 +1421,18 @@ __wt_page_release(WT_SESSION_IMPL *session, WT_REF *ref, uint32_t flags)
return (0);
/*
- * Attempt to evict pages with the special "oldest" read generation.
- * This is set for pages that grow larger than the configured
- * memory_page_max setting, when we see many deleted items, and when we
- * are attempting to scan without trashing the cache.
+ * Attempt to evict pages with the special "oldest" read generation. This is set for pages that
+ * grow larger than the configured memory_page_max setting, when we see many deleted items, and
+ * when we are attempting to scan without trashing the cache.
*
- * Checkpoint should not queue pages for urgent eviction if they require
- * dirty eviction: there is a special exemption that allows checkpoint
- * to evict dirty pages in a tree that is being checkpointed, and no
- * other thread can help with that. Checkpoints don't rely on this code
- * for dirty eviction: that is handled explicitly in __wt_sync_file.
+ * Checkpoint should not queue pages for urgent eviction if they require dirty eviction: there
+ * is a special exemption that allows checkpoint to evict dirty pages in a tree that is being
+ * checkpointed, and no other thread can help with that. Checkpoints don't rely on this code for
+ * dirty eviction: that is handled explicitly in __wt_sync_file.
*
- * If the operation has disabled eviction or splitting, or the session
- * is preventing from reconciling, then just queue the page for urgent
- * eviction. Otherwise, attempt to release and evict it.
+ * If the operation has disabled eviction or splitting, or the session is preventing from
+ * reconciling, then just queue the page for urgent eviction. Otherwise, attempt to release and
+ * evict it.
*/
page = ref->page;
if (WT_READGEN_EVICT_SOON(page->read_gen) && btree->evict_disabled == 0 &&
diff --git a/src/third_party/wiredtiger/src/include/btree_cmp.i b/src/third_party/wiredtiger/src/include/btree_cmp.i
index 2f0596bed13..7a77c74db9e 100644
--- a/src/third_party/wiredtiger/src/include/btree_cmp.i
+++ b/src/third_party/wiredtiger/src/include/btree_cmp.i
@@ -225,12 +225,10 @@ __wt_lex_compare_short(const WT_ITEM *user_item, const WT_ITEM *tree_item)
treep = tree_item->data;
/*
- * The maximum packed uint64_t is 9B, catch row-store objects using
- * packed record numbers as keys.
+ * The maximum packed uint64_t is 9B, catch row-store objects using packed record numbers as keys.
*
- * Don't use a #define to compress this case statement: gcc7 complains
- * about implicit fallthrough and doesn't support explicit fallthrough
- * comments in macros.
+ * Don't use a #define to compress this case statement: gcc7 complains about implicit fallthrough
+ * and doesn't support explicit fallthrough comments in macros.
*/
#define WT_COMPARE_SHORT_MAXLEN 9
switch (len) {
diff --git a/src/third_party/wiredtiger/src/include/cache.i b/src/third_party/wiredtiger/src/include/cache.i
index a4a762eae7f..fa770d49c4e 100644
--- a/src/third_party/wiredtiger/src/include/cache.i
+++ b/src/third_party/wiredtiger/src/include/cache.i
@@ -343,10 +343,9 @@ __wt_eviction_needed(WT_SESSION_IMPL *session, bool busy, bool readonly, double
/*
* Only check the dirty trigger when the session is not busy.
*
- * In other words, once we are pinning resources, try to finish the
- * operation as quickly as possible without exceeding the cache size.
- * The next transaction in this session will not be able to start until
- * the cache is under the limit.
+ * In other words, once we are pinning resources, try to finish the operation as quickly as
+ * possible without exceeding the cache size. The next transaction in this session will not be
+ * able to start until the cache is under the limit.
*/
return (clean_needed || (!busy && dirty_needed));
}
diff --git a/src/third_party/wiredtiger/src/include/cell.h b/src/third_party/wiredtiger/src/include/cell.h
index ca9e8e50e91..a93fc3dabd0 100644
--- a/src/third_party/wiredtiger/src/include/cell.h
+++ b/src/third_party/wiredtiger/src/include/cell.h
@@ -84,17 +84,15 @@
#define WT_CELL_TXN_STOP 0x10 /* Newest-stop txn ID */
/*
- * WT_CELL_ADDR_INT is an internal block location, WT_CELL_ADDR_LEAF is a leaf
- * block location, and WT_CELL_ADDR_LEAF_NO is a leaf block location where the
- * page has no overflow items. (The goal is to speed up truncation as we don't
- * have to read pages without overflow items in order to delete them. Note,
- * WT_CELL_ADDR_LEAF_NO is not guaranteed to be set on every page without
- * overflow items, the only guarantee is that if set, the page has no overflow
- * items.)
- *
- * WT_CELL_VALUE_COPY is a reference to a previous cell on the page, supporting
- * value dictionaries: if the two values are the same, we only store them once
- * and have any second and subsequent uses reference the original.
+ * WT_CELL_ADDR_INT is an internal block location, WT_CELL_ADDR_LEAF is a leaf block location, and
+ * WT_CELL_ADDR_LEAF_NO is a leaf block location where the page has no overflow items. (The goal is
+ * to speed up truncation as we don't have to read pages without overflow items in order to delete
+ * them. Note, WT_CELL_ADDR_LEAF_NO is not guaranteed to be set on every page without overflow
+ * items, the only guarantee is that if set, the page has no overflow items.)
+ *
+ * WT_CELL_VALUE_COPY is a reference to a previous cell on the page, supporting value dictionaries:
+ * if the two values are the same, we only store them once and have any second and subsequent uses
+ * reference the original.
*/
#define WT_CELL_ADDR_DEL (0) /* Address: deleted */
#define WT_CELL_ADDR_INT (1 << 4) /* Address: internal */
diff --git a/src/third_party/wiredtiger/src/include/column.i b/src/third_party/wiredtiger/src/include/column.i
index d039386245c..138a185fa42 100644
--- a/src/third_party/wiredtiger/src/include/column.i
+++ b/src/third_party/wiredtiger/src/include/column.i
@@ -46,17 +46,14 @@ __col_insert_search_gt(WT_INSERT_HEAD *ins_head, uint64_t recno)
}
/*
- * If we didn't find any records greater than or equal to the target,
- * we never set the return value, set it to the first record in the
- * list.
+ * If we didn't find any records greater than or equal to the target, we never set the return
+ * value, set it to the first record in the list.
*
- * Otherwise, it references a record less-than-or-equal to the target,
- * move to a later record, that is, a subsequent record greater than
- * the target. Because inserts happen concurrently, additional records
- * might be inserted after the searched-for record that are still
- * smaller than the target, continue to move forward until reaching a
- * record larger than the target. There isn't any safety testing
- * because we confirmed such a record exists before searching.
+ * Otherwise, it references a record less-than-or-equal to the target, move to a later record,
+ * that is, a subsequent record greater than the target. Because inserts happen concurrently,
+ * additional records might be inserted after the searched-for record that are still smaller
+ * than the target, continue to move forward until reaching a record larger than the target.
+ * There isn't any safety testing because we confirmed such a record exists before searching.
*/
if ((ins = ret_ins) == NULL)
ins = WT_SKIP_FIRST(ins_head);
@@ -282,11 +279,10 @@ __col_var_search(WT_REF *ref, uint64_t recno, uint64_t *start_recnop)
/*
* Find the matching slot.
*
- * This is done in two stages: first, we do a binary search among any
- * repeating records to find largest repeating less than the search key.
- * Once there, we can do a simple offset calculation to find the correct
- * slot for this record number, because we know any intervening records
- * have repeat counts of 1.
+ * This is done in two stages: first, we do a binary search among any repeating records to find
+ * largest repeating less than the search key. Once there, we can do a simple offset calculation
+ * to find the correct slot for this record number, because we know any intervening records have
+ * repeat counts of 1.
*/
for (base = 0, limit = WT_COL_VAR_REPEAT_SET(page) ? page->pg_var_nrepeats : 0; limit != 0;
limit >>= 1) {
diff --git a/src/third_party/wiredtiger/src/include/connection.h b/src/third_party/wiredtiger/src/include/connection.h
index 174263c3949..db4b2e9b41e 100644
--- a/src/third_party/wiredtiger/src/include/connection.h
+++ b/src/third_party/wiredtiger/src/include/connection.h
@@ -204,6 +204,8 @@ struct __wt_connection_impl {
/* Configuration */
const WT_CONFIG_ENTRY **config_entries;
+ uint64_t operation_timeout_us; /* Maximum operation period before rollback */
+
const char *optrack_path; /* Directory for operation logs */
WT_FH *optrack_map_fh; /* Name to id translation file. */
WT_SPINLOCK optrack_map_spinlock; /* Translation file spinlock. */
@@ -248,15 +250,13 @@ struct __wt_connection_impl {
uint32_t open_cursor_count; /* Atomic: open cursor handle count */
/*
- * WiredTiger allocates space for 50 simultaneous sessions (threads of
- * control) by default. Growing the number of threads dynamically is
- * possible, but tricky since server threads are walking the array
- * without locking it.
+ * WiredTiger allocates space for 50 simultaneous sessions (threads of control) by default.
+ * Growing the number of threads dynamically is possible, but tricky since server threads are
+ * walking the array without locking it.
*
- * There's an array of WT_SESSION_IMPL pointers that reference the
- * allocated array; we do it that way because we want an easy way for
- * the server thread code to avoid walking the entire array when only a
- * few threads are running.
+ * There's an array of WT_SESSION_IMPL pointers that reference the allocated array; we do it
+ * that way because we want an easy way for the server thread code to avoid walking the entire
+ * array when only a few threads are running.
*/
WT_SESSION_IMPL *sessions; /* Session reference */
uint32_t session_size; /* Session array size */
diff --git a/src/third_party/wiredtiger/src/include/cursor.h b/src/third_party/wiredtiger/src/include/cursor.h
index b52bd2c86ca..b3d32ad8417 100644
--- a/src/third_party/wiredtiger/src/include/cursor.h
+++ b/src/third_party/wiredtiger/src/include/cursor.h
@@ -265,26 +265,22 @@ struct __wt_cursor_index {
};
/*
- * A join iterator structure is used to generate candidate primary keys. It
- * is the responsibility of the caller of the iterator to filter these
- * primary key against the other conditions of the join before returning
- * them the caller of WT_CURSOR::next.
+ * A join iterator structure is used to generate candidate primary keys. It is the responsibility of
+ * the caller of the iterator to filter these primary key against the other conditions of the join
+ * before returning them the caller of WT_CURSOR::next.
*
- * For a conjunction join (the default), entry_count will be 1, meaning that
- * the iterator only consumes the first entry (WT_CURSOR_JOIN_ENTRY). That
- * is, it successively returns primary keys from a cursor for the first
- * index that was joined. When the values returned by that cursor are
- * exhausted, the iterator has completed. For a disjunction join,
- * exhausting a cursor just means that the iterator advances to the next
- * entry. If the next entry represents an index, a new cursor is opened and
- * primary keys from that index are then successively returned.
+ * For a conjunction join (the default), entry_count will be 1, meaning that the iterator only
+ * consumes the first entry (WT_CURSOR_JOIN_ENTRY). That is, it successively returns primary keys
+ * from a cursor for the first index that was joined. When the values returned by that cursor are
+ * exhausted, the iterator has completed. For a disjunction join, exhausting a cursor just means
+ * that the iterator advances to the next entry. If the next entry represents an index, a new cursor
+ * is opened and primary keys from that index are then successively returned.
*
- * When positioned on an entry that represents a nested join, a new child
- * iterator is created that will be bound to the nested WT_CURSOR_JOIN.
- * That iterator is then used to generate candidate primary keys. When its
- * iteration is completed, that iterator is destroyed and the parent
- * iterator advances to the next entry. Thus, depending on how deeply joins
- * are nested, a similarly deep stack of iterators is created.
+ * When positioned on an entry that represents a nested join, a new child iterator is created that
+ * will be bound to the nested WT_CURSOR_JOIN. That iterator is then used to generate candidate
+ * primary keys. When its iteration is completed, that iterator is destroyed and the parent iterator
+ * advances to the next entry. Thus, depending on how deeply joins are nested, a similarly deep
+ * stack of iterators is created.
*/
struct __wt_cursor_join_iter {
WT_SESSION_IMPL *session;
diff --git a/src/third_party/wiredtiger/src/include/cursor.i b/src/third_party/wiredtiger/src/include/cursor.i
index 730d69cbdc7..18c5d146a9e 100644
--- a/src/third_party/wiredtiger/src/include/cursor.i
+++ b/src/third_party/wiredtiger/src/include/cursor.i
@@ -384,10 +384,9 @@ __cursor_row_slot_return(WT_CURSOR_BTREE *cbt, WT_ROW *rip, WT_UPDATE *upd)
copy = WT_ROW_KEY_COPY(rip);
/*
- * Get a key: we could just call __wt_row_leaf_key, but as a cursor
- * is running through the tree, we may have additional information
- * here (we may have the fully-built key that's immediately before
- * the prefix-compressed key we want, so it's a faster construction).
+ * Get a key: we could just call __wt_row_leaf_key, but as a cursor is running through the tree,
+ * we may have additional information here (we may have the fully-built key that's immediately
+ * before the prefix-compressed key we want, so it's a faster construction).
*
* First, check for an immediately available key.
*/
@@ -399,14 +398,12 @@ __cursor_row_slot_return(WT_CURSOR_BTREE *cbt, WT_ROW *rip, WT_UPDATE *upd)
goto slow;
/*
- * Unpack the cell and deal with overflow and prefix-compressed keys.
- * Inline building simple prefix-compressed keys from a previous key,
- * otherwise build from scratch.
+ * Unpack the cell and deal with overflow and prefix-compressed keys. Inline building simple
+ * prefix-compressed keys from a previous key, otherwise build from scratch.
*
- * Clear the key cell structure. It shouldn't be necessary (as far as I
- * can tell, and we don't do it in lots of other places), but disabling
- * shared builds (--disable-shared) results in the compiler complaining
- * about uninitialized field use.
+ * Clear the key cell structure. It shouldn't be necessary (as far as I can tell, and we don't
+ * do it in lots of other places), but disabling shared builds (--disable-shared) results in the
+ * compiler complaining about uninitialized field use.
*/
kpack = &_kpack;
memset(kpack, 0, sizeof(*kpack));
@@ -415,12 +412,11 @@ __cursor_row_slot_return(WT_CURSOR_BTREE *cbt, WT_ROW *rip, WT_UPDATE *upd)
WT_ASSERT(session, cbt->row_key->size >= kpack->prefix);
/*
- * Grow the buffer as necessary as well as ensure data has been
- * copied into local buffer space, then append the suffix to the
- * prefix already in the buffer.
+ * Grow the buffer as necessary as well as ensure data has been copied into local buffer
+ * space, then append the suffix to the prefix already in the buffer.
*
- * Don't grow the buffer unnecessarily or copy data we don't
- * need, truncate the item's data length to the prefix bytes.
+ * Don't grow the buffer unnecessarily or copy data we don't need, truncate the item's data
+ * length to the prefix bytes.
*/
cbt->row_key->size = kpack->prefix;
WT_RET(__wt_buf_grow(session, cbt->row_key, cbt->row_key->size + kpack->size));
diff --git a/src/third_party/wiredtiger/src/include/extern.h b/src/third_party/wiredtiger/src/include/extern.h
index d02b4dca326..4844a88380c 100644
--- a/src/third_party/wiredtiger/src/include/extern.h
+++ b/src/third_party/wiredtiger/src/include/extern.h
@@ -1421,8 +1421,14 @@ extern int __wt_txn_global_init(WT_SESSION_IMPL *session, const char *cfg[])
WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
extern int __wt_txn_global_set_timestamp(WT_SESSION_IMPL *session, const char *cfg[])
WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
+extern int __wt_txn_global_shutdown(WT_SESSION_IMPL *session, const char *config, const char **cfg)
+ WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
extern int __wt_txn_init(WT_SESSION_IMPL *session, WT_SESSION_IMPL *session_ret)
WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
+extern int __wt_txn_is_blocking_old(WT_SESSION_IMPL *session)
+ WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
+extern int __wt_txn_is_blocking_pin(WT_SESSION_IMPL *session)
+ WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
extern int __wt_txn_log_commit(WT_SESSION_IMPL *session, const char *cfg[])
WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
extern int __wt_txn_log_op(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt)
@@ -1532,8 +1538,6 @@ extern uint32_t __wt_rduppo2(uint32_t n, uint32_t po2)
WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
extern uint32_t __wt_split_page_size(int split_pct, uint32_t maxpagesize, uint32_t allocsize)
WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
-extern uint64_t __wt_clock_to_nsec(uint64_t end, uint64_t begin)
- WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
extern uint64_t __wt_ext_transaction_id(WT_EXTENSION_API *wt_api, WT_SESSION *wt_session)
WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
extern uint64_t __wt_ext_transaction_oldest(WT_EXTENSION_API *wt_api)
@@ -1607,8 +1611,6 @@ extern void __wt_curtable_set_key(WT_CURSOR *cursor, ...);
extern void __wt_curtable_set_value(WT_CURSOR *cursor, ...);
extern void __wt_encrypt_size(
WT_SESSION_IMPL *session, WT_KEYED_ENCRYPTOR *kencryptor, size_t incoming_size, size_t *sizep);
-extern void __wt_epoch(WT_SESSION_IMPL *session, struct timespec *tsp)
- WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("default")));
extern void __wt_err_func(
WT_SESSION_IMPL *session, int error, const char *func, int line, const char *fmt, ...)
WT_GCC_FUNC_DECL_ATTRIBUTE((cold)) WT_GCC_FUNC_DECL_ATTRIBUTE((format(printf, 5, 6)))
@@ -1687,9 +1689,6 @@ extern void __wt_root_ref_init(
extern void __wt_rwlock_destroy(WT_SESSION_IMPL *session, WT_RWLOCK *l);
extern void __wt_schema_destroy_colgroup(WT_SESSION_IMPL *session, WT_COLGROUP **colgroupp);
extern void __wt_scr_discard(WT_SESSION_IMPL *session);
-extern void __wt_seconds(WT_SESSION_IMPL *session, uint64_t *secondsp)
- WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("default")));
-extern void __wt_seconds32(WT_SESSION_IMPL *session, uint32_t *secondsp);
extern void __wt_session_close_cache(WT_SESSION_IMPL *session);
extern void __wt_session_gen_enter(WT_SESSION_IMPL *session, int which);
extern void __wt_session_gen_leave(WT_SESSION_IMPL *session, int which);
@@ -1722,7 +1721,6 @@ extern void __wt_txn_clear_timestamp_queues(WT_SESSION_IMPL *session);
extern void __wt_txn_destroy(WT_SESSION_IMPL *session);
extern void __wt_txn_get_snapshot(WT_SESSION_IMPL *session);
extern void __wt_txn_global_destroy(WT_SESSION_IMPL *session);
-extern void __wt_txn_global_shutdown(WT_SESSION_IMPL *session);
extern void __wt_txn_named_snapshot_destroy(WT_SESSION_IMPL *session);
extern void __wt_txn_op_free(WT_SESSION_IMPL *session, WT_TXN_OP *op);
extern void __wt_txn_publish_read_timestamp(WT_SESSION_IMPL *session);
@@ -1770,6 +1768,8 @@ static inline bool __wt_leaf_page_can_split(WT_SESSION_IMPL *session, WT_PAGE *p
WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
static inline bool __wt_off_page(WT_PAGE *page, const void *p)
WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
+static inline bool __wt_op_timer_fired(WT_SESSION_IMPL *session)
+ WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
static inline bool __wt_page_can_evict(WT_SESSION_IMPL *session, WT_REF *ref, bool *inmem_splitp)
WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
static inline bool __wt_page_del_active(WT_SESSION_IMPL *session, WT_REF *ref, bool visible_all)
@@ -1797,10 +1797,6 @@ static inline bool __wt_session_can_wait(WT_SESSION_IMPL *session)
WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
static inline bool __wt_split_descent_race(WT_SESSION_IMPL *session, WT_REF *ref,
WT_PAGE_INDEX *saved_pindex) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
-static inline bool __wt_txn_am_oldest(WT_SESSION_IMPL *session)
- WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
-static inline bool __wt_txn_upd_durable(WT_SESSION_IMPL *session, WT_UPDATE *upd)
- WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
static inline bool __wt_txn_upd_visible(WT_SESSION_IMPL *session, WT_UPDATE *upd)
WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
static inline bool __wt_txn_upd_visible_all(WT_SESSION_IMPL *session, WT_UPDATE *upd)
@@ -2073,6 +2069,8 @@ static inline uint64_t __wt_cell_rle(WT_CELL_UNPACK *unpack)
WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
static inline uint64_t __wt_clock(WT_SESSION_IMPL *session)
WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
+static inline uint64_t __wt_clock_to_nsec(uint64_t end, uint64_t begin)
+ WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
static inline uint64_t __wt_rdtsc(void) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
static inline uint64_t __wt_txn_id_alloc(WT_SESSION_IMPL *session, bool publish)
WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
@@ -2110,6 +2108,8 @@ static inline void __wt_cond_wait(
WT_SESSION_IMPL *session, WT_CONDVAR *cond, uint64_t usecs, bool (*run_func)(WT_SESSION_IMPL *));
static inline void __wt_cursor_dhandle_decr_use(WT_SESSION_IMPL *session);
static inline void __wt_cursor_dhandle_incr_use(WT_SESSION_IMPL *session);
+static inline void __wt_epoch(WT_SESSION_IMPL *session, struct timespec *tsp);
+static inline void __wt_op_timer_start(WT_SESSION_IMPL *session);
static inline void __wt_page_evict_soon(WT_SESSION_IMPL *session, WT_REF *ref);
static inline void __wt_page_modify_clear(WT_SESSION_IMPL *session, WT_PAGE *page);
static inline void __wt_page_modify_set(WT_SESSION_IMPL *session, WT_PAGE *page);
@@ -2137,6 +2137,8 @@ static inline void __wt_row_leaf_value_cell(WT_SESSION_IMPL *session, WT_PAGE *p
WT_CELL_UNPACK *kpack, WT_CELL_UNPACK *vpack);
static inline void __wt_row_leaf_value_set(WT_PAGE *page, WT_ROW *rip, WT_CELL_UNPACK *unpack);
static inline void __wt_scr_free(WT_SESSION_IMPL *session, WT_ITEM **bufp);
+static inline void __wt_seconds(WT_SESSION_IMPL *session, uint64_t *secondsp);
+static inline void __wt_seconds32(WT_SESSION_IMPL *session, uint32_t *secondsp);
static inline void __wt_spin_backoff(uint64_t *yield_count, uint64_t *sleep_usecs);
static inline void __wt_spin_destroy(WT_SESSION_IMPL *session, WT_SPINLOCK *t);
static inline void __wt_spin_lock(WT_SESSION_IMPL *session, WT_SPINLOCK *t);
diff --git a/src/third_party/wiredtiger/src/include/extern_posix.h b/src/third_party/wiredtiger/src/include/extern_posix.h
index 189bc948714..a2280aefa4f 100644
--- a/src/third_party/wiredtiger/src/include/extern_posix.h
+++ b/src/third_party/wiredtiger/src/include/extern_posix.h
@@ -51,7 +51,8 @@ extern void __wt_cond_destroy(WT_SESSION_IMPL *session, WT_CONDVAR **condp);
extern void __wt_cond_signal(WT_SESSION_IMPL *session, WT_CONDVAR *cond);
extern void __wt_cond_wait_signal(WT_SESSION_IMPL *session, WT_CONDVAR *cond, uint64_t usecs,
bool (*run_func)(WT_SESSION_IMPL *), bool *signalled);
-extern void __wt_epoch_raw(WT_SESSION_IMPL *session, struct timespec *tsp);
+extern void __wt_epoch_raw(WT_SESSION_IMPL *session, struct timespec *tsp)
+ WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("default")));
extern void __wt_sleep(uint64_t seconds, uint64_t micro_seconds)
WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("default")));
extern void __wt_stream_set_line_buffer(FILE *fp)
diff --git a/src/third_party/wiredtiger/src/include/gcc.h b/src/third_party/wiredtiger/src/include/gcc.h
index 052fb35d3a7..184d08c01d6 100644
--- a/src/third_party/wiredtiger/src/include/gcc.h
+++ b/src/third_party/wiredtiger/src/include/gcc.h
@@ -180,6 +180,21 @@ WT_ATOMIC_FUNC(size, size_t, size_t *vp, size_t v)
#define WT_READ_BARRIER() WT_FULL_BARRIER()
#define WT_WRITE_BARRIER() WT_FULL_BARRIER()
+#elif defined(__mips64el__) || defined(__mips__) || defined(__mips64__) || defined(__mips64)
+#define WT_PAUSE() __asm__ volatile("pause\n" ::: "memory")
+#define WT_FULL_BARRIER() \
+ do { \
+ __asm__ volatile("sync; ld $0, %0" ::"m"(*(long *)0xffffffff80000000) : "memory"); \
+ } while (0)
+#define WT_READ_BARRIER() \
+ do { \
+ __asm__ volatile("sync; ld $0, %0" ::"m"(*(long *)0xffffffff80000000) : "memory"); \
+ } while (0)
+#define WT_WRITE_BARRIER() \
+ do { \
+ __asm__ volatile("sync; ld $0, %0" ::"m"(*(long *)0xffffffff80000000) : "memory"); \
+ } while (0)
+
#elif defined(__PPC64__) || defined(PPC64)
/* ori 0,0,0 is the PPC64 noop instruction */
#define WT_PAUSE() __asm__ volatile("ori 0,0,0" ::: "memory")
diff --git a/src/third_party/wiredtiger/src/include/hardware.h b/src/third_party/wiredtiger/src/include/hardware.h
index 447d082393e..9947de8b26a 100644
--- a/src/third_party/wiredtiger/src/include/hardware.h
+++ b/src/third_party/wiredtiger/src/include/hardware.h
@@ -60,15 +60,14 @@
/*
* Pad a structure so an array of structures get separate cache lines.
*
- * Note that we avoid compiler structure alignment because that requires
- * allocating aligned blocks of memory, and alignment pollutes any other type
- * that contains an aligned field. It is possible that a hot field positioned
- * before this one will be on the same cache line, but not if it is also
+ * Note that we avoid compiler structure alignment because that requires allocating aligned blocks
+ * of memory, and alignment pollutes any other type that contains an aligned field. It is possible
+ * that a hot field positioned before this one will be on the same cache line, but not if it is also
* padded.
*
- * This alignment has a small impact on portability as well, as we are using an
- * anonymous union here which is supported under C11, earlier versions of
- * the GNU standard, and MSVC versions as early as 2003.
+ * This alignment has a small impact on portability as well, as we are using an anonymous union here
+ * which is supported under C11, earlier versions of the GNU standard, and MSVC versions as early as
+ * 2003.
*/
#define WT_CACHE_LINE_PAD_BEGIN \
union { \
diff --git a/src/third_party/wiredtiger/src/include/log.h b/src/third_party/wiredtiger/src/include/log.h
index 0518d8dd0f9..f5f6bca7cc0 100644
--- a/src/third_party/wiredtiger/src/include/log.h
+++ b/src/third_party/wiredtiger/src/include/log.h
@@ -110,16 +110,17 @@ union __wt_lsn {
* Possible values for the consolidation array slot states:
*
* WT_LOG_SLOT_CLOSE - slot is in use but closed to new joins.
+ *
* WT_LOG_SLOT_FREE - slot is available for allocation.
+ *
* WT_LOG_SLOT_WRITTEN - slot is written and should be processed by worker.
*
- * The slot state must be volatile: threads loop checking the state and can't
- * cache the first value they see.
+ * The slot state must be volatile: threads loop checking the state and can't cache the first value
+ * they see.
*
- * The slot state is divided into two 32 bit sizes. One half is the
- * amount joined and the other is the amount released. Since we use
- * a few special states, reserve the top few bits for state. That makes
- * the maximum size less than 32 bits for both joined and released.
+ * The slot state is divided into two 32 bit sizes. One half is the amount joined and the other is
+ * the amount released. Since we use a few special states, reserve the top few bits for state. That
+ * makes the maximum size less than 32 bits for both joined and released.
*/
/*
* XXX The log slot bits are signed and should be rewritten as unsigned. For now, give the logging
@@ -279,13 +280,11 @@ struct __wt_log {
WT_CONDVAR *log_write_cond;
/*
- * Consolidation array information
- * Our testing shows that the more consolidation we generate the
- * better the performance we see which equates to an active slot
- * slot count of one.
+ * Consolidation array information Our testing shows that the more consolidation we generate the
+ * better the performance we see which equates to an active slot slot count of one.
*
- * Note: this can't be an array, we impose cache-line alignment and
- * gcc doesn't support that for arrays.
+ * Note: this can't be an array, we impose cache-line alignment and gcc doesn't support that for
+ * arrays.
*/
#define WT_SLOT_POOL 128
WT_LOGSLOT *active_slot; /* Active slot */
@@ -309,12 +308,10 @@ struct __wt_log_record {
uint32_t checksum; /* 04-07: Checksum of the record */
/*
- * No automatic generation: flag values cannot change, they're written
- * to disk.
+ * No automatic generation: flag values cannot change, they're written to disk.
*
- * Unused bits in the flags, as well as the 'unused' padding,
- * are expected to be zeroed; we check that to help detect file
- * corruption.
+ * Unused bits in the flags, as well as the 'unused' padding, are expected to be zeroed; we check
+ * that to help detect file corruption.
*/
#define WT_LOG_RECORD_COMPRESSED 0x01u /* Compressed except hdr */
#define WT_LOG_RECORD_ENCRYPTED 0x02u /* Encrypted except hdr */
diff --git a/src/third_party/wiredtiger/src/include/misc.h b/src/third_party/wiredtiger/src/include/misc.h
index 046d724d1f7..c303edc9488 100644
--- a/src/third_party/wiredtiger/src/include/misc.h
+++ b/src/third_party/wiredtiger/src/include/misc.h
@@ -147,13 +147,13 @@
/*
* Flag set, clear and test.
*
- * They come in 3 flavors: F_XXX (handles a field named "flags" in the structure
- * referenced by its argument), LF_XXX (handles a local variable named "flags"),
- * and FLD_XXX (handles any variable, anywhere).
+ * They come in 3 flavors: F_XXX (handles a field named "flags" in the structure referenced by its
+ * argument), LF_XXX (handles a local variable named "flags"), and FLD_XXX (handles any variable,
+ * anywhere).
*
- * Flags are unsigned 32-bit values -- we cast to keep the compiler quiet (the
- * hex constant might be a negative integer), and to ensure the hex constant is
- * the correct size before applying the bitwise not operator.
+ * Flags are unsigned 32-bit values -- we cast to keep the compiler quiet (the hex constant might be
+ * a negative integer), and to ensure the hex constant is the correct size before applying the
+ * bitwise not operator.
*/
#define FLD_CLR(field, mask) ((void)((field) &= ~(mask)))
#define FLD_MASK(field, mask) ((field) & (mask))
@@ -173,8 +173,8 @@
/*
* Insertion sort, for sorting small sets of values.
*
- * The "compare_lt" argument is a function or macro that returns true when
- * its first argument is less than its second argument.
+ * The "compare_lt" argument is a function or macro that returns true when its first argument is
+ * less than its second argument.
*/
#define WT_INSERTION_SORT(arrayp, n, value_type, compare_lt) \
do { \
diff --git a/src/third_party/wiredtiger/src/include/misc.i b/src/third_party/wiredtiger/src/include/misc.i
index 7b908ac3871..d739e78cf28 100644
--- a/src/third_party/wiredtiger/src/include/misc.i
+++ b/src/third_party/wiredtiger/src/include/misc.i
@@ -30,49 +30,6 @@ __wt_hex(int c)
}
/*
- * __wt_rdtsc --
- * Get a timestamp from CPU registers.
- */
-static inline uint64_t
-__wt_rdtsc(void)
-{
-#if defined(__i386)
- {
- uint64_t x;
-
- __asm__ volatile("rdtsc" : "=A"(x));
- return (x);
- }
-#elif defined(__amd64)
- {
- uint64_t a, d;
-
- __asm__ volatile("rdtsc" : "=a"(a), "=d"(d));
- return ((d << 32) | a);
- }
-#else
- return (0);
-#endif
-}
-
-/*
- * __wt_clock --
- * Obtain a timestamp via either a CPU register or via a system call on platforms where
- * obtaining it directly from the hardware register is not supported.
- */
-static inline uint64_t
-__wt_clock(WT_SESSION_IMPL *session)
-{
- struct timespec tsp;
-
- if (__wt_process.use_epochtime) {
- __wt_epoch(session, &tsp);
- return ((uint64_t)(tsp.tv_sec * WT_BILLION + tsp.tv_nsec));
- }
- return (__wt_rdtsc());
-}
-
-/*
* __wt_strdup --
* ANSI strdup function.
*/
@@ -281,12 +238,12 @@ __wt_timing_stress(WT_SESSION_IMPL *session, u_int flag)
}
/*
- * The hardware-accelerated checksum code that originally shipped on Windows
- * did not correctly handle memory that wasn't 8B aligned and a multiple of 8B.
- * It's likely that calculations were always 8B aligned, but there's some risk.
+ * The hardware-accelerated checksum code that originally shipped on Windows did not correctly
+ * handle memory that wasn't 8B aligned and a multiple of 8B. It's likely that calculations were
+ * always 8B aligned, but there's some risk.
*
- * What we do is always write the correct checksum, and if a checksum test
- * fails, check it against the alternate version have before failing.
+ * What we do is always write the correct checksum, and if a checksum test fails, check it against
+ * the alternate version have before failing.
*/
#if defined(_M_AMD64) && !defined(HAVE_NO_CRC32_HARDWARE)
diff --git a/src/third_party/wiredtiger/src/include/mutex.h b/src/third_party/wiredtiger/src/include/mutex.h
index 63283c92633..d65eea97b68 100644
--- a/src/third_party/wiredtiger/src/include/mutex.h
+++ b/src/third_party/wiredtiger/src/include/mutex.h
@@ -9,8 +9,8 @@
/*
* Condition variables:
*
- * WiredTiger uses condition variables to signal between threads, and for
- * locking operations that are expected to block.
+ * WiredTiger uses condition variables to signal between threads, and for locking operations that
+ * are expected to block.
*/
struct __wt_condvar {
const char *name; /* Mutex name for debugging */
@@ -88,9 +88,8 @@ struct __wt_rwlock { /* Read/write lock */
/*
* Spin locks:
*
- * WiredTiger uses spinlocks for fast mutual exclusion (where operations done
- * while holding the spin lock are expected to complete in a small number of
- * instructions).
+ * WiredTiger uses spinlocks for fast mutual exclusion (where operations done while holding the spin
+ * lock are expected to complete in a small number of instructions).
*/
#define SPINLOCK_GCC 0
#define SPINLOCK_MSVC 1
diff --git a/src/third_party/wiredtiger/src/include/mutex.i b/src/third_party/wiredtiger/src/include/mutex.i
index d9a93902fcd..45a0b3ab0f0 100644
--- a/src/third_party/wiredtiger/src/include/mutex.i
+++ b/src/third_party/wiredtiger/src/include/mutex.i
@@ -9,9 +9,8 @@
/*
* Spin locks:
*
- * These used for cases where fast mutual exclusion is needed (where operations
- * done while holding the spin lock are expected to complete in a small number
- * of instructions.
+ * These used for cases where fast mutual exclusion is needed (where operations done while holding
+ * the spin lock are expected to complete in a small number of instructions.
*/
/*
diff --git a/src/third_party/wiredtiger/src/include/packing.i b/src/third_party/wiredtiger/src/include/packing.i
index 1335334f142..a251322fcbb 100644
--- a/src/third_party/wiredtiger/src/include/packing.i
+++ b/src/third_party/wiredtiger/src/include/packing.i
@@ -9,9 +9,9 @@
/*
* Throughout this code we have to be aware of default argument conversion.
*
- * Refer to Chapter 8 of "Expert C Programming" by Peter van der Linden for the
- * gory details. The short version is that we have less cases to deal with
- * because the compiler promotes shorter types to int or unsigned int.
+ * Refer to Chapter 8 of "Expert C Programming" by Peter van der Linden for the gory details. The
+ * short version is that we have less cases to deal with because the compiler promotes shorter types
+ * to int or unsigned int.
*/
typedef struct {
union {
diff --git a/src/third_party/wiredtiger/src/include/reconcile.h b/src/third_party/wiredtiger/src/include/reconcile.h
index 22f63ae4ff4..1c02f8353c6 100644
--- a/src/third_party/wiredtiger/src/include/reconcile.h
+++ b/src/third_party/wiredtiger/src/include/reconcile.h
@@ -33,12 +33,9 @@ struct __wt_reconcile {
/* Track the page's min/maximum transactions. */
uint64_t max_txn;
- wt_timestamp_t max_timestamp;
-
- /* Lookaside boundary tracking. */
- uint64_t unstable_txn;
- wt_timestamp_t unstable_durable_timestamp;
- wt_timestamp_t unstable_timestamp;
+ wt_timestamp_t max_ts;
+ wt_timestamp_t max_ondisk_ts;
+ wt_timestamp_t min_skipped_ts;
u_int updates_seen; /* Count of updates seen. */
u_int updates_unstable; /* Count of updates not visible_all. */
@@ -56,43 +53,36 @@ struct __wt_reconcile {
bool leave_dirty;
/*
- * Track if reconciliation has seen any overflow items. If a leaf page
- * with no overflow items is written, the parent page's address cell is
- * set to the leaf-no-overflow type. This means we can delete the leaf
- * page without reading it because we don't have to discard any overflow
+ * Track if reconciliation has seen any overflow items. If a leaf page with no overflow items is
+ * written, the parent page's address cell is set to the leaf-no-overflow type. This means we
+ * can delete the leaf page without reading it because we don't have to discard any overflow
* items it might reference.
*
- * The test test is per-page reconciliation, that is, once we see an
- * overflow item on the page, all subsequent leaf pages written for the
- * page will not be leaf-no-overflow type, regardless of whether or not
- * they contain overflow items. In other words, leaf-no-overflow is not
- * guaranteed to be set on every page that doesn't contain an overflow
- * item, only that if it is set, the page contains no overflow items.
- * XXX
- * This was originally done because raw compression couldn't do better,
- * now that raw compression has been removed, we should do better.
+ * The test test is per-page reconciliation, that is, once we see an overflow item on the page,
+ * all subsequent leaf pages written for the page will not be leaf-no-overflow type, regardless
+ * of whether or not they contain overflow items. In other words, leaf-no-overflow is not
+ * guaranteed to be set on every page that doesn't contain an overflow item, only that if it is
+ * set, the page contains no overflow items. XXX This was originally done because raw
+ * compression couldn't do better, now that raw compression has been removed, we should do
+ * better.
*/
bool ovfl_items;
/*
- * Track if reconciliation of a row-store leaf page has seen empty (zero
- * length) values. We don't write out anything for empty values, so if
- * there are empty values on a page, we have to make two passes over the
- * page when it's read to figure out how many keys it has, expensive in
- * the common case of no empty values and (entries / 2) keys. Likewise,
- * a page with only empty values is another common data set, and keys on
- * that page will be equal to the number of entries. In both cases, set
- * a flag in the page's on-disk header.
+ * Track if reconciliation of a row-store leaf page has seen empty (zero length) values. We
+ * don't write out anything for empty values, so if there are empty values on a page, we have to
+ * make two passes over the page when it's read to figure out how many keys it has, expensive in
+ * the common case of no empty values and (entries / 2) keys. Likewise, a page with only empty
+ * values is another common data set, and keys on that page will be equal to the number of
+ * entries. In both cases, set a flag in the page's on-disk header.
*
- * The test is per-page reconciliation as described above for the
- * overflow-item test.
+ * The test is per-page reconciliation as described above for the overflow-item test.
*/
bool all_empty_value, any_empty_value;
/*
- * Reconciliation gets tricky if we have to split a page, which happens
- * when the disk image we create exceeds the page type's maximum disk
- * image size.
+ * Reconciliation gets tricky if we have to split a page, which happens when the disk image we
+ * create exceeds the page type's maximum disk image size.
*
* First, the target size of the page we're building.
*/
@@ -106,31 +96,26 @@ struct __wt_reconcile {
uint32_t min_split_size; /* Minimum split page size */
/*
- * We maintain two split chunks in the memory during reconciliation to
- * be written out as pages. As we get to the end of the data, if the
- * last one turns out to be smaller than the minimum split size, we go
- * back into the penultimate chunk and split at this minimum split size
- * boundary. This moves some data from the penultimate chunk to the last
- * chunk, hence increasing the size of the last page written without
- * decreasing the penultimate page size beyond the minimum split size.
- * For this reason, we maintain an expected split percentage boundary
- * and a minimum split percentage boundary.
+ * We maintain two split chunks in the memory during reconciliation to be written out as pages.
+ * As we get to the end of the data, if the last one turns out to be smaller than the minimum
+ * split size, we go back into the penultimate chunk and split at this minimum split size
+ * boundary. This moves some data from the penultimate chunk to the last chunk, hence increasing
+ * the size of the last page written without decreasing the penultimate page size beyond the
+ * minimum split size. For this reason, we maintain an expected split percentage boundary and a
+ * minimum split percentage boundary.
*
- * Chunks are referenced by current and previous pointers. In case of a
- * split, previous references the first chunk and current switches to
- * the second chunk. If reconciliation generates more split chunks, the
- * the previous chunk is written to the disk and current and previous
- * swap.
+ * Chunks are referenced by current and previous pointers. In case of a split, previous
+ * references the first chunk and current switches to the second chunk. If reconciliation
+ * generates more split chunks, the previous chunk is written to the disk and current and
+ * previous swap.
*/
struct __wt_rec_chunk {
/*
- * The recno and entries fields are the starting record number
- * of the split chunk (for column-store splits), and the number
- * of entries in the split chunk.
+ * The recno and entries fields are the starting record number of the split chunk (for
+ * column-store splits), and the number of entries in the split chunk.
*
- * The key for a row-store page; no column-store key is needed
- * because the page's recno, stored in the recno field, is the
- * column-store key.
+ * The key for a row-store page; no column-store key is needed because the page's recno,
+ * stored in the recno field, is the column-store key.
*/
uint32_t entries;
uint64_t recno;
diff --git a/src/third_party/wiredtiger/src/include/reconcile.i b/src/third_party/wiredtiger/src/include/reconcile.i
index eabf9e58c4f..adad096da49 100644
--- a/src/third_party/wiredtiger/src/include/reconcile.i
+++ b/src/third_party/wiredtiger/src/include/reconcile.i
@@ -20,19 +20,16 @@ static inline bool
__wt_rec_need_split(WT_RECONCILE *r, size_t len)
{
/*
- * In the case of a row-store leaf page, trigger a split if a threshold
- * number of saved updates is reached. This allows pages to split for
- * update/restore and lookaside eviction when there is no visible data
- * causing the disk image to grow.
+ * In the case of a row-store leaf page, trigger a split if a threshold number of saved updates
+ * is reached. This allows pages to split for update/restore and lookaside eviction when there
+ * is no visible data causing the disk image to grow.
*
- * In the case of small pages or large keys, we might try to split when
- * a page has no updates or entries, which isn't possible. To consider
- * update/restore or lookaside information, require either page entries
- * or updates that will be attached to the image. The limit is one of
- * either, but it doesn't make sense to create pages or images with few
- * entries or updates, even where page sizes are small (especially as
- * updates that will eventually become overflow items can throw off our
- * calculations). Bound the combination at something reasonable.
+ * In the case of small pages or large keys, we might try to split when a page has no updates or
+ * entries, which isn't possible. To consider update/restore or lookaside information, require
+ * either page entries or updates that will be attached to the image. The limit is one of
+ * either, but it doesn't make sense to create pages or images with few entries or updates, even
+ * where page sizes are small (especially as updates that will eventually become overflow items
+ * can throw off our calculations). Bound the combination at something reasonable.
*/
if (r->page->type == WT_PAGE_ROW_LEAF && r->entries + r->supd_next > 10)
len += r->supd_memsize;
@@ -128,12 +125,11 @@ __wt_rec_image_copy(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_REC_KV *kv)
uint8_t *p, *t;
/*
- * If there's only one chunk of data to copy (because the cell and data
- * are being copied from the original disk page), the cell length won't
- * be set, the WT_ITEM data/length will reference the data to be copied.
+ * If there's only one chunk of data to copy (because the cell and data are being copied from
+ * the original disk page), the cell length won't be set, the WT_ITEM data/length will reference
+ * the data to be copied.
*
- * WT_CELLs are typically small, 1 or 2 bytes -- don't call memcpy, do
- * the copy in-line.
+ * WT_CELLs are typically small, 1 or 2 bytes -- don't call memcpy, do the copy in-line.
*/
for (p = r->first_free, t = (uint8_t *)&kv->cell, len = kv->cell_len; len > 0; --len)
*p++ = *t++;
@@ -257,18 +253,15 @@ __wt_rec_dict_replace(WT_SESSION_IMPL *session, WT_RECONCILE *r, wt_timestamp_t
uint64_t offset;
/*
- * We optionally create a dictionary of values and only write a unique
- * value once per page, using a special "copy" cell for all subsequent
- * copies of the value. We have to do the cell build and resolution at
- * this low level because we need physical cell offsets for the page.
+ * We optionally create a dictionary of values and only write a unique value once per page,
+ * using a special "copy" cell for all subsequent copies of the value. We have to do the cell
+ * build and resolution at this low level because we need physical cell offsets for the page.
*
- * Sanity check: short-data cells can be smaller than dictionary-copy
- * cells. If the data is already small, don't bother doing the work.
- * This isn't just work avoidance: on-page cells can't grow as a result
- * of writing a dictionary-copy cell, the reconciliation functions do a
- * split-boundary test based on the size required by the value's cell;
- * if we grow the cell after that test we'll potentially write off the
- * end of the buffer's memory.
+ * Sanity check: short-data cells can be smaller than dictionary-copy cells. If the data is
+ * already small, don't bother doing the work. This isn't just work avoidance: on-page cells
+ * can't grow as a result of writing a dictionary-copy cell, the reconciliation functions do a
+ * split-boundary test based on the size required by the value's cell; if we grow the cell after
+ * that test we'll potentially write off the end of the buffer's memory.
*/
if (val->buf.size <= WT_INTPACK32_MAXSIZE)
return (0);
@@ -277,11 +270,10 @@ __wt_rec_dict_replace(WT_SESSION_IMPL *session, WT_RECONCILE *r, wt_timestamp_t
return (0);
/*
- * If the dictionary offset isn't set, we're creating a new entry in the
- * dictionary, set its location.
+ * If the dictionary offset isn't set, we're creating a new entry in the dictionary, set its
+ * location.
*
- * If the dictionary offset is set, we have a matching value. Create a
- * copy cell instead.
+ * If the dictionary offset is set, we have a matching value. Create a copy cell instead.
*/
if (dp->offset == 0)
dp->offset = WT_PTRDIFF32(r->first_free, r->cur_ptr->image.mem);
diff --git a/src/third_party/wiredtiger/src/include/serial.i b/src/third_party/wiredtiger/src/include/serial.i
index 4f8d6ac6611..66fb066153a 100644
--- a/src/third_party/wiredtiger/src/include/serial.i
+++ b/src/third_party/wiredtiger/src/include/serial.i
@@ -19,16 +19,14 @@ __insert_simple_func(
WT_UNUSED(session);
/*
- * Update the skiplist elements referencing the new WT_INSERT item.
- * If we fail connecting one of the upper levels in the skiplist,
- * return success: the levels we updated are correct and sufficient.
- * Even though we don't get the benefit of the memory we allocated,
- * we can't roll back.
+ * Update the skiplist elements referencing the new WT_INSERT item. If we fail connecting one of
+ * the upper levels in the skiplist, return success: the levels we updated are correct and
+ * sufficient. Even though we don't get the benefit of the memory we allocated, we can't roll
+ * back.
*
- * All structure setup must be flushed before the structure is entered
- * into the list. We need a write barrier here, our callers depend on
- * it. Don't pass complex arguments to the macro, some implementations
- * read the old value multiple times.
+ * All structure setup must be flushed before the structure is entered into the list. We need a
+ * write barrier here, our callers depend on it. Don't pass complex arguments to the macro, some
+ * implementations read the old value multiple times.
*/
for (i = 0; i < skipdepth; i++) {
WT_INSERT *old_ins = *ins_stack[i];
@@ -55,16 +53,14 @@ __insert_serial_func(WT_SESSION_IMPL *session, WT_INSERT_HEAD *ins_head, WT_INSE
/*
* Update the skiplist elements referencing the new WT_INSERT item.
*
- * Confirm we are still in the expected position, and no item has been
- * added where our insert belongs. If we fail connecting one of the
- * upper levels in the skiplist, return success: the levels we updated
- * are correct and sufficient. Even though we don't get the benefit of
- * the memory we allocated, we can't roll back.
+ * Confirm we are still in the expected position, and no item has been added where our insert
+ * belongs. If we fail connecting one of the upper levels in the skiplist, return success: the
+ * levels we updated are correct and sufficient. Even though we don't get the benefit of the
+ * memory we allocated, we can't roll back.
*
- * All structure setup must be flushed before the structure is entered
- * into the list. We need a write barrier here, our callers depend on
- * it. Don't pass complex arguments to the macro, some implementations
- * read the old value multiple times.
+ * All structure setup must be flushed before the structure is entered into the list. We need a
+ * write barrier here, our callers depend on it. Don't pass complex arguments to the macro, some
+ * implementations read the old value multiple times.
*/
for (i = 0; i < skipdepth; i++) {
WT_INSERT *old_ins = *ins_stack[i];
@@ -234,12 +230,11 @@ __wt_update_serial(WT_SESSION_IMPL *session, WT_PAGE *page, WT_UPDATE **srch_upd
*updp = NULL;
/*
- * All structure setup must be flushed before the structure is entered
- * into the list. We need a write barrier here, our callers depend on
- * it.
+ * All structure setup must be flushed before the structure is entered into the list. We need a
+ * write barrier here, our callers depend on it.
*
- * Swap the update into place. If that fails, a new update was added
- * after our search, we raced. Check if our update is still permitted.
+ * Swap the update into place. If that fails, a new update was added after our search, we raced.
+ * Check if our update is still permitted.
*/
while (!__wt_atomic_cas_ptr(srch_upd, upd->next, upd)) {
if ((ret = __wt_txn_update_check(session, upd->next = *srch_upd)) != 0) {
diff --git a/src/third_party/wiredtiger/src/include/session.h b/src/third_party/wiredtiger/src/include/session.h
index b1da78f4668..f4b82b8f5e9 100644
--- a/src/third_party/wiredtiger/src/include/session.h
+++ b/src/third_party/wiredtiger/src/include/session.h
@@ -61,7 +61,12 @@ struct __wt_session_impl {
const char *name; /* Name */
const char *lastop; /* Last operation */
uint32_t id; /* UID, offset in session array */
- uint32_t op_start; /* DEBUGGING: Operation start time (seconds) */
+
+ uint64_t operation_start_us; /* Operation start */
+ uint64_t operation_timeout_us; /* Maximum operation period before rollback */
+#ifdef HAVE_DIAGNOSTIC
+ uint32_t op_5043_seconds; /* Temporary debugging to catch WT-5043, discard after 01/2020. */
+#endif
WT_EVENT_HANDLER *event_handler; /* Application's event handlers */
@@ -241,11 +246,11 @@ struct __wt_session_impl {
/*
* Hazard pointers.
*
- * Hazard information persists past session close because it's accessed
- * by threads of control other than the thread owning the session.
+ * Hazard information persists past session close because it's accessed by threads of control other
+ * than the thread owning the session.
*
- * Use the non-NULL state of the hazard field to know if the session has
- * previously been initialized.
+ * Use the non-NULL state of the hazard field to know if the session has previously been
+ * initialized.
*/
#define WT_SESSION_FIRST_USE(s) ((s)->hazard == NULL)
diff --git a/src/third_party/wiredtiger/src/include/stat.h b/src/third_party/wiredtiger/src/include/stat.h
index 53d3f2126ae..14665c4df75 100644
--- a/src/third_party/wiredtiger/src/include/stat.h
+++ b/src/third_party/wiredtiger/src/include/stat.h
@@ -9,74 +9,67 @@
/*
* Statistics counters:
*
- * We use an array of statistics structures; threads write different structures
- * to avoid writing the same cache line and incurring cache coherency overheads,
- * which can dramatically slow fast and otherwise read-mostly workloads.
+ * We use an array of statistics structures; threads write different structures to avoid writing the
+ * same cache line and incurring cache coherency overheads, which can dramatically slow fast and
+ * otherwise read-mostly workloads.
*
- * With an 8B statistics value and 64B cache-line alignment, 8 values share the
- * same cache line. There are collisions when different threads choose the same
- * statistics structure and update values that live on the cache line. There is
- * likely some locality however: a thread updating the cursor search statistic
- * is likely to update other cursor statistics with a chance of hitting already
- * cached values.
+ * With an 8B statistics value and 64B cache-line alignment, 8 values share the same cache line.
+ * There are collisions when different threads choose the same statistics structure and update
+ * values that live on the cache line. There is likely some locality however: a thread updating the
+ * cursor search statistic is likely to update other cursor statistics with a chance of hitting
+ * already cached values.
*
- * The actual statistic value must be signed, because one thread might increment
- * the value in its structure, and then another thread might decrement the same
- * value in another structure (where the value was initially zero), so the value
- * in the second thread's slot will go negative.
+ * The actual statistic value must be signed, because one thread might increment the value in its
+ * structure, and then another thread might decrement the same value in another structure (where the
+ * value was initially zero), so the value in the second thread's slot will go negative.
*
- * When reading a statistics value, the array values are summed and returned to
- * the caller. The summation is performed without locking, so the value read
- * may be inconsistent (and might be negative, if increments/decrements race
- * with the reader).
+ * When reading a statistics value, the array values are summed and returned to the caller. The
+ * summation is performed without locking, so the value read may be inconsistent (and might be
+ * negative, if increments/decrements race with the reader).
*
- * Choosing how many structures isn't easy: obviously, a smaller number creates
- * more conflicts while a larger number uses more memory.
+ * Choosing how many structures isn't easy: obviously, a smaller number creates more conflicts while
+ * a larger number uses more memory.
*
- * Ideally, if the application running on the system is CPU-intensive, and using
- * all CPUs on the system, we want to use the same number of slots as there are
- * CPUs (because their L1 caches are the units of coherency). However, in
- * practice we cannot easily determine how many CPUs are actually available to
- * the application.
+ * Ideally, if the application running on the system is CPU-intensive, and using all CPUs on the
+ * system, we want to use the same number of slots as there are CPUs (because their L1 caches are
+ * the units of coherency). However, in practice we cannot easily determine how many CPUs are
+ * actually available to the application.
*
- * Our next best option is to use the number of threads in the application as a
- * heuristic for the number of CPUs (presumably, the application architect has
- * figured out how many CPUs are available). However, inside WiredTiger we don't
- * know when the application creates its threads.
+ * Our next best option is to use the number of threads in the application as a heuristic for the
+ * number of CPUs (presumably, the application architect has figured out how many CPUs are
+ * available). However, inside WiredTiger we don't know when the application creates its threads.
*
- * For now, we use a fixed number of slots. Ideally, we would approximate the
- * largest number of cores we expect on any machine where WiredTiger might be
- * run, however, we don't want to waste that much memory on smaller machines.
- * As of 2015, machines with more than 24 CPUs are relatively rare.
+ * For now, we use a fixed number of slots. Ideally, we would approximate the largest number of
+ * cores we expect on any machine where WiredTiger might be run, however, we don't want to waste
+ * that much memory on smaller machines. As of 2015, machines with more than 24 CPUs are relatively
+ * rare.
*
- * Default hash table size; use a prime number of buckets rather than assuming
- * a good hash (Reference Sedgewick, Algorithms in C, "Hash Functions").
+ * Default hash table size; use a prime number of buckets rather than assuming a good hash
+ * (Reference Sedgewick, Algorithms in C, "Hash Functions").
*/
#define WT_COUNTER_SLOTS 23
/*
* WT_STATS_SLOT_ID is the thread's slot ID for the array of structures.
*
- * Ideally, we want a slot per CPU, and we want each thread to index the slot
- * corresponding to the CPU it runs on. Unfortunately, getting the ID of the
- * current CPU is difficult: some operating systems provide a system call to
- * acquire a CPU ID, but not all (regardless, making a system call to increment
- * a statistics value is far too expensive).
+ * Ideally, we want a slot per CPU, and we want each thread to index the slot corresponding to the
+ * CPU it runs on. Unfortunately, getting the ID of the current CPU is difficult: some operating
+ * systems provide a system call to acquire a CPU ID, but not all (regardless, making a system call
+ * to increment a statistics value is far too expensive).
*
- * Our second-best option is to use the thread ID. Unfortunately, there is no
- * portable way to obtain a unique thread ID that's a small-enough number to
- * be used as an array index (portable thread IDs are usually a pointer or an
- * opaque chunk, not a simple integer).
+ * Our second-best option is to use the thread ID. Unfortunately, there is no portable way to obtain
+ * a unique thread ID that's a small-enough number to be used as an array index (portable thread IDs
+ * are usually a pointer or an opaque chunk, not a simple integer).
*
- * Our solution is to use the session ID; there is normally a session per thread
- * and the session ID is a small, monotonically increasing number.
+ * Our solution is to use the session ID; there is normally a session per thread and the session ID
+ * is a small, monotonically increasing number.
*/
#define WT_STATS_SLOT_ID(session) (((session)->id) % WT_COUNTER_SLOTS)
/*
- * Statistic structures are arrays of int64_t's. We have functions to read/write
- * those structures regardless of the specific statistic structure we're working
- * with, by translating statistics structure field names to structure offsets.
+ * Statistic structures are arrays of int64_t's. We have functions to read/write those structures
+ * regardless of the specific statistic structure we're working with, by translating statistics
+ * structure field names to structure offsets.
*
* Translate a statistic's value name to an offset in the array.
*/
@@ -109,20 +102,17 @@ __wt_stats_aggregate(void *stats_arg, int slot)
aggr_v += stats[i][slot];
/*
- * This can race. However, any implementation with a single value can
- * race as well, different threads could set the same counter value
- * simultaneously. While we are making races more likely, we are not
- * fundamentally weakening the isolation semantics found in updating a
- * single value.
+ * This can race. However, any implementation with a single value can race as well, different
+ * threads could set the same counter value simultaneously. While we are making races more
+ * likely, we are not fundamentally weakening the isolation semantics found in updating a single
+ * value.
*
- * Additionally, the aggregation can go negative (imagine a thread
- * incrementing a value after aggregation has passed its slot and a
- * second thread decrementing a value before aggregation has reached
- * its slot).
+ * Additionally, the aggregation can go negative (imagine a thread incrementing a value after
+ * aggregation has passed its slot and a second thread decrementing a value before aggregation
+ * has reached its slot).
*
- * For historic API compatibility, the external type is a uint64_t;
- * limit our return to positive values, negative numbers would just
- * look really, really large.
+ * For historic API compatibility, the external type is a uint64_t; limit our return to positive
+ * values, negative numbers would just look really, really large.
*/
if (aggr_v < 0)
aggr_v = 0;
@@ -223,12 +213,11 @@ __wt_stats_clear(void *stats_arg, int slot)
#define WT_STAT_CONN_SET(session, fld, value) WT_STAT_SET(session, S2C(session)->stats, fld, value)
/*
- * Update data-source handle statistics if statistics gathering is enabled
- * and the data-source handle is set.
+ * Update data-source handle statistics if statistics gathering is enabled and the data-source
+ * handle is set.
*
- * XXX
- * We shouldn't have to check if the data-source handle is NULL, but it's
- * necessary until everything is converted to using data-source handles.
+ * XXX We shouldn't have to check if the data-source handle is NULL, but it's necessary until
+ * everything is converted to using data-source handles.
*/
#define WT_STAT_DATA_DECRV(session, fld, value) \
do { \
diff --git a/src/third_party/wiredtiger/src/include/time.i b/src/third_party/wiredtiger/src/include/time.i
new file mode 100644
index 00000000000..0dd6781216e
--- /dev/null
+++ b/src/third_party/wiredtiger/src/include/time.i
@@ -0,0 +1,190 @@
+/*-
+ * Copyright (c) 2014-2019 MongoDB, Inc.
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __wt_rdtsc --
+ * Get a timestamp from CPU registers.
+ */
+static inline uint64_t
+__wt_rdtsc(void)
+{
+#if defined(__i386)
+ {
+ uint64_t x;
+
+ __asm__ volatile("rdtsc" : "=A"(x));
+ return (x);
+ }
+#elif defined(__amd64)
+ {
+ uint64_t a, d;
+
+ __asm__ volatile("rdtsc" : "=a"(a), "=d"(d));
+ return ((d << 32) | a);
+ }
+#else
+ return (0);
+#endif
+}
+
+/*
+ * __time_check_monotonic --
+ * Check and prevent time running backward. If we detect that it has, we set the time structure
+ * to the previous values, making time stand still until we see a time in the future of the
+ * highest value seen so far.
+ */
+static inline void
+__time_check_monotonic(WT_SESSION_IMPL *session, struct timespec *tsp)
+{
+ /*
+ * Detect time going backward. If so, use the last saved timestamp.
+ */
+ if (session == NULL)
+ return;
+
+ if (tsp->tv_sec < session->last_epoch.tv_sec ||
+ (tsp->tv_sec == session->last_epoch.tv_sec && tsp->tv_nsec < session->last_epoch.tv_nsec)) {
+ WT_STAT_CONN_INCR(session, time_travel);
+ *tsp = session->last_epoch;
+ } else
+ session->last_epoch = *tsp;
+}
+
+/*
+ * __wt_epoch --
+ * Return the time since the Epoch.
+ */
+static inline void
+__wt_epoch(WT_SESSION_IMPL *session, struct timespec *tsp)
+{
+ struct timespec tmp;
+
+ /*
+ * Read into a local variable, then check for monotonically increasing time, ensuring single
+ * threads never see time move backward. We don't prevent multiple threads from seeing time move
+ * backwards (even when reading time serially, the saved last-read time is per thread, not per
+ * timer, so multiple threads can race the time). Nor do we prevent multiple threads
+ * simultaneously reading the time from seeing random time or time moving backwards (assigning
+ * the time structure to the returned memory location implies multicycle writes to memory).
+ */
+ __wt_epoch_raw(session, &tmp);
+ __time_check_monotonic(session, &tmp);
+ *tsp = tmp;
+}
+
+/*
+ * __wt_clock --
+ * Obtain a timestamp via either a CPU register or via a system call on platforms where
+ * obtaining it directly from the hardware register is not supported.
+ */
+static inline uint64_t
+__wt_clock(WT_SESSION_IMPL *session)
+{
+ struct timespec tsp;
+
+ /*
+ * In one case we return nanoseconds, in the other we return clock ticks. That looks wrong, but
+ * it's not. When simply comparing before and after values, which is returned doesn't matter.
+ * When trying to calculate wall-clock time (that is, comparing a starting time with an ending
+ * time), we'll subtract the two values and then call a function to convert the result of the
+ * subtraction into nanoseconds. In the case where we already have nanoseconds, that function
+ * has a conversion constant of 1 and we'll skip the conversion, in the case where we have clock
+ * ticks, the conversion constant will be real. The reason is because doing it that way avoids a
+ * floating-point operation per wall-clock time calculation.
+ */
+ if (__wt_process.use_epochtime) {
+ __wt_epoch(session, &tsp);
+ return ((uint64_t)(tsp.tv_sec * WT_BILLION + tsp.tv_nsec));
+ }
+ return (__wt_rdtsc());
+}
+
+/*
+ * __wt_seconds --
+ * Return the seconds since the Epoch.
+ */
+static inline void
+__wt_seconds(WT_SESSION_IMPL *session, uint64_t *secondsp)
+{
+ struct timespec t;
+
+ __wt_epoch(session, &t);
+
+ *secondsp = (uint64_t)(t.tv_sec + t.tv_nsec / WT_BILLION);
+}
+
+/*
+ * __wt_seconds32 --
+ * Return the seconds since the Epoch in 32 bits.
+ */
+static inline void
+__wt_seconds32(WT_SESSION_IMPL *session, uint32_t *secondsp)
+{
+ uint64_t seconds;
+
+ /* This won't work in 2038. But for now allow it. */
+ __wt_seconds(session, &seconds);
+ *secondsp = (uint32_t)seconds;
+}
+
+/*
+ * __wt_clock_to_nsec --
+ * Convert from clock ticks to nanoseconds.
+ */
+static inline uint64_t
+__wt_clock_to_nsec(uint64_t end, uint64_t begin)
+{
+ double clock_diff;
+
+ /*
+ * If the ticks were reset, consider it an invalid check and just return zero as the time
+ * difference because we cannot compute anything meaningful.
+ */
+ if (end < begin)
+ return (0);
+ clock_diff = (double)(end - begin);
+ return ((uint64_t)(clock_diff / __wt_process.tsc_nsec_ratio));
+}
+
+/*
+ * __wt_op_timer_start --
+ * Start the operations timer.
+ */
+static inline void
+__wt_op_timer_start(WT_SESSION_IMPL *session)
+{
+ session->operation_start_us = session->operation_timeout_us == 0 ? 0 : __wt_clock(session);
+#ifdef HAVE_DIAGNOSTIC
+ /*
+ * This is called at the beginning of each API call. We need to clear out any old values from
+ * this debugging field so that we don't leave a stale value in there that may then give a false
+ * positive.
+ */
+ session->op_5043_seconds = 0;
+#endif
+}
+
+/*
+ * __wt_op_timer_fired --
+ * Check the operations timers.
+ */
+static inline bool
+__wt_op_timer_fired(WT_SESSION_IMPL *session)
+{
+ uint64_t diff, now;
+
+ /* Check for both a timeout and a start time to avoid any future configuration races. */
+ if (session->operation_timeout_us == 0 || session->operation_start_us == 0)
+ return (false);
+
+ now = __wt_clock(session);
+ diff = WT_CLOCKDIFF_US(now, session->operation_start_us);
+ return (diff > session->operation_timeout_us);
+}
diff --git a/src/third_party/wiredtiger/src/include/txn.h b/src/third_party/wiredtiger/src/include/txn.h
index e67f680b076..7636cf42dd9 100644
--- a/src/third_party/wiredtiger/src/include/txn.h
+++ b/src/third_party/wiredtiger/src/include/txn.h
@@ -39,9 +39,8 @@ typedef enum {
/*
* Transaction ID comparison dealing with edge cases.
*
- * WT_TXN_ABORTED is the largest possible ID (never visible to a running
- * transaction), WT_TXN_NONE is smaller than any possible ID (visible to all
- * running transactions).
+ * WT_TXN_ABORTED is the largest possible ID (never visible to a running transaction), WT_TXN_NONE
+ * is smaller than any possible ID (visible to all running transactions).
*/
#define WT_TXNID_LE(t1, t2) ((t1) <= (t2))
@@ -158,15 +157,13 @@ struct __wt_txn_global {
uint32_t read_timestampq_len;
/*
- * Track information about the running checkpoint. The transaction
- * snapshot used when checkpointing are special. Checkpoints can run
- * for a long time so we keep them out of regular visibility checks.
- * Eviction and checkpoint operations know when they need to be aware
- * of checkpoint transactions.
+ * Track information about the running checkpoint. The transaction snapshot used when
+ * checkpointing are special. Checkpoints can run for a long time so we keep them out of regular
+ * visibility checks. Eviction and checkpoint operations know when they need to be aware of
+ * checkpoint transactions.
*
- * We rely on the fact that (a) the only table a checkpoint updates is
- * the metadata; and (b) once checkpoint has finished reading a table,
- * it won't revisit it.
+ * We rely on the fact that (a) the only table a checkpoint updates is the metadata; and (b)
+ * once checkpoint has finished reading a table, it won't revisit it.
*/
volatile bool checkpoint_running; /* Checkpoint running */
volatile uint32_t checkpoint_id; /* Checkpoint's session ID */
@@ -277,8 +274,7 @@ struct __wt_txn {
/*
* Timestamp copied into updates created by this transaction.
*
- * In some use cases, this can be updated while the transaction is
- * running.
+ * In some use cases, this can be updated while the transaction is running.
*/
wt_timestamp_t commit_timestamp;
diff --git a/src/third_party/wiredtiger/src/include/txn.i b/src/third_party/wiredtiger/src/include/txn.i
index 3e5d2bfd850..5359e296fa0 100644
--- a/src/third_party/wiredtiger/src/include/txn.i
+++ b/src/third_party/wiredtiger/src/include/txn.i
@@ -136,13 +136,11 @@ __txn_resolve_prepared_update(WT_SESSION_IMPL *session, WT_UPDATE *upd)
txn = &session->txn;
/*
- * In case of a prepared transaction, the order of modification of the
- * prepare timestamp to commit timestamp in the update chain will not
- * affect the data visibility, a reader will encounter a prepared
- * update resulting in prepare conflict.
+ * In case of a prepared transaction, the order of modification of the prepare timestamp to
+ * commit timestamp in the update chain will not affect the data visibility, a reader will
+ * encounter a prepared update resulting in prepare conflict.
*
- * As updating timestamp might not be an atomic operation, we will
- * manage using state.
+ * As updating timestamp might not be an atomic operation, we will manage using state.
*/
upd->prepare_state = WT_PREPARE_LOCKED;
WT_WRITE_BARRIER();
@@ -552,14 +550,12 @@ __wt_txn_oldest_id(WT_SESSION_IMPL *session)
WT_READ_BARRIER();
/*
- * Checkpoint transactions often fall behind ordinary application
- * threads. Take special effort to not keep changes pinned in cache
- * if they are only required for the checkpoint and it has already
- * seen them.
+ * Checkpoint transactions often fall behind ordinary application threads. Take special effort
+ * to not keep changes pinned in cache if they are only required for the checkpoint and it has
+ * already seen them.
*
- * If there is no active checkpoint or this handle is up to date with
- * the active checkpoint then it's safe to ignore the checkpoint ID in
- * the visibility check.
+ * If there is no active checkpoint or this handle is up to date with the active checkpoint then
+ * it's safe to ignore the checkpoint ID in the visibility check.
*/
checkpoint_pinned = txn_global->checkpoint_state.pinned_id;
if (checkpoint_pinned == WT_TXN_NONE || WT_TXNID_LT(oldest_id, checkpoint_pinned))
@@ -586,14 +582,12 @@ __wt_txn_pinned_timestamp(WT_SESSION_IMPL *session, wt_timestamp_t *pinned_tsp)
*pinned_tsp = pinned_ts = txn_global->pinned_timestamp;
/*
- * Checkpoint transactions often fall behind ordinary application
- * threads. Take special effort to not keep changes pinned in cache if
- * they are only required for the checkpoint and it has already seen
- * them.
+ * Checkpoint transactions often fall behind ordinary application threads. Take special effort
+ * to not keep changes pinned in cache if they are only required for the checkpoint and it has
+ * already seen them.
*
- * If there is no active checkpoint or this handle is up to date with
- * the active checkpoint then it's safe to ignore the checkpoint ID in
- * the visibility check.
+ * If there is no active checkpoint or this handle is up to date with the active checkpoint then
+ * it's safe to ignore the checkpoint ID in the visibility check.
*/
include_checkpoint_txn =
btree == NULL || (!F_ISSET(btree, WT_BTREE_LOOKASIDE) &&
@@ -709,13 +703,11 @@ __txn_visible_id(WT_SESSION_IMPL *session, uint64_t id)
return (true);
/*
- * WT_ISO_SNAPSHOT, WT_ISO_READ_COMMITTED: the ID is visible if it is
- * not the result of a concurrent transaction, that is, if was
- * committed before the snapshot was taken.
+ * WT_ISO_SNAPSHOT, WT_ISO_READ_COMMITTED: the ID is visible if it is not the result of a
+ * concurrent transaction, that is, if was committed before the snapshot was taken.
*
- * The order here is important: anything newer than the maximum ID we
- * saw when taking the snapshot should be invisible, even if the
- * snapshot is empty.
+ * The order here is important: anything newer than the maximum ID we saw when taking the
+ * snapshot should be invisible, even if the snapshot is empty.
*/
if (WT_TXNID_LE(txn->snap_max, id))
return (false);
@@ -793,19 +785,6 @@ __wt_txn_upd_visible_type(WT_SESSION_IMPL *session, WT_UPDATE *upd)
}
/*
- * __wt_txn_upd_durable --
- * Can the current transaction make the given update durable.
- */
-static inline bool
-__wt_txn_upd_durable(WT_SESSION_IMPL *session, WT_UPDATE *upd)
-{
- /* If update is visible then check if it is durable. */
- if (__wt_txn_upd_visible_type(session, upd) != WT_VISIBLE_TRUE)
- return (false);
- return (__wt_txn_visible(session, upd->txnid, upd->durable_ts));
-}
-
-/*
* __wt_txn_upd_visible --
* Can the current transaction see the given update.
*/
@@ -871,8 +850,12 @@ __wt_txn_begin(WT_SESSION_IMPL *session, const char *cfg[])
if (session->ncursors > 0)
WT_RET(__wt_session_copy_values(session));
- /* Stall here if the cache is completely full. */
- WT_RET(__wt_cache_eviction_check(session, false, true, NULL));
+ /*
+ * Stall here if the cache is completely full. We have allocated a transaction ID which
+ * makes it possible for eviction to decide we're contributing to the problem and return
+ * WT_ROLLBACK. The WT_SESSION.begin_transaction API can't return rollback, continue on.
+ */
+ WT_RET_ERROR_OK(__wt_cache_eviction_check(session, false, true, NULL), WT_ROLLBACK);
__wt_txn_get_snapshot(session);
}
@@ -945,26 +928,21 @@ __wt_txn_id_alloc(WT_SESSION_IMPL *session, bool publish)
/*
* Allocating transaction IDs involves several steps.
*
- * Firstly, publish that this transaction is allocating its ID, then
- * publish the transaction ID as the current global ID. Note that this
- * transaction ID might not be unique among threads and hence not valid
- * at this moment. The flag will notify other transactions that are
- * attempting to get their own snapshot for this transaction ID to
- * retry.
+ * Firstly, publish that this transaction is allocating its ID, then publish the transaction ID
+ * as the current global ID. Note that this transaction ID might not be unique among threads and
+ * hence not valid at this moment. The flag will notify other transactions that are attempting
+ * to get their own snapshot for this transaction ID to retry.
*
- * Then we do an atomic increment to allocate a unique ID. This will
- * give the valid ID to this transaction that we publish to the global
- * transaction table.
+ * Then we do an atomic increment to allocate a unique ID. This will give the valid ID to this
+ * transaction that we publish to the global transaction table.
*
- * We want the global value to lead the allocated values, so that any
- * allocated transaction ID eventually becomes globally visible. When
- * there are no transactions running, the oldest_id will reach the
- * global current ID, so we want post-increment semantics. Our atomic
- * add primitive does pre-increment, so adjust the result here.
+ * We want the global value to lead the allocated values, so that any allocated transaction ID
+ * eventually becomes globally visible. When there are no transactions running, the oldest_id
+ * will reach the global current ID, so we want post-increment semantics. Our atomic add
+ * primitive does pre-increment, so adjust the result here.
*
- * We rely on atomic reads of the current ID to create snapshots, so
- * for unlocked reads to be well defined, we must use an atomic
- * increment here.
+ * We rely on atomic reads of the current ID to create snapshots, so for unlocked reads to be
+ * well defined, we must use an atomic increment here.
*/
if (publish) {
WT_PUBLISH(txn_state->is_allocating, true);
@@ -1095,8 +1073,8 @@ __wt_txn_read_last(WT_SESSION_IMPL *session)
/*
* Release the snap_min ID we put in the global table.
*
- * If the isolation has been temporarily forced, don't touch the
- * snapshot here: it will be restored by WT_WITH_TXN_ISOLATION.
+ * If the isolation has been temporarily forced, don't touch the snapshot here: it will be
+ * restored by WT_WITH_TXN_ISOLATION.
*/
if ((!F_ISSET(txn, WT_TXN_RUNNING) || txn->isolation != WT_ISO_SNAPSHOT) &&
txn->forced_iso == 0)
@@ -1145,40 +1123,6 @@ __wt_txn_cursor_op(WT_SESSION_IMPL *session)
}
/*
- * __wt_txn_am_oldest --
- * Am I the oldest transaction in the system?
- */
-static inline bool
-__wt_txn_am_oldest(WT_SESSION_IMPL *session)
-{
- WT_CONNECTION_IMPL *conn;
- WT_TXN *txn;
- WT_TXN_GLOBAL *txn_global;
- WT_TXN_STATE *s;
- uint64_t id;
- uint32_t i, session_cnt;
-
- conn = S2C(session);
- txn = &session->txn;
- txn_global = &conn->txn_global;
-
- if (txn->id == WT_TXN_NONE || F_ISSET(txn, WT_TXN_PREPARE))
- return (false);
-
- WT_ORDERED_READ(session_cnt, conn->session_cnt);
- for (i = 0, s = txn_global->states; i < session_cnt; i++, s++)
- /*
- * We are checking if the transaction is oldest one in the system. It is safe to ignore any
- * sessions that are allocating transaction IDs, since we already have an ID, they are
- * guaranteed to be newer.
- */
- if (!s->is_allocating && (id = s->id) != WT_TXN_NONE && WT_TXNID_LT(id, txn->id))
- return (false);
-
- return (true);
-}
-
-/*
* __wt_txn_activity_check --
* Check whether there are any running transactions.
*/
diff --git a/src/third_party/wiredtiger/src/include/verify_build.h b/src/third_party/wiredtiger/src/include/verify_build.h
index a72289cc03f..d402d2d73c7 100644
--- a/src/third_party/wiredtiger/src/include/verify_build.h
+++ b/src/third_party/wiredtiger/src/include/verify_build.h
@@ -73,11 +73,10 @@ __wt_verify_build(void)
WT_STATIC_ASSERT(sizeof(size_t) >= 8);
/*
- * We require a wt_off_t fit into an 8B chunk because 8B is the largest
- * integral value we can encode into an address cookie.
+ * We require a wt_off_t fit into an 8B chunk because 8B is the largest integral value we can
+ * encode into an address cookie.
*
- * WiredTiger has never been tested on a system with 4B file offsets,
- * disallow them for now.
+ * WiredTiger has never been tested on a system with 4B file offsets, disallow them for now.
*/
WT_STATIC_ASSERT(sizeof(wt_off_t) == 8);
diff --git a/src/third_party/wiredtiger/src/include/wiredtiger.in b/src/third_party/wiredtiger/src/include/wiredtiger.in
index b9fed57f9ad..892d78b89a4 100644
--- a/src/third_party/wiredtiger/src/include/wiredtiger.in
+++ b/src/third_party/wiredtiger/src/include/wiredtiger.in
@@ -1723,6 +1723,12 @@ struct __wt_session {
* \c "read-committed"\, \c "snapshot"; default empty.}
* @config{name, name of the transaction for tracing and debugging., a string; default
* empty.}
+ * @config{operation_timeout_ms, when non-zero\, a requested limit on the number of elapsed
+ * real time milliseconds taken to complete database operations in this transaction. Time
+ * is measured from the start of each WiredTiger API call. There is no guarantee any
+ * operation will not take longer than this amount of time. If WiredTiger notices the limit
+ * has been exceeded\, an operation may return a WT_ROLLBACK error. Default is to have no
+ * limit., an integer greater than or equal to 1; default \c 0.}
* @config{priority, priority of the transaction for resolving conflicts. Transactions with
* higher values are less likely to abort., an integer between -100 and 100; default \c 0.}
* @config{read_timestamp, read using the specified timestamp. The supplied value must not
@@ -1906,16 +1912,17 @@ struct __wt_session {
* "to=<checkpoint>" to drop all checkpoints before and including the named checkpoint.
* Checkpoints cannot be dropped while a hot backup is in progress or if open in a cursor.,
* a list of strings; default empty.}
- * @config{force, by default\, checkpoints may be skipped if the underlying object has not
- * been modified\, this option forces the checkpoint., a boolean flag; default \c false.}
+ * @config{force, if false (the default)\, checkpoints may be skipped if the underlying
+ * object has not been modified\, if true\, this option forces the checkpoint., a boolean
+ * flag; default \c false.}
* @config{name, if set\, specify a name for the checkpoint (note that checkpoints including
* LSM trees may not be named)., a string; default empty.}
* @config{target, if non-empty\, checkpoint the list of objects., a list of strings;
* default empty.}
- * @config{use_timestamp, by default\, create the checkpoint as of the last stable timestamp
- * if timestamps are in use\, or all current updates if there is no stable timestamp set.
- * If false\, this option generates a checkpoint with all updates including those later than
- * the timestamp., a boolean flag; default \c true.}
+ * @config{use_timestamp, if true (the default)\, create the checkpoint as of the last
+ * stable timestamp if timestamps are in use\, or all current updates if there is no stable
+ * timestamp set. If false\, this option generates a checkpoint with all updates including
+ * those later than the timestamp., a boolean flag; default \c true.}
* @configend
* @errors
*/
@@ -2259,6 +2266,12 @@ struct __wt_connection {
* database. Each worker thread uses a session handle from the configured session_max., an
* integer between 3 and 20; default \c 4.}
* @config{ ),,}
+ * @config{operation_timeout_ms, when non-zero\, a requested limit on the number of elapsed
+ * real time milliseconds application threads will take to complete database operations.
+ * Time is measured from the start of each WiredTiger API call. There is no guarantee any
+ * operation will not take longer than this amount of time. If WiredTiger notices the limit
+ * has been exceeded\, an operation may return a WT_ROLLBACK error. Default is to have no
+ * limit., an integer greater than or equal to 1; default \c 0.}
* @config{operation_tracking = (, enable tracking of performance-critical functions. See
* @ref operation_tracking for more information., a set of related configuration options
* defined below.}
@@ -2493,18 +2506,16 @@ struct __wt_connection {
/*!
* Rollback in-memory non-logged state to an earlier point in time.
*
- * This method uses a timestamp to define the rollback point, and thus
- * requires that the application uses timestamps and that the
- * stable_timestamp must have been set via a call to
- * WT_CONNECTION::set_timestamp. Any updates to checkpoint durable
- * tables that are more recent than the stable timestamp are removed.
+ * This method uses a timestamp to define the rollback point, and requires the application
+ * use timestamps, the stable_timestamp have been set via a call to
+ * WT_CONNECTION::set_timestamp, and a checkpoint operating on the last stable timestamp
+ * to have completed. Any updates to checkpoint durable tables that are more recent than
+ * the stable timestamp are removed.
*
- * This method requires that there are no active operations for the
- * duration of the call.
+ * This method requires that there are no active operations for the duration of the call.
*
- * Any updates made to logged tables will not be rolled back. Any
- * updates made without an associated timestamp will not be rolled
- * back. See @ref transaction_timestamps.
+ * Any updates made to logged tables will not be rolled back. Any updates made without an
+ * associated timestamp will not be rolled back. See @ref transaction_timestamps.
*
* @snippet ex_all.c rollback to stable
*
@@ -2912,6 +2923,12 @@ struct __wt_connection {
* @config{multiprocess, permit sharing between processes (will automatically start an RPC server
* for primary processes and use RPC for secondary processes). <b>Not yet supported in
* WiredTiger</b>., a boolean flag; default \c false.}
+ * @config{operation_timeout_ms, when non-zero\, a requested limit on the number of elapsed real
+ * time milliseconds application threads will take to complete database operations. Time is
+ * measured from the start of each WiredTiger API call. There is no guarantee any operation will
+ * not take longer than this amount of time. If WiredTiger notices the limit has been exceeded\, an
+ * operation may return a WT_ROLLBACK error. Default is to have no limit., an integer greater than
+ * or equal to 1; default \c 0.}
* @config{operation_tracking = (, enable tracking of performance-critical functions. See @ref
* operation_tracking for more information., a set of related configuration options defined below.}
* @config{&nbsp;&nbsp;&nbsp;&nbsp;enabled, enable operation tracking subsystem., a boolean flag;
diff --git a/src/third_party/wiredtiger/src/include/wt_internal.h b/src/third_party/wiredtiger/src/include/wt_internal.h
index 3bc4f02c258..2b281443f21 100644
--- a/src/third_party/wiredtiger/src/include/wt_internal.h
+++ b/src/third_party/wiredtiger/src/include/wt_internal.h
@@ -430,6 +430,7 @@ typedef uint64_t wt_timestamp_t;
#include "packing.i"
#include "reconcile.i"
#include "serial.i"
+#include "time.i"
#if defined(__cplusplus)
}
diff --git a/src/third_party/wiredtiger/src/log/log.c b/src/third_party/wiredtiger/src/log/log.c
index d6f18f82bb9..aeda4608082 100644
--- a/src/third_party/wiredtiger/src/log/log.c
+++ b/src/third_party/wiredtiger/src/log/log.c
@@ -201,14 +201,12 @@ __log_fs_write(
WT_DECL_RET;
/*
- * If we're writing into a new log file and we're running in
- * compatibility mode to an older release, we have to wait for all
- * writes to the previous log file to complete otherwise there could
- * be a hole at the end of the previous log file that we cannot detect.
+ * If we're writing into a new log file and we're running in compatibility mode to an older
+ * release, we have to wait for all writes to the previous log file to complete otherwise there
+ * could be a hole at the end of the previous log file that we cannot detect.
*
- * NOTE: Check for a version less than the one writing the system
- * record since we've had a log version change without any actual
- * file format changes.
+ * NOTE: Check for a version less than the one writing the system record since we've had a log
+ * version change without any actual file format changes.
*/
if (S2C(session)->log->log_version < WT_LOG_VERSION_SYSTEM &&
slot->slot_release_lsn.l.file < slot->slot_start_lsn.l.file) {
@@ -784,9 +782,9 @@ __log_file_header(WT_SESSION_IMPL *session, WT_FH *fh, WT_LSN *end_lsn, bool pre
/*
* Now that the record is set up, initialize the record header.
*
- * Checksum a little-endian version of the header, and write everything
- * in little-endian format. The checksum is (potentially) returned in a
- * big-endian format, swap it into place in a separate step.
+ * Checksum a little-endian version of the header, and write everything in little-endian format.
+ * The checksum is (potentially) returned in a big-endian format, swap it into place in a
+ * separate step.
*/
logrec->len = log->allocsize;
logrec->checksum = 0;
@@ -1790,16 +1788,14 @@ __log_has_hole(WT_SESSION_IMPL *session, WT_FH *fh, wt_off_t log_size, wt_off_t
break;
}
/*
- * A presumed log record begins here where the buffer
- * becomes non-zero. If we have enough of a log record
- * present in the buffer, we either have a valid header
- * or corruption. Verify the header of this record to
- * determine whether it is just a hole or corruption.
+ * A presumed log record begins here where the buffer becomes non-zero. If we have
+ * enough of a log record present in the buffer, we either have a valid header or
+ * corruption. Verify the header of this record to determine whether it is just a hole
+ * or corruption.
*
- * We don't bother making this check for backup copies,
- * as records may have their beginning zeroed, hence
- * the part after a hole may in fact be the middle of
- * the record.
+ * We don't bother making this check for backup copies, as records may have their
+ * beginning zeroed, hence the part after a hole may in fact be the middle of the
+ * record.
*/
if (!F_ISSET(conn, WT_CONN_WAS_BACKUP)) {
logrec = (WT_LOG_RECORD *)p;
@@ -2348,13 +2344,12 @@ advance:
next_lsn.l.offset += rdup_len;
if (rd_lsn.l.offset != 0) {
/*
- * We need to manage the different buffers here.
- * Buf is the buffer this function uses to read from
- * the disk. The callback buffer may change based
- * on whether encryption and compression are used.
+ * We need to manage the different buffers here. Buf is the buffer this function uses to
+ * read from the disk. The callback buffer may change based on whether encryption and
+ * compression are used.
*
- * We want to free any buffers from compression and
- * encryption but keep the one we use for reading.
+ * We want to free any buffers from compression and encryption but keep the one we use
+ * for reading.
*/
cbbuf = buf;
if (F_ISSET(logrec, WT_LOG_RECORD_ENCRYPTED)) {
@@ -2589,12 +2584,11 @@ __log_write_internal(WT_SESSION_IMPL *session, WT_ITEM *record, WT_LSN *lsnp, ui
myslot.slot = NULL;
memset(&myslot, 0, sizeof(myslot));
/*
- * Assume the WT_ITEM the caller passed is a WT_LOG_RECORD, which has a
- * header at the beginning for us to fill in.
+ * Assume the WT_ITEM the caller passed is a WT_LOG_RECORD, which has a header at the beginning
+ * for us to fill in.
*
- * If using direct_io, the caller should pass us an aligned record.
- * But we need to make sure it is big enough and zero-filled so
- * that we can write the full amount. Do this whether or not
+ * If using direct_io, the caller should pass us an aligned record. But we need to make sure it
+ * is big enough and zero-filled so that we can write the full amount. Do this whether or not
* direct_io is in use because it makes the reading code cleaner.
*/
WT_STAT_CONN_INCRV(session, log_bytes_payload, record->size);
@@ -2602,8 +2596,8 @@ __log_write_internal(WT_SESSION_IMPL *session, WT_ITEM *record, WT_LSN *lsnp, ui
WT_ERR(__wt_buf_grow(session, record, rdup_len));
WT_ASSERT(session, record->data == record->mem);
/*
- * If the caller's record only partially fills the necessary
- * space, we need to zero-fill the remainder.
+ * If the caller's record only partially fills the necessary space, we need to zero-fill the
+ * remainder.
*
* The cast is safe, we've already checked to make sure it's in range.
*/
@@ -2611,28 +2605,23 @@ __log_write_internal(WT_SESSION_IMPL *session, WT_ITEM *record, WT_LSN *lsnp, ui
if (fill_size != 0) {
memset((uint8_t *)record->mem + record->size, 0, fill_size);
/*
- * Set the last byte of the log record to a non-zero value,
- * that allows us, on the input side, to tell that a log
- * record was completely written; there couldn't have been
- * a partial write. That means that any checksum mismatch
- * in those conditions is a log corruption.
+ * Set the last byte of the log record to a non-zero value, that allows us, on the input
+ * side, to tell that a log record was completely written; there couldn't have been a
+ * partial write. That means that any checksum mismatch in those conditions is a log
+ * corruption.
*
- * Without this changed byte, when we see a zeroed last byte,
- * we must always treat a checksum error as a possible partial
- * write. Since partial writes can happen as a result of an
- * interrupted process (for example, a shutdown), we must
- * treat a checksum error as a normal occurrence, and merely
- * the place where the log must be truncated. So any real
+ * Without this changed byte, when we see a zeroed last byte, we must always treat a
+ * checksum error as a possible partial write. Since partial writes can happen as a result
+ * of an interrupted process (for example, a shutdown), we must treat a checksum error as a
+ * normal occurrence, and merely the place where the log must be truncated. So any real
* corruption within log records is hard to detect as such.
*
- * However, we can only make this modification if there is
- * more than one byte being filled, as the first zero byte
- * past the actual record is needed to terminate the loop
- * in txn_commit_apply.
+ * However, we can only make this modification if there is more than one byte being filled,
+ * as the first zero byte past the actual record is needed to terminate the loop in
+ * txn_commit_apply.
*
- * This is not a log format change, as we only are changing a
- * byte in the padding portion of a record, and no logging code
- * has ever checked that it is any particular value up to now.
+ * This is not a log format change, as we only are changing a byte in the padding portion of
+ * a record, and no logging code has ever checked that it is any particular value up to now.
*/
if (fill_size > 1)
*((uint8_t *)record->mem + rdup_len - 1) = WT_DEBUG_BYTE;
@@ -2681,8 +2670,7 @@ __log_write_internal(WT_SESSION_IMPL *session, WT_ITEM *record, WT_LSN *lsnp, ui
__wt_log_slot_free(session, myslot.slot);
} else if (force) {
/*
- * If we are going to wait for this slot to get written,
- * signal the wrlsn thread.
+ * If we are going to wait for this slot to get written, signal the wrlsn thread.
*
* XXX I've seen times when conditions are NULL.
*/
diff --git a/src/third_party/wiredtiger/src/lsm/lsm_cursor.c b/src/third_party/wiredtiger/src/lsm/lsm_cursor.c
index bc860952baf..42155e7df56 100644
--- a/src/third_party/wiredtiger/src/lsm/lsm_cursor.c
+++ b/src/third_party/wiredtiger/src/lsm/lsm_cursor.c
@@ -67,13 +67,12 @@ __wt_clsm_await_switch(WT_CURSOR_LSM *clsm)
session = (WT_SESSION_IMPL *)clsm->iface.session;
/*
- * If there is no primary chunk, or a chunk has overflowed the hard
- * limit, which either means a worker thread has fallen behind or there
- * has just been a user-level checkpoint, wait until the tree changes.
+ * If there is no primary chunk, or a chunk has overflowed the hard limit, which either means a
+ * worker thread has fallen behind or there has just been a user-level checkpoint, wait until
+ * the tree changes.
*
- * We used to switch chunks in the application thread here, but that is
- * problematic because there is a transaction in progress and it could
- * roll back, leaving the metadata inconsistent.
+ * We used to switch chunks in the application thread here, but that is problematic because
+ * there is a transaction in progress and it could roll back, leaving the metadata inconsistent.
*/
for (waited = 0; lsm_tree->nchunks == 0 || clsm->dsk_gen == lsm_tree->dsk_gen; ++waited) {
if (waited % WT_THOUSAND == 0)
@@ -112,15 +111,13 @@ __clsm_enter_update(WT_CURSOR_LSM *clsm)
}
/*
- * In LSM there are multiple btrees active at one time. The tree
- * switch code needs to use btree API methods, and it wants to
- * operate on the btree for the primary chunk. Set that up now.
+ * In LSM there are multiple btrees active at one time. The tree switch code needs to use btree
+ * API methods, and it wants to operate on the btree for the primary chunk. Set that up now.
*
- * If the primary chunk has grown too large, set a flag so the worker
- * thread will switch when it gets a chance to avoid introducing high
- * latency into application threads. Don't do this indefinitely: if a
- * chunk grows twice as large as the configured size, block until it
- * can be switched.
+ * If the primary chunk has grown too large, set a flag so the worker thread will switch when it
+ * gets a chance to avoid introducing high latency into application threads. Don't do this
+ * indefinitely: if a chunk grows twice as large as the configured size, block until it can be
+ * switched.
*/
hard_limit = lsm_tree->need_switch;
@@ -200,19 +197,14 @@ __clsm_enter(WT_CURSOR_LSM *clsm, bool reset, bool update)
__wt_txn_cursor_op(session);
/*
- * Figure out how many updates are required for
- * snapshot isolation.
+ * Figure out how many updates are required for snapshot isolation.
*
- * This is not a normal visibility check on the maximum
- * transaction ID in each chunk: any transaction ID
- * that overlaps with our snapshot is a potential
- * conflict.
+ * This is not a normal visibility check on the maximum transaction ID in each chunk:
+ * any transaction ID that overlaps with our snapshot is a potential conflict.
*
- * Note that the pinned ID is correct here: it tracks
- * concurrent transactions excluding special
- * transactions such as checkpoint (which we can't
- * conflict with because checkpoint only writes the
- * metadata, which is not an LSM tree).
+ * Note that the pinned ID is correct here: it tracks concurrent transactions excluding
+ * special transactions such as checkpoint (which we can't conflict with because
+ * checkpoint only writes the metadata, which is not an LSM tree).
*/
clsm->nupdates = 1;
if (txn->isolation == WT_ISO_SNAPSHOT && F_ISSET(clsm, WT_CLSM_OPEN_SNAPSHOT)) {
@@ -557,9 +549,8 @@ retry:
/*
* Close any cursors we no longer need.
*
- * Drop the LSM tree lock while we do this: if the cache is
- * full, we may block while closing a cursor. Save the
- * generation number and retry if it has changed under us.
+ * Drop the LSM tree lock while we do this: if the cache is full, we may block while closing
+ * a cursor. Save the generation number and retry if it has changed under us.
*/
if (clsm->chunks != NULL && ngood < clsm->nchunks) {
close_range_start = ngood;
@@ -651,19 +642,16 @@ retry:
btree = ((WT_CURSOR_BTREE *)primary)->btree;
/*
- * If the primary is not yet set as the primary, do that now.
- * Note that eviction was configured off when the underlying
- * object was created, which is what we want, leave it alone.
+ * If the primary is not yet set as the primary, do that now. Note that eviction was
+ * configured off when the underlying object was created, which is what we want, leave it
+ * alone.
*
- * We don't have to worry about races here: every thread that
- * modifies the tree will have to come through here, at worse
- * we set the flag repeatedly. We don't use a WT_BTREE handle
- * flag, however, we could race doing the read-modify-write of
- * the flags field.
+ * We don't have to worry about races here: every thread that modifies the tree will have to
+ * come through here, at worse we set the flag repeatedly. We don't use a WT_BTREE handle
+ * flag, however, we could race doing the read-modify-write of the flags field.
*
- * If something caused the chunk to be closed and reopened
- * since it was created, we can no longer use it as a primary
- * chunk and we need to force a switch. We detect the tree was
+ * If something caused the chunk to be closed and reopened since it was created, we can no
+ * longer use it as a primary chunk and we need to force a switch. We detect the tree was
* created when it was opened by checking the "original" flag.
*/
if (!btree->lsm_primary && btree->original)
@@ -837,12 +825,11 @@ __clsm_position_chunk(WT_CURSOR_LSM *clsm, WT_CURSOR *c, bool forward, int *cmpp
WT_RET(forward ? c->next(c) : c->prev(c));
/*
- * With higher isolation levels, where we have stable reads,
- * we're done: the cursor is now positioned as expected.
+ * With higher isolation levels, where we have stable reads, we're done: the cursor is now
+ * positioned as expected.
*
- * With read-uncommitted isolation, a new record could have
- * appeared in between the search and stepping forward / back.
- * In that case, keep going until we see a key in the expected
+ * With read-uncommitted isolation, a new record could have appeared in between the search
+ * and stepping forward / back. In that case, keep going until we see a key in the expected
* range.
*/
if (session->txn.isolation != WT_ISO_READ_UNCOMMITTED)
@@ -1270,14 +1257,13 @@ __clsm_search_near(WT_CURSOR *cursor, int *exactp)
F_CLR(clsm, WT_CLSM_ITERATE_NEXT | WT_CLSM_ITERATE_PREV);
/*
- * search_near is somewhat fiddly: we can't just use a nearby key from
- * the in-memory chunk because there could be a closer key on disk.
+ * search_near is somewhat fiddly: we can't just use a nearby key from the in-memory chunk
+ * because there could be a closer key on disk.
*
- * As we search down the chunks, we stop as soon as we find an exact
- * match. Otherwise, we maintain the smallest cursor larger than the
- * search key and the largest cursor smaller than the search key. At
- * the end, we prefer the larger cursor, but if no record is larger,
- * position on the last record in the tree.
+ * As we search down the chunks, we stop as soon as we find an exact match. Otherwise, we
+ * maintain the smallest cursor larger than the search key and the largest cursor smaller than
+ * the search key. At the end, we prefer the larger cursor, but if no record is larger, position
+ * on the last record in the tree.
*/
WT_FORALL_CURSORS(clsm, c, i)
{
@@ -1435,13 +1421,12 @@ __clsm_put(WT_SESSION_IMPL *session, WT_CURSOR_LSM *clsm, const WT_ITEM *key, co
}
/*
- * Update the record count. It is in a shared structure, but it's only
- * approximate, so don't worry about protecting access.
+ * Update the record count. It is in a shared structure, but it's only approximate, so don't
+ * worry about protecting access.
*
- * Throttle if necessary. Every 100 update operations on each cursor,
- * check if throttling is required. Don't rely only on the shared
- * counter because it can race, and because for some workloads, there
- * may not be enough records per chunk to get effective throttling.
+ * Throttle if necessary. Every 100 update operations on each cursor, check if throttling is
+ * required. Don't rely only on the shared counter because it can race, and because for some
+ * workloads, there may not be enough records per chunk to get effective throttling.
*/
if ((++clsm->primary_chunk->count % 100 == 0 || ++clsm->update_count >= 100) &&
lsm_tree->merge_throttle + lsm_tree->ckpt_throttle > 0) {
diff --git a/src/third_party/wiredtiger/src/lsm/lsm_manager.c b/src/third_party/wiredtiger/src/lsm/lsm_manager.c
index eaecb197b08..aa7a400d3c9 100644
--- a/src/third_party/wiredtiger/src/lsm/lsm_manager.c
+++ b/src/third_party/wiredtiger/src/lsm/lsm_manager.c
@@ -592,13 +592,11 @@ __wt_lsm_manager_push_entry(
}
/*
- * Don't allow any work units unless a tree is active, this avoids
- * races on shutdown between clearing out queues and pushing new
- * work units.
+ * Don't allow any work units unless a tree is active, this avoids races on shutdown between
+ * clearing out queues and pushing new work units.
*
- * Increment the queue reference before checking the flag since
- * on close, the flag is cleared and then the queue reference count
- * is checked.
+ * Increment the queue reference before checking the flag since on close, the flag is cleared
+ * and then the queue reference count is checked.
*/
(void)__wt_atomic_add32(&lsm_tree->queue_ref, 1);
if (!lsm_tree->active) {
diff --git a/src/third_party/wiredtiger/src/lsm/lsm_merge.c b/src/third_party/wiredtiger/src/lsm/lsm_merge.c
index 58b44f9cf2a..7110a75cec0 100644
--- a/src/third_party/wiredtiger/src/lsm/lsm_merge.c
+++ b/src/third_party/wiredtiger/src/lsm/lsm_merge.c
@@ -199,18 +199,15 @@ __lsm_merge_span(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree, u_int id, u_in
return (WT_NOTFOUND);
/*
- * Look for the most efficient merge we can do. We define efficiency
- * as collapsing as many levels as possible while processing the
- * smallest number of rows.
+ * Look for the most efficient merge we can do. We define efficiency as collapsing as many levels as
+ * possible while processing the smallest number of rows.
*
- * We make a distinction between "major" and "minor" merges. The
- * difference is whether the oldest chunk is involved: if it is, we can
- * discard tombstones, because there can be no older record to marked
- * deleted.
+ * We make a distinction between "major" and "minor" merges. The difference is whether the oldest
+ * chunk is involved: if it is, we can discard tombstones, because there can be no older record to
+ * marked deleted.
*
- * Respect the configured limit on the number of chunks to merge: start
- * with the most recent set of chunks and work backwards until going
- * further becomes significantly less efficient.
+ * Respect the configured limit on the number of chunks to merge: start with the most recent set of
+ * chunks and work backwards until going further becomes significantly less efficient.
*/
retry_find:
oldest_gen = youngest_gen = lsm_tree->chunk[end_chunk]->generation;
@@ -539,12 +536,12 @@ __wt_lsm_merge(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree, u_int id)
F_SET(chunk, WT_LSM_CHUNK_ONDISK);
/*
- * We have no current way of continuing if the metadata update fails,
- * so we will panic in that case. Put some effort into cleaning up
- * after ourselves here - so things have a chance of shutting down.
+ * We have no current way of continuing if the metadata update fails, so we will panic in that
+ * case. Put some effort into cleaning up after ourselves here - so things have a chance of
+ * shutting down.
*
- * Any errors that happened after the tree was locked are
- * fatal - we can't guarantee the state of the tree.
+ * Any errors that happened after the tree was locked are fatal - we can't guarantee the state
+ * of the tree.
*/
if ((ret = __wt_lsm_meta_write(session, lsm_tree, NULL)) != 0)
WT_PANIC_ERR(session, ret, "Failed finalizing LSM merge");
diff --git a/src/third_party/wiredtiger/src/lsm/lsm_meta.c b/src/third_party/wiredtiger/src/lsm/lsm_meta.c
index c6f7a82968c..c39e4756bed 100644
--- a/src/third_party/wiredtiger/src/lsm/lsm_meta.c
+++ b/src/third_party/wiredtiger/src/lsm/lsm_meta.c
@@ -290,9 +290,8 @@ __lsm_meta_read_v1(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree, const char *
/*
* Set up the config for each chunk.
*
- * Make the memory_page_max double the chunk size, so application
- * threads don't immediately try to force evict the chunk when the
- * worker thread clears the NO_EVICTION flag.
+ * Make the memory_page_max double the chunk size, so application threads don't immediately try
+ * to force evict the chunk when the worker thread clears the NO_EVICTION flag.
*/
file_cfg[1] = lsmconf;
WT_ERR(__wt_scr_alloc(session, 0, &buf));
diff --git a/src/third_party/wiredtiger/src/lsm/lsm_stat.c b/src/third_party/wiredtiger/src/lsm/lsm_stat.c
index 9d34eca0589..c30d77d6c05 100644
--- a/src/third_party/wiredtiger/src/lsm/lsm_stat.c
+++ b/src/third_party/wiredtiger/src/lsm/lsm_stat.c
@@ -63,9 +63,8 @@ __curstat_lsm_init(WT_SESSION_IMPL *session, const char *uri, WT_CURSOR_STAT *cs
/*
* Get the statistics for the chunk's underlying object.
*
- * XXX kludge: we may have an empty chunk where no checkpoint
- * was written. If so, try to open the ordinary handle on that
- * chunk instead.
+ * XXX kludge: we may have an empty chunk where no checkpoint was written. If so, try to
+ * open the ordinary handle on that chunk instead.
*/
WT_ERR(__wt_buf_fmt(session, uribuf, "statistics:%s", chunk->uri));
ret = __wt_curstat_open(session, uribuf->data, NULL,
diff --git a/src/third_party/wiredtiger/src/lsm/lsm_tree.c b/src/third_party/wiredtiger/src/lsm/lsm_tree.c
index 9b6933a61e2..40cf169566c 100644
--- a/src/third_party/wiredtiger/src/lsm/lsm_tree.c
+++ b/src/third_party/wiredtiger/src/lsm/lsm_tree.c
@@ -289,13 +289,12 @@ __wt_lsm_tree_setup_chunk(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree, WT_LS
WT_RET(__wt_lsm_tree_chunk_name(session, lsm_tree, chunk->id, chunk->generation, &chunk->uri));
/*
- * If the underlying file exists, drop the chunk first - there may be
- * some content hanging over from an aborted merge or checkpoint.
+ * If the underlying file exists, drop the chunk first - there may be some content hanging over
+ * from an aborted merge or checkpoint.
*
- * Don't do this for the very first chunk: we are called during
- * WT_SESSION::create, and doing a drop inside there does interesting
- * things with handle locks and metadata tracking. It can never have
- * been the result of an interrupted merge, anyway.
+ * Don't do this for the very first chunk: we are called during WT_SESSION::create, and doing a
+ * drop inside there does interesting things with handle locks and metadata tracking. It can
+ * never have been the result of an interrupted merge, anyway.
*/
if (chunk->id > 1)
WT_RET(__lsm_tree_cleanup_old(session, chunk->uri));
@@ -660,8 +659,7 @@ __wt_lsm_tree_throttle(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree, bool dec
/*
* Merge throttling, based on the number of on-disk, level 0 chunks.
*
- * Don't throttle if the tree has less than a single level's number
- * of chunks.
+ * Don't throttle if the tree has less than a single level's number of chunks.
*/
if (F_ISSET(lsm_tree, WT_LSM_TREE_MERGES)) {
if (lsm_tree->nchunks < lsm_tree->merge_max)
diff --git a/src/third_party/wiredtiger/src/lsm/lsm_work_unit.c b/src/third_party/wiredtiger/src/lsm/lsm_work_unit.c
index 8f815277e6b..3be7acf7379 100644
--- a/src/third_party/wiredtiger/src/lsm/lsm_work_unit.c
+++ b/src/third_party/wiredtiger/src/lsm/lsm_work_unit.c
@@ -101,11 +101,11 @@ __wt_lsm_get_chunk_to_flush(
}
/*
- * Don't be overly zealous about pushing old chunks from cache.
- * Attempting too many drops can interfere with checkpoints.
+ * Don't be overly zealous about pushing old chunks from cache. Attempting too many drops can
+ * interfere with checkpoints.
*
- * If retrying a discard push an additional work unit so there are
- * enough to trigger checkpoints.
+ * If retrying a discard push an additional work unit so there are enough to trigger
+ * checkpoints.
*/
if (evict_chunk != NULL && flush_chunk != NULL) {
chunk = (__wt_random(&session->rnd) & 1) ? evict_chunk : flush_chunk;
@@ -619,10 +619,9 @@ __lsm_drop_file(WT_SESSION_IMPL *session, const char *uri)
const char *drop_cfg[] = {WT_CONFIG_BASE(session, WT_SESSION_drop), "remove_files=false", NULL};
/*
- * We need to grab the schema lock to drop the file, so first try to
- * make sure there is minimal work to freeing space in the cache. Only
- * bother trying to discard the checkpoint handle: the in-memory handle
- * should have been closed already.
+ * We need to grab the schema lock to drop the file, so first try to make sure there is minimal
+ * work to freeing space in the cache. Only bother trying to discard the checkpoint handle: the
+ * in-memory handle should have been closed already.
*
* This will fail with EBUSY if the file is still in use.
*/
@@ -663,14 +662,12 @@ __lsm_free_chunks(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree)
flush_metadata = false;
/*
- * Take a copy of the current state of the LSM tree and look for chunks
- * to drop. We do it this way to avoid holding the LSM tree lock while
- * doing I/O or waiting on the schema lock.
+ * Take a copy of the current state of the LSM tree and look for chunks to drop. We do it this
+ * way to avoid holding the LSM tree lock while doing I/O or waiting on the schema lock.
*
- * This is safe because only one thread will be in this function at a
- * time. Merges may complete concurrently, and the old_chunks array
- * may be extended, but we shuffle down the pointers each time we free
- * one to keep the non-NULL slots at the beginning of the array.
+ * This is safe because only one thread will be in this function at a time. Merges may complete
+ * concurrently, and the old_chunks array may be extended, but we shuffle down the pointers each
+ * time we free one to keep the non-NULL slots at the beginning of the array.
*/
WT_CLEAR(cookie);
WT_RET(__lsm_copy_chunks(session, lsm_tree, &cookie, true));
diff --git a/src/third_party/wiredtiger/src/meta/meta_apply.c b/src/third_party/wiredtiger/src/meta/meta_apply.c
index e8f5ac33605..752ad8c09db 100644
--- a/src/third_party/wiredtiger/src/meta/meta_apply.c
+++ b/src/third_party/wiredtiger/src/meta/meta_apply.c
@@ -41,15 +41,13 @@ __meta_btree_apply(WT_SESSION_IMPL *session, WT_CURSOR *cursor,
continue;
/*
- * We need to pull the handle into the session handle cache
- * and make sure it's referenced to stop other internal code
- * dropping the handle (e.g in LSM when cleaning up obsolete
- * chunks). Holding the schema lock isn't enough.
+ * We need to pull the handle into the session handle cache and make sure it's referenced to
+ * stop other internal code dropping the handle (e.g in LSM when cleaning up obsolete
+ * chunks). Holding the schema lock isn't enough.
*
- * Handles that are busy are skipped without the whole
- * operation failing. This deals among other cases with
- * checkpoint encountering handles that are locked (e.g., for
- * bulk loads or verify operations).
+ * Handles that are busy are skipped without the whole operation failing. This deals among
+ * other cases with checkpoint encountering handles that are locked (e.g., for bulk loads or
+ * verify operations).
*/
if ((t_ret = __wt_session_get_dhandle(session, uri, NULL, NULL, 0)) != 0) {
WT_TRET_BUSY_OK(t_ret);
diff --git a/src/third_party/wiredtiger/src/meta/meta_ckpt.c b/src/third_party/wiredtiger/src/meta/meta_ckpt.c
index 3b0749d9020..c8c4383f1a3 100644
--- a/src/third_party/wiredtiger/src/meta/meta_ckpt.c
+++ b/src/third_party/wiredtiger/src/meta/meta_ckpt.c
@@ -40,9 +40,8 @@ __wt_meta_checkpoint(
/*
* Retrieve the named checkpoint or the last checkpoint.
*
- * If we don't find a named checkpoint, we're done, they're read-only.
- * If we don't find a default checkpoint, it's creation, return "no
- * data" and let our caller handle it.
+ * If we don't find a named checkpoint, we're done, they're read-only. If we don't find a
+ * default checkpoint, it's creation, return "no data" and let our caller handle it.
*/
if (checkpoint == NULL) {
if ((ret = __ckpt_last(session, config, ckpt)) == WT_NOTFOUND) {
@@ -358,13 +357,11 @@ __wt_meta_ckptlist_get(
if (update) {
/*
- * This isn't clean, but there's necessary cooperation between
- * the schema layer (that maintains the list of checkpoints),
- * the btree layer (that knows when the root page is written,
- * creating a new checkpoint), and the block manager (which
- * actually creates the checkpoint). All of that cooperation is
- * handled in the array of checkpoint structures referenced from
- * the WT_BTREE structure.
+ * This isn't clean, but there's necessary cooperation between the schema layer (that
+ * maintains the list of checkpoints), the btree layer (that knows when the root page is
+ * written, creating a new checkpoint), and the block manager (which actually creates the
+ * checkpoint). All of that cooperation is handled in the array of checkpoint structures
+ * referenced from the WT_BTREE structure.
*
* Allocate a slot for a new value, plus a slot to mark the end.
*/
@@ -498,21 +495,17 @@ __ckptlist_review_write_gen(WT_SESSION_IMPL *session, WT_CKPT *ckpt)
uint64_t v;
/*
- * Every page written in a given wiredtiger_open() session needs to be
- * in a single "generation", it's how we know to ignore transactional
- * information found on pages written in previous generations. We make
- * this work by writing the maximum write generation we've ever seen
- * as the write-generation of the metadata file's checkpoint. When
- * wiredtiger_open() is called, we copy that write generation into the
- * connection's name space as the base write generation value. Then,
- * whenever we open a file, if the file's write generation is less than
- * the base value, we update the file's write generation so all writes
- * will appear after the base value, and we ignore transactions on pages
- * where the write generation is less than the base value.
+ * Every page written in a given wiredtiger_open() session needs to be in a single "generation",
+ * it's how we know to ignore transactional information found on pages written in previous
+ * generations. We make this work by writing the maximum write generation we've ever seen as the
+ * write-generation of the metadata file's checkpoint. When wiredtiger_open() is called, we copy
+ * that write generation into the connection's name space as the base write generation value.
+ * Then, whenever we open a file, if the file's write generation is less than the base value, we
+ * update the file's write generation so all writes will appear after the base value, and we
+ * ignore transactions on pages where the write generation is less than the base value.
*
- * At every checkpoint, if the file's checkpoint write generation is
- * larger than the connection's maximum write generation, update the
- * connection.
+ * At every checkpoint, if the file's checkpoint write generation is larger than the
+ * connection's maximum write generation, update the connection.
*/
do {
WT_ORDERED_READ(v, S2C(session)->max_write_gen);
diff --git a/src/third_party/wiredtiger/src/meta/meta_table.c b/src/third_party/wiredtiger/src/meta/meta_table.c
index f92a64e7e3d..69e4ca2e056 100644
--- a/src/third_party/wiredtiger/src/meta/meta_table.c
+++ b/src/third_party/wiredtiger/src/meta/meta_table.c
@@ -74,12 +74,11 @@ __wt_metadata_cursor_open(WT_SESSION_IMPL *session, const char *config, WT_CURSO
btree = ((WT_CURSOR_BTREE *)(*cursorp))->btree;
/*
- * Special settings for metadata: skew eviction so metadata almost
- * always stays in cache and make sure metadata is logged if possible.
+ * Special settings for metadata: skew eviction so metadata almost always stays in cache and make
+ * sure metadata is logged if possible.
*
- * Test before setting so updates can't race in subsequent opens (the
- * first update is safe because it's single-threaded from
- * wiredtiger_open).
+ * Test before setting so updates can't race in subsequent opens (the first update is safe because
+ * it's single-threaded from wiredtiger_open).
*/
#define WT_EVICT_META_SKEW 10000
if (btree->evict_priority == 0)
diff --git a/src/third_party/wiredtiger/src/meta/meta_turtle.c b/src/third_party/wiredtiger/src/meta/meta_turtle.c
index e1289864c6c..80e409f380f 100644
--- a/src/third_party/wiredtiger/src/meta/meta_turtle.c
+++ b/src/third_party/wiredtiger/src/meta/meta_turtle.c
@@ -197,19 +197,16 @@ __wt_turtle_init(WT_SESSION_IMPL *session)
WT_RET(__wt_remove_if_exists(session, WT_METADATA_TURTLE_SET, false));
/*
- * If we found a corrupted turtle file, then delete it and create a new.
- * We could die after creating the turtle file and before creating the
- * metadata file, or worse, the metadata file might be in some random
- * state. Make sure that doesn't happen: if we don't find the turtle
- * file, first create the metadata file, load any hot backup, and then
- * create the turtle file. No matter what happens, if metadata file
- * creation doesn't fully complete, we won't have a turtle file and we
- * will repeat the process until we succeed.
+ * If we found a corrupted turtle file, then delete it and create a new. We could die after
+ * creating the turtle file and before creating the metadata file, or worse, the metadata file
+ * might be in some random state. Make sure that doesn't happen: if we don't find the turtle
+ * file, first create the metadata file, load any hot backup, and then create the turtle file.
+ * No matter what happens, if metadata file creation doesn't fully complete, we won't have a
+ * turtle file and we will repeat the process until we succeed.
*
- * Incremental backups can occur only if recovery is run and it becomes
- * live. So, if there is a turtle file and an incremental backup file,
- * that is an error. Otherwise, if there's already a turtle file, we're
- * done.
+ * Incremental backups can occur only if recovery is run and it becomes live. So, if there is a
+ * turtle file and an incremental backup file, that is an error. Otherwise, if there's already a
+ * turtle file, we're done.
*/
WT_RET(__wt_fs_exist(session, WT_INCREMENTAL_BACKUP, &exist_incr));
WT_RET(__wt_fs_exist(session, WT_INCREMENTAL_SRC, &exist_isrc));
diff --git a/src/third_party/wiredtiger/src/os_common/os_alloc.c b/src/third_party/wiredtiger/src/os_common/os_alloc.c
index 7933e01dedb..7ad9cf4ddc8 100644
--- a/src/third_party/wiredtiger/src/os_common/os_alloc.c
+++ b/src/third_party/wiredtiger/src/os_common/os_alloc.c
@@ -217,11 +217,10 @@ __wt_realloc_aligned(
}
#endif
/*
- * If there is no posix_memalign function, or no alignment configured,
- * fall back to realloc.
+ * If there is no posix_memalign function, or no alignment configured, fall back to realloc.
*
- * Windows note: Visual C CRT memalign does not match POSIX behavior
- * and would also double each allocation so it is bad for memory use.
+ * Windows note: Visual C CRT memalign does not match POSIX behavior and would also double each
+ * allocation so it is bad for memory use.
*/
return (__realloc_func(session, bytes_allocated_ret, bytes_to_allocate, false, retp));
}
diff --git a/src/third_party/wiredtiger/src/os_common/os_fhandle.c b/src/third_party/wiredtiger/src/os_common/os_fhandle.c
index bba63e2ae44..d3d12f76a11 100644
--- a/src/third_party/wiredtiger/src/os_common/os_fhandle.c
+++ b/src/third_party/wiredtiger/src/os_common/os_fhandle.c
@@ -235,8 +235,7 @@ __wt_open(WT_SESSION_IMPL *session, const char *name, WT_FS_OPEN_FILE_TYPE file_
fh->file_type = file_type;
/*
- * If this is a read-only connection, open all files read-only except
- * the lock file.
+ * If this is a read-only connection, open all files read-only except the lock file.
*
* The only file created in read-only mode is the lock file.
*/
@@ -331,8 +330,7 @@ __wt_close(WT_SESSION_IMPL *session, WT_FH **fhp)
__wt_verbose(session, WT_VERB_FILEOPS, "%s: file-close", fh->name);
/*
- * If the reference count hasn't gone to 0, or if it's an in-memory
- * object, we're done.
+ * If the reference count hasn't gone to 0, or if it's an in-memory object, we're done.
*
* Assert the reference count is correct, but don't let it wrap.
*/
diff --git a/src/third_party/wiredtiger/src/os_posix/os_dlopen.c b/src/third_party/wiredtiger/src/os_posix/os_dlopen.c
index 7ba37803a44..afdee29f4ed 100644
--- a/src/third_party/wiredtiger/src/os_posix/os_dlopen.c
+++ b/src/third_party/wiredtiger/src/os_posix/os_dlopen.c
@@ -65,9 +65,8 @@ __wt_dlclose(WT_SESSION_IMPL *session, WT_DLH *dlh)
/*
* FreeBSD dies inside __cxa_finalize when closing handles.
*
- * For now, just skip the dlclose: this may leak some resources until
- * the process exits, but that is preferable to hard-to-debug crashes
- * during exit.
+ * For now, just skip the dlclose: this may leak some resources until the process exits, but that is
+ * preferable to hard-to-debug crashes during exit.
*/
#ifndef __FreeBSD__
if (dlclose(dlh->handle) != 0) {
diff --git a/src/third_party/wiredtiger/src/os_posix/os_fallocate.c b/src/third_party/wiredtiger/src/os_posix/os_fallocate.c
index 06b65b2c921..341f4f85537 100644
--- a/src/third_party/wiredtiger/src/os_posix/os_fallocate.c
+++ b/src/third_party/wiredtiger/src/os_posix/os_fallocate.c
@@ -103,17 +103,14 @@ __wt_posix_file_extend(WT_FILE_HANDLE *file_handle, WT_SESSION *wt_session, wt_o
/*
* The first file extension call: figure out what this system has.
*
- * This function is configured as a locking call, so we know we're
- * single-threaded through here. Set the nolock function first, then
- * publish the NULL replacement to ensure the handle functions are
- * always correct.
+ * This function is configured as a locking call, so we know we're single-threaded through here.
+ * Set the nolock function first, then publish the NULL replacement to ensure the handle
+ * functions are always correct.
*
- * We've seen Linux systems where posix_fallocate has corrupted existing
- * file data (even though that is explicitly disallowed by POSIX).
- * FreeBSD and Solaris support posix_fallocate, and so far we've seen
- * no problems leaving it unlocked. Check for fallocate (and the system
- * call version of fallocate) first to avoid locking on Linux if at all
- * possible.
+ * We've seen Linux systems where posix_fallocate has corrupted existing file data (even though
+ * that is explicitly disallowed by POSIX). FreeBSD and Solaris support posix_fallocate, and so
+ * far we've seen no problems leaving it unlocked. Check for fallocate (and the system call
+ * version of fallocate) first to avoid locking on Linux if at all possible.
*/
if (__posix_std_fallocate(file_handle, wt_session, offset) == 0) {
file_handle->fh_extend_nolock = __posix_std_fallocate;
diff --git a/src/third_party/wiredtiger/src/os_posix/os_fs.c b/src/third_party/wiredtiger/src/os_posix/os_fs.c
index dfa075d1249..0e0794d6cfa 100644
--- a/src/third_party/wiredtiger/src/os_posix/os_fs.c
+++ b/src/third_party/wiredtiger/src/os_posix/os_fs.c
@@ -372,13 +372,11 @@ __posix_file_lock(WT_FILE_HANDLE *file_handle, WT_SESSION *wt_session, bool lock
pfh = (WT_FILE_HANDLE_POSIX *)file_handle;
/*
- * WiredTiger requires this function be able to acquire locks past
- * the end of file.
+ * WiredTiger requires this function be able to acquire locks past the end of file.
*
- * Note we're using fcntl(2) locking: all fcntl locks associated with a
- * file for a given process are removed when any file descriptor for the
- * file is closed by the process, even if a lock was never requested for
- * that file descriptor.
+ * Note we're using fcntl(2) locking: all fcntl locks associated with a file for a given process
+ * are removed when any file descriptor for the file is closed by the process, even if a lock
+ * was never requested for that file descriptor.
*/
fl.l_start = 0;
fl.l_len = 1;
diff --git a/src/third_party/wiredtiger/src/os_posix/os_mtx_cond.c b/src/third_party/wiredtiger/src/os_posix/os_mtx_cond.c
index bd68c7afdbd..a5a0854fa20 100644
--- a/src/third_party/wiredtiger/src/os_posix/os_mtx_cond.c
+++ b/src/third_party/wiredtiger/src/os_posix/os_mtx_cond.c
@@ -79,17 +79,14 @@ __wt_cond_wait_signal(WT_SESSION_IMPL *session, WT_CONDVAR *cond, uint64_t usecs
locked = true;
/*
- * It's possible to race with threads waking us up. That's not a problem
- * if there are multiple wakeups because the next wakeup will get us, or
- * if we're only pausing for a short period. It's a problem if there's
- * only a single wakeup, our waker is likely waiting for us to exit.
- * After acquiring the mutex (so we're guaranteed to be awakened by any
- * future wakeup call), optionally check if we're OK to keep running.
- * This won't ensure our caller won't just loop and call us again, but
- * at least it's not our fault.
+ * It's possible to race with threads waking us up. That's not a problem if there are multiple
+ * wakeups because the next wakeup will get us, or if we're only pausing for a short period.
+ * It's a problem if there's only a single wakeup, our waker is likely waiting for us to exit.
+ * After acquiring the mutex (so we're guaranteed to be awakened by any future wakeup call),
+ * optionally check if we're OK to keep running. This won't ensure our caller won't just loop
+ * and call us again, but at least it's not our fault.
*
- * Assert we're not waiting longer than a second if not checking the
- * run status.
+ * Assert we're not waiting longer than a second if not checking the run status.
*/
WT_ASSERT(session, run_func != NULL || usecs <= WT_MILLION);
if (run_func != NULL && !run_func(session))
@@ -97,17 +94,14 @@ __wt_cond_wait_signal(WT_SESSION_IMPL *session, WT_CONDVAR *cond, uint64_t usecs
if (usecs > 0) {
/*
- * Get the current time as the basis for calculating when the
- * wait should end. Prefer a monotonic clock source to avoid
- * unexpectedly long sleeps when the system clock is adjusted.
+ * Get the current time as the basis for calculating when the wait should end. Prefer a monotonic
+ * clock source to avoid unexpectedly long sleeps when the system clock is adjusted.
*
- * Failing that, query the time directly and don't attempt to
- * correct for the clock moving backwards, which would result
- * in a sleep that is too long by however much the clock is
- * updated. This isn't as good as a monotonic clock source but
- * makes the window of vulnerability smaller (i.e., the
- * calculated time is only incorrect if the system clock
- * changes in between us querying it and waiting).
+ * Failing that, query the time directly and don't attempt to correct for the clock moving
+ * backwards, which would result in a sleep that is too long by however much the clock is updated.
+ * This isn't as good as a monotonic clock source but makes the window of vulnerability smaller
+ * (i.e., the calculated time is only incorrect if the system clock changes in between us querying
+ * it and waiting).
*/
#ifdef HAVE_PTHREAD_COND_MONOTONIC
WT_SYSCALL_RETRY(clock_gettime(CLOCK_MONOTONIC, &ts), ret);
diff --git a/src/third_party/wiredtiger/src/os_posix/os_time.c b/src/third_party/wiredtiger/src/os_posix/os_time.c
index 9b4729994df..6009a532c8c 100644
--- a/src/third_party/wiredtiger/src/os_posix/os_time.c
+++ b/src/third_party/wiredtiger/src/os_posix/os_time.c
@@ -14,6 +14,7 @@
*/
void
__wt_epoch_raw(WT_SESSION_IMPL *session, struct timespec *tsp)
+ WT_GCC_FUNC_ATTRIBUTE((visibility("default")))
{
WT_DECL_RET;
diff --git a/src/third_party/wiredtiger/src/os_win/os_fs.c b/src/third_party/wiredtiger/src/os_win/os_fs.c
index 2e67a0c8a61..c5015788613 100644
--- a/src/third_party/wiredtiger/src/os_win/os_fs.c
+++ b/src/third_party/wiredtiger/src/os_win/os_fs.c
@@ -184,9 +184,9 @@ __win_file_close(WT_FILE_HANDLE *file_handle, WT_SESSION *wt_session)
/*
* Close the primary and secondary handles.
*
- * We don't open Windows system handles when opening directories for
- * flushing, as it's not necessary (or possible) to flush a directory
- * on Windows. Confirm the file handle is open before closing it.
+ * We don't open Windows system handles when opening directories for flushing, as it's not
+ * necessary (or possible) to flush a directory on Windows. Confirm the file handle is open
+ * before closing it.
*/
if (win_fh->filehandle != INVALID_HANDLE_VALUE && CloseHandle(win_fh->filehandle) == 0) {
windows_error = __wt_getlasterror();
@@ -486,12 +486,10 @@ __win_open_file(WT_FILE_SYSTEM *file_system, WT_SESSION *wt_session, const char
desired_access |= GENERIC_WRITE;
/*
- * Security:
- * The application may spawn a new process, and we don't want another
- * process to have access to our file handles.
+ * Security: The application may spawn a new process, and we don't want another process to have
+ * access to our file handles.
*
- * TODO: Set tighter file permissions but set bInheritHandle to false
- * to prevent inheritance
+ * TODO: Set tighter file permissions but set bInheritHandle to false to prevent inheritance
*/
f = FILE_ATTRIBUTE_NORMAL;
diff --git a/src/third_party/wiredtiger/src/os_win/os_mtx_cond.c b/src/third_party/wiredtiger/src/os_win/os_mtx_cond.c
index 75b0fe75478..af67fd6a264 100644
--- a/src/third_party/wiredtiger/src/os_win/os_mtx_cond.c
+++ b/src/third_party/wiredtiger/src/os_win/os_mtx_cond.c
@@ -59,17 +59,14 @@ __wt_cond_wait_signal(WT_SESSION_IMPL *session, WT_CONDVAR *cond, uint64_t usecs
locked = true;
/*
- * It's possible to race with threads waking us up. That's not a problem
- * if there are multiple wakeups because the next wakeup will get us, or
- * if we're only pausing for a short period. It's a problem if there's
- * only a single wakeup, our waker is likely waiting for us to exit.
- * After acquiring the mutex (so we're guaranteed to be awakened by any
- * future wakeup call), optionally check if we're OK to keep running.
- * This won't ensure our caller won't just loop and call us again, but
- * at least it's not our fault.
+ * It's possible to race with threads waking us up. That's not a problem if there are multiple
+ * wakeups because the next wakeup will get us, or if we're only pausing for a short period.
+ * It's a problem if there's only a single wakeup, our waker is likely waiting for us to exit.
+ * After acquiring the mutex (so we're guaranteed to be awakened by any future wakeup call),
+ * optionally check if we're OK to keep running. This won't ensure our caller won't just loop
+ * and call us again, but at least it's not our fault.
*
- * Assert we're not waiting longer than a second if not checking the
- * run status.
+ * Assert we're not waiting longer than a second if not checking the run status.
*/
WT_ASSERT(session, run_func != NULL || usecs <= WT_MILLION);
diff --git a/src/third_party/wiredtiger/src/os_win/os_setvbuf.c b/src/third_party/wiredtiger/src/os_win/os_setvbuf.c
index 8b26c379e0a..9b027f60100 100644
--- a/src/third_party/wiredtiger/src/os_win/os_setvbuf.c
+++ b/src/third_party/wiredtiger/src/os_win/os_setvbuf.c
@@ -16,13 +16,12 @@ void
__wt_stream_set_line_buffer(FILE *fp)
{
/*
- * This function exists because MSVC doesn't support buffer sizes of 0
- * to the setvbuf call. To avoid re-introducing the bug, we have helper
- * functions and disallow calling setvbuf directly in WiredTiger code.
+ * This function exists because MSVC doesn't support buffer sizes of 0 to the setvbuf call. To
+ * avoid re-introducing the bug, we have helper functions and disallow calling setvbuf directly
+ * in WiredTiger code.
*
- * Additionally, MSVC doesn't support line buffering, the result is the
- * same as full-buffering. We assume our caller wants immediate output,
- * set no-buffering instead.
+ * Additionally, MSVC doesn't support line buffering, the result is the same as full-buffering.
+ * We assume our caller wants immediate output, set no-buffering instead.
*/
__wt_stream_set_no_buffer(fp);
}
diff --git a/src/third_party/wiredtiger/src/reconcile/rec_child.c b/src/third_party/wiredtiger/src/reconcile/rec_child.c
index 99342d8ed94..390f183f651 100644
--- a/src/third_party/wiredtiger/src/reconcile/rec_child.c
+++ b/src/third_party/wiredtiger/src/reconcile/rec_child.c
@@ -20,20 +20,18 @@ __rec_child_deleted(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_REF *ref, WT_C
page_del = ref->page_del;
/*
- * Internal pages with child leaf pages in the WT_REF_DELETED state are
- * a special case during reconciliation. First, if the deletion was a
- * result of a session truncate call, the deletion may not be visible to
- * us. In that case, we proceed as with any change not visible during
- * reconciliation by ignoring the change for the purposes of writing the
- * internal page.
+ * Internal pages with child leaf pages in the WT_REF_DELETED state are a special case during
+ * reconciliation. First, if the deletion was a result of a session truncate call, the deletion
+ * may not be visible to us. In that case, we proceed as with any change not visible during
+ * reconciliation by ignoring the change for the purposes of writing the internal page.
*
- * In this case, there must be an associated page-deleted structure, and
- * it holds the transaction ID we care about.
+ * In this case, there must be an associated page-deleted structure, and it holds the
+ * transaction ID we care about.
*
* In some cases, there had better not be any updates we can't see.
*
- * A visible update to be in READY state (i.e. not in LOCKED or
- * PREPARED state), for truly visible to others.
+ * A visible update to be in READY state (i.e. not in LOCKED or PREPARED state), for truly
+ * visible to others.
*/
if (F_ISSET(r, WT_REC_VISIBILITY_ERR) && page_del != NULL &&
__wt_page_del_active(session, ref, false))
@@ -42,26 +40,22 @@ __rec_child_deleted(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_REF *ref, WT_C
/*
* Deal with any underlying disk blocks.
*
- * First, check to see if there is an address associated with this leaf:
- * if there isn't, we're done, the underlying page is already gone. If
- * the page still exists, check for any transactions in the system that
- * might want to see the page's state before it's deleted.
+ * First, check to see if there is an address associated with this leaf: if there isn't, we're
+ * done, the underlying page is already gone. If the page still exists, check for any
+ * transactions in the system that might want to see the page's state before it's deleted.
*
- * If any such transactions exist, we cannot discard the underlying leaf
- * page to the block manager because the transaction may eventually read
- * it. However, this write might be part of a checkpoint, and should we
- * recover to that checkpoint, we'll need to delete the leaf page, else
- * we'd leak it. The solution is to write a proxy cell on the internal
- * page ensuring the leaf page is eventually discarded.
+ * If any such transactions exist, we cannot discard the underlying leaf page to the block
+ * manager because the transaction may eventually read it. However, this write might be part of
+ * a checkpoint, and should we recover to that checkpoint, we'll need to delete the leaf page,
+ * else we'd leak it. The solution is to write a proxy cell on the internal page ensuring the
+ * leaf page is eventually discarded.
*
- * If no such transactions exist, we can discard the leaf page to the
- * block manager and no cell needs to be written at all. We do this
- * outside of the underlying tracking routines because this action is
- * permanent and irrevocable. (Clearing the address means we've lost
- * track of the disk address in a permanent way. This is safe because
- * there's no path to reading the leaf page again: if there's ever a
- * read into this part of the name space again, the cache read function
- * instantiates an entirely new page.)
+ * If no such transactions exist, we can discard the leaf page to the block manager and no cell
+ * needs to be written at all. We do this outside of the underlying tracking routines because
+ * this action is permanent and irrevocable. (Clearing the address means we've lost track of the
+ * disk address in a permanent way. This is safe because there's no path to reading the leaf
+ * page again: if there's ever a read into this part of the name space again, the cache read
+ * function instantiates an entirely new page.)
*/
if (ref->addr != NULL && !__wt_page_del_active(session, ref, true)) {
/*
@@ -94,20 +88,16 @@ __rec_child_deleted(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_REF *ref, WT_C
if (F_ISSET(r, WT_REC_EVICT))
return (__wt_set_return(session, EBUSY));
- /*
- * If there are deleted child pages we can't discard immediately, keep the page dirty so they
- * are eventually freed.
- */
+ /* If the page cannot be marked clean. */
r->leave_dirty = true;
/*
- * If the original page cannot be freed, we need to keep a slot on the
- * page to reference it from the parent page.
+ * If the original page cannot be freed, we need to keep a slot on the page to reference it from
+ * the parent page.
*
- * If the delete is not visible in this checkpoint, write the original
- * address normally. Otherwise, we have to write a proxy record.
- * If the delete state is not ready, then delete is not visible as it
- * is in prepared state.
+ * If the delete is not visible in this checkpoint, write the original address normally.
+ * Otherwise, we have to write a proxy record. If the delete state is not ready, then delete is
+ * not visible as it is in prepared state.
*/
if (!__wt_page_del_active(session, ref, false))
*statep = WT_CHILD_PROXY;
@@ -133,16 +123,14 @@ __wt_rec_child_modify(
*statep = WT_CHILD_ORIGINAL;
/*
- * This function is called when walking an internal page to decide how
- * to handle child pages referenced by the internal page.
+ * This function is called when walking an internal page to decide how to handle child pages
+ * referenced by the internal page.
*
- * Internal pages are reconciled for two reasons: first, when evicting
- * an internal page, second by the checkpoint code when writing internal
- * pages. During eviction, all pages should be in the WT_REF_DISK or
- * WT_REF_DELETED state. During checkpoint, eviction that might affect
- * review of an internal page is prohibited, however, as the subtree is
- * not reserved for our exclusive use, there are other page states that
- * must be considered.
+ * Internal pages are reconciled for two reasons: first, when evicting an internal page, second
+ * by the checkpoint code when writing internal pages. During eviction, all pages should be in
+ * the WT_REF_DISK or WT_REF_DELETED state. During checkpoint, eviction that might affect review
+ * of an internal page is prohibited, however, as the subtree is not reserved for our exclusive
+ * use, there are other page states that must be considered.
*/
for (;; __wt_yield()) {
switch (r->tested_ref_state = ref->state) {
@@ -154,11 +142,9 @@ __wt_rec_child_modify(
/*
* The child is in a deleted state.
*
- * It's possible the state could change underneath us as
- * the page is read in, and we can race between checking
- * for a deleted state and looking at the transaction ID
- * to see if the delete is visible to us. Lock down the
- * structure.
+ * It's possible the state could change underneath us as the page is read in, and we can
+ * race between checking for a deleted state and looking at the transaction ID to see if
+ * the delete is visible to us. Lock down the structure.
*/
if (!WT_REF_CAS_STATE(session, ref, WT_REF_DELETED, WT_REF_LOCKED))
break;
@@ -170,9 +156,8 @@ __wt_rec_child_modify(
/*
* Locked.
*
- * We should never be here during eviction, active child
- * pages in an evicted page's subtree fails the eviction
- * attempt.
+ * We should never be here during eviction, active child pages in an evicted page's
+ * subtree fails the eviction attempt.
*/
WT_ASSERT(session, !F_ISSET(r, WT_REC_EVICT));
if (F_ISSET(r, WT_REC_EVICT))
@@ -194,9 +179,8 @@ __wt_rec_child_modify(
/*
* On disk or in cache with lookaside updates.
*
- * We should never be here during eviction: active
- * child pages in an evicted page's subtree fails the
- * eviction attempt.
+ * We should never be here during eviction: active child pages in an evicted page's
+ * subtree fails the eviction attempt.
*/
if (F_ISSET(r, WT_REC_EVICT) && __wt_page_las_active(session, ref)) {
WT_ASSERT(session, false);
@@ -217,25 +201,23 @@ __wt_rec_child_modify(
/*
* In memory.
*
- * We should never be here during eviction, active child
- * pages in an evicted page's subtree fails the eviction
- * attempt.
+ * We should never be here during eviction, active child pages in an evicted page's
+ * subtree fails the eviction attempt.
*/
WT_ASSERT(session, !F_ISSET(r, WT_REC_EVICT));
if (F_ISSET(r, WT_REC_EVICT))
return (__wt_set_return(session, EBUSY));
/*
- * If called during checkpoint, acquire a hazard pointer
- * so the child isn't evicted, it's an in-memory case.
+ * If called during checkpoint, acquire a hazard pointer so the child isn't evicted,
+ * it's an in-memory case.
*
- * This call cannot return split/restart, we have a lock
- * on the parent which prevents a child page split.
+ * This call cannot return split/restart, we have a lock on the parent which prevents a
+ * child page split.
*
- * Set WT_READ_NO_WAIT because we're only interested in
- * the WT_REF's final state. Pages in transition might
- * change WT_REF state during our read, and then return
- * WT_NOTFOUND to us. In that case, loop and look again.
+ * Set WT_READ_NO_WAIT because we're only interested in the WT_REF's final state. Pages
+ * in transition might change WT_REF state during our read, and then return WT_NOTFOUND
+ * to us. In that case, loop and look again.
*/
ret = __wt_page_in(
session, ref, WT_READ_CACHE | WT_READ_NO_EVICT | WT_READ_NO_GEN | WT_READ_NO_WAIT);
@@ -251,9 +233,8 @@ __wt_rec_child_modify(
/*
* Being read, not modified by definition.
*
- * We should never be here during eviction, active child
- * pages in an evicted page's subtree fails the eviction
- * attempt.
+ * We should never be here during eviction, active child pages in an evicted page's
+ * subtree fails the eviction attempt.
*/
WT_ASSERT(session, !F_ISSET(r, WT_REC_EVICT));
if (F_ISSET(r, WT_REC_EVICT))
@@ -264,14 +245,12 @@ __wt_rec_child_modify(
/*
* The page was split out from under us.
*
- * We should never be here during eviction, active child
- * pages in an evicted page's subtree fails the eviction
- * attempt.
+ * We should never be here during eviction, active child pages in an evicted page's
+ * subtree fails the eviction attempt.
*
- * We should never be here during checkpoint, dirty page
- * eviction is shutout during checkpoint, all splits in
- * process will have completed before we walk any pages
- * for checkpoint.
+ * We should never be here during checkpoint, dirty page eviction is shutout during
+ * checkpoint, all splits in process will have completed before we walk any pages for
+ * checkpoint.
*/
WT_ASSERT(session, WT_REF_SPLIT != WT_REF_SPLIT);
return (__wt_set_return(session, EBUSY));
@@ -284,25 +263,20 @@ __wt_rec_child_modify(
in_memory:
/*
- * In-memory states: the child is potentially modified if the page's
- * modify structure has been instantiated. If the modify structure
- * exists and the page has actually been modified, set that state.
- * If that's not the case, we would normally use the original cell's
- * disk address as our reference, however there are two special cases,
- * both flagged by a missing block address.
+ * In-memory states: the child is potentially modified if the page's modify structure has been
+ * instantiated. If the modify structure exists and the page has actually been modified, set
+ * that state. If that's not the case, we would normally use the original cell's disk address as
+ * our reference, however there are two special cases, both flagged by a missing block address.
*
- * First, if forced to instantiate a deleted child page and it's never
- * modified, we end up here with a page that has a modify structure, no
- * modifications, and no disk address. Ignore those pages, they're not
- * modified and there is no reason to write the cell.
+ * First, if forced to instantiate a deleted child page and it's never modified, we end up here
+ * with a page that has a modify structure, no modifications, and no disk address. Ignore those
+ * pages, they're not modified and there is no reason to write the cell.
*
- * Second, insert splits are permitted during checkpoint. When doing the
- * final checkpoint pass, we first walk the internal page's page-index
- * and write out any dirty pages we find, then we write out the internal
- * page in post-order traversal. If we found the split page in the first
- * step, it will have an address; if we didn't find the split page in
- * the first step, it won't have an address and we ignore it, it's not
- * part of the checkpoint.
+ * Second, insert splits are permitted during checkpoint. When doing the final checkpoint pass,
+ * we first walk the internal page's page-index and write out any dirty pages we find, then we
+ * write out the internal page in post-order traversal. If we found the split page in the first
+ * step, it will have an address; if we didn't find the split page in the first step, it won't
+ * have an address and we ignore it, it's not part of the checkpoint.
*/
mod = ref->page->modify;
if (mod != NULL && mod->rec_result != 0)
diff --git a/src/third_party/wiredtiger/src/reconcile/rec_col.c b/src/third_party/wiredtiger/src/reconcile/rec_col.c
index d9a974cc68a..c4241f840ae 100644
--- a/src/third_party/wiredtiger/src/reconcile/rec_col.c
+++ b/src/third_party/wiredtiger/src/reconcile/rec_col.c
@@ -26,14 +26,12 @@ __rec_col_fix_bulk_insert_split_check(WT_CURSOR_BULK *cbulk)
if (cbulk->entry == cbulk->nrecs) {
if (cbulk->entry != 0) {
/*
- * If everything didn't fit, update the counters and
- * split.
+ * If everything didn't fit, update the counters and split.
*
* Boundary: split or write the page.
*
- * No need to have a minimum split size boundary, all
- * pages are filled 100% except the last, allowing it to
- * grow in the future.
+ * No need to have a minimum split size boundary, all pages are filled 100% except the
+ * last, allowing it to grow in the future.
*/
__wt_rec_incr(
session, r, cbulk->entry, __bitstr_size((size_t)cbulk->entry * btree->bitcnt));
@@ -404,14 +402,12 @@ __wt_rec_col_fix(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_REF *pageref)
}
/*
- * If everything didn't fit, update the counters and
- * split.
+ * If everything didn't fit, update the counters and split.
*
* Boundary: split or write the page.
*
- * No need to have a minimum split size boundary, all
- * pages are filled 100% except the last, allowing it to
- * grow in the future.
+ * No need to have a minimum split size boundary, all pages are filled 100% except the
+ * last, allowing it to grow in the future.
*/
__wt_rec_incr(session, r, entry, __bitstr_size((size_t)entry * btree->bitcnt));
WT_RET(__wt_rec_split(session, r, 0));
@@ -708,21 +704,16 @@ __wt_rec_col_var(
goto record_loop;
/*
- * Overflow items are tricky: we don't know until we're
- * finished processing the set of values if we need the
- * overflow value or not. If we don't use the overflow
- * item at all, we have to discard it from the backing
- * file, otherwise we'll leak blocks on the checkpoint.
- * That's safe because if the backing overflow value is
- * still needed by any running transaction, we'll cache
- * a copy in the update list.
+ * Overflow items are tricky: we don't know until we're finished processing the set of
+ * values if we need the overflow value or not. If we don't use the overflow item at all, we
+ * have to discard it from the backing file, otherwise we'll leak blocks on the checkpoint.
+ * That's safe because if the backing overflow value is still needed by any running
+ * transaction, we'll cache a copy in the update list.
*
- * Regardless, we avoid copying in overflow records: if
- * there's a WT_INSERT entry that modifies a reference
- * counted overflow record, we may have to write copies
- * of the overflow record, and in that case we'll do the
- * comparisons, but we don't read overflow items just to
- * see if they match records on either side.
+ * Regardless, we avoid copying in overflow records: if there's a WT_INSERT entry that
+ * modifies a reference counted overflow record, we may have to write copies of the overflow
+ * record, and in that case we'll do the comparisons, but we don't read overflow items just
+ * to see if they match records on either side.
*/
if (vpack->ovfl) {
ovfl_state = OVFL_UNUSED;
@@ -799,15 +790,12 @@ __wt_rec_col_var(
}
} else if (vpack->raw == WT_CELL_VALUE_OVFL_RM) {
/*
- * If doing an update save and restore, and the
- * underlying value is a removed overflow value,
- * we end up here.
+ * If doing an update save and restore, and the underlying value is a removed
+ * overflow value, we end up here.
*
- * If necessary, when the overflow value was
- * originally removed, reconciliation appended
- * a globally visible copy of the value to the
- * key's update list, meaning the on-page item
- * isn't accessed after page re-instantiation.
+ * If necessary, when the overflow value was originally removed, reconciliation
+ * appended a globally visible copy of the value to the key's update list, meaning
+ * the on-page item isn't accessed after page re-instantiation.
*
* Assert the case.
*/
@@ -844,8 +832,7 @@ __wt_rec_col_var(
/*
* An as-yet-unused overflow item.
*
- * We're going to copy the on-page cell,
- * write out any record we're tracking.
+ * We're going to copy the on-page cell, write out any record we're tracking.
*/
if (rle != 0) {
WT_ERR(__rec_col_var_helper(session, r, salvage, last.value, durable_ts,
diff --git a/src/third_party/wiredtiger/src/reconcile/rec_row.c b/src/third_party/wiredtiger/src/reconcile/rec_row.c
index 733f450070e..27de9d69e67 100644
--- a/src/third_party/wiredtiger/src/reconcile/rec_row.c
+++ b/src/third_party/wiredtiger/src/reconcile/rec_row.c
@@ -343,14 +343,12 @@ __wt_rec_row_int(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page)
/* For each entry in the in-memory page... */
WT_INTL_FOREACH_BEGIN (session, page, ref) {
/*
- * There are different paths if the key is an overflow item vs.
- * a straight-forward on-page value. If an overflow item, we
- * would have instantiated it, and we can use that fact to set
- * things up.
+ * There are different paths if the key is an overflow item vs. a straight-forward on-page
+ * value. If an overflow item, we would have instantiated it, and we can use that fact to
+ * set things up.
*
- * Note the cell reference and unpacked key cell are available
- * only in the case of an instantiated, off-page key, we don't
- * bother setting them if that's not possible.
+ * Note the cell reference and unpacked key cell are available only in the case of an
+ * instantiated, off-page key, we don't bother setting them if that's not possible.
*/
if (F_ISSET_ATOMIC(page, WT_PAGE_OVERFLOW_KEYS)) {
cell = NULL;
@@ -372,11 +370,10 @@ __wt_rec_row_int(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page)
/*
* Ignored child.
*
- * Overflow keys referencing pages we're not writing are
- * no longer useful, schedule them for discard. Don't
- * worry about instantiation, internal page keys are
- * always instantiated. Don't worry about reuse,
- * reusing this key in this reconciliation is unlikely.
+ * Overflow keys referencing pages we're not writing are no longer useful, schedule them
+ * for discard. Don't worry about instantiation, internal page keys are always
+ * instantiated. Don't worry about reuse, reusing this key in this reconciliation is
+ * unlikely.
*/
if (key_onpage_ovfl)
WT_ERR(__wt_ovfl_discard_add(session, page, kpack->cell));
@@ -758,13 +755,11 @@ __wt_rec_row_leaf(
dictionary = false;
if (upd == NULL) {
/*
- * When the page was read into memory, there may not
- * have been a value item.
+ * When the page was read into memory, there may not have been a value item.
*
- * If there was a value item, check if it's a dictionary
- * cell (a copy of another item on the page). If it's a
- * copy, we have to create a new value item as the old
- * item might have been discarded from the page.
+ * If there was a value item, check if it's a dictionary cell (a copy of another item on
+ * the page). If it's a copy, we have to create a new value item as the old item might
+ * have been discarded from the page.
*/
if (vpack->raw == WT_CELL_VALUE_COPY) {
/* If the item is Huffman encoded, decode it. */
@@ -782,36 +777,28 @@ __wt_rec_row_leaf(
dictionary = true;
} else if (vpack->raw == WT_CELL_VALUE_OVFL_RM) {
/*
- * If doing an update save and restore, and the
- * underlying value is a removed overflow value,
- * we end up here.
+ * If doing an update save and restore, and the underlying value is a removed
+ * overflow value, we end up here.
*
- * If necessary, when the overflow value was
- * originally removed, reconciliation appended
- * a globally visible copy of the value to the
- * key's update list, meaning the on-page item
- * isn't accessed after page re-instantiation.
+ * If necessary, when the overflow value was originally removed, reconciliation
+ * appended a globally visible copy of the value to the key's update list, meaning
+ * the on-page item isn't accessed after page re-instantiation.
*
* Assert the case.
*/
WT_ASSERT(session, F_ISSET(r, WT_REC_UPDATE_RESTORE));
/*
- * If the key is also a removed overflow item,
- * don't write anything at all.
+ * If the key is also a removed overflow item, don't write anything at all.
*
- * We don't have to write anything because the
- * code re-instantiating the page gets the key
- * to match the saved list of updates from the
- * original page. By not putting the key on
- * the page, we'll move the key/value set from
- * a row-store leaf page slot to an insert list,
- * but that shouldn't matter.
+ * We don't have to write anything because the code re-instantiating the page gets
+ * the key to match the saved list of updates from the original page. By not putting
+ * the key on the page, we'll move the key/value set from a row-store leaf page slot
+ * to an insert list, but that shouldn't matter.
*
- * The reason we bother with the test is because
- * overflows are expensive to write. It's hard
- * to imagine a real workload where this test is
- * worth the effort, but it's a simple test.
+ * The reason we bother with the test is because overflows are expensive to write.
+ * It's hard to imagine a real workload where this test is worth the effort, but
+ * it's a simple test.
*/
if (kpack != NULL && kpack->raw == WT_CELL_KEY_OVFL_RM)
goto leaf_insert;
@@ -855,14 +842,11 @@ __wt_rec_row_leaf(
break;
case WT_UPDATE_TOMBSTONE:
/*
- * If this key/value pair was deleted, we're
- * done.
+ * If this key/value pair was deleted, we're done.
*
- * Overflow keys referencing discarded values
- * are no longer useful, discard the backing
- * blocks. Don't worry about reuse, reusing
- * keys from a row-store page reconciliation
- * seems unlikely enough to ignore.
+ * Overflow keys referencing discarded values are no longer useful, discard the
+ * backing blocks. Don't worry about reuse, reusing keys from a row-store page
+ * reconciliation seems unlikely enough to ignore.
*/
if (kpack != NULL && kpack->ovfl && kpack->raw != WT_CELL_KEY_OVFL_RM) {
/*
@@ -892,8 +876,7 @@ __wt_rec_row_leaf(
/*
* Build key cell.
*
- * If the key is an overflow key that hasn't been removed, use
- * the original backing blocks.
+ * If the key is an overflow key that hasn't been removed, use the original backing blocks.
*/
key_onpage_ovfl = kpack != NULL && kpack->ovfl && kpack->raw != WT_CELL_KEY_OVFL_RM;
if (key_onpage_ovfl) {
@@ -930,14 +913,11 @@ __wt_rec_row_leaf(
WT_ASSERT(session, tmpkey->size != 0);
/*
- * Grow the buffer as necessary, ensuring data
- * data has been copied into local buffer space,
- * then append the suffix to the prefix already
- * in the buffer.
+ * Grow the buffer as necessary, ensuring data data has been copied into local
+ * buffer space, then append the suffix to the prefix already in the buffer.
*
- * Don't grow the buffer unnecessarily or copy
- * data we don't need, truncate the item's data
- * length to the prefix bytes.
+ * Don't grow the buffer unnecessarily or copy data we don't need, truncate the
+ * item's data length to the prefix bytes.
*/
tmpkey->size = kpack->prefix;
WT_ERR(__wt_buf_grow(session, tmpkey, tmpkey->size + kpack->size));
diff --git a/src/third_party/wiredtiger/src/reconcile/rec_track.c b/src/third_party/wiredtiger/src/reconcile/rec_track.c
index 0ecd3f6998b..ae7fd9b6d79 100644
--- a/src/third_party/wiredtiger/src/reconcile/rec_track.c
+++ b/src/third_party/wiredtiger/src/reconcile/rec_track.c
@@ -301,11 +301,9 @@ __ovfl_reuse_wrapup(WT_SESSION_IMPL *session, WT_PAGE *page)
head = page->modify->ovfl_track->ovfl_reuse;
/*
- * Discard any overflow records that aren't in-use, freeing underlying
- * blocks.
+ * Discard any overflow records that aren't in-use, freeing underlying blocks.
*
- * First, walk the overflow reuse lists (except for the lowest one),
- * fixing up skiplist links.
+ * First, walk the overflow reuse lists (except for the lowest one), fixing up skiplist links.
*/
for (i = WT_SKIP_MAXDEPTH - 1; i > 0; --i)
for (e = &head[i]; (reuse = *e) != NULL;) {
@@ -317,15 +315,13 @@ __ovfl_reuse_wrapup(WT_SESSION_IMPL *session, WT_PAGE *page)
}
/*
- * Second, discard any overflow record without an in-use flag, clear
- * the flags for the next run.
+ * Second, discard any overflow record without an in-use flag, clear the flags for the next run.
*
- * As part of the pass through the lowest level, figure out how much
- * space we added/subtracted from the page, and update its footprint.
- * We don't get it exactly correct because we don't know the depth of
- * the skiplist here, but it's close enough, and figuring out the
- * memory footprint change in the reconciliation wrapup code means
- * fewer atomic updates and less code overall.
+ * As part of the pass through the lowest level, figure out how much space we added/subtracted
+ * from the page, and update its footprint. We don't get it exactly correct because we don't
+ * know the depth of the skiplist here, but it's close enough, and figuring out the memory
+ * footprint change in the reconciliation wrapup code means fewer atomic updates and less code
+ * overall.
*/
decr = 0;
for (e = &head[0]; (reuse = *e) != NULL;) {
@@ -368,11 +364,9 @@ __ovfl_reuse_wrapup_err(WT_SESSION_IMPL *session, WT_PAGE *page)
head = page->modify->ovfl_track->ovfl_reuse;
/*
- * Discard any overflow records that were just added, freeing underlying
- * blocks.
+ * Discard any overflow records that were just added, freeing underlying blocks.
*
- * First, walk the overflow reuse lists (except for the lowest one),
- * fixing up skiplist links.
+ * First, walk the overflow reuse lists (except for the lowest one), fixing up skiplist links.
*/
for (i = WT_SKIP_MAXDEPTH - 1; i > 0; --i)
for (e = &head[i]; (reuse = *e) != NULL;) {
@@ -464,14 +458,12 @@ __wt_ovfl_reuse_add(WT_SESSION_IMPL *session, WT_PAGE *page, const uint8_t *addr
skipdepth = __wt_skip_choose_depth(session);
/*
- * Allocate the WT_OVFL_REUSE structure, next pointers for the skip
- * list, room for the address and value, then copy everything into
- * place.
+ * Allocate the WT_OVFL_REUSE structure, next pointers for the skip list, room for the address
+ * and value, then copy everything into place.
*
- * To minimize the WT_OVFL_REUSE structure size, the address offset
- * and size are single bytes: that's safe because the address follows
- * the structure (which can't be more than about 100B), and address
- * cookies are limited to 255B.
+ * To minimize the WT_OVFL_REUSE structure size, the address offset and size are single bytes:
+ * that's safe because the address follows the structure (which can't be more than about 100B),
+ * and address cookies are limited to 255B.
*/
size = sizeof(WT_OVFL_REUSE) + skipdepth * sizeof(WT_OVFL_REUSE *) + addr_size + value_size;
WT_RET(__wt_calloc(session, 1, size, &reuse));
diff --git a/src/third_party/wiredtiger/src/reconcile/rec_visibility.c b/src/third_party/wiredtiger/src/reconcile/rec_visibility.c
index 9f3150d362b..2150bf63559 100644
--- a/src/third_party/wiredtiger/src/reconcile/rec_visibility.c
+++ b/src/third_party/wiredtiger/src/reconcile/rec_visibility.c
@@ -9,6 +9,19 @@
#include "wt_internal.h"
/*
+ * __rec_update_durable --
+ * Return whether an update is suitable for writing to a disk image.
+ */
+static bool
+__rec_update_durable(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_UPDATE *upd)
+{
+ return (F_ISSET(r, WT_REC_VISIBLE_ALL) ?
+ __wt_txn_upd_visible_all(session, upd) :
+ __wt_txn_upd_visible_type(session, upd) == WT_VISIBLE_TRUE &&
+ __wt_txn_visible(session, upd->txnid, upd->durable_ts));
+}
+
+/*
* __rec_update_save --
* Save a WT_UPDATE list for later restoration.
*/
@@ -55,13 +68,11 @@ __rec_append_orig_value(
}
/*
- * We need the original on-page value for some reader: get a copy and
- * append it to the end of the update list with a transaction ID that
- * guarantees its visibility.
+ * We need the original on-page value for some reader: get a copy and append it to the end of
+ * the update list with a transaction ID that guarantees its visibility.
*
- * If we don't have a value cell, it's an insert/append list key/value
- * pair which simply doesn't exist for some reader; place a deleted
- * record at the end of the update list.
+ * If we don't have a value cell, it's an insert/append list key/value pair which simply doesn't
+ * exist for some reader; place a deleted record at the end of the update list.
*/
append = NULL; /* -Wconditional-uninitialized */
size = 0; /* -Wconditional-uninitialized */
@@ -74,12 +85,11 @@ __rec_append_orig_value(
}
/*
- * If we're saving the original value for a birthmark, transfer over
- * the transaction ID and clear out the birthmark update.
+ * If we're saving the original value for a birthmark, transfer over the transaction ID and
+ * clear out the birthmark update.
*
- * Else, set the entry's transaction information to the lowest possible
- * value. Cleared memory matches the lowest possible transaction ID and
- * timestamp, do nothing.
+ * Else, set the entry's transaction information to the lowest possible value. Cleared memory
+ * matches the lowest possible transaction ID and timestamp, do nothing.
*/
if (upd->type == WT_UPDATE_BIRTHMARK) {
append->txnid = upd->txnid;
@@ -111,11 +121,11 @@ __wt_rec_upd_select(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_INSERT *ins, v
WT_CELL_UNPACK *vpack, WT_UPDATE_SELECT *upd_select)
{
WT_PAGE *page;
- WT_UPDATE *first_ts_upd, *first_txn_upd, *first_upd, *upd;
- wt_timestamp_t timestamp, ts;
+ WT_UPDATE *first_txn_upd, *first_upd, *upd;
+ wt_timestamp_t max_ts;
size_t upd_memsize;
uint64_t max_txn, txnid;
- bool all_visible, list_prepared, list_uncommitted, skipped_birthmark;
+ bool all_stable, list_prepared, list_uncommitted, skipped_birthmark;
/*
* The "saved updates" return value is used independently of returning an update we can write,
@@ -125,8 +135,9 @@ __wt_rec_upd_select(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_INSERT *ins, v
upd_select->upd_saved = false;
page = r->page;
- first_ts_upd = first_txn_upd = NULL;
+ first_txn_upd = NULL;
upd_memsize = 0;
+ max_ts = WT_TS_NONE;
max_txn = WT_TXN_NONE;
list_prepared = list_uncommitted = skipped_birthmark = false;
@@ -152,8 +163,6 @@ __wt_rec_upd_select(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_INSERT *ins, v
*/
if (first_txn_upd == NULL)
first_txn_upd = upd;
-
- /* Track the largest transaction ID seen. */
if (WT_TXNID_LT(max_txn, txnid))
max_txn = txnid;
@@ -170,21 +179,23 @@ __wt_rec_upd_select(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_INSERT *ins, v
* prepared transaction IDs are globally visible, need to check the update state as well.
*/
if (F_ISSET(r, WT_REC_EVICT)) {
- if (upd->prepare_state == WT_PREPARE_LOCKED ||
- upd->prepare_state == WT_PREPARE_INPROGRESS) {
- list_prepared = true;
- continue;
- }
if (F_ISSET(r, WT_REC_VISIBLE_ALL) ? WT_TXNID_LE(r->last_running, txnid) :
!__txn_visible_id(session, txnid)) {
r->update_uncommitted = list_uncommitted = true;
continue;
}
+ if (upd->prepare_state == WT_PREPARE_LOCKED ||
+ upd->prepare_state == WT_PREPARE_INPROGRESS) {
+ list_prepared = true;
+ if (upd->start_ts > max_ts)
+ max_ts = upd->start_ts;
+ continue;
+ }
}
/* Track the first update with non-zero timestamp. */
- if (first_ts_upd == NULL && upd->start_ts != WT_TS_NONE)
- first_ts_upd = upd;
+ if (upd->durable_ts > max_ts)
+ max_ts = upd->durable_ts;
/*
* Select the update to write to the disk image.
@@ -202,8 +213,7 @@ __wt_rec_upd_select(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_INSERT *ins, v
if (upd_select->upd == NULL && r->las_skew_newest)
upd_select->upd = upd;
- if (F_ISSET(r, WT_REC_VISIBLE_ALL) ? !__wt_txn_upd_visible_all(session, upd) :
- !__wt_txn_upd_durable(session, upd)) {
+ if (!__rec_update_durable(session, r, upd)) {
if (F_ISSET(r, WT_REC_EVICT))
++r->updates_unstable;
@@ -214,21 +224,28 @@ __wt_rec_upd_select(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_INSERT *ins, v
* discard an uncommitted update.
*/
if (F_ISSET(r, WT_REC_UPDATE_RESTORE) && upd_select->upd != NULL &&
- (list_prepared || list_uncommitted)) {
- r->leave_dirty = true;
+ (list_prepared || list_uncommitted))
return (__wt_set_return(session, EBUSY));
- }
if (upd->type == WT_UPDATE_BIRTHMARK)
skipped_birthmark = true;
+ /*
+ * Track the oldest update not on the page.
+ *
+ * This is used to decide whether reads can use the page image, hence using the start
+ * rather than the durable timestamp.
+ */
+ if (upd_select->upd == NULL && upd->start_ts < r->min_skipped_ts)
+ r->min_skipped_ts = upd->start_ts;
+
continue;
}
/*
* Lookaside without stable timestamp was taken care of above
- * (set to the first uncommitted transaction). Lookaside with
- * stable timestamp always takes the first stable update.
+ * (set to the first uncommitted transaction). All other
+ * reconciliation takes the first stable update.
*/
if (upd_select->upd == NULL)
upd_select->upd = upd;
@@ -262,6 +279,9 @@ __wt_rec_upd_select(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_INSERT *ins, v
if (upd == first_txn_upd)
r->update_used = true;
+ if (upd != NULL && upd->durable_ts > r->max_ondisk_ts)
+ r->max_ondisk_ts = upd->durable_ts;
+
/*
* TIMESTAMP-FIXME The start timestamp is determined by the commit timestamp when the key is
* first inserted (or last updated). The end timestamp is set when a key/value pair becomes
@@ -308,8 +328,8 @@ __wt_rec_upd_select(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_INSERT *ins, v
r->max_txn = max_txn;
/* Update the maximum timestamp. */
- if (first_ts_upd != NULL && r->max_timestamp < first_ts_upd->durable_ts)
- r->max_timestamp = first_ts_upd->durable_ts;
+ if (max_ts > r->max_ts)
+ r->max_ts = max_ts;
/*
* If the update we chose was a birthmark, or we are doing update-restore and we skipped a
@@ -327,19 +347,15 @@ __wt_rec_upd_select(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_INSERT *ins, v
}
/*
- * Check if all updates on the page are visible. If not, it must stay
- * dirty unless we are saving updates to the lookaside table.
+ * Check if all updates on the page are visible, if not, it must stay dirty.
*
- * Updates can be out of transaction ID order (but not out of timestamp
- * order), so we track the maximum transaction ID and the newest update
- * with a timestamp (if any).
+ * Updates can be out of transaction ID order (but not out of timestamp order), so we track the
+ * maximum transaction ID and the newest update with a timestamp (if any).
*/
- timestamp = first_ts_upd == NULL ? 0 : first_ts_upd->durable_ts;
- all_visible = upd == first_txn_upd && !list_prepared && !list_uncommitted &&
- (F_ISSET(r, WT_REC_VISIBLE_ALL) ? __wt_txn_visible_all(session, max_txn, timestamp) :
- __wt_txn_visible(session, max_txn, timestamp));
+ all_stable = upd == first_txn_upd && !list_prepared && !list_uncommitted &&
+ __wt_txn_visible_all(session, max_txn, max_ts);
- if (all_visible)
+ if (all_stable)
goto check_original_value;
r->leave_dirty = true;
@@ -347,9 +363,7 @@ __wt_rec_upd_select(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_INSERT *ins, v
if (F_ISSET(r, WT_REC_VISIBILITY_ERR))
WT_PANIC_RET(session, EINVAL, "reconciliation error, update not visible");
- /*
- * If not trying to evict the page, we know what we'll write and we're done.
- */
+ /* If not trying to evict the page, we know what we'll write and we're done. */
if (!F_ISSET(r, WT_REC_EVICT))
goto check_original_value;
@@ -382,54 +396,6 @@ __wt_rec_upd_select(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_INSERT *ins, v
WT_RET(__rec_update_save(session, r, ins, ripcip, upd_select->upd, upd_memsize));
upd_select->upd_saved = true;
- /*
- * Track the first off-page update when saving history in the lookaside table. When skewing
- * newest, we want the first (non-aborted) update after the one stored on the page. Otherwise,
- * we want the update before the on-page update.
- */
- if (F_ISSET(r, WT_REC_LOOKASIDE) && r->las_skew_newest) {
- if (WT_TXNID_LT(r->unstable_txn, first_upd->txnid))
- r->unstable_txn = first_upd->txnid;
- if (first_ts_upd != NULL) {
- WT_ASSERT(session, first_ts_upd->prepare_state == WT_PREPARE_INPROGRESS ||
- first_ts_upd->start_ts <= first_ts_upd->durable_ts);
-
- if (r->unstable_timestamp < first_ts_upd->start_ts)
- r->unstable_timestamp = first_ts_upd->start_ts;
-
- if (r->unstable_durable_timestamp < first_ts_upd->durable_ts)
- r->unstable_durable_timestamp = first_ts_upd->durable_ts;
- }
- } else if (F_ISSET(r, WT_REC_LOOKASIDE)) {
- for (upd = first_upd; upd != upd_select->upd; upd = upd->next) {
- if (upd->txnid == WT_TXN_ABORTED)
- continue;
-
- if (upd->txnid != WT_TXN_NONE && WT_TXNID_LT(upd->txnid, r->unstable_txn))
- r->unstable_txn = upd->txnid;
-
- /*
- * The durable timestamp is always set by commit, and usually the same as the start
- * timestamp, which makes it OK to use the two independently and be confident both will
- * be set.
- */
- WT_ASSERT(session,
- upd->prepare_state == WT_PREPARE_INPROGRESS || upd->durable_ts >= upd->start_ts);
-
- if (r->unstable_timestamp > upd->start_ts)
- r->unstable_timestamp = upd->start_ts;
-
- /*
- * An in-progress prepared update will always have a zero durable timestamp. Checkpoints
- * can only skip reading lookaside history if all updates are in the future, including
- * the prepare, so including the prepare timestamp instead.
- */
- ts = upd->prepare_state == WT_PREPARE_INPROGRESS ? upd->start_ts : upd->durable_ts;
- if (r->unstable_durable_timestamp > ts)
- r->unstable_durable_timestamp = ts;
- }
- }
-
check_original_value:
/*
* Paranoia: check that we didn't choose an update that has since been rolled back.
diff --git a/src/third_party/wiredtiger/src/reconcile/rec_write.c b/src/third_party/wiredtiger/src/reconcile/rec_write.c
index 6bd67f329e1..5746e20273b 100644
--- a/src/third_party/wiredtiger/src/reconcile/rec_write.c
+++ b/src/third_party/wiredtiger/src/reconcile/rec_write.c
@@ -315,13 +315,12 @@ __rec_write_check_complete(
WT_RET(tret);
/*
- * Check if this reconciliation attempt is making progress. If there's
- * any sign of progress, don't fall back to the lookaside table.
+ * Check if this reconciliation attempt is making progress. If there's any sign of progress,
+ * don't fall back to the lookaside table.
*
- * Check if the current reconciliation split, in which case we'll
- * likely get to write at least one of the blocks. If we've created a
- * page image for a page that previously didn't have one, or we had a
- * page image and it is now empty, that's also progress.
+ * Check if the current reconciliation split, in which case we'll likely get to write at least
+ * one of the blocks. If we've created a page image for a page that previously didn't have one,
+ * or we had a page image and it is now empty, that's also progress.
*/
if (r->multi_next > 1)
return (0);
@@ -337,11 +336,11 @@ __rec_write_check_complete(
return (0);
/*
- * Check if the current reconciliation applied some updates, in which
- * case evict/restore should gain us some space.
+ * Check if the current reconciliation applied some updates, in which case evict/restore should
+ * gain us some space.
*
- * Check if lookaside eviction is possible. If any of the updates we
- * saw were uncommitted, the lookaside table cannot be used.
+ * Check if lookaside eviction is possible. If any of the updates we saw were uncommitted, the
+ * lookaside table cannot be used.
*/
if (r->update_uncommitted || r->update_used)
return (0);
@@ -372,12 +371,10 @@ __rec_write_page_status(WT_SESSION_IMPL *session, WT_RECONCILE *r)
/*
* The page remains dirty.
*
- * Any checkpoint call cleared the tree's modified flag before
- * writing pages, so we must explicitly reset it. We insert a
- * barrier after the change for clarity (the requirement is the
- * flag be set before a subsequent checkpoint reads it, and
- * as the current checkpoint is waiting on this reconciliation
- * to complete, there's no risk of that happening).
+ * Any checkpoint call cleared the tree's modified flag before writing pages, so we must
+ * explicitly reset it. We insert a barrier after the change for clarity (the requirement is
+ * the flag be set before a subsequent checkpoint reads it, and as the current checkpoint is
+ * waiting on this reconciliation to complete, there's no risk of that happening).
*/
btree->modified = true;
WT_FULL_BARRIER();
@@ -404,7 +401,7 @@ __rec_write_page_status(WT_SESSION_IMPL *session, WT_RECONCILE *r)
* discard its history).
*/
mod->rec_max_txn = r->max_txn;
- mod->rec_max_timestamp = r->max_timestamp;
+ mod->rec_max_timestamp = r->max_ts;
/*
* Track the tree's maximum transaction ID (used to decide if it's safe to discard the
@@ -416,22 +413,20 @@ __rec_write_page_status(WT_SESSION_IMPL *session, WT_RECONCILE *r)
if (!F_ISSET(r, WT_REC_EVICT)) {
if (WT_TXNID_LT(btree->rec_max_txn, r->max_txn))
btree->rec_max_txn = r->max_txn;
- if (btree->rec_max_timestamp < r->max_timestamp)
- btree->rec_max_timestamp = r->max_timestamp;
+ if (btree->rec_max_timestamp < r->max_ts)
+ btree->rec_max_timestamp = r->max_ts;
}
/*
- * We set the page state to mark it as having been dirtied for
- * the first time prior to reconciliation. A failed atomic cas
- * indicates that an update has taken place during
+ * We set the page state to mark it as having been dirtied for the first time prior to
+ * reconciliation. A failed atomic cas indicates that an update has taken place during
* reconciliation.
*
- * The page only might be clean; if the page state is unchanged
- * since reconciliation started, it's clean.
+ * The page only might be clean; if the page state is unchanged since reconciliation
+ * started, it's clean.
*
- * If the page state changed, the page has been written since
- * reconciliation started and remains dirty (that can't happen
- * when evicting, the page is exclusively locked).
+ * If the page state changed, the page has been written since reconciliation started and
+ * remains dirty (that can't happen when evicting, the page is exclusively locked).
*/
if (__wt_atomic_cas32(&mod->page_state, WT_PAGE_DIRTY_FIRST, WT_PAGE_CLEAN))
__wt_cache_dirty_decr(session, page);
@@ -477,11 +472,11 @@ __rec_root_write(WT_SESSION_IMPL *session, WT_PAGE *page, uint32_t flags)
session, WT_VERB_SPLIT, "root page split -> %" PRIu32 " pages", mod->mod_multi_entries);
/*
- * Create a new root page, initialize the array of child references,
- * mark it dirty, then write it.
+ * Create a new root page, initialize the array of child references, mark it dirty, then write
+ * it.
*
- * Don't count the eviction of this page as progress, checkpoint can
- * repeatedly create and discard these pages.
+ * Don't count the eviction of this page as progress, checkpoint can repeatedly create and
+ * discard these pages.
*/
WT_RET(__wt_page_alloc(session, page->type, mod->mod_multi_entries, false, &next));
F_SET_ATOMIC(next, WT_PAGE_EVICT_NO_PROGRESS);
@@ -576,12 +571,11 @@ __rec_init(WT_SESSION_IMPL *session, WT_REF *ref, uint32_t flags, WT_SALVAGE_COO
r->orig_txn_checkpoint_gen = __wt_gen(session, WT_GEN_CHECKPOINT);
/*
- * Update the page state to indicate that all currently installed
- * updates will be included in this reconciliation if it would mark the
- * page clean.
+ * Update the page state to indicate that all currently installed updates will be included in
+ * this reconciliation if it would mark the page clean.
*
- * Add a write barrier to make it more likely that a thread adding an
- * update will see this state change.
+ * Add a write barrier to make it more likely that a thread adding an update will see this state
+ * change.
*/
page->modify->page_state = WT_PAGE_DIRTY_FIRST;
WT_FULL_BARRIER();
@@ -596,17 +590,14 @@ __rec_init(WT_SESSION_IMPL *session, WT_REF *ref, uint32_t flags, WT_SALVAGE_COO
WT_ORDERED_READ(r->last_running, txn_global->last_running);
/*
- * Decide whether to skew on-page values towards newer or older
- * versions. This is a heuristic attempting to minimize the number of
- * pages that need to be rewritten by future checkpoints.
+ * Decide whether to skew on-page values towards newer or older versions. This is a heuristic
+ * attempting to minimize the number of pages that need to be rewritten by future checkpoints.
*
- * We usually prefer to skew to newer versions, the logic being that by
- * the time the next checkpoint runs, it is likely that all the updates
- * we choose will be stable. However, if checkpointing with a
- * timestamp (indicated by a stable_timestamp being set), and there is
- * a checkpoint already running, or this page was read with lookaside
- * history, or the stable timestamp hasn't changed since last time this
- * page was successfully, skew oldest instead.
+ * We usually prefer to skew to newer versions, the logic being that by the time the next
+ * checkpoint runs, it is likely that all the updates we choose will be stable. However, if
+ * checkpointing with a timestamp (indicated by a stable_timestamp being set), and there is a
+ * checkpoint already running, or this page was read with lookaside history, or the stable
+ * timestamp hasn't changed since last time this page was successfully, skew oldest instead.
*/
if (F_ISSET(S2C(session)->cache, WT_CACHE_EVICT_DEBUG_MODE) &&
__wt_random(&session->rnd) % 3 == 0)
@@ -651,22 +642,8 @@ __rec_init(WT_SESSION_IMPL *session, WT_REF *ref, uint32_t flags, WT_SALVAGE_COO
/* Track the page's min/maximum transaction */
r->max_txn = WT_TXN_NONE;
- r->max_timestamp = 0;
-
- /*
- * Track the first unstable transaction (when skewing newest this is the newest update,
- * otherwise the newest update not on the page). This is the boundary between the on-page
- * information and the history stored in the lookaside table.
- */
- if (r->las_skew_newest) {
- r->unstable_txn = WT_TXN_NONE;
- r->unstable_timestamp = WT_TS_NONE;
- r->unstable_durable_timestamp = WT_TS_NONE;
- } else {
- r->unstable_txn = WT_TXN_ABORTED;
- r->unstable_timestamp = WT_TS_MAX;
- r->unstable_durable_timestamp = WT_TS_MAX;
- }
+ r->max_ondisk_ts = r->max_ts = WT_TS_NONE;
+ r->min_skipped_ts = WT_TS_MAX;
/* Track if updates were used and/or uncommitted. */
r->updates_seen = r->updates_unstable = 0;
@@ -700,9 +677,8 @@ __rec_init(WT_SESSION_IMPL *session, WT_REF *ref, uint32_t flags, WT_SALVAGE_COO
r->evict_matching_checksum_failed = false;
/*
- * Dictionary compression only writes repeated values once. We grow
- * the dictionary as necessary, always using the largest size we've
- * seen.
+ * Dictionary compression only writes repeated values once. We grow the dictionary as necessary,
+ * always using the largest size we've seen.
*
* Reset the dictionary.
*
@@ -959,12 +935,10 @@ __rec_split_chunk_init(
*
* Don't touch the disk image item memory, that memory is reused.
*
- * Clear the disk page header to ensure all of it is initialized, even
- * the unused fields.
+ * Clear the disk page header to ensure all of it is initialized, even the unused fields.
*
- * In the case of fixed-length column-store, clear the entire buffer:
- * fixed-length column-store sets bits in bytes, where the bytes are
- * assumed to initially be 0.
+ * In the case of fixed-length column-store, clear the entire buffer: fixed-length column-store
+ * sets bits in bytes, where the bytes are assumed to initially be 0.
*/
WT_RET(__wt_buf_init(session, &chunk->image, memsize));
memset(chunk->image.mem, 0, r->page->type == WT_PAGE_COL_FIX ? memsize : WT_PAGE_HEADER_SIZE);
@@ -1002,39 +976,32 @@ __wt_rec_split_init(
r->page_size = (uint32_t)max;
/*
- * If we have to split, we want to choose a smaller page size for the
- * split pages, because otherwise we could end up splitting one large
- * packed page over and over. We don't want to pick the minimum size
- * either, because that penalizes an application that did a bulk load
- * and subsequently inserted a few items into packed pages. Currently
- * defaulted to 75%, but I have no empirical evidence that's "correct".
+ * If we have to split, we want to choose a smaller page size for the split pages, because
+ * otherwise we could end up splitting one large packed page over and over. We don't want to
+ * pick the minimum size either, because that penalizes an application that did a bulk load and
+ * subsequently inserted a few items into packed pages. Currently defaulted to 75%, but I have
+ * no empirical evidence that's "correct".
*
- * The maximum page size may be a multiple of the split page size (for
- * example, there's a maximum page size of 128KB, but because the table
- * is active and we don't want to split a lot, the split size is 20KB).
- * The maximum page size may NOT be an exact multiple of the split page
+ * The maximum page size may be a multiple of the split page size (for example, there's a
+ * maximum page size of 128KB, but because the table is active and we don't want to split a lot,
+ * the split size is 20KB). The maximum page size may NOT be an exact multiple of the split page
* size.
*
- * It's lots of work to build these pages and don't want to start over
- * when we reach the maximum page size (it's painful to restart after
- * creating overflow items and compacted data, for example, as those
- * items have already been written to disk). So, the loop calls the
- * helper functions when approaching a split boundary, and we save the
- * information at that point. We also save the boundary information at
- * the minimum split size. We maintain two chunks (each boundary
- * represents a chunk that gets written as a page) in the memory,
- * writing out the older one to the disk as a page when we need to make
- * space for a new chunk. On reaching the last chunk, if it turns out to
- * be smaller than the minimum split size, we go back into the
- * penultimate chunk and split at this minimum split size boundary. This
- * moves some data from the penultimate chunk to the last chunk, hence
- * increasing the size of the last page written without decreasing the
- * penultimate page size beyond the minimum split size.
+ * It's lots of work to build these pages and don't want to start over when we reach the maximum
+ * page size (it's painful to restart after creating overflow items and compacted data, for
+ * example, as those items have already been written to disk). So, the loop calls the helper
+ * functions when approaching a split boundary, and we save the information at that point. We
+ * also save the boundary information at the minimum split size. We maintain two chunks (each
+ * boundary represents a chunk that gets written as a page) in the memory, writing out the older
+ * one to the disk as a page when we need to make space for a new chunk. On reaching the last
+ * chunk, if it turns out to be smaller than the minimum split size, we go back into the
+ * penultimate chunk and split at this minimum split size boundary. This moves some data from
+ * the penultimate chunk to the last chunk, hence increasing the size of the last page written
+ * without decreasing the penultimate page size beyond the minimum split size.
*
- * Finally, all this doesn't matter for fixed-size column-store pages
- * and salvage. Fixed-size column store pages can split under (very)
- * rare circumstances, but they're allocated at a fixed page size, never
- * anything smaller. In salvage, as noted above, we can't split at all.
+ * Finally, all this doesn't matter for fixed-size column-store pages and salvage. Fixed-size
+ * column store pages can split under (very) rare circumstances, but they're allocated at a
+ * fixed page size, never anything smaller. In salvage, as noted above, we can't split at all.
*/
if (r->salvage != NULL) {
r->split_size = 0;
@@ -1108,14 +1075,12 @@ __rec_is_checkpoint(WT_SESSION_IMPL *session, WT_RECONCILE *r)
*
* This function exists as a place to hang this comment.
*
- * Any time we write the root page of the tree without splitting we are
- * creating a checkpoint (and have to tell the underlying block manager
- * so it creates and writes the additional information checkpoints
- * require). However, checkpoints are completely consistent, and so we
- * have to resolve information about the blocks we're expecting to free
- * as part of the checkpoint, before writing the checkpoint. In short,
- * we don't do checkpoint writes here; clear the boundary information as
- * a reminder and create the checkpoint during wrapup.
+ * Any time we write the root page of the tree without splitting we are creating a checkpoint
+ * (and have to tell the underlying block manager so it creates and writes the additional
+ * information checkpoints require). However, checkpoints are completely consistent, and so we
+ * have to resolve information about the blocks we're expecting to free as part of the
+ * checkpoint, before writing the checkpoint. In short, we don't do checkpoint writes here;
+ * clear the boundary information as a reminder and create the checkpoint during wrapup.
*/
return (!F_ISSET(btree, WT_BTREE_NO_CHECKPOINT) && __wt_ref_is_root(r->ref));
}
@@ -1138,36 +1103,30 @@ __rec_split_row_promote(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_ITEM *key,
int cmp;
/*
- * For a column-store, the promoted key is the recno and we already have
- * a copy. For a row-store, it's the first key on the page, a variable-
- * length byte string, get a copy.
+ * For a column-store, the promoted key is the recno and we already have a copy. For a
+ * row-store, it's the first key on the page, a variable- length byte string, get a copy.
*
- * This function is called from the split code at each split boundary,
- * but that means we're not called before the first boundary, and we
- * will eventually have to get the first key explicitly when splitting
- * a page.
+ * This function is called from the split code at each split boundary, but that means we're not
+ * called before the first boundary, and we will eventually have to get the first key explicitly
+ * when splitting a page.
*
- * For the current slot, take the last key we built, after doing suffix
- * compression. The "last key we built" describes some process: before
- * calling the split code, we must place the last key on the page before
- * the boundary into the "last" key structure, and the first key on the
- * page after the boundary into the "current" key structure, we're going
- * to compare them for suffix compression.
+ * For the current slot, take the last key we built, after doing suffix compression. The "last
+ * key we built" describes some process: before calling the split code, we must place the last
+ * key on the page before the boundary into the "last" key structure, and the first key on the
+ * page after the boundary into the "current" key structure, we're going to compare them for
+ * suffix compression.
*
- * Suffix compression is a hack to shorten keys on internal pages. We
- * only need enough bytes in the promoted key to ensure searches go to
- * the correct page: the promoted key has to be larger than the last key
- * on the leaf page preceding it, but we don't need any more bytes than
- * that. In other words, we can discard any suffix bytes not required
- * to distinguish between the key being promoted and the last key on the
- * leaf page preceding it. This can only be done for the first level of
- * internal pages, you cannot repeat suffix truncation as you split up
- * the tree, it loses too much information.
+ * Suffix compression is a hack to shorten keys on internal pages. We only need enough bytes in
+ * the promoted key to ensure searches go to the correct page: the promoted key has to be larger
+ * than the last key on the leaf page preceding it, but we don't need any more bytes than that.
+ * In other words, we can discard any suffix bytes not required to distinguish between the key
+ * being promoted and the last key on the leaf page preceding it. This can only be done for the
+ * first level of internal pages, you cannot repeat suffix truncation as you split up the tree,
+ * it loses too much information.
*
- * Note #1: if the last key on the previous page was an overflow key,
- * we don't have the in-memory key against which to compare, and don't
- * try to do suffix compression. The code for that case turns suffix
- * compression off for the next key, we don't have to deal with it here.
+ * Note #1: if the last key on the previous page was an overflow key, we don't have the
+ * in-memory key against which to compare, and don't try to do suffix compression. The code for
+ * that case turns suffix compression off for the next key, we don't have to deal with it here.
*/
if (type != WT_PAGE_ROW_LEAF || !r->key_sfx_compress)
return (__wt_buf_set(session, key, r->cur->data, r->cur->size));
@@ -1463,9 +1422,8 @@ __rec_split_finish_process_prev(WT_SESSION_IMPL *session, WT_RECONCILE *r)
if (prev_ptr->min_offset != 0 && cur_ptr->image.size < r->min_split_size) {
/*
- * The last chunk, pointed to by the current image pointer, has
- * less than the minimum data. Let's move any data more than the
- * minimum from the previous image into the current.
+ * The last chunk, pointed to by the current image pointer, has less than the minimum data.
+ * Let's move any data more than the minimum from the previous image into the current.
*
* Grow the current buffer if it is not large enough.
*/
@@ -1518,13 +1476,11 @@ int
__wt_rec_split_finish(WT_SESSION_IMPL *session, WT_RECONCILE *r)
{
/*
- * We're done reconciling, write the final page. We may arrive here with
- * no entries to write if the page was entirely empty or if nothing on
- * the page was visible to us.
+ * We're done reconciling, write the final page. We may arrive here with no entries to write if
+ * the page was entirely empty or if nothing on the page was visible to us.
*
- * Pages with skipped or not-yet-globally visible updates aren't really
- * empty; otherwise, the page is truly empty and we will merge it into
- * its parent during the parent's reconciliation.
+ * Pages with skipped or not-yet-globally visible updates aren't really empty; otherwise, the
+ * page is truly empty and we will merge it into its parent during the parent's reconciliation.
*/
if (r->entries == 0 && r->supd_next == 0)
return (0);
@@ -1578,11 +1534,11 @@ __rec_split_write_supd(
int cmp;
/*
- * Check if we've saved updates that belong to this block, and move
- * any to the per-block structure.
+ * Check if we've saved updates that belong to this block, and move any to the per-block
+ * structure.
*
- * This code requires a key be filled in for the next block (or the
- * last block flag be set, if there's no next block).
+ * This code requires a key be filled in for the next block (or the last block flag be set, if
+ * there's no next block).
*
* The last block gets all remaining saved updates.
*/
@@ -1594,13 +1550,11 @@ __rec_split_write_supd(
}
/*
- * Get the saved update's key and compare it with the block's key range.
- * If the saved update list belongs with the block we're about to write,
- * move it to the per-block memory. Check only to the first update that
- * doesn't go with the block, they must be in sorted order.
+ * Get the saved update's key and compare it with the block's key range. If the saved update
+ * list belongs with the block we're about to write, move it to the per-block memory. Check only
+ * to the first update that doesn't go with the block, they must be in sorted order.
*
- * The other chunk will have the key for the next page, that's what we
- * compare against.
+ * The other chunk will have the key for the next page, that's what we compare against.
*/
next = chunk == r->cur_ptr ? r->prev_ptr : r->cur_ptr;
page = r->page;
@@ -1649,17 +1603,9 @@ __rec_split_write_supd(
done:
if (F_ISSET(r, WT_REC_LOOKASIDE)) {
/* Track the oldest lookaside timestamp seen so far. */
- multi->page_las.skew_newest = r->las_skew_newest;
multi->page_las.max_txn = r->max_txn;
- multi->page_las.unstable_txn = r->unstable_txn;
- WT_ASSERT(session, r->unstable_txn != WT_TXN_NONE);
- multi->page_las.max_timestamp = r->max_timestamp;
-
- WT_ASSERT(session, r->all_upd_prepare_in_prog == true ||
- r->unstable_durable_timestamp >= r->unstable_timestamp);
-
- multi->page_las.unstable_timestamp = r->unstable_timestamp;
- multi->page_las.unstable_durable_timestamp = r->unstable_durable_timestamp;
+ multi->page_las.max_ondisk_ts = r->max_ondisk_ts;
+ multi->page_las.min_skipped_ts = r->min_skipped_ts;
}
err:
@@ -1738,10 +1684,9 @@ __rec_split_write_reuse(
/*
* Calculating the checksum is the expensive part, try to avoid it.
*
- * Ignore the last block of any reconciliation. Pages are written in the
- * same block order every time, so the last block written for a page is
- * unlikely to match any previously written block or block written in
- * the future, (absent a point-update earlier in the page which didn't
+ * Ignore the last block of any reconciliation. Pages are written in the same block order every
+ * time, so the last block written for a page is unlikely to match any previously written block
+ * or block written in the future, (absent a point-update earlier in the page which didn't
* change the size of the on-page object in any way).
*/
if (last_block)
@@ -1825,18 +1770,15 @@ __rec_compression_adjust(WT_SESSION_IMPL *session, uint32_t max, size_t compress
if (compressed_size > max) {
/*
- * The compressed size is GT the page maximum.
- * Check if the pre-compression size is larger than the maximum.
- * If 10% of the page size larger than the maximum, decrease it
- * by that amount. Else if it's not already at the page maximum,
- * set it there.
+ * The compressed size is GT the page maximum. Check if the pre-compression size is larger
+ * than the maximum. If 10% of the page size larger than the maximum, decrease it by that
+ * amount. Else if it's not already at the page maximum, set it there.
*
- * Note we're using 10% of the maximum page size as our test for
- * when to adjust the pre-compression size as well as the amount
- * by which we adjust it. Not updating the value when it's close
- * to the page size keeps us from constantly updating a shared
- * memory location, and 10% of the page size is an OK step value
- * as well, so we use it in both cases.
+ * Note we're using 10% of the maximum page size as our test for when to adjust the
+ * pre-compression size as well as the amount by which we adjust it. Not updating the value
+ * when it's close to the page size keeps us from constantly updating a shared memory
+ * location, and 10% of the page size is an OK step value as well, so we use it in both
+ * cases.
*/
adjust = current - max;
if (adjust > ten_percent)
@@ -1849,12 +1791,10 @@ __rec_compression_adjust(WT_SESSION_IMPL *session, uint32_t max, size_t compress
/*
* The compressed size is LTE the page maximum.
*
- * Don't increase the pre-compressed size on the last block, the
- * last block might be tiny.
+ * Don't increase the pre-compressed size on the last block, the last block might be tiny.
*
- * If the compressed size is less than the page maximum by 10%,
- * increase the pre-compression size by 10% of the page, or up
- * to the maximum in-memory image size.
+ * If the compressed size is less than the page maximum by 10%, increase the pre-compression
+ * size by 10% of the page, or up to the maximum in-memory image size.
*
* Note we're using 10% of the maximum page size... see above.
*/
@@ -1940,13 +1880,12 @@ __rec_split_write(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_REC_CHUNK *chunk
__rec_split_write_header(session, r, chunk, multi, compressed_image->mem);
/*
- * If we are writing the whole page in our first/only attempt, it might
- * be a checkpoint (checkpoints are only a single page, by definition).
- * Checkpoints aren't written here, the wrapup functions do the write.
+ * If we are writing the whole page in our first/only attempt, it might be a checkpoint
+ * (checkpoints are only a single page, by definition). Checkpoints aren't written here, the
+ * wrapup functions do the write.
*
- * Track the buffer with the image. (This is bad layering, but we can't
- * write the image until the wrapup code, and we don't have a code path
- * from here to there.)
+ * Track the buffer with the image. (This is bad layering, but we can't write the image until
+ * the wrapup code, and we don't have a code path from here to there.)
*/
if (last_block && r->multi_next == 1 && __rec_is_checkpoint(session, r)) {
WT_ASSERT(session, r->supd_next == 0);
@@ -2266,8 +2205,8 @@ __rec_write_wrapup(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page)
/*
* Discard the replacement leaf page's blocks.
*
- * The exception is root pages are never tracked or free'd, they
- * are checkpoints, and must be explicitly dropped.
+ * The exception is root pages are never tracked or free'd, they are
+ * checkpoints, and must be explicitly dropped.
*/
if (!__wt_ref_is_root(ref))
WT_RET(__wt_btree_block_free(session, mod->mod_replace.addr, mod->mod_replace.size));
@@ -2328,17 +2267,14 @@ __rec_write_wrapup(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page)
break;
case 1: /* 1-for-1 page swap */
/*
- * Because WiredTiger's pages grow without splitting, we're
- * replacing a single page with another single page most of
- * the time.
+ * Because WiredTiger's pages grow without splitting, we're replacing a single page with
+ * another single page most of the time.
*
- * If in-memory, or saving/restoring changes for this page and
- * there's only one block, there's nothing to write. Set up
- * a single block as if to split, then use that disk image to
- * rewrite the page in memory. This is separate from simple
- * replacements where eviction has decided to retain the page
- * in memory because the latter can't handle update lists and
- * splits can.
+ * If in-memory, or saving/restoring changes for this page and there's only one block,
+ * there's nothing to write. Set up a single block as if to split, then use that disk
+ * image to rewrite the page in memory. This is separate from simple replacements where
+ * eviction has decided to retain the page in memory because the latter can't handle
+ * update lists and splits can.
*/
if (F_ISSET(r, WT_REC_IN_MEMORY) ||
(F_ISSET(r, WT_REC_UPDATE_RESTORE) && r->multi->supd_entries != 0))
@@ -2417,12 +2353,10 @@ __rec_write_wrapup_err(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page)
multi->addr.reuse = 0;
/*
- * On error, discard blocks we've written, they're unreferenced by the
- * tree. This is not a question of correctness, we're avoiding block
- * leaks.
+ * On error, discard blocks we've written, they're unreferenced by the tree. This is not a
+ * question of correctness, we're avoiding block leaks.
*
- * Don't discard backing blocks marked for reuse, they remain part of
- * a previous reconciliation.
+ * Don't discard backing blocks marked for reuse, they remain part of a previous reconciliation.
*/
for (multi = r->multi, i = 0; i < r->multi_next; ++multi, ++i)
if (multi->addr.addr != NULL) {
diff --git a/src/third_party/wiredtiger/src/schema/schema_create.c b/src/third_party/wiredtiger/src/schema/schema_create.c
index bb4a61687eb..cd7609f7153 100644
--- a/src/third_party/wiredtiger/src/schema/schema_create.c
+++ b/src/third_party/wiredtiger/src/schema/schema_create.c
@@ -98,13 +98,12 @@ __create_file(WT_SESSION_IMPL *session, const char *uri, bool exclusive, const c
}
/*
- * Open the file to check that it was setup correctly. We don't need to
- * pass the configuration, we just wrote the collapsed configuration
- * into the metadata file, and it's going to be read/used by underlying
- * functions.
+ * Open the file to check that it was setup correctly. We don't need to pass the configuration,
+ * we just wrote the collapsed configuration into the metadata file, and it's going to be
+ * read/used by underlying functions.
*
- * Keep the handle exclusive until it is released at the end of the
- * call, otherwise we could race with a drop.
+ * Keep the handle exclusive until it is released at the end of the call, otherwise we could
+ * race with a drop.
*/
WT_ERR(__wt_session_get_dhandle(session, uri, NULL, NULL, WT_DHANDLE_EXCLUSIVE));
if (WT_META_TRACKING(session))
@@ -371,18 +370,15 @@ __create_index(WT_SESSION_IMPL *session, const char *name, bool exclusive, const
name);
/*
- * Note: it would be better to keep the table exclusive here, while
- * changing its indexes. We don't because some operation we perform
- * below reacquire the table handle (such as opening a cursor on the
- * table in order to fill the index). If we keep the handle exclusive
- * here, those operations wanting ordinary access will conflict,
- * leading to errors. At the same time, we don't want to allow
- * table cursors that have already been fully opened to remain open
- * across this call.
+ * Note: it would be better to keep the table exclusive here, while changing its indexes. We
+ * don't because some operation we perform below reacquire the table handle (such as opening a
+ * cursor on the table in order to fill the index). If we keep the handle exclusive here, those
+ * operations wanting ordinary access will conflict, leading to errors. At the same time, we
+ * don't want to allow table cursors that have already been fully opened to remain open across
+ * this call.
*
- * Temporarily getting the table exclusively serves the purpose
- * of ensuring that cursors on the table that are already open
- * must at least be closed before this call proceeds.
+ * Temporarily getting the table exclusively serves the purpose of ensuring that cursors on the
+ * table that are already open must at least be closed before this call proceeds.
*/
tlen = (size_t)(idxname++ - tablename);
if ((ret = __wt_schema_get_table(
diff --git a/src/third_party/wiredtiger/src/schema/schema_drop.c b/src/third_party/wiredtiger/src/schema/schema_drop.c
index 4a3e616d9ab..527e8540d74 100644
--- a/src/third_party/wiredtiger/src/schema/schema_drop.c
+++ b/src/third_party/wiredtiger/src/schema/schema_drop.c
@@ -113,15 +113,13 @@ __drop_table(WT_SESSION_IMPL *session, const char *uri, const char *cfg[])
/*
* Open the table so we can drop its column groups and indexes.
*
- * Ideally we would keep the table locked exclusive across the drop,
- * but for now we rely on the global table lock to prevent the table
- * being reopened while it is being dropped. One issue is that the
- * WT_WITHOUT_LOCKS macro can drop and reacquire the global table lock,
- * avoiding deadlocks while waiting for LSM operation to quiesce.
+ * Ideally we would keep the table locked exclusive across the drop, but for now we rely on the
+ * global table lock to prevent the table being reopened while it is being dropped. One issue is
+ * that the WT_WITHOUT_LOCKS macro can drop and reacquire the global table lock, avoiding
+ * deadlocks while waiting for LSM operation to quiesce.
*
- * Temporarily getting the table exclusively serves the purpose
- * of ensuring that cursors on the table that are already open
- * must at least be closed before this call proceeds.
+ * Temporarily getting the table exclusively serves the purpose of ensuring that cursors on the
+ * table that are already open must at least be closed before this call proceeds.
*/
WT_ERR(__wt_schema_get_table_uri(session, uri, true, WT_DHANDLE_EXCLUSIVE, &table));
WT_ERR(__wt_schema_release_table(session, &table));
diff --git a/src/third_party/wiredtiger/src/schema/schema_open.c b/src/third_party/wiredtiger/src/schema/schema_open.c
index 4c6a8b02c26..4e9d98d77fb 100644
--- a/src/third_party/wiredtiger/src/schema/schema_open.c
+++ b/src/third_party/wiredtiger/src/schema/schema_open.c
@@ -148,14 +148,12 @@ __open_index(WT_SESSION_IMPL *session, WT_TABLE *table, WT_INDEX *idx)
WT_ERR(__wt_strndup(session, cval.str, cval.len, &idx->key_format));
/*
- * The key format for an index is somewhat subtle: the application
- * specifies a set of columns that it will use for the key, but the
- * engine usually adds some hidden columns in order to derive the
- * primary key. These hidden columns are part of the file's key.
+ * The key format for an index is somewhat subtle: the application specifies a set of columns
+ * that it will use for the key, but the engine usually adds some hidden columns in order to
+ * derive the primary key. These hidden columns are part of the file's key.
*
- * The file's key_format is stored persistently, we need to calculate
- * the index cursor key format (which will usually omit some of those
- * keys).
+ * The file's key_format is stored persistently, we need to calculate the index cursor key
+ * format (which will usually omit some of those keys).
*/
WT_ERR(__wt_buf_init(session, buf, 0));
WT_ERR(__wt_config_getones(session, idx->config, "columns", &idx->colconf));
diff --git a/src/third_party/wiredtiger/src/schema/schema_project.c b/src/third_party/wiredtiger/src/schema/schema_project.c
index ebcbe45e8fc..aa441d67f5f 100644
--- a/src/third_party/wiredtiger/src/schema/schema_project.c
+++ b/src/third_party/wiredtiger/src/schema/schema_project.c
@@ -310,11 +310,9 @@ __wt_schema_project_slice(WT_SESSION_IMPL *session, WT_CURSOR **cp, const char *
/*
* Read the item we're about to overwrite.
*
- * There is subtlety here: the value format
- * may not exactly match the cursor's format.
- * In particular, we need lengths with raw
- * columns in the middle of a packed struct,
- * but not if they are at the end of a struct.
+ * There is subtlety here: the value format may not exactly match the cursor's
+ * format. In particular, we need lengths with raw columns in the middle of a packed
+ * struct, but not if they are at the end of a struct.
*/
WT_RET(__pack_next(&pack, &pv));
diff --git a/src/third_party/wiredtiger/src/schema/schema_rename.c b/src/third_party/wiredtiger/src/schema/schema_rename.c
index 304d7305504..a151b1640c8 100644
--- a/src/third_party/wiredtiger/src/schema/schema_rename.c
+++ b/src/third_party/wiredtiger/src/schema/schema_rename.c
@@ -207,10 +207,9 @@ __rename_table(WT_SESSION_IMPL *session, const char *uri, const char *newuri, co
/*
* Open the table so we can rename its column groups and indexes.
*
- * Ideally we would keep the table locked exclusive across the rename,
- * but for now we rely on the global table lock to prevent the table
- * being reopened while it is being renamed. One issue is that the
- * WT_WITHOUT_LOCKS macro can drop and reacquire the global table lock,
+ * Ideally we would keep the table locked exclusive across the rename, but for now we rely on
+ * the global table lock to prevent the table being reopened while it is being renamed. One
+ * issue is that the WT_WITHOUT_LOCKS macro can drop and reacquire the global table lock,
* avoiding deadlocks while waiting for LSM operation to quiesce.
*/
WT_RET(__wt_schema_get_table(session, oldname, strlen(oldname), false, 0, &table));
diff --git a/src/third_party/wiredtiger/src/schema/schema_stat.c b/src/third_party/wiredtiger/src/schema/schema_stat.c
index f612129b2ce..33f8d5cc7d6 100644
--- a/src/third_party/wiredtiger/src/schema/schema_stat.c
+++ b/src/third_party/wiredtiger/src/schema/schema_stat.c
@@ -150,9 +150,8 @@ __wt_curstat_table_init(
/*
* Process the column groups.
*
- * Set the cursor to reference the data source statistics; we don't
- * initialize it, instead we copy (rather than aggregate), the first
- * column's statistics, which has the same effect.
+ * Set the cursor to reference the data source statistics; we don't initialize it, instead we
+ * copy (rather than aggregate), the first column's statistics, which has the same effect.
*/
stats = &cst->u.dsrc_stats;
for (i = 0; i < WT_COLGROUPS(table); i++) {
diff --git a/src/third_party/wiredtiger/src/session/session_api.c b/src/third_party/wiredtiger/src/session/session_api.c
index 09148db3018..098acb87c60 100644
--- a/src/third_party/wiredtiger/src/session/session_api.c
+++ b/src/third_party/wiredtiger/src/session/session_api.c
@@ -325,13 +325,12 @@ __session_close(WT_SESSION *wt_session, const char *config)
WT_STAT_CONN_DECR(session, session_open);
/*
- * Sessions are re-used, clear the structure: the clear sets the active
- * field to 0, which will exclude the hazard array from review by the
- * eviction thread. Because some session fields are accessed by other
- * threads, the structure must be cleared carefully.
+ * Sessions are re-used, clear the structure: the clear sets the active field to 0, which will
+ * exclude the hazard array from review by the eviction thread. Because some session fields are
+ * accessed by other threads, the structure must be cleared carefully.
*
- * We don't need to publish here, because regardless of the active field
- * being non-zero, the hazard pointer is always valid.
+ * We don't need to publish here, because regardless of the active field being non-zero, the
+ * hazard pointer is always valid.
*/
__session_clear(session);
session = conn->default_session;
@@ -423,12 +422,11 @@ __session_open_cursor_int(WT_SESSION_IMPL *session, const char *uri, WT_CURSOR *
*cursorp = NULL;
/*
- * Open specific cursor types we know about, or call the generic data
- * source open function.
+ * Open specific cursor types we know about, or call the generic data source open function.
*
- * Unwind a set of string comparisons into a switch statement hoping
- * the compiler can make it fast, but list the common choices first
- * instead of sorting so if/else patterns are still fast.
+ * Unwind a set of string comparisons into a switch statement hoping the compiler can make it
+ * fast, but list the common choices first instead of sorting so if/else patterns are still
+ * fast.
*/
switch (uri[0]) {
/*
@@ -596,13 +594,12 @@ err:
WT_TRET(cursor->close(cursor));
}
/*
- * Opening a cursor on a non-existent data source will set ret to
- * either of ENOENT or WT_NOTFOUND at this point. However,
- * applications may reasonably do this inside a transaction to check
- * for the existence of a table or index.
+ * Opening a cursor on a non-existent data source will set ret to either of ENOENT or
+ * WT_NOTFOUND at this point. However, applications may reasonably do this inside a transaction
+ * to check for the existence of a table or index.
*
- * Failure in opening a cursor should not set an error on the
- * transaction and WT_NOTFOUND will be mapped to ENOENT.
+ * Failure in opening a cursor should not set an error on the transaction and WT_NOTFOUND will
+ * be mapped to ENOENT.
*/
API_END_RET_NO_TXN_ERROR(session, ret);
@@ -1350,15 +1347,14 @@ __wt_session_range_truncate(
WT_ERR(__wt_bad_object_type(session, stop->uri));
/*
- * If both cursors set, check they're correctly ordered with respect to
- * each other. We have to test this before any search, the search can
- * change the initial cursor position.
+ * If both cursors set, check they're correctly ordered with respect to each other. We have to
+ * test this before any search, the search can change the initial cursor position.
*
- * Rather happily, the compare routine will also confirm the cursors
- * reference the same object and the keys are set.
+ * Rather happily, the compare routine will also confirm the cursors reference the same object
+ * and the keys are set.
*
- * The test for a NULL start comparison function isn't necessary (we
- * checked it above), but it quiets clang static analysis complaints.
+ * The test for a NULL start comparison function isn't necessary (we checked it above), but it
+ * quiets clang static analysis complaints.
*/
if (start != NULL && stop != NULL && start->compare != NULL) {
WT_ERR(start->compare(start, stop, &cmp));
@@ -1391,13 +1387,12 @@ __wt_session_range_truncate(
}
/*
- * We always truncate in the forward direction because the underlying
- * data structures can move through pages faster forward than backward.
- * If we don't have a start cursor, create one and position it at the
- * first record.
+ * We always truncate in the forward direction because the underlying data structures can move
+ * through pages faster forward than backward. If we don't have a start cursor, create one and
+ * position it at the first record.
*
- * If start is NULL, stop must not be NULL, but static analyzers have
- * a hard time with that, test explicitly.
+ * If start is NULL, stop must not be NULL, but static analyzers have a hard time with that,
+ * test explicitly.
*/
if (start == NULL && stop != NULL) {
WT_ERR(__session_open_cursor((WT_SESSION *)session, stop->uri, NULL, NULL, &start));
@@ -1421,9 +1416,8 @@ err:
/*
* Close any locally-opened start cursor.
*
- * Reset application cursors, they've possibly moved and the
- * application cannot use them. Note that we can make it here with a
- * NULL start cursor (e.g., if the truncate range is empty).
+ * Reset application cursors, they've possibly moved and the application cannot use them. Note
+ * that we can make it here with a NULL start cursor (e.g., if the truncate range is empty).
*/
if (local_start)
WT_TRET(start->close(start));
@@ -1450,12 +1444,12 @@ __session_truncate(
WT_STAT_CONN_INCR(session, cursor_truncate);
/*
- * If the URI is specified, we don't need a start/stop, if start/stop
- * is specified, we don't need a URI. One exception is the log URI
- * which may truncate (archive) log files for a backup cursor.
+ * If the URI is specified, we don't need a start/stop, if start/stop is specified, we don't
+ * need a URI. One exception is the log URI which may truncate (archive) log files for a backup
+ * cursor.
*
- * If no URI is specified, and both cursors are specified, start/stop
- * must reference the same object.
+ * If no URI is specified, and both cursors are specified, start/stop must reference the same
+ * object.
*
* Any specified cursor must have been initialized.
*/
@@ -1956,17 +1950,14 @@ __session_checkpoint(WT_SESSION *wt_session, const char *config)
WT_ERR(__wt_inmem_unsupported_op(session, NULL));
/*
- * Checkpoints require a snapshot to write a transactionally consistent
- * snapshot of the data.
+ * Checkpoints require a snapshot to write a transactionally consistent snapshot of the data.
*
- * We can't use an application's transaction: if it has uncommitted
- * changes, they will be written in the checkpoint and may appear after
- * a crash.
+ * We can't use an application's transaction: if it has uncommitted changes, they will be
+ * written in the checkpoint and may appear after a crash.
*
- * Use a real snapshot transaction: we don't want any chance of the
- * snapshot being updated during the checkpoint. Eviction is prevented
- * from evicting anything newer than this because we track the oldest
- * transaction ID in the system that is not visible to all readers.
+ * Use a real snapshot transaction: we don't want any chance of the snapshot being updated
+ * during the checkpoint. Eviction is prevented from evicting anything newer than this because
+ * we track the oldest transaction ID in the system that is not visible to all readers.
*/
WT_ERR(__wt_txn_context_check(session, false));
diff --git a/src/third_party/wiredtiger/src/session/session_compact.c b/src/third_party/wiredtiger/src/session/session_compact.c
index 2be298c330e..f01962b2e78 100644
--- a/src/third_party/wiredtiger/src/session/session_compact.c
+++ b/src/third_party/wiredtiger/src/session/session_compact.c
@@ -287,11 +287,9 @@ __compact_worker(WT_SESSION_IMPL *session)
}
/*
- * If compaction failed because checkpoint was running,
- * continue with the next handle. We might continue to
- * race with checkpoint on each handle, but that's OK,
- * we'll step through all the handles, and then we'll
- * block until a checkpoint completes.
+ * If compaction failed because checkpoint was running, continue with the next handle.
+ * We might continue to race with checkpoint on each handle, but that's OK, we'll step
+ * through all the handles, and then we'll block until a checkpoint completes.
*
* Just quit if eviction is the problem.
*/
diff --git a/src/third_party/wiredtiger/src/session/session_dhandle.c b/src/third_party/wiredtiger/src/session/session_dhandle.c
index 9a3fceeb48b..9bf35dca909 100644
--- a/src/third_party/wiredtiger/src/session/session_dhandle.c
+++ b/src/third_party/wiredtiger/src/session/session_dhandle.c
@@ -115,14 +115,12 @@ __wt_session_lock_dhandle(WT_SESSION_IMPL *session, uint32_t flags, bool *is_dea
want_exclusive = LF_ISSET(WT_DHANDLE_EXCLUSIVE);
/*
- * If this session already has exclusive access to the handle, there is
- * no point trying to lock it again.
+ * If this session already has exclusive access to the handle, there is no point trying to lock
+ * it again.
*
- * This should only happen if a checkpoint handle is locked multiple
- * times during a checkpoint operation, or the handle is already open
- * without any special flags. In particular, it must fail if
- * attempting to checkpoint a handle opened for a bulk load, even in
- * the same session.
+ * This should only happen if a checkpoint handle is locked multiple times during a checkpoint
+ * operation, or the handle is already open without any special flags. In particular, it must
+ * fail if attempting to checkpoint a handle opened for a bulk load, even in the same session.
*/
if (dhandle->excl_session == session) {
if (!LF_ISSET(WT_DHANDLE_LOCK_ONLY) &&
@@ -134,14 +132,12 @@ __wt_session_lock_dhandle(WT_SESSION_IMPL *session, uint32_t flags, bool *is_dea
}
/*
- * Check that the handle is open. We've already incremented
- * the reference count, so once the handle is open it won't be
- * closed by another thread.
+ * Check that the handle is open. We've already incremented the reference count, so once the
+ * handle is open it won't be closed by another thread.
*
- * If we can see the WT_DHANDLE_OPEN flag set while holding a
- * lock on the handle, then it's really open and we can start
- * using it. Alternatively, if we can get an exclusive lock
- * and WT_DHANDLE_OPEN is still not set, we need to do the open.
+ * If we can see the WT_DHANDLE_OPEN flag set while holding a lock on the handle, then it's
+ * really open and we can start using it. Alternatively, if we can get an exclusive lock and
+ * WT_DHANDLE_OPEN is still not set, we need to do the open.
*/
for (;;) {
/* If the handle is dead, give up. */
@@ -159,11 +155,10 @@ __wt_session_lock_dhandle(WT_SESSION_IMPL *session, uint32_t flags, bool *is_dea
/*
* If the handle is open, get a read lock and recheck.
*
- * Wait for a read lock if we want exclusive access and failed
- * to get it: the sweep server may be closing this handle, and
- * we need to wait for it to release its lock. If we want
- * exclusive access and find the handle open once we get the
- * read lock, give up: some other thread has it locked for real.
+ * Wait for a read lock if we want exclusive access and failed to get it: the sweep server
+ * may be closing this handle, and we need to wait for it to release its lock. If we want
+ * exclusive access and find the handle open once we get the read lock, give up: some other
+ * thread has it locked for real.
*/
if (F_ISSET(dhandle, WT_DHANDLE_OPEN) && (!want_exclusive || lock_busy)) {
__wt_readlock(session, &dhandle->rwlock);
@@ -324,16 +319,15 @@ retry:
__wt_free(session, checkpoint);
/*
- * There's a potential race: we get the name of the most recent unnamed
- * checkpoint, but if it's discarded (or locked so it can be discarded)
- * by the time we try to open it, we'll fail the open. Retry in those
- * cases, a new "last" checkpoint should surface, and we can't return an
- * error, the application will be justifiably upset if we can't open the
- * last checkpoint instance of an object.
+ * There's a potential race: we get the name of the most recent unnamed checkpoint, but if it's
+ * discarded (or locked so it can be discarded) by the time we try to open it, we'll fail the
+ * open. Retry in those cases, a new "last" checkpoint should surface, and we can't return an
+ * error, the application will be justifiably upset if we can't open the last checkpoint
+ * instance of an object.
*
- * The check against WT_NOTFOUND is correct: if there was no checkpoint
- * for the object (that is, the object has never been in a checkpoint),
- * we returned immediately after the call to search for that name.
+ * The check against WT_NOTFOUND is correct: if there was no checkpoint for the object (that is,
+ * the object has never been in a checkpoint), we returned immediately after the call to search
+ * for that name.
*/
if (last_ckpt && (ret == WT_NOTFOUND || ret == EBUSY))
goto retry;
@@ -485,14 +479,12 @@ __wt_session_get_dhandle(WT_SESSION_IMPL *session, const char *uri, const char *
WT_ASSERT(session, F_ISSET(dhandle, WT_DHANDLE_EXCLUSIVE));
/*
- * For now, we need the schema lock and handle list locks to
- * open a file for real.
+ * For now, we need the schema lock and handle list locks to open a file for real.
*
- * Code needing exclusive access (such as drop or verify)
- * assumes that it can close all open handles, then open an
- * exclusive handle on the active tree and no other threads can
- * reopen handles in the meantime. A combination of the schema
- * and handle list locks are used to enforce this.
+ * Code needing exclusive access (such as drop or verify) assumes that it can close all open
+ * handles, then open an exclusive handle on the active tree and no other threads can reopen
+ * handles in the meantime. A combination of the schema and handle list locks are used to
+ * enforce this.
*/
if (!F_ISSET(session, WT_SESSION_LOCKED_SCHEMA)) {
dhandle->excl_session = NULL;
diff --git a/src/third_party/wiredtiger/src/support/err.c b/src/third_party/wiredtiger/src/support/err.c
index c967354564c..2d7dd124b4f 100644
--- a/src/third_party/wiredtiger/src/support/err.c
+++ b/src/third_party/wiredtiger/src/support/err.c
@@ -228,14 +228,13 @@ __eventv(WT_SESSION_IMPL *session, bool msg_event, int error, const char *func,
if (error != 0) {
/*
- * When the engine calls __wt_err on error, it often outputs an
- * error message including the string associated with the error
- * it's returning. We could change the calls to call __wt_errx,
- * but it's simpler to not append an error string if all we are
- * doing is duplicating an existing error string.
+ * When the engine calls __wt_err on error, it often outputs an error message including the
+ * string associated with the error it's returning. We could change the calls to call
+ * __wt_errx, but it's simpler to not append an error string if all we are doing is
+ * duplicating an existing error string.
*
- * Use strcmp to compare: both strings are nul-terminated, and
- * we don't want to run past the end of the buffer.
+ * Use strcmp to compare: both strings are nul-terminated, and we don't want to run past the
+ * end of the buffer.
*/
err = __wt_strerror(session, error, NULL, 0);
len = strlen(err);
@@ -244,18 +243,15 @@ __eventv(WT_SESSION_IMPL *session, bool msg_event, int error, const char *func,
}
/*
- * If a handler fails, return the error status: if we're in the process
- * of handling an error, any return value we provide will be ignored by
- * our caller, our caller presumably already has an error value it will
- * be returning.
+ * If a handler fails, return the error status: if we're in the process of handling an error,
+ * any return value we provide will be ignored by our caller, our caller presumably already has
+ * an error value it will be returning.
*
- * If an application-specified or default informational message handler
- * fails, complain using the application-specified or default error
- * handler.
+ * If an application-specified or default informational message handler fails, complain using
+ * the application-specified or default error handler.
*
- * If an application-specified error message handler fails, complain
- * using the default error handler. If the default error handler fails,
- * fallback to stderr.
+ * If an application-specified error message handler fails, complain using the default error
+ * handler. If the default error handler fails, fallback to stderr.
*/
wt_session = (WT_SESSION *)session;
handler = session->event_handler;
diff --git a/src/third_party/wiredtiger/src/support/generation.c b/src/third_party/wiredtiger/src/support/generation.c
index 431ca2c5a2f..c6e9de2c25a 100644
--- a/src/third_party/wiredtiger/src/support/generation.c
+++ b/src/third_party/wiredtiger/src/support/generation.c
@@ -100,9 +100,8 @@ __wt_gen_drain(WT_SESSION_IMPL *session, int which, uint64_t generation)
WT_ORDERED_READ(v, s->generations[which]);
/*
- * The generation argument is newer than the limit. Wait
- * for threads in generations older than the argument
- * generation, threads in argument generations are OK.
+ * The generation argument is newer than the limit. Wait for threads in generations
+ * older than the argument generation, threads in argument generations are OK.
*
* The thread's generation may be 0 (that is, not set).
*/
diff --git a/src/third_party/wiredtiger/src/support/hazard.c b/src/third_party/wiredtiger/src/support/hazard.c
index 3710da4ec5e..75901c8181d 100644
--- a/src/third_party/wiredtiger/src/support/hazard.c
+++ b/src/third_party/wiredtiger/src/support/hazard.c
@@ -124,16 +124,13 @@ __wt_hazard_set(WT_SESSION_IMPL *session, WT_REF *ref, bool *busyp
/*
* Do the dance:
*
- * The memory location which makes a page "real" is the WT_REF's state
- * of WT_REF_LIMBO or WT_REF_MEM, which can be set to WT_REF_LOCKED
- * at any time by the page eviction server.
+ * The memory location which makes a page "real" is the WT_REF's state of WT_REF_LIMBO or
+ * WT_REF_MEM, which can be set to WT_REF_LOCKED at any time by the page eviction server.
*
- * Add the WT_REF reference to the session's hazard list and flush the
- * write, then see if the page's state is still valid. If so, we can
- * use the page because the page eviction server will see our hazard
- * pointer before it discards the page (the eviction server sets the
- * state to WT_REF_LOCKED, then flushes memory and checks the hazard
- * pointers).
+ * Add the WT_REF reference to the session's hazard list and flush the write, then see if the
+ * page's state is still valid. If so, we can use the page because the page eviction server will
+ * see our hazard pointer before it discards the page (the eviction server sets the state to
+ * WT_REF_LOCKED, then flushes memory and checks the hazard pointers).
*/
hp->ref = ref;
#ifdef HAVE_DIAGNOSTIC
@@ -200,12 +197,11 @@ __wt_hazard_clear(WT_SESSION_IMPL *session, WT_REF *ref)
hp->ref = NULL;
/*
- * If this was the last hazard pointer in the session,
- * reset the size so that checks can skip this session.
+ * If this was the last hazard pointer in the session, reset the size so that checks can
+ * skip this session.
*
- * A write-barrier() is necessary before the change to
- * the in-use value, the number of active references
- * can never be less than the number of in-use slots.
+ * A write-barrier() is necessary before the change to the in-use value, the number of
+ * active references can never be less than the number of in-use slots.
*/
if (--session->nhazard == 0)
WT_PUBLISH(session->hazard_inuse, 0);
@@ -280,16 +276,13 @@ static inline void
hazard_get_reference(WT_SESSION_IMPL *session, WT_HAZARD **hazardp, uint32_t *hazard_inusep)
{
/*
- * Hazard pointer arrays can be swapped out from under us if they grow.
- * First, read the current in-use value. The read must precede the read
- * of the hazard pointer itself (so the in-use value is pessimistic
- * should the hazard array grow), and additionally ensure we only read
- * the in-use value once. Then, read the hazard pointer, also ensuring
- * we only read it once.
+ * Hazard pointer arrays can be swapped out from under us if they grow. First, read the current
+ * in-use value. The read must precede the read of the hazard pointer itself (so the in-use
+ * value is pessimistic should the hazard array grow), and additionally ensure we only read the
+ * in-use value once. Then, read the hazard pointer, also ensuring we only read it once.
*
- * Use a barrier instead of marking the fields volatile because we don't
- * want to slow down the rest of the hazard pointer functions that don't
- * need special treatment.
+ * Use a barrier instead of marking the fields volatile because we don't want to slow down the
+ * rest of the hazard pointer functions that don't need special treatment.
*/
WT_ORDERED_READ(*hazard_inusep, session->hazard_inuse);
WT_ORDERED_READ(*hazardp, session->hazard);
diff --git a/src/third_party/wiredtiger/src/support/huffman.c b/src/third_party/wiredtiger/src/support/huffman.c
index 8420a625b45..906dc311a63 100644
--- a/src/third_party/wiredtiger/src/support/huffman.c
+++ b/src/third_party/wiredtiger/src/support/huffman.c
@@ -87,8 +87,7 @@ typedef struct __wt_huffman_obj {
/*
* Queue element data structure.
*
- * Consists of a pointer to a huffman tree node, and a pointer to the next
- * element in the queue.
+ * Consists of a pointer to a huffman tree node, and a pointer to the next element in the queue.
*/
typedef struct node_queue_elem {
WT_FREQTREE_NODE *node;
@@ -98,8 +97,8 @@ typedef struct node_queue_elem {
/*
* Queue of huffman tree nodes.
*
- * Contains a pointer to the beginning and the end of the queue, which is
- * implemented as a linked list.
+ * Contains a pointer to the beginning and the end of the queue, which is implemented as a linked
+ * list.
*/
typedef struct node_queue {
NODE_QUEUE_ELEM *first;
@@ -381,9 +380,8 @@ __wt_huffman_open(
/*
* Adding the leaves to the queue.
*
- * Discard symbols with a frequency of 0; this assumes these symbols
- * never occur in the source stream, and the purpose is to reduce the
- * huffman tree's size.
+ * Discard symbols with a frequency of 0; this assumes these symbols never occur in the source
+ * stream, and the purpose is to reduce the huffman tree's size.
*/
for (i = 0; i < symcnt; ++i)
if (indexed_freqs[i].frequency > 0) {
diff --git a/src/third_party/wiredtiger/src/support/modify.c b/src/third_party/wiredtiger/src/support/modify.c
index 848289d264d..086fb4b3920 100644
--- a/src/third_party/wiredtiger/src/support/modify.c
+++ b/src/third_party/wiredtiger/src/support/modify.c
@@ -111,17 +111,15 @@ __modify_apply_one(WT_SESSION_IMPL *session, WT_ITEM *value, WT_MODIFY *modify,
size = modify->size;
/*
- * Grow the buffer to the maximum size we'll need. This is pessimistic
- * because it ignores replacement bytes, but it's a simpler calculation.
+ * Grow the buffer to the maximum size we'll need. This is pessimistic because it ignores
+ * replacement bytes, but it's a simpler calculation.
*
- * Grow the buffer first. This function is often called using a cursor
- * buffer referencing on-page memory and it's easy to overwrite a page.
- * A side-effect of growing the buffer is to ensure the buffer's value
- * is in buffer-local memory.
+ * Grow the buffer first. This function is often called using a cursor buffer referencing
+ * on-page memory and it's easy to overwrite a page. A side-effect of growing the buffer is to
+ * ensure the buffer's value is in buffer-local memory.
*
- * Because the buffer may reference an overflow item, the data may not
- * start at the start of the buffer's memory and we have to correct for
- * that.
+ * Because the buffer may reference an overflow item, the data may not start at the start of the
+ * buffer's memory and we have to correct for that.
*/
item_offset = WT_DATA_IN_ITEM(value) ? WT_PTRDIFF(value->data, value->mem) : 0;
WT_RET(__wt_buf_grow(
@@ -217,15 +215,12 @@ __modify_fast_path(WT_ITEM *value, const size_t *p, int nentries, int *nappliedp
WT_CLEAR(prev); /* [-Werror=maybe-uninitialized] */
/*
- * If the modifications are sorted and don't overlap in the old or new
- * values, we can do a fast application of all the modifications
- * modifications in a single pass.
+ * If the modifications are sorted and don't overlap in the old or new values, we can do a fast
+ * application of all the modifications modifications in a single pass.
*
- * The requirement for ordering is unfortunate, but modifications are
- * performed in order, and applications specify byte offsets based on
- * that. In other words, byte offsets are cumulative, modifications
- * that shrink or grow the data affect subsequent modification's byte
- * offsets.
+ * The requirement for ordering is unfortunate, but modifications are performed in order, and
+ * applications specify byte offsets based on that. In other words, byte offsets are cumulative,
+ * modifications that shrink or grow the data affect subsequent modification's byte offsets.
*/
fastpath = first = true;
*nappliedp = 0;
@@ -348,14 +343,12 @@ __wt_modify_apply(WT_CURSOR *cursor, const void *modify)
nentries = (int)tmp;
/*
- * Grow the buffer first. This function is often called using a cursor
- * buffer referencing on-page memory and it's easy to overwrite a page.
- * A side-effect of growing the buffer is to ensure the buffer's value
- * is in buffer-local memory.
+ * Grow the buffer first. This function is often called using a cursor buffer referencing
+ * on-page memory and it's easy to overwrite a page. A side-effect of growing the buffer is to
+ * ensure the buffer's value is in buffer-local memory.
*
- * Because the buffer may reference an overflow item, the data may not
- * start at the start of the buffer's memory and we have to correct for
- * that.
+ * Because the buffer may reference an overflow item, the data may not start at the start of the
+ * buffer's memory and we have to correct for that.
*/
item_offset = WT_DATA_IN_ITEM(value) ? WT_PTRDIFF(value->data, value->mem) : 0;
WT_RET(__wt_buf_grow(session, value, item_offset + value->size));
diff --git a/src/third_party/wiredtiger/src/support/mtx_rw.c b/src/third_party/wiredtiger/src/support/mtx_rw.c
index bb89e343b69..ff35fce0c81 100644
--- a/src/third_party/wiredtiger/src/support/mtx_rw.c
+++ b/src/third_party/wiredtiger/src/support/mtx_rw.c
@@ -402,10 +402,9 @@ __wt_writelock(WT_SESSION_IMPL *session, WT_RWLOCK *l)
/*
* Wait for our group to start and any readers to drain.
*
- * We take care here to do an atomic read of the full 64-bit lock
- * value. Otherwise, reads are not guaranteed to be ordered and we
- * could see no readers active from a different batch and decide that
- * we have the lock.
+ * We take care here to do an atomic read of the full 64-bit lock value. Otherwise, reads are
+ * not guaranteed to be ordered and we could see no readers active from a different batch and
+ * decide that we have the lock.
*/
for (pause_cnt = 0, old.u.v = l->u.v; ticket != old.u.s.current || old.u.s.readers_active != 0;
pause_cnt++, old.u.v = l->u.v) {
@@ -460,9 +459,8 @@ __wt_writeunlock(WT_SESSION_IMPL *session, WT_RWLOCK *l)
/*
* Allow the next batch to start.
*
- * If there are readers in the next group, swap queued readers
- * to active: this could race with new readlock requests, so we
- * have to spin.
+ * If there are readers in the next group, swap queued readers to active: this could race
+ * with new readlock requests, so we have to spin.
*/
new.u.v = old.u.v;
if (++new.u.s.current == new.u.s.reader) {
diff --git a/src/third_party/wiredtiger/src/support/rand.c b/src/third_party/wiredtiger/src/support/rand.c
index 264ee711755..fcc76147f7e 100644
--- a/src/third_party/wiredtiger/src/support/rand.c
+++ b/src/third_party/wiredtiger/src/support/rand.c
@@ -29,16 +29,15 @@
#include "wt_internal.h"
/*
- * This is an implementation of George Marsaglia's multiply-with-carry pseudo-
- * random number generator. Computationally fast, with reasonable randomness
- * properties, and a claimed period of > 2^60.
+ * This is an implementation of George Marsaglia's multiply-with-carry pseudo- random number
+ * generator. Computationally fast, with reasonable randomness properties, and a claimed period of >
+ * 2^60.
*
- * Be very careful about races here. Multiple threads can call __wt_random
- * concurrently, and it is okay if those concurrent calls get the same return
- * value. What is *not* okay is if reading/writing the shared state races and
- * uses two different values for m_w or m_z. That can result in a stored value
- * of zero, in which case they will be stuck on zero forever. Take a local copy
- * of the values to avoid that, and read/write in atomic, 8B chunks.
+ * Be very careful about races here. Multiple threads can call __wt_random concurrently, and it is
+ * okay if those concurrent calls get the same return value. What is *not* okay is if
+ * reading/writing the shared state races and uses two different values for m_w or m_z. That can
+ * result in a stored value of zero, in which case they will be stuck on zero forever. Take a local
+ * copy of the values to avoid that, and read/write in atomic, 8B chunks.
*/
#undef M_W
#define M_W(r) r.x.w
diff --git a/src/third_party/wiredtiger/src/support/scratch.c b/src/third_party/wiredtiger/src/support/scratch.c
index 294f8f2fe0f..74195d18502 100644
--- a/src/third_party/wiredtiger/src/support/scratch.c
+++ b/src/third_party/wiredtiger/src/support/scratch.c
@@ -241,11 +241,10 @@ __wt_scr_alloc_func(WT_SESSION_IMPL *session, size_t size, WT_ITEM **scratchp
*scratchp = NULL;
/*
- * Each WT_SESSION_IMPL has an array of scratch buffers available for
- * use by any function. We use WT_ITEM structures for scratch memory
- * because we already have functions that do variable-length allocation
- * on a WT_ITEM. Scratch buffers are allocated only by a single thread
- * of control, so no locking is necessary.
+ * Each WT_SESSION_IMPL has an array of scratch buffers available for use by any function. We
+ * use WT_ITEM structures for scratch memory because we already have functions that do
+ * variable-length allocation on a WT_ITEM. Scratch buffers are allocated only by a single
+ * thread of control, so no locking is necessary.
*
* Walk the array, looking for a buffer we can use.
*/
diff --git a/src/third_party/wiredtiger/src/support/time.c b/src/third_party/wiredtiger/src/support/time.c
deleted file mode 100644
index 61cebb71b51..00000000000
--- a/src/third_party/wiredtiger/src/support/time.c
+++ /dev/null
@@ -1,109 +0,0 @@
-/*-
- * Copyright (c) 2014-2019 MongoDB, Inc.
- * Copyright (c) 2008-2014 WiredTiger, Inc.
- * All rights reserved.
- *
- * See the file LICENSE for redistribution information.
- */
-
-#include "wt_internal.h"
-
-/*
- * __time_check_monotonic --
- * Check and prevent time running backward. If we detect that it has, we set the time structure
- * to the previous values, making time stand still until we see a time in the future of the
- * highest value seen so far.
- */
-static void
-__time_check_monotonic(WT_SESSION_IMPL *session, struct timespec *tsp)
-{
- /*
- * Detect time going backward. If so, use the last saved timestamp.
- */
- if (session == NULL)
- return;
-
- if (tsp->tv_sec < session->last_epoch.tv_sec ||
- (tsp->tv_sec == session->last_epoch.tv_sec && tsp->tv_nsec < session->last_epoch.tv_nsec)) {
- WT_STAT_CONN_INCR(session, time_travel);
- *tsp = session->last_epoch;
- } else
- session->last_epoch = *tsp;
-}
-
-/*
- * __wt_epoch --
- * Return the time since the Epoch.
- */
-void
-__wt_epoch(WT_SESSION_IMPL *session, struct timespec *tsp)
- WT_GCC_FUNC_ATTRIBUTE((visibility("default")))
-{
- struct timespec tmp;
-
- /*
- * Read into a local variable, then check for monotonically increasing time, ensuring single
- * threads never see time move backward. We don't prevent multiple threads from seeing time move
- * backwards (even when reading time serially, the saved last-read time is per thread, not per
- * timer, so multiple threads can race the time). Nor do we prevent multiple threads
- * simultaneously reading the time from seeing random time or time moving backwards (assigning
- * the time structure to the returned memory location implies multicycle writes to memory).
- */
- __wt_epoch_raw(session, &tmp);
- __time_check_monotonic(session, &tmp);
- *tsp = tmp;
-}
-
-/*
- * __wt_seconds --
- * Return the seconds since the Epoch.
- */
-void
-__wt_seconds(WT_SESSION_IMPL *session, uint64_t *secondsp)
- WT_GCC_FUNC_ATTRIBUTE((visibility("default")))
-{
- struct timespec t;
-
- __wt_epoch(session, &t);
-
- /*
- * A time_t isn't guaranteed to fit into a uint64_t, but it's asserted when WiredTiger builds.
- */
- *secondsp = (uint64_t)t.tv_sec;
-}
-
-/*
- * __wt_seconds32 --
- * Return the seconds since the Epoch in 32 bits.
- */
-void
-__wt_seconds32(WT_SESSION_IMPL *session, uint32_t *secondsp)
-{
- struct timespec t;
-
- __wt_epoch(session, &t);
-
- /*
- * This won't work in 2038. But for now allow it.
- */
- *secondsp = (uint32_t)t.tv_sec;
-}
-
-/*
- * __wt_clock_to_nsec --
- * Convert from clock ticks to nanoseconds.
- */
-uint64_t
-__wt_clock_to_nsec(uint64_t end, uint64_t begin)
-{
- double clock_diff;
-
- /*
- * If the ticks were reset, consider it an invalid check and just return zero as the time
- * difference because we cannot compute anything meaningful.
- */
- if (end < begin)
- return (0);
- clock_diff = (double)(end - begin);
- return ((uint64_t)(clock_diff / __wt_process.tsc_nsec_ratio));
-}
diff --git a/src/third_party/wiredtiger/src/txn/txn.c b/src/third_party/wiredtiger/src/txn/txn.c
index 0374732dfa7..7aaba221842 100644
--- a/src/third_party/wiredtiger/src/txn/txn.c
+++ b/src/third_party/wiredtiger/src/txn/txn.c
@@ -415,9 +415,8 @@ __wt_txn_update_oldest(WT_SESSION_IMPL *session, uint32_t flags)
/*
* Make sure the ID doesn't move past any named snapshots.
*
- * Don't include the read/assignment in the assert statement. Coverity
- * complains if there are assignments only done in diagnostic builds,
- * and when the read is from a volatile.
+ * Don't include the read/assignment in the assert statement. Coverity complains if there
+ * are assignments only done in diagnostic builds, and when the read is from a volatile.
*/
uint64_t id = txn_global->nsnap_oldest_id;
WT_ASSERT(session, id == WT_TXN_NONE || !WT_TXNID_LT(id, oldest_id));
@@ -468,9 +467,15 @@ __wt_txn_config(WT_SESSION_IMPL *session, const char *cfg[])
WT_STRING_MATCH("read-committed", cval.str, cval.len) ? WT_ISO_READ_COMMITTED :
WT_ISO_READ_UNCOMMITTED;
+ /* Retrieve the maximum operation time, defaulting to the database-wide configuration. */
+ WT_RET(__wt_config_gets(session, cfg, "operation_timeout_ms", &cval));
+ session->operation_timeout_us = (uint64_t)(cval.val * WT_THOUSAND);
+ if (session->operation_timeout_us == 0)
+ session->operation_timeout_us = S2C(session)->operation_timeout_us;
+
/*
- * The default sync setting is inherited from the connection, but can
- * be overridden by an explicit "sync" setting for this transaction.
+ * The default sync setting is inherited from the connection, but can be overridden by an
+ * explicit "sync" setting for this transaction.
*
* We want to distinguish between inheriting implicitly and explicitly.
*/
@@ -609,12 +614,14 @@ __wt_txn_release(WT_SESSION_IMPL *session)
/*
* Ensure the transaction flags are cleared on exit
*
- * Purposely do NOT clear the commit and durable timestamps on release.
- * Other readers may still find these transactions in the durable queue
- * and will need to see those timestamps.
+ * Purposely do NOT clear the commit and durable timestamps on release. Other readers may still
+ * find these transactions in the durable queue and will need to see those timestamps.
*/
txn->flags = 0;
txn->prepare_timestamp = WT_TS_NONE;
+
+ /* Clear operation timer. */
+ session->operation_timeout_us = 0;
}
/*
@@ -1122,18 +1129,14 @@ __wt_txn_prepare(WT_SESSION_IMPL *session, const char *cfg[])
op->u.op_upd = NULL;
WT_STAT_CONN_INCR(session, txn_prepared_updates_count);
/*
- * Set the key repeated flag which tells us that we've
- * got multiple updates to the same key by the same txn.
- * This is later used in txn commit.
+ * Set the key repeated flag which tells us that we've got multiple updates to the same
+ * key by the same txn. This is later used in txn commit.
*
- * When we see a reserved update we set the
- * WT_UPDATE_RESERVED flag instead. We do this as we
- * cannot know if our current update should specify the
- * key repeated flag as we don't want to traverse the
- * entire update chain to find out. i.e. if there is
- * an update with our txnid after the reserved update
- * we should set key repeated, but if there isn't we
- * shouldn't.
+ * When we see a reserved update we set the WT_UPDATE_RESERVED flag instead. We do this
+ * as we cannot know if our current update should specify the key repeated flag as we
+ * don't want to traverse the entire update chain to find out. i.e. if there is an
+ * update with our txnid after the reserved update we should set key repeated, but if
+ * there isn't we shouldn't.
*/
if (upd->next != NULL && upd->txnid == upd->next->txnid) {
if (upd->next->type == WT_UPDATE_RESERVE)
@@ -1501,19 +1504,143 @@ __wt_txn_activity_drain(WT_SESSION_IMPL *session)
* __wt_txn_global_shutdown --
* Shut down the global transaction state.
*/
-void
-__wt_txn_global_shutdown(WT_SESSION_IMPL *session)
+int
+__wt_txn_global_shutdown(WT_SESSION_IMPL *session, const char *config, const char **cfg)
{
+ WT_CONFIG_ITEM cval;
+ WT_CONNECTION_IMPL *conn;
+ WT_DECL_RET;
+ WT_SESSION *wt_session;
+ WT_SESSION_IMPL *s;
+ const char *ckpt_cfg;
+
+ conn = S2C(session);
+
/*
- * All application transactions have completed, ignore the pinned
- * timestamp so that updates can be evicted from the cache during
- * connection close.
+ * Perform a system-wide checkpoint so that all tables are consistent with each other. All
+ * transactions are resolved but ignore timestamps to make sure all data gets to disk. Do this
+ * before shutting down all the subsystems. We have shut down all user sessions, but send in
+ * true for waiting for internal races.
+ */
+ WT_TRET(__wt_config_gets(session, cfg, "use_timestamp", &cval));
+ ckpt_cfg = "use_timestamp=false";
+ if (cval.val != 0) {
+ ckpt_cfg = "use_timestamp=true";
+ if (conn->txn_global.has_stable_timestamp)
+ F_SET(conn, WT_CONN_CLOSING_TIMESTAMP);
+ }
+ if (!F_ISSET(conn, WT_CONN_IN_MEMORY | WT_CONN_READONLY)) {
+ s = NULL;
+ WT_TRET(__wt_open_internal_session(conn, "close_ckpt", true, 0, &s));
+ if (s != NULL) {
+ const char *checkpoint_cfg[] = {
+ WT_CONFIG_BASE(session, WT_SESSION_checkpoint), ckpt_cfg, NULL};
+ wt_session = &s->iface;
+ WT_TRET(__wt_txn_checkpoint(s, checkpoint_cfg, true));
+
+ /*
+ * Mark the metadata dirty so we flush it on close, allowing recovery to be skipped.
+ */
+ WT_WITH_DHANDLE(s, WT_SESSION_META_DHANDLE(s), __wt_tree_modify_set(s));
+
+ WT_TRET(wt_session->close(wt_session, config));
+ }
+ }
+
+ /*
+ * All application transactions have completed, ignore the pinned timestamp so that updates can
+ * be evicted from the cache during connection close.
*
- * Note that we are relying on a special case in __wt_txn_visible_all
- * that returns true during close when there is no pinned timestamp
- * set.
+ * Note that we are relying on a special case in __wt_txn_visible_all that returns true during
+ * close when there is no pinned timestamp set.
*/
- S2C(session)->txn_global.has_pinned_timestamp = false;
+ conn->txn_global.has_pinned_timestamp = false;
+
+ return (ret);
+}
+
+/*
+ * __wt_txn_is_blocking_old --
+ * Return if this transaction is the oldest transaction in the system, called by eviction to
+ * determine if a worker thread should be released from eviction.
+ */
+int
+__wt_txn_is_blocking_old(WT_SESSION_IMPL *session)
+{
+ WT_CONNECTION_IMPL *conn;
+ WT_TXN *txn;
+ WT_TXN_GLOBAL *txn_global;
+ WT_TXN_STATE *state;
+ uint64_t id;
+ uint32_t i, session_cnt;
+
+ conn = S2C(session);
+ txn = &session->txn;
+ txn_global = &conn->txn_global;
+
+ if (txn->id == WT_TXN_NONE || F_ISSET(txn, WT_TXN_PREPARE))
+ return (false);
+
+ WT_ORDERED_READ(session_cnt, conn->session_cnt);
+
+ /*
+ * Check if the transaction is oldest one in the system. It's safe to ignore sessions allocating
+ * transaction IDs, since we already have an ID, they are guaranteed to be newer.
+ */
+ for (i = 0, state = txn_global->states; i < session_cnt; i++, state++) {
+ if (state->is_allocating)
+ continue;
+
+ WT_ORDERED_READ(id, state->id);
+ if (id != WT_TXN_NONE && WT_TXNID_LT(id, txn->id))
+ break;
+ }
+ return (i == session_cnt ?
+ __wt_txn_rollback_required(session, "oldest transaction ID rolled back for eviction") :
+ 0);
+}
+
+/*
+ * __wt_txn_is_blocking_pin --
+ * Return if this transaction is likely blocking eviction because of a pinned transaction ID,
+ * called by eviction to determine if a worker thread should be released from eviction.
+ */
+int
+__wt_txn_is_blocking_pin(WT_SESSION_IMPL *session)
+{
+ WT_CONNECTION_IMPL *conn;
+ WT_SESSION_IMPL *s;
+ WT_TXN *txn;
+ uint64_t snap_min;
+ uint32_t i, session_cnt;
+
+ conn = S2C(session);
+ txn = &session->txn;
+
+ /*
+ * Check if we hold the oldest pinned transaction ID in the system. This potentially means
+ * rolling back a read-only transaction, which MongoDB can't (yet) handle. For this reason,
+ * don't check unless we're configured to time out thread operations, a way to confirm our
+ * caller is prepared for rollback.
+ */
+ if (!F_ISSET(txn, WT_TXN_HAS_SNAPSHOT) || txn->snap_min == WT_TXN_NONE)
+ return (0);
+ if (!__wt_op_timer_fired(session))
+ return (0);
+
+ WT_ORDERED_READ(session_cnt, conn->session_cnt);
+
+ for (s = conn->sessions, i = 0; i < session_cnt; ++s, ++i) {
+ if (F_ISSET(s, WT_SESSION_INTERNAL) || !F_ISSET(&s->txn, WT_TXN_HAS_SNAPSHOT))
+ continue;
+
+ WT_ORDERED_READ(snap_min, s->txn.snap_min);
+ if (snap_min != WT_TXN_NONE && snap_min < txn->snap_min)
+ break;
+ }
+ return (i == session_cnt ? __wt_txn_rollback_required(
+ session, "oldest pinned transaction ID rolled back for eviction") :
+ 0);
}
/*
diff --git a/src/third_party/wiredtiger/src/txn/txn_ckpt.c b/src/third_party/wiredtiger/src/txn/txn_ckpt.c
index 072406a25cc..e960ec03d48 100644
--- a/src/third_party/wiredtiger/src/txn/txn_ckpt.c
+++ b/src/third_party/wiredtiger/src/txn/txn_ckpt.c
@@ -160,15 +160,12 @@ __checkpoint_apply_all(
if (!target_list && op != NULL) {
/*
- * If the checkpoint is named or we're dropping checkpoints, we
- * checkpoint both open and closed files; else, only checkpoint
- * open files.
+ * If the checkpoint is named or we're dropping checkpoints, we checkpoint both open and
+ * closed files; else, only checkpoint open files.
*
- * XXX
- * We don't optimize unnamed checkpoints of a list of targets,
- * we open the targets and checkpoint them even if they are
- * quiescent and don't need a checkpoint, believing applications
- * unlikely to checkpoint a list of closed targets.
+ * XXX We don't optimize unnamed checkpoints of a list of targets, we open the targets and
+ * checkpoint them even if they are quiescent and don't need a checkpoint, believing
+ * applications unlikely to checkpoint a list of closed targets.
*/
ckpt_closed = named;
if (!ckpt_closed) {
@@ -217,21 +214,17 @@ __checkpoint_data_source(WT_SESSION_IMPL *session, const char *cfg[])
WT_NAMED_DATA_SOURCE *ndsrc;
/*
- * A place-holder, to support data sources: we assume calling the
- * underlying data-source session checkpoint function is sufficient to
- * checkpoint all objects in the data source, open or closed, and we
- * don't attempt to optimize the checkpoint of individual targets.
- * Those assumptions are not necessarily going to be true for all
- * data sources.
+ * A place-holder, to support data sources: we assume calling the underlying data-source session
+ * checkpoint function is sufficient to checkpoint all objects in the data source, open or
+ * closed, and we don't attempt to optimize the checkpoint of individual targets. Those
+ * assumptions are not necessarily going to be true for all data sources.
*
- * It's not difficult to support data-source checkpoints of individual
- * targets (__wt_schema_worker is the underlying function that will do
- * the work, and it's already written to support data-sources, although
- * we'd probably need to pass the URI of the object to the data source
- * checkpoint function which we don't currently do). However, doing a
- * full data checkpoint is trickier: currently, the connection code is
- * written to ignore all objects other than "file:", and that code will
- * require significant changes to work with data sources.
+ * It's not difficult to support data-source checkpoints of individual targets
+ * (__wt_schema_worker is the underlying function that will do the work, and it's already
+ * written to support data-sources, although we'd probably need to pass the URI of the object to
+ * the data source checkpoint function which we don't currently do). However, doing a full data
+ * checkpoint is trickier: currently, the connection code is written to ignore all objects other
+ * than "file:", and that code will require significant changes to work with data sources.
*/
TAILQ_FOREACH (ndsrc, &S2C(session)->dsrcqh, q) {
dsrc = ndsrc->dsrc;
@@ -407,9 +400,8 @@ __checkpoint_reduce_dirty_cache(WT_SESSION_IMPL *session)
/*
* We haven't reached the current target.
*
- * Don't wait indefinitely: there might be dirty pages
- * that can't be evicted. If we can't meet the target,
- * give up and start the checkpoint for real.
+ * Don't wait indefinitely: there might be dirty pages that can't be evicted. If we can't
+ * meet the target, give up and start the checkpoint for real.
*/
bytes_written_total = cache->bytes_written - bytes_written_start;
if (bytes_written_total > max_write)
@@ -541,9 +533,8 @@ __checkpoint_prepare(WT_SESSION_IMPL *session, bool *trackingp, const char *cfg[
/*
* Start a snapshot transaction for the checkpoint.
*
- * Note: we don't go through the public API calls because they have
- * side effects on cursors, which applications can hold open across
- * calls to checkpoint.
+ * Note: we don't go through the public API calls because they have side effects on cursors,
+ * which applications can hold open across calls to checkpoint.
*/
WT_RET(__wt_txn_begin(session, txn_cfg));
@@ -574,8 +565,8 @@ __checkpoint_prepare(WT_SESSION_IMPL *session, bool *trackingp, const char *cfg[
/*
* Remove the checkpoint transaction from the global table.
*
- * This allows ordinary visibility checks to move forward because
- * checkpoints often take a long time and only write to the metadata.
+ * This allows ordinary visibility checks to move forward because checkpoints often take a long
+ * time and only write to the metadata.
*/
__wt_writelock(session, &txn_global->rwlock);
txn_global->checkpoint_state = *txn_state;
@@ -597,8 +588,8 @@ __checkpoint_prepare(WT_SESSION_IMPL *session, bool *trackingp, const char *cfg[
/*
* Set the checkpoint transaction's timestamp, if requested.
*
- * We rely on having the global transaction data locked so the oldest
- * timestamp can't move past the stable timestamp.
+ * We rely on having the global transaction data locked so the oldest timestamp can't move past
+ * the stable timestamp.
*/
WT_ASSERT(session, !F_ISSET(txn, WT_TXN_HAS_TS_COMMIT | WT_TXN_HAS_TS_READ |
WT_TXN_TS_PUBLISHED | WT_TXN_PUBLIC_TS_READ));
@@ -618,7 +609,7 @@ __checkpoint_prepare(WT_SESSION_IMPL *session, bool *trackingp, const char *cfg[
} else if (!F_ISSET(conn, WT_CONN_RECOVERING))
txn_global->meta_ckpt_timestamp = txn_global->recovery_timestamp;
} else if (!F_ISSET(conn, WT_CONN_RECOVERING))
- txn_global->meta_ckpt_timestamp = 0;
+ txn_global->meta_ckpt_timestamp = WT_TS_NONE;
__wt_writeunlock(session, &txn_global->rwlock);
@@ -627,8 +618,8 @@ __checkpoint_prepare(WT_SESSION_IMPL *session, bool *trackingp, const char *cfg[
session, txn->read_timestamp, "Checkpoint requested at stable timestamp");
/*
- * The snapshot we established when the transaction started may
- * be too early to match the timestamp we just read.
+ * The snapshot we established when the transaction started may be too early to match the
+ * timestamp we just read.
*
* Get a new one.
*/
@@ -636,11 +627,11 @@ __checkpoint_prepare(WT_SESSION_IMPL *session, bool *trackingp, const char *cfg[
}
/*
- * Get a list of handles we want to flush; for named checkpoints this
- * may pull closed objects into the session cache.
+ * Get a list of handles we want to flush; for named checkpoints this may pull closed objects
+ * into the session cache.
*
- * First, gather all handles, then start the checkpoint transaction,
- * then release any clean handles.
+ * First, gather all handles, then start the checkpoint transaction, then release any clean
+ * handles.
*/
WT_ASSERT(session, session->ckpt_handle_next == 0);
WT_WITH_TABLE_READ_LOCK(
@@ -673,12 +664,11 @@ __txn_checkpoint_can_skip(
txn_global = &conn->txn_global;
/*
- * This function also parses out some configuration options and hands
- * them back to the caller - make sure it does that parsing regardless
- * of the result.
+ * This function also parses out some configuration options and hands them back to the caller -
+ * make sure it does that parsing regardless of the result.
*
- * Determine if this is going to be a full checkpoint, that is a
- * checkpoint that applies to all data tables in a database.
+ * Determine if this is going to be a full checkpoint, that is a checkpoint that applies to all
+ * data tables in a database.
*/
WT_RET(__wt_config_gets(session, cfg, "target", &cval));
__wt_config_subinit(session, &targetconf, &cval);
@@ -788,8 +778,7 @@ __txn_checkpoint(WT_SESSION_IMPL *session, const char *cfg[])
/*
* Update the global oldest ID so we do all possible cleanup.
*
- * This is particularly important for compact, so that all dirty pages
- * can be fully written.
+ * This is particularly important for compact, so that all dirty pages can be fully written.
*/
WT_ERR(__wt_txn_update_oldest(session, WT_TXN_OLDEST_STRICT | WT_TXN_OLDEST_WAIT));
@@ -814,32 +803,29 @@ __txn_checkpoint(WT_SESSION_IMPL *session, const char *cfg[])
/*
* Start the checkpoint for real.
*
- * Bump the global checkpoint generation, used to figure out whether
- * checkpoint has visited a tree. Use an atomic increment even though
- * we are single-threaded because readers of the checkpoint generation
- * don't hold the checkpoint lock.
+ * Bump the global checkpoint generation, used to figure out whether checkpoint has visited a
+ * tree. Use an atomic increment even though we are single-threaded because readers of the
+ * checkpoint generation don't hold the checkpoint lock.
*
- * We do need to update it before clearing the checkpoint's entry out
- * of the transaction table, or a thread evicting in a tree could
- * ignore the checkpoint's transaction.
+ * We do need to update it before clearing the checkpoint's entry out of the transaction table,
+ * or a thread evicting in a tree could ignore the checkpoint's transaction.
*/
generation = __wt_gen_next(session, WT_GEN_CHECKPOINT);
WT_STAT_CONN_SET(session, txn_checkpoint_generation, generation);
/*
- * We want to skip checkpointing clean handles whenever possible. That
- * is, when the checkpoint is not named or forced. However, we need to
- * take care about ordering with respect to the checkpoint transaction.
+ * We want to skip checkpointing clean handles whenever possible. That is, when the checkpoint
+ * is not named or forced. However, we need to take care about ordering with respect to the
+ * checkpoint transaction.
*
- * We can't skip clean handles before starting the transaction or the
- * checkpoint can miss updates in trees that become dirty as the
- * checkpoint is starting. If we wait until the transaction has
- * started before locking a handle, there could be a metadata-changing
- * operation in between (e.g., salvage) that will cause a write
- * conflict when the checkpoint goes to write the metadata.
+ * We can't skip clean handles before starting the transaction or the checkpoint can miss
+ * updates in trees that become dirty as the checkpoint is starting. If we wait until the
+ * transaction has started before locking a handle, there could be a metadata-changing operation
+ * in between (e.g., salvage) that will cause a write conflict when the checkpoint goes to write
+ * the metadata.
*
- * Hold the schema lock while starting the transaction and gathering
- * handles so the set we get is complete and correct.
+ * Hold the schema lock while starting the transaction and gathering handles so the set we get
+ * is complete and correct.
*/
WT_WITH_SCHEMA_LOCK(session, ret = __checkpoint_prepare(session, &tracking, cfg));
WT_ERR(ret);
@@ -910,15 +896,12 @@ __txn_checkpoint(WT_SESSION_IMPL *session, const char *cfg[])
WT_ERR(__wt_txn_commit(session, NULL));
/*
- * Ensure that the metadata changes are durable before the checkpoint
- * is resolved. Do this by either checkpointing the metadata or syncing
- * the log file.
- * Recovery relies on the checkpoint LSN in the metadata only being
- * updated by full checkpoints so only checkpoint the metadata for
- * full or non-logged checkpoints.
+ * Ensure that the metadata changes are durable before the checkpoint is resolved. Do this by
+ * either checkpointing the metadata or syncing the log file. Recovery relies on the checkpoint
+ * LSN in the metadata only being updated by full checkpoints so only checkpoint the metadata
+ * for full or non-logged checkpoints.
*
- * This is very similar to __wt_meta_track_off, ideally they would be
- * merged.
+ * This is very similar to __wt_meta_track_off, ideally they would be merged.
*/
if (full || !logging) {
session->isolation = txn->isolation = WT_ISO_READ_UNCOMMITTED;
@@ -949,13 +932,26 @@ __txn_checkpoint(WT_SESSION_IMPL *session, const char *cfg[])
__checkpoint_stats(session);
/*
- * If timestamps were used to define the content of the checkpoint update the saved last
- * checkpoint timestamp, otherwise leave it alone. If a checkpoint is taken without
- * timestamps, it's likely a bug, but we don't want to clear the saved last checkpoint
- * timestamp regardless.
+ * If timestamps defined the checkpoint's content, set the saved last checkpoint timestamp,
+ * otherwise clear it. We clear it for a couple of reasons: applications can query it and we
+ * don't want to lie, and we use it to decide if WT_CONNECTION.rollback_to_stable is an
+ * allowed operation. For the same reason, don't set it to WT_TS_NONE when the checkpoint
+ * timestamp is WT_TS_NONE, set it to 1 so we can tell the difference.
*/
- if (use_timestamp)
- conn->txn_global.last_ckpt_timestamp = ckpt_tmp_ts;
+ if (use_timestamp) {
+ conn->txn_global.last_ckpt_timestamp = use_timestamp ? ckpt_tmp_ts : WT_TS_NONE;
+ /*
+ * MongoDB assumes the checkpoint timestamp will be initialized with WT_TS_NONE. In such
+ * cases it queries the recovery timestamp to determine the last stable recovery
+ * timestamp. So, if the recovery timestamp is valid, set the last checkpoint timestamp
+ * to recovery timestamp. This should never be a problem, as checkpoint timestamp should
+ * never be less than recovery timestamp. This could potentially avoid MongoDB making
+ * two calls to determine last stable recovery timestamp.
+ */
+ if (conn->txn_global.last_ckpt_timestamp == WT_TS_NONE)
+ conn->txn_global.last_ckpt_timestamp = conn->txn_global.recovery_timestamp;
+ } else
+ conn->txn_global.last_ckpt_timestamp = WT_TS_NONE;
}
err:
@@ -965,17 +961,14 @@ err:
conn->ckpt_timer_start.tv_sec = 0;
/*
- * XXX
- * Rolling back the changes here is problematic.
+ * XXX Rolling back the changes here is problematic.
*
- * If we unroll here, we need a way to roll back changes to the avail
- * list for each tree that was successfully synced before the error
- * occurred. Otherwise, the next time we try this operation, we will
- * try to free an old checkpoint again.
+ * If we unroll here, we need a way to roll back changes to the avail list for each tree that
+ * was successfully synced before the error occurred. Otherwise, the next time we try this
+ * operation, we will try to free an old checkpoint again.
*
- * OTOH, if we commit the changes after a failure, we have partially
- * overwritten the checkpoint, so what ends up on disk is not
- * consistent.
+ * OTOH, if we commit the changes after a failure, we have partially overwritten the checkpoint,
+ * so what ends up on disk is not consistent.
*/
failed = ret != 0;
if (failed)
@@ -1076,18 +1069,15 @@ __wt_txn_checkpoint(WT_SESSION_IMPL *session, const char *cfg[], bool waiting)
/*
* Don't highjack the session checkpoint thread for eviction.
*
- * Application threads are not generally available for potentially slow
- * operations, but checkpoint does enough I/O it may be called upon to
- * perform slow operations for the block manager.
+ * Application threads are not generally available for potentially slow operations, but checkpoint
+ * does enough I/O it may be called upon to perform slow operations for the block manager.
*
- * Application checkpoints wait until the checkpoint lock is available,
- * compaction checkpoints don't.
+ * Application checkpoints wait until the checkpoint lock is available, compaction checkpoints
+ * don't.
*
- * Checkpoints should always use a separate session for lookaside
- * updates, otherwise those updates are pinned until the checkpoint
- * commits. Also, there are unfortunate interactions between the
- * special rules for lookaside eviction and the special handling of the
- * checkpoint transaction.
+ * Checkpoints should always use a separate session for lookaside updates, otherwise those updates
+ * are pinned until the checkpoint commits. Also, there are unfortunate interactions between the
+ * special rules for lookaside eviction and the special handling of the checkpoint transaction.
*/
#undef WT_CHECKPOINT_SESSION_FLAGS
#define WT_CHECKPOINT_SESSION_FLAGS (WT_SESSION_CAN_WAIT | WT_SESSION_IGNORE_CACHE_SIZE)
@@ -1246,11 +1236,9 @@ __checkpoint_lock_dirty_tree_int(WT_SESSION_IMPL *session, bool is_checkpoint, b
/*
* Lock the checkpoints that will be deleted.
*
- * Checkpoints are only locked when tracking is enabled, which covers
- * checkpoint and drop operations, but not close. The reasoning is
- * there should be no access to a checkpoint during close, because any
- * thread accessing a checkpoint will also have the current file handle
- * open.
+ * Checkpoints are only locked when tracking is enabled, which covers checkpoint and drop
+ * operations, but not close. The reasoning is there should be no access to a checkpoint during
+ * close, because any thread accessing a checkpoint will also have the current file handle open.
*/
if (WT_META_TRACKING(session))
WT_CKPT_FOREACH (ckptbase, ckpt) {
@@ -1413,26 +1401,22 @@ __checkpoint_mark_skip(WT_SESSION_IMPL *session, WT_CKPT *ckptbase, bool force)
/*
* Check for clean objects not requiring a checkpoint.
*
- * If we're closing a handle, and the object is clean, we can skip the
- * checkpoint, whatever checkpoints we have are sufficient. (We might
- * not have any checkpoints if the object was never modified, and that's
- * OK: the object creation code doesn't mark the tree modified so we can
- * skip newly created trees here.)
+ * If we're closing a handle, and the object is clean, we can skip the checkpoint, whatever
+ * checkpoints we have are sufficient. (We might not have any checkpoints if the object was
+ * never modified, and that's OK: the object creation code doesn't mark the tree modified so we
+ * can skip newly created trees here.)
*
- * If the application repeatedly checkpoints an object (imagine hourly
- * checkpoints using the same explicit or internal name), there's no
- * reason to repeat the checkpoint for clean objects. The test is if
- * the only checkpoint we're deleting is the last one in the list and
- * it has the same name as the checkpoint we're about to take, skip the
- * work. (We can't skip checkpoints that delete more than the last
- * checkpoint because deleting those checkpoints might free up space in
- * the file.) This means an application toggling between two (or more)
- * checkpoint names will repeatedly take empty checkpoints, but that's
- * not likely enough to make detection worthwhile.
+ * If the application repeatedly checkpoints an object (imagine hourly checkpoints using the
+ * same explicit or internal name), there's no reason to repeat the checkpoint for clean
+ * objects. The test is if the only checkpoint we're deleting is the last one in the list and it
+ * has the same name as the checkpoint we're about to take, skip the work. (We can't skip
+ * checkpoints that delete more than the last checkpoint because deleting those checkpoints
+ * might free up space in the file.) This means an application toggling between two (or more)
+ * checkpoint names will repeatedly take empty checkpoints, but that's not likely enough to make
+ * detection worthwhile.
*
- * Checkpoint read-only objects otherwise: the application must be able
- * to open the checkpoint in a cursor after taking any checkpoint, which
- * means it must exist.
+ * Checkpoint read-only objects otherwise: the application must be able to open the checkpoint
+ * in a cursor after taking any checkpoint, which means it must exist.
*/
F_CLR(btree, WT_BTREE_SKIP_CKPT);
if (!btree->modified && !force) {
@@ -1589,16 +1573,14 @@ fake:
/*
* Update the object's metadata.
*
- * If the object is the metadata, the call to __wt_meta_ckptlist_set
- * will update the turtle file and swap the new one into place. We
- * need to make sure the metadata is on disk before the turtle file is
- * updated.
+ * If the object is the metadata, the call to __wt_meta_ckptlist_set will update the turtle file
+ * and swap the new one into place. We need to make sure the metadata is on disk before the
+ * turtle file is updated.
*
- * If we are doing a checkpoint in a file without a transaction (e.g.,
- * closing a dirty tree before an exclusive operation like verify),
- * the metadata update will be auto-committed. In that case, we need to
- * sync the file here or we could roll forward the metadata in
- * recovery and open a checkpoint that isn't yet durable.
+ * If we are doing a checkpoint in a file without a transaction (e.g., closing a dirty tree
+ * before an exclusive operation like verify), the metadata update will be auto-committed. In
+ * that case, we need to sync the file here or we could roll forward the metadata in recovery
+ * and open a checkpoint that isn't yet durable.
*/
if (WT_IS_METADATA(dhandle) || !F_ISSET(&session->txn, WT_TXN_RUNNING))
WT_ERR(__wt_checkpoint_sync(session, NULL));
diff --git a/src/third_party/wiredtiger/src/txn/txn_log.c b/src/third_party/wiredtiger/src/txn/txn_log.c
index f74f0d45562..124465cc529 100644
--- a/src/third_party/wiredtiger/src/txn/txn_log.c
+++ b/src/third_party/wiredtiger/src/txn/txn_log.c
@@ -28,12 +28,11 @@ __txn_op_log_row_key_check(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt)
memset(&key, 0, sizeof(key));
/*
- * We used to take the row-store logging key from the page referenced by
- * the cursor, then switched to taking it from the cursor itself. Check
- * they are the same.
+ * We used to take the row-store logging key from the page referenced by the cursor, then
+ * switched to taking it from the cursor itself. Check they are the same.
*
- * If the cursor references a WT_INSERT item, take the key from there,
- * else take the key from the original page.
+ * If the cursor references a WT_INSERT item, take the key from there, else take the key from
+ * the original page.
*/
if (cbt->ins == NULL) {
session = (WT_SESSION_IMPL *)cbt->iface.session;
diff --git a/src/third_party/wiredtiger/src/txn/txn_recover.c b/src/third_party/wiredtiger/src/txn/txn_recover.c
index 17e0b61c904..24653712e13 100644
--- a/src/third_party/wiredtiger/src/txn/txn_recover.c
+++ b/src/third_party/wiredtiger/src/txn/txn_recover.c
@@ -55,12 +55,12 @@ __recovery_cursor(
if (WT_LOGOP_IS_IGNORED(id))
return (0);
/*
- * Metadata operations have an id of 0. Match operations based
- * on the id and the current pass of recovery for metadata.
+ * Metadata operations have an id of 0. Match operations based on the id and the current pass of
+ * recovery for metadata.
*
- * Only apply operations in the correct metadata phase, and if the LSN
- * is more recent than the last checkpoint. If there is no entry for a
- * file, assume it was dropped or missing after a hot backup.
+ * Only apply operations in the correct metadata phase, and if the LSN is more recent than the
+ * last checkpoint. If there is no entry for a file, assume it was dropped or missing after a
+ * hot backup.
*/
metadata_op = id == WT_METAFILE_ID;
if (r->metadata_only != metadata_op)
@@ -536,7 +536,7 @@ __wt_txn_recover(WT_SESSION_IMPL *session)
r.session = session;
WT_MAX_LSN(&r.max_ckpt_lsn);
WT_MAX_LSN(&r.max_rec_lsn);
- conn->txn_global.recovery_timestamp = conn->txn_global.meta_ckpt_timestamp = 0;
+ conn->txn_global.recovery_timestamp = conn->txn_global.meta_ckpt_timestamp = WT_TS_NONE;
F_SET(conn, WT_CONN_RECOVERING);
WT_ERR(__wt_metadata_search(session, WT_METAFILE_URI, &config));
@@ -575,15 +575,13 @@ __wt_txn_recover(WT_SESSION_IMPL *session)
}
/*
- * First, do a pass through the log to recover the metadata, and
- * establish the last checkpoint LSN. Skip this when opening a hot
- * backup: we already have the correct metadata in that case.
+ * First, do a pass through the log to recover the metadata, and establish the last checkpoint
+ * LSN. Skip this when opening a hot backup: we already have the correct metadata in that case.
*
- * If we're running with salvage and we hit an error, we ignore it
- * and continue. In salvage we want to recover whatever part of the
- * data we can from the last checkpoint up until whatever problem we
- * detect in the log file. In salvage, we ignore errors from scanning
- * the log so recovery can continue. Other errors remain errors.
+ * If we're running with salvage and we hit an error, we ignore it and continue. In salvage we
+ * want to recover whatever part of the data we can from the last checkpoint up until whatever
+ * problem we detect in the log file. In salvage, we ignore errors from scanning the log so
+ * recovery can continue. Other errors remain errors.
*/
if (!was_backup) {
r.metadata_only = true;
diff --git a/src/third_party/wiredtiger/src/txn/txn_rollback_to_stable.c b/src/third_party/wiredtiger/src/txn/txn_rollback_to_stable.c
index 97c83c47414..d3d9c2b4dfb 100644
--- a/src/third_party/wiredtiger/src/txn/txn_rollback_to_stable.c
+++ b/src/third_party/wiredtiger/src/txn/txn_rollback_to_stable.c
@@ -105,13 +105,11 @@ __txn_abort_newer_update(
first_upd = upd->next;
} else if (rollback_timestamp < upd->durable_ts) {
/*
- * If any updates are aborted, all newer updates
- * better be aborted as well.
+ * If any updates are aborted, all newer updates better be aborted as well.
*
- * Timestamp ordering relies on the validations at
- * the time of commit. Thus if the table is not
- * configured for key consistency check, the
- * the timestamps could be out of order here.
+ * Timestamp ordering relies on the validations at the time of commit. Thus if the table
+ * is not configured for key consistency check, the timestamps could be out of order
+ * here.
*/
WT_ASSERT(session, !FLD_ISSET(S2BT(session)->assert_flags, WT_ASSERT_COMMIT_TS_KEYS) ||
upd == first_upd);
@@ -222,34 +220,29 @@ __txn_abort_newer_updates(WT_SESSION_IMPL *session, WT_REF *ref, wt_timestamp_t
{
WT_DECL_RET;
WT_PAGE *page;
+ WT_PAGE_LOOKASIDE *page_las;
uint32_t read_flags;
bool local_read;
/*
- * If we created a page image with updates the need to be rolled back,
- * read the history into cache now and make sure the page is marked
- * dirty. Otherwise, the history we need could be swept from the
- * lookaside table before the page is read because the lookaside sweep
- * code has no way to tell that the page image is invalid.
+ * If we created a page image with updates that need to be rolled back, read the history into
+ * cache now and make sure the page is marked dirty. Otherwise, the history we need could be
+ * swept from the lookaside table before the page is read because the lookaside sweep code has
+ * no way to tell that the page image is invalid.
*
- * So, if there is lookaside history for a page, first check if the
- * history needs to be rolled back make sure that history is loaded
- * into cache. That is, if skew_newest is true, so the disk image
- * potentially contained unstable updates, and the history is more
- * recent than the rollback timestamp.
+ * So, if there is lookaside history for a page, first check if the history needs to be rolled
+ * back then ensure the history is loaded into cache.
*
- * Also, we have separately discarded any lookaside history more recent
- * than the rollback timestamp. For page_las structures in cache,
- * reset any future timestamps back to the rollback timestamp. This
- * allows those structures to be discarded once the rollback timestamp
- * is stable (crucially for tests, they can be discarded if the
- * connection is closed right after a rollback_to_stable call).
+ * Also, we have separately discarded any lookaside history more recent than the rollback
+ * timestamp. For page_las structures in cache, reset any future timestamps back to the rollback
+ * timestamp. This allows those structures to be discarded once the rollback timestamp is stable
+ * (crucially for tests, they can be discarded if the connection is closed right after a
+ * rollback_to_stable call).
*/
local_read = false;
read_flags = WT_READ_WONT_NEED;
- if (ref->page_las != NULL) {
- if (ref->page_las->skew_newest &&
- rollback_timestamp < ref->page_las->unstable_durable_timestamp) {
+ if ((page_las = ref->page_las) != NULL) {
+ if (rollback_timestamp < page_las->max_ondisk_ts) {
/*
* Make sure we get back a page with history, not a limbo page.
*/
@@ -258,13 +251,10 @@ __txn_abort_newer_updates(WT_SESSION_IMPL *session, WT_REF *ref, wt_timestamp_t
WT_ASSERT(session,
ref->state != WT_REF_LIMBO && ref->page != NULL && __wt_page_is_modified(ref->page));
local_read = true;
+ page_las->max_ondisk_ts = rollback_timestamp;
}
- if (ref->page_las->max_timestamp > rollback_timestamp)
- ref->page_las->max_timestamp = rollback_timestamp;
- if (ref->page_las->unstable_durable_timestamp > rollback_timestamp)
- ref->page_las->unstable_durable_timestamp = rollback_timestamp;
- if (ref->page_las->unstable_timestamp > rollback_timestamp)
- ref->page_las->unstable_timestamp = rollback_timestamp;
+ if (rollback_timestamp < page_las->min_skipped_ts)
+ page_las->min_skipped_ts = rollback_timestamp;
}
/* Review deleted page saved to the ref */
@@ -272,18 +262,14 @@ __txn_abort_newer_updates(WT_SESSION_IMPL *session, WT_REF *ref, wt_timestamp_t
WT_ERR(__wt_delete_page_rollback(session, ref));
/*
- * If we have a ref with no page, or the page is clean, there is
- * nothing to roll back.
+ * If we have a ref with no page, or the page is clean, there is nothing to roll back.
*
- * This check for a clean page is partly an optimization (checkpoint
- * only marks pages clean when they have no unwritten updates so
- * there's no point visiting them again), but also covers a corner case
- * of a checkpoint with use_timestamp=false. Such a checkpoint
- * effectively moves the stable timestamp forward, because changes that
- * are written in the checkpoint cannot be reliably rolled back. The
- * actual stable timestamp doesn't change, though, so if we try to roll
- * back clean pages the in-memory tree can get out of sync with the
- * on-disk tree.
+ * This check for a clean page is partly an optimization (checkpoint only marks pages clean when
+ * they have no unwritten updates so there's no point visiting them again), but also covers a
+ * corner case of a checkpoint with use_timestamp=false. Such a checkpoint effectively moves the
+ * stable timestamp forward, because changes that are written in the checkpoint cannot be
+ * reliably rolled back. The actual stable timestamp doesn't change, though, so if we try to
+ * roll back clean pages the in-memory tree can get out of sync with the on-disk tree.
*/
if ((page = ref->page) == NULL || !__wt_page_is_modified(page))
goto err;
@@ -436,6 +422,7 @@ __txn_rollback_to_stable_check(WT_SESSION_IMPL *session)
conn = S2C(session);
txn_global = &conn->txn_global;
+
if (!txn_global->has_stable_timestamp)
WT_RET_MSG(session, EINVAL, "rollback_to_stable requires a stable timestamp");
@@ -472,13 +459,12 @@ __txn_rollback_to_stable(WT_SESSION_IMPL *session, const char *cfg[])
WT_STAT_CONN_INCR(session, txn_rollback_to_stable);
/*
- * Mark that a rollback operation is in progress and wait for eviction
- * to drain. This is necessary because lookaside eviction uses
- * transactions and causes the check for a quiescent system to fail.
+ * Mark that a rollback operation is in progress and wait for eviction to drain. This is
+ * necessary because lookaside eviction uses transactions and causes the check for a quiescent
+ * system to fail.
*
- * Configuring lookaside eviction off isn't atomic, safe because the
- * flag is only otherwise set when closing down the database. Assert
- * to avoid confusion in the future.
+ * Configuring lookaside eviction off isn't atomic, safe because the flag is only otherwise set
+ * when closing down the database. Assert to avoid confusion in the future.
*/
WT_ASSERT(session, !F_ISSET(conn, WT_CONN_EVICTION_NO_LOOKASIDE));
F_SET(conn, WT_CONN_EVICTION_NO_LOOKASIDE);
diff --git a/src/third_party/wiredtiger/src/txn/txn_timestamp.c b/src/third_party/wiredtiger/src/txn/txn_timestamp.c
index 2d9291ebbce..a9ee80c953f 100644
--- a/src/third_party/wiredtiger/src/txn/txn_timestamp.c
+++ b/src/third_party/wiredtiger/src/txn/txn_timestamp.c
@@ -210,20 +210,18 @@ __txn_get_published_timestamp(WT_SESSION_IMPL *session, WT_TXN *txn)
wt_timestamp_t ts;
/*
- * Any checking of bit flags in this logic is invalid. __wt_txn_release
- * may have already been called on this transaction which will set the
- * flags member to 0. So we need to deduce which timestamp to use purely
- * by inspecting the timestamp members which we deliberately preserve
+ * Any checking of bit flags in this logic is invalid. __wt_txn_release may have already been
+ * called on this transaction which will set the flags member to 0. So we need to deduce which
+ * timestamp to use purely by inspecting the timestamp members which we deliberately preserve
* for reader threads such as ourselves.
*
- * In the non-prepared case, the first commit will either be less than
- * the commit (in the case of multiple commits) in which case we should
- * return the first commit. Or it will be equal to the commit (in the
- * case of a single commit) and we can return durable (which is mirrored
- * from the commit timestamp).
+ * In the non-prepared case, the first commit will either be less than the commit (in the case
+ * of multiple commits) in which case we should return the first commit. Or it will be equal to
+ * the commit (in the case of a single commit) and we can return durable (which is mirrored from
+ * the commit timestamp).
*
- * In the prepared case, the first commit will always be equal to the
- * commit so we'll return durable.
+ * In the prepared case, the first commit will always be equal to the commit so we'll return
+ * durable.
*/
if (txn->commit_timestamp != txn->first_commit_timestamp)
ts = txn->first_commit_timestamp;
@@ -546,14 +544,12 @@ __wt_txn_global_set_timestamp(WT_SESSION_IMPL *session, const char *cfg[])
set:
__wt_writelock(session, &txn_global->rwlock);
/*
- * This method can be called from multiple threads, check that we are
- * moving the global timestamps forwards.
+ * This method can be called from multiple threads, check that we are moving the global
+ * timestamps forwards.
*
- * The exception is the durable timestamp, where the application can
- * move it backwards (in fact, it only really makes sense to explicitly
- * move it backwards because it otherwise tracks the largest
- * durable_timestamp so it moves forward whenever transactions are
- * assigned timestamps).
+ * The exception is the durable timestamp, where the application can move it backwards (in fact,
+ * it only really makes sense to explicitly move it backwards because it otherwise tracks the
+ * largest durable_timestamp so it moves forward whenever transactions are assigned timestamps).
*/
if (has_durable) {
txn_global->durable_timestamp = durable_ts;
diff --git a/src/third_party/wiredtiger/src/utilities/util_loadtext.c b/src/third_party/wiredtiger/src/utilities/util_loadtext.c
index 1d4414b47b5..879eb0270f6 100644
--- a/src/third_party/wiredtiger/src/utilities/util_loadtext.c
+++ b/src/third_party/wiredtiger/src/utilities/util_loadtext.c
@@ -68,8 +68,7 @@ text(WT_SESSION *session, const char *uri)
/*
* We're about to load strings, make sure the formats match.
*
- * Row-store tables have key/value pairs, column-store tables only have
- * values.
+ * Row-store tables have key/value pairs, column-store tables only have values.
*/
if (!WT_STREQ(cursor->value_format, "S") ||
(!WT_STREQ(cursor->key_format, "S") && !WT_STREQ(cursor->key_format, "r")))
diff --git a/src/third_party/wiredtiger/test/csuite/schema_abort/main.c b/src/third_party/wiredtiger/test/csuite/schema_abort/main.c
index bd127d8a686..ffa46247106 100644
--- a/src/third_party/wiredtiger/test/csuite/schema_abort/main.c
+++ b/src/third_party/wiredtiger/test/csuite/schema_abort/main.c
@@ -153,13 +153,12 @@ static WT_EVENT_HANDLER event_handler = {
};
/*
- * The following are various schema-related functions to have some threads
- * performing during the test. The goal is to make sure that after a random
- * abort, the database is left in a recoverable state. Yield during the
- * schema operations to increase chance of abort during them.
+ * The following are various schema-related functions to have some threads performing during the
+ * test. The goal is to make sure that after a random abort, the database is left in a recoverable
+ * state. Yield during the schema operations to increase chance of abort during them.
*
- * TODO: Currently only verifies insert data, it would be ideal to modify the
- * schema operations so that we can verify the state of the schema too.
+ * TODO: Currently only verifies insert data, it would be ideal to modify the schema operations so
+ * that we can verify the state of the schema too.
*/
static void
diff --git a/src/third_party/wiredtiger/test/csuite/scope/main.c b/src/third_party/wiredtiger/test/csuite/scope/main.c
index dc7b312e5c8..57947fcf166 100644
--- a/src/third_party/wiredtiger/test/csuite/scope/main.c
+++ b/src/third_party/wiredtiger/test/csuite/scope/main.c
@@ -138,9 +138,8 @@ cursor_scope_ops(WT_SESSION *session, const char *uri)
}
/*
- * The application must keep key and value memory valid until
- * the next operation that positions the cursor, modifies the
- * data, or resets or closes the cursor.
+ * The application must keep key and value memory valid until the next operation that
+ * positions the cursor, modifies the data, or resets or closes the cursor.
*
* Modifying either the key or value buffers is not permitted.
*/
@@ -199,8 +198,8 @@ cursor_scope_ops(WT_SESSION *session, const char *uri)
case INSERT:
case REMOVE:
/*
- * Insert and remove configured with a search key do
- * not position the cursor and have no key or value.
+ * Insert and remove configured with a search key do not position the cursor and have no
+ * key or value.
*
* There should be two error messages, ignore them.
*/
@@ -217,8 +216,7 @@ cursor_scope_ops(WT_SESSION *session, const char *uri)
break;
case REMOVE_POS:
/*
- * Remove configured with a cursor position has a key,
- * but no value.
+ * Remove configured with a cursor position has a key, but no value.
*
* There should be one error message, ignore it.
*/
@@ -243,11 +241,10 @@ cursor_scope_ops(WT_SESSION *session, const char *uri)
case SEARCH_NEAR:
case UPDATE:
/*
- * Modify, reserve, search, search-near and update all
- * position the cursor and have both a key and value.
+ * Modify, reserve, search, search-near and update all position the cursor and have both
+ * a key and value.
*
- * Any key/value should not reference application
- * memory.
+ * Any key/value should not reference application memory.
*/
if (recno) {
testutil_assert(cursor->get_key(cursor, &keyr) == 0);
diff --git a/src/third_party/wiredtiger/test/csuite/truncated_log/main.c b/src/third_party/wiredtiger/test/csuite/truncated_log/main.c
index befc30eab61..4f31496994a 100644
--- a/src/third_party/wiredtiger/test/csuite/truncated_log/main.c
+++ b/src/third_party/wiredtiger/test/csuite/truncated_log/main.c
@@ -154,13 +154,11 @@ fill_db(void)
save_lsn.l.file = 0;
/*
- * Write data into the table until we move to log file 2.
- * We do the calculation below so that we don't have to walk the
- * log for every record.
+ * Write data into the table until we move to log file 2. We do the calculation below so that we
+ * don't have to walk the log for every record.
*
- * Calculate about how many records should fit in the log file.
- * Subtract a bunch for metadata and file creation records.
- * Then subtract out a few more records to be conservative.
+ * Calculate about how many records should fit in the log file. Subtract a bunch for metadata
+ * and file creation records. Then subtract out a few more records to be conservative.
*/
units = (K_SIZE + V_SIZE) / 128 + 1;
min_key = 90000 / (units * 128) - 15;
diff --git a/src/third_party/wiredtiger/test/csuite/wt2853_perf/main.c b/src/third_party/wiredtiger/test/csuite/wt2853_perf/main.c
index 3a39ffa4c57..147e907430f 100644
--- a/src/third_party/wiredtiger/test/csuite/wt2853_perf/main.c
+++ b/src/third_party/wiredtiger/test/csuite/wt2853_perf/main.c
@@ -30,16 +30,14 @@
/*
* JIRA ticket reference: WT-2853
*
- * Test case description: create two threads: one is populating/updating
- * records in a table with a few indices, the other is reading from table and
- * indices. The test is adapted from one that uses cursor joins, this test
- * does not, but simulates some of the access patterns.
+ * Test case description: create two threads: one is populating/updating records in a table with a
+ * few indices, the other is reading from table and indices. The test is adapted from one that uses
+ * cursor joins, this test does not, but simulates some of the access patterns.
*
- * Failure mode: after a second or two of progress by both threads, they both
- * appear to slow dramatically, almost locking up. After some time (I've
- * observed from a half minute to a few minutes), the lock up ends and both
- * threads seem to be inserting and reading at a normal fast pace. That
- * continues until the test ends (~30 seconds).
+ * Failure mode: after a second or two of progress by both threads, they both appear to slow
+ * dramatically, almost locking up. After some time (I've observed from a half minute to a few
+ * minutes), the lock up ends and both threads seem to be inserting and reading at a normal fast
+ * pace. That continues until the test ends (~30 seconds).
*/
static void *thread_insert(void *);
diff --git a/src/third_party/wiredtiger/test/csuite/wt2909_checkpoint_integrity/main.c b/src/third_party/wiredtiger/test/csuite/wt2909_checkpoint_integrity/main.c
index ff59ee95267..4e4e7f860a5 100644
--- a/src/third_party/wiredtiger/test/csuite/wt2909_checkpoint_integrity/main.c
+++ b/src/third_party/wiredtiger/test/csuite/wt2909_checkpoint_integrity/main.c
@@ -31,41 +31,34 @@
#include <sys/wait.h>
/*
- * JIRA ticket reference: WT-2909
- * Test case description:
+ * JIRA ticket reference: WT-2909 Test case description:
*
- * This test attempts to check the integrity of checkpoints by injecting
- * failures (by means of a custom file system) and then trying to recover. To
- * insulate the top level program from various crashes that may occur when
- * injecting failures, the "populate" code runs in another process, and is
- * expected to sometimes fail. Then the top level program runs recovery (with
- * the normal file system) and checks the results. Any failure at the top level
- * indicates a checkpoint integrity problem.
+ * This test attempts to check the integrity of checkpoints by injecting failures (by means of a
+ * custom file system) and then trying to recover. To insulate the top level program from various
+ * crashes that may occur when injecting failures, the "populate" code runs in another process, and
+ * is expected to sometimes fail. Then the top level program runs recovery (with the normal file
+ * system) and checks the results. Any failure at the top level indicates a checkpoint integrity
+ * problem.
*
- * Each subtest uses the same kind of schema and data, the only variance is
- * when the faults are injected. At the moment, this test only injects during
- * checkpoints, and only injects write failures. It varies in the number of
- * successful writes that occur before an injected failure (during a checkpoint
- * operation), this can be indicated with "-o N". When N is not specified, the
- * test attempts to find the optimal range of N for testing. Clearly when N is
- * large, then the checkpoint may be successfully written, and the data
- * represented by the checkpoint will be fully present. When N is small,
- * nothing of interest is written and no data is present. To find the sweet
- * spot where interesting failures occur, the test does a binary search to find
- * the approximate N that divides the "small" and "large" cases. This is not
- * strictly deterministic, a given N may give different results on different
- * runs. But approximate optimal N can be determined, allowing a series of
- * additional tests clustered around this N.
+ * Each subtest uses the same kind of schema and data, the only variance is when the faults are
+ * injected. At the moment, this test only injects during checkpoints, and only injects write
+ * failures. It varies in the number of successful writes that occur before an injected failure
+ * (during a checkpoint operation), this can be indicated with "-o N". When N is not specified, the
+ * test attempts to find the optimal range of N for testing. Clearly when N is large, then the
+ * checkpoint may be successfully written, and the data represented by the checkpoint will be fully
+ * present. When N is small, nothing of interest is written and no data is present. To find the
+ * sweet spot where interesting failures occur, the test does a binary search to find the
+ * approximate N that divides the "small" and "large" cases. This is not strictly deterministic, a
+ * given N may give different results on different runs. But approximate optimal N can be
+ * determined, allowing a series of additional tests clustered around this N.
*
- * The data is stored in two tables, one having indices. Both tables have
- * the same keys and are updated with the same key in a single transaction.
+ * The data is stored in two tables, one having indices. Both tables have the same keys and are
+ * updated with the same key in a single transaction.
*
- * Failure mode:
- * If one table is out of step with the other, that is detected as a failure at
- * the top level. If an index is missing values (or has extra values), that is
- * likewise a failure at the top level. If the tables or the home directory
- * cannot be opened, that is a top level error. The tables must be present
- * as an initial checkpoint is done without any injected fault.
+ * Failure mode: If one table is out of step with the other, that is detected as a failure at the
+ * top level. If an index is missing values (or has extra values), that is likewise a failure at the
+ * top level. If the tables or the home directory cannot be opened, that is a top level error. The
+ * tables must be present as an initial checkpoint is done without any injected fault.
*/
/*
diff --git a/src/third_party/wiredtiger/test/csuite/wt2999_join_extractor/main.c b/src/third_party/wiredtiger/test/csuite/wt2999_join_extractor/main.c
index 3bf02ed3f3c..a0443afa023 100644
--- a/src/third_party/wiredtiger/test/csuite/wt2999_join_extractor/main.c
+++ b/src/third_party/wiredtiger/test/csuite/wt2999_join_extractor/main.c
@@ -30,14 +30,12 @@
/*
* JIRA ticket reference: WT-2999
*
- * Test case description: Create a table that stores ~4K size blobs;
- * two indices are defined using a pair of custom extractors
- * that pull the first and second 32-bit integers from the blob.
- * A simple join is created using the two indices, and iterated.
+ * Test case description: Create a table that stores ~4K size blobs; two indices are defined using a
+ * pair of custom extractors that pull the first and second 32-bit integers from the blob. A simple
+ * join is created using the two indices, and iterated.
*
- * Failure mode: When a custom extractor is used with cursor
- * joins, there are memory leaks at the point where the extractor
- * sets the key.
+ * Failure mode: When a custom extractor is used with cursor joins, there are memory leaks at the
+ * point where the extractor sets the key.
*/
static int
custom_extract1(WT_EXTRACTOR *extractor, WT_SESSION *session, const WT_ITEM *key,
diff --git a/src/third_party/wiredtiger/test/csuite/wt3363_checkpoint_op_races/main.c b/src/third_party/wiredtiger/test/csuite/wt3363_checkpoint_op_races/main.c
index 97b2a1a03a2..d70a9e0475e 100644
--- a/src/third_party/wiredtiger/test/csuite/wt3363_checkpoint_op_races/main.c
+++ b/src/third_party/wiredtiger/test/csuite/wt3363_checkpoint_op_races/main.c
@@ -30,16 +30,15 @@
/*
* JIRA ticket reference: WT-3363
*
- * Test case description: There are a number of operations that we run that we
- * expect not to conflict with or block against a running checkpoint. This test
- * aims to run repeated checkpoints in a thread, while running an assortment
- * of operations that we expect to execute quickly on further threads. To
- * ensure that we catch any blockages we introduce a very large delay into the
+ * Test case description: There are a number of operations that we run that we expect not to
+ * conflict with or block against a running checkpoint. This test aims to run repeated checkpoints
+ * in a thread, while running an assortment of operations that we expect to execute quickly on
+ * further threads. To ensure that we catch any blockages we introduce a very large delay into the
* checkpoint and measure that no operation takes 1/2 the length of this delay.
*
- * Failure mode: We monitor the execution time of all operations and if we find
- * any operation taking longer than 1/2 the delay time, we abort dumping a core
- * file which can be used to determine what operation was blocked.
+ * Failure mode: We monitor the execution time of all operations and if we find any operation taking
+ * longer than 1/2 the delay time, we abort dumping a core file which can be used to determine what
+ * operation was blocked.
*/
static WT_THREAD_RET do_checkpoints(void *);
static WT_THREAD_RET do_ops(void *);
diff --git a/src/third_party/wiredtiger/test/csuite/wt4803_cache_overflow_abort/main.c b/src/third_party/wiredtiger/test/csuite/wt4803_cache_overflow_abort/main.c
index 264dbbb5679..442d3afb306 100644
--- a/src/third_party/wiredtiger/test/csuite/wt4803_cache_overflow_abort/main.c
+++ b/src/third_party/wiredtiger/test/csuite/wt4803_cache_overflow_abort/main.c
@@ -59,11 +59,11 @@ handle_message(WT_EVENT_HANDLER *handler, WT_SESSION *session, int error, const
stderr, "Got cache overflow error (expect_panic=%s)\n", expect_panic ? "true" : "false");
/*
- * If we're expecting a panic, exit with zero to indicate to the
- * parent that this test was successful.
+ * If we're expecting a panic, exit with zero to indicate to the parent that this test was
+ * successful.
*
- * If not, don't intercept. We'll naturally exit with non-zero
- * if we're terminating due to panic.
+ * If not, don't intercept. We'll naturally exit with non-zero if we're terminating due to
+ * panic.
*/
if (expect_panic)
exit(EXIT_SUCCESS);
@@ -101,15 +101,13 @@ las_workload(TEST_OPTS *opts, const char *las_file_max)
}
/*
- * Open a snapshot isolation transaction in another session. This forces
- * the cache to retain all previous values. Then update all keys with a
- * new value in the original session while keeping that snapshot
- * transaction open. With the large value buffer, small cache and lots
- * of keys, this will force a lot of lookaside usage.
+ * Open a snapshot isolation transaction in another session. This forces the cache to retain all
+ * previous values. Then update all keys with a new value in the original session while keeping
+ * that snapshot transaction open. With the large value buffer, small cache and lots of keys,
+ * this will force a lot of lookaside usage.
*
- * When the file_max setting is small, the maximum size should easily be
- * reached and we should panic. When the maximum size is large or not
- * set, then we should succeed.
+ * When the file_max setting is small, the maximum size should easily be reached and we should
+ * panic. When the maximum size is large or not set, then we should succeed.
*/
testutil_check(opts->conn->open_session(opts->conn, NULL, NULL, &other_session));
testutil_check(other_session->begin_transaction(other_session, "isolation=snapshot"));
@@ -147,11 +145,11 @@ test_las_workload(TEST_OPTS *opts, const char *las_file_max)
testutil_make_work_dir(opts->home);
/*
- * Since it's possible that the workload will panic and abort, we will
- * fork the process and execute the workload in the child process.
+ * Since it's possible that the workload will panic and abort, we will fork the process and
+ * execute the workload in the child process.
*
- * This way, we can safely check the exit code of the child process and
- * confirm that it is what we expected.
+ * This way, we can safely check the exit code of the child process and confirm that it is what
+ * we expected.
*/
pid = fork();
if (pid < 0)
diff --git a/src/third_party/wiredtiger/test/evergreen.yml b/src/third_party/wiredtiger/test/evergreen.yml
index bf20d7568bc..a61c797e683 100755
--- a/src/third_party/wiredtiger/test/evergreen.yml
+++ b/src/third_party/wiredtiger/test/evergreen.yml
@@ -4,7 +4,7 @@
#
functions:
- "fetch source" :
+ "get project" :
command: git.get_project
params:
directory: wiredtiger
@@ -13,7 +13,7 @@ functions:
params:
aws_key: ${aws_key}
aws_secret: ${aws_secret}
- remote_file: wiredtiger/${build_variant}/${revision}/artifacts/${build_id}.tgz
+ remote_file: wiredtiger/${build_variant}/${revision}/artifacts/${dependent_task|compile}_${build_id}.tgz
bucket: build_external
extract_to: wiredtiger
"fetch artifacts from little-endian" :
@@ -54,11 +54,13 @@ functions:
set -o errexit
set -o verbose
if [ "Windows_NT" = "$OS" ]; then
- scons.bat --enable-python=c:\\swigwin-3.0.2\\swig.exe --enable-diagnostic ${smp_command|}
+ pip install scons==3.1.1
+ scons-3.1.1.bat ${win_configure_flags|--enable-python=c:\\swigwin-3.0.2\\swig.exe --enable-diagnostic} ${smp_command|}
else
cd build_posix
sh ./reconf
- ${configure_env_vars|} ../configure ${configure_python_setting|} --enable-diagnostic --enable-python --enable-zlib --enable-strict --enable-static --prefix=$(pwd)/LOCAL_INSTALL
+ ${configure_env_vars|} ../configure ${configure_python_setting|} \
+ ${posix_configure_flags|--enable-diagnostic --enable-python --enable-zlib --enable-strict --enable-static --prefix=$(pwd)/LOCAL_INSTALL}
${make_command|make} ${smp_command|} 2>&1
# On macOS, change the binary location with install_name_tool since DYLD_LIBRARY_PATH
@@ -70,62 +72,59 @@ functions:
install_name_tool -change /usr/local/lib/libwiredtiger-$WT_VERSION.dylib $(pwd)/.libs/libwiredtiger-$WT_VERSION.dylib .libs/wt
fi
fi
-
-pre:
- - command: shell.exec
- params:
- script: |
- rm -rf "wiredtiger"
-post:
- - command: shell.exec
+ "make check directory":
+ command: shell.exec
params:
- working_dir: "wiredtiger"
+ working_dir: "wiredtiger/build_posix"
script: |
set -o errexit
set -o verbose
- tar cfz ../wiredtiger.tgz .
- - command: s3.put
+
+ ${test_env_vars|} ${make_command|make} VERBOSE=1 check -C ${directory} ${smp_command|} 2>&1
+ "upload artifact":
+ - command: archive.targz_pack
+ params:
+ target: "wiredtiger.tgz"
+ source_dir: "wiredtiger"
+ include:
+ - "./**"
+ - command: s3.put
+ params:
+ aws_secret: ${aws_secret}
+ aws_key: ${aws_key}
+ local_file: wiredtiger.tgz
+ bucket: build_external
+ permissions: public-read
+ content_type: application/tar
+ display_name: Artifacts
+ remote_file: wiredtiger/${build_variant}/${revision}/artifacts/${task_name}_${build_id}.tgz
+ "cleanup":
+ command: shell.exec
params:
- aws_secret: ${aws_secret}
- aws_key: ${aws_key}
- local_file: wiredtiger.tgz
- bucket: build_external
- permissions: public-read
- content_type: application/tar
- display_name: Artifacts
- remote_file: wiredtiger/${build_variant}/${revision}/artifacts/${task_id}.tgz
+ script: |
+ rm -rf "wiredtiger"
+ rm -rf "wiredtiger.tgz"
+
+pre:
+ - func: "cleanup"
+post:
+ - func: "upload artifact"
+ - func: "cleanup"
tasks:
## Base compile task on posix flavours
- name: compile
commands:
- - func: "fetch source"
- - command: git.apply_patch
- params:
- directory: wiredtiger
+ - func: "get project"
- func: "compile wiredtiger"
- - command: archive.targz_pack
- params:
- target: "wiredtiger.tgz"
- source_dir: "wiredtiger"
- include:
- - "./**"
- - command: s3.put
- params:
- aws_secret: ${aws_secret}
- aws_key: ${aws_key}
- local_file: wiredtiger.tgz
- bucket: build_external
- permissions: public-read
- content_type: application/tar
- display_name: Artifacts_compile
- remote_file: wiredtiger/${build_variant}/${revision}/artifacts/${build_id}.tgz
- # Remove the artifacts here so the later post commands won't perform duplicated archiving.
- - command: shell.exec
- params:
- script: |
- rm -rf "wiredtiger"
- rm -rf "wiredtiger.tgz"
+
+ - name: compile-asan
+ commands:
+ - func: "get project"
+ - func: "compile wiredtiger"
+ vars:
+ configure_env_vars: CC=/opt/mongodbtoolchain/v3/bin/clang CXX=/opt/mongodbtoolchain/v3/bin/clang++ ASAN_OPTIONS=detect_leaks=1:abort_on_error=1:disable_coredump=0 ASAN_SYMBOLIZER_PATH=/opt/mongodbtoolchain/v3/bin/llvm-symbolizer CFLAGS=-fsanitize=address
+ posix_configure_flags: --enable-silent-rules --enable-strict --enable-diagnostic --disable-static
- name: make-check-test
depends_on:
@@ -150,14 +149,9 @@ tasks:
commands:
- func: "fetch artifacts"
- func: "compile wiredtiger"
- - command: shell.exec
- params:
- working_dir: "wiredtiger/build_posix"
- script: |
- set -o errexit
- set -o verbose
-
- ${test_env_vars|} ${make_command|make} VERBOSE=1 check -C lang/python ${smp_command|} 2>&1
+ - func: "make check directory"
+ vars:
+ directory: lang/python
- name: examples-c-test
depends_on:
@@ -165,14 +159,24 @@ tasks:
commands:
- func: "fetch artifacts"
- func: "compile wiredtiger"
- - command: shell.exec
- params:
- working_dir: "wiredtiger/build_posix"
- script: |
- set -o errexit
- set -o verbose
+ - func: "make check directory"
+ vars:
+ directory: examples/c
- ${test_env_vars|} ${make_command|make} VERBOSE=1 check -C examples/c ${smp_command|} 2>&1
+ - name: examples-c-test-asan
+ depends_on:
+ - name: compile-asan
+ commands:
+ - func: "fetch artifacts"
+ vars:
+ dependent_task: compile-asan
+ - func: "compile wiredtiger"
+ vars:
+ configure_env_vars: CC=/opt/mongodbtoolchain/v3/bin/clang CXX=/opt/mongodbtoolchain/v3/bin/clang++ ASAN_OPTIONS=detect_leaks=1:abort_on_error=1:disable_coredump=0 ASAN_SYMBOLIZER_PATH=/opt/mongodbtoolchain/v3/bin/llvm-symbolizer CFLAGS=-fsanitize=address
+ posix_configure_flags: --enable-silent-rules --enable-strict --enable-diagnostic --disable-static
+ - func: "make check directory"
+ vars:
+ directory: examples/c
- name: bloom-test
depends_on:
@@ -180,14 +184,9 @@ tasks:
commands:
- func: "fetch artifacts"
- func: "compile wiredtiger"
- - command: shell.exec
- params:
- working_dir: "wiredtiger/build_posix"
- script: |
- set -o errexit
- set -o verbose
-
- ${test_env_vars|} ${make_command|make} VERBOSE=1 check -C test/bloom ${smp_command|} 2>&1
+ - func: "make check directory"
+ vars:
+ directory: test/bloom
- name: checkpoint-test
depends_on:
@@ -195,14 +194,9 @@ tasks:
commands:
- func: "fetch artifacts"
- func: "compile wiredtiger"
- - command: shell.exec
- params:
- working_dir: "wiredtiger/build_posix"
- script: |
- set -o errexit
- set -o verbose
-
- ${test_env_vars|} ${make_command|make} VERBOSE=1 check -C test/checkpoint ${smp_command|} 2>&1
+ - func: "make check directory"
+ vars:
+ directory: test/checkpoint
- name: cursor-order-test
depends_on:
@@ -210,14 +204,9 @@ tasks:
commands:
- func: "fetch artifacts"
- func: "compile wiredtiger"
- - command: shell.exec
- params:
- working_dir: "wiredtiger/build_posix"
- script: |
- set -o errexit
- set -o verbose
-
- ${test_env_vars|} ${make_command|make} VERBOSE=1 check -C test/cursor_order ${smp_command|} 2>&1
+ - func: "make check directory"
+ vars:
+ directory: test/cursor_order
- name: fops-test
depends_on:
@@ -225,14 +214,9 @@ tasks:
commands:
- func: "fetch artifacts"
- func: "compile wiredtiger"
- - command: shell.exec
- params:
- working_dir: "wiredtiger/build_posix"
- script: |
- set -o errexit
- set -o verbose
-
- ${test_env_vars|} ${make_command|make} VERBOSE=1 check -C test/fops ${smp_command|} 2>&1
+ - func: "make check directory"
+ vars:
+ directory: test/fops
- name: format-test
depends_on:
@@ -240,14 +224,9 @@ tasks:
commands:
- func: "fetch artifacts"
- func: "compile wiredtiger"
- - command: shell.exec
- params:
- working_dir: "wiredtiger/build_posix"
- script: |
- set -o errexit
- set -o verbose
-
- ${test_env_vars|} ${make_command|make} VERBOSE=1 check -C test/format ${smp_command|} 2>&1
+ - func: "make check directory"
+ vars:
+ directory: test/format
- name: huge-test
depends_on:
@@ -255,14 +234,9 @@ tasks:
commands:
- func: "fetch artifacts"
- func: "compile wiredtiger"
- - command: shell.exec
- params:
- working_dir: "wiredtiger/build_posix"
- script: |
- set -o errexit
- set -o verbose
-
- ${test_env_vars|} ${make_command|make} VERBOSE=1 check -C test/huge ${smp_command|} 2>&1
+ - func: "make check directory"
+ vars:
+ directory: test/huge
- name: manydbs-test
depends_on:
@@ -270,14 +244,9 @@ tasks:
commands:
- func: "fetch artifacts"
- func: "compile wiredtiger"
- - command: shell.exec
- params:
- working_dir: "wiredtiger/build_posix"
- script: |
- set -o errexit
- set -o verbose
-
- ${test_env_vars|} ${make_command|make} VERBOSE=1 check -C test/manydbs ${smp_command|} 2>&1
+ - func: "make check directory"
+ vars:
+ directory: test/manydbs
- name: packing-test
depends_on:
@@ -285,14 +254,9 @@ tasks:
commands:
- func: "fetch artifacts"
- func: "compile wiredtiger"
- - command: shell.exec
- params:
- working_dir: "wiredtiger/build_posix"
- script: |
- set -o errexit
- set -o verbose
-
- ${test_env_vars|} ${make_command|make} VERBOSE=1 check -C test/packing ${smp_command|} 2>&1
+ - func: "make check directory"
+ vars:
+ directory: test/packing
- name: readonly-test
depends_on:
@@ -300,14 +264,9 @@ tasks:
commands:
- func: "fetch artifacts"
- func: "compile wiredtiger"
- - command: shell.exec
- params:
- working_dir: "wiredtiger/build_posix"
- script: |
- set -o errexit
- set -o verbose
-
- ${test_env_vars|} ${make_command|make} VERBOSE=1 check -C test/readonly ${smp_command|} 2>&1
+ - func: "make check directory"
+ vars:
+ directory: test/readonly
- name: salvage-test
depends_on:
@@ -315,14 +274,9 @@ tasks:
commands:
- func: "fetch artifacts"
- func: "compile wiredtiger"
- - command: shell.exec
- params:
- working_dir: "wiredtiger/build_posix"
- script: |
- set -o errexit
- set -o verbose
-
- ${test_env_vars|} ${make_command|make} VERBOSE=1 check -C test/salvage ${smp_command|} 2>&1
+ - func: "make check directory"
+ vars:
+ directory: test/salvage
- name: thread-test
depends_on:
@@ -330,14 +284,9 @@ tasks:
commands:
- func: "fetch artifacts"
- func: "compile wiredtiger"
- - command: shell.exec
- params:
- working_dir: "wiredtiger/build_posix"
- script: |
- set -o errexit
- set -o verbose
-
- ${test_env_vars|} ${make_command|make} VERBOSE=1 check -C test/thread ${smp_command|} 2>&1
+ - func: "make check directory"
+ vars:
+ directory: test/thread
- name: bench-wtperf-test
depends_on:
@@ -345,14 +294,9 @@ tasks:
commands:
- func: "fetch artifacts"
- func: "compile wiredtiger"
- - command: shell.exec
- params:
- working_dir: "wiredtiger/build_posix"
- script: |
- set -o errexit
- set -o verbose
-
- ${test_env_vars|} ${make_command|make} VERBOSE=1 check -C bench/wtperf ${smp_command|} 2>&1
+ - func: "make check directory"
+ vars:
+ directory: bench/wtperf
# End of normal make check test tasks
@@ -856,7 +800,7 @@ tasks:
# Avoid /usr/bin/python, at least on macOS: with System Integrity
# Protection enabled, it ignores DYLD_LIBRARY_PATH and hence
# doesn't find the WiredTiger library in the local tree.
- ${test_env_vars|} python ../test/suite/run.py -v 2 ${smp_command|} 2>&1
+ ${test_env_vars|} ${python_binary|python} ../test/suite/run.py -v 2 ${smp_command|} 2>&1
# Break out Python unit tests into multiple buckets/tasks based on test name and runtime
# The test/suite/run.py script can work out test names by casting each command argument
@@ -991,7 +935,8 @@ tasks:
set -o errexit
set -o verbose
- scons.bat ${smp_command|} "CFLAGS=/Gv /wd4090 /wd4996 /we4047 /we4024 /TC /we4100 /we4133" wiredtiger.dll libwiredtiger.lib
+ pip install scons==3.1.1
+ scons-3.1.1.bat ${smp_command|} "CFLAGS=/Gv /wd4090 /wd4996 /we4047 /we4024 /TC /we4100 /we4133" wiredtiger.dll libwiredtiger.lib
- name: fops
depends_on:
@@ -1025,16 +970,14 @@ tasks:
cmd.exe /c "cd test\\format && ..\\..\\t_format.exe reverse=0 encryption=none logging_compression=none runs=20"
- name: million-collection-test
- depends_on: []
- run_on:
- - rhel62-large
commands:
- - func: "fetch source"
+ - func: "get project"
- func: "fetch mongo-tests repo"
- command: shell.exec
params:
working_dir: mongo-tests
script: |
+ sudo su
set -o errexit
set -o verbose
ulimit -n 1000000
@@ -1043,7 +986,7 @@ tasks:
- name: compatibility-test-for-mongodb-releases
commands:
- - func: "fetch source"
+ - func: "get project"
- command: shell.exec
params:
working_dir: "wiredtiger"
@@ -1176,10 +1119,10 @@ tasks:
buildvariants:
-- name: ubuntu1404
- display_name: Ubuntu 14.04
+- name: ubuntu1804
+ display_name: Ubuntu 18.04
run_on:
- - ubuntu1404-test
+ - ubuntu1804-test
expansions:
# It's ugly, but we need the absolute path here, not the relative
test_env_vars: PATH=/opt/mongodbtoolchain/v3/bin:$PATH LD_LIBRARY_PATH=$(pwd)/.libs top_srcdir=$(pwd)/.. top_builddir=$(pwd)
@@ -1243,11 +1186,13 @@ buildvariants:
- name: unit-test-bucket06
- name: unit-test-bucket07
- name: fops
+ - name: compile-asan
+ - name: examples-c-test-asan
-- name: ubuntu1404-python3
- display_name: Ubuntu 14.04 (Python3)
+- name: ubuntu1804-python3
+ display_name: Ubuntu 18.04 (Python3)
run_on:
- - ubuntu1404-test
+ - ubuntu1804-test
expansions:
test_env_vars: PATH=/opt/mongodbtoolchain/v3/bin:$PATH LD_LIBRARY_PATH=$(pwd)/.libs top_srcdir=$(pwd)/.. top_builddir=$(pwd)
smp_command: -j $(grep -c ^processor /proc/cpuinfo)
@@ -1266,11 +1211,26 @@ buildvariants:
- name: unit-test-bucket06
- name: unit-test-bucket07
+- name: rhel80
+ display_name: RHEL 8.0
+ run_on:
+ - rhel80-test
+ expansions:
+ test_env_vars: PATH=/opt/mongodbtoolchain/v3/bin:$PATH LD_LIBRARY_PATH=$(pwd)/.libs top_srcdir=$(pwd)/.. top_builddir=$(pwd)
+ smp_command: -j $(grep -c ^processor /proc/cpuinfo)
+ configure_env_vars: CC=/opt/mongodbtoolchain/v3/bin/gcc CXX=/opt/mongodbtoolchain/v3/bin/g++ PATH=/opt/mongodbtoolchain/v3/bin:$PATH
+ make_command: PATH=/opt/mongodbtoolchain/v3/bin:$PATH make
+ tasks:
+ - name: compile
+ - name: make-check-test
+ - name: unit-test
+ - name: fops
+
- name: large-scale-test
display_name: Large scale testing
batchtime: 1440 # 1 day
run_on:
- - rhel62-large
+ - rhel80-build
expansions:
configure_env_vars: CC=/opt/mongodbtoolchain/v3/bin/gcc CXX=/opt/mongodbtoolchain/v3/bin/g++
tasks:
@@ -1280,14 +1240,14 @@ buildvariants:
display_name: Compatibility tests
batchtime: 10080 # 7 days
run_on:
- - ubuntu1404-test
+ - ubuntu1804-test
tasks:
- name: compatibility-test-for-mongodb-releases
- name: windows-64
display_name: Windows 64-bit
run_on:
- - windows-64-vs2013-test
+ - windows-64-vs2017-test
tasks:
- name: compile
- name: compile-windows-alt
@@ -1320,7 +1280,7 @@ buildvariants:
- name: little-endian
display_name: Little-endian (x86)
run_on:
- - ubuntu1404-test
+ - ubuntu1804-test
batchtime: 10080 # 7 days
expansions:
smp_command: -j $(grep -c ^processor /proc/cpuinfo)
@@ -1337,7 +1297,7 @@ buildvariants:
modules:
- enterprise
run_on:
- - ubuntu1604-zseries-small
+ - ubuntu1804-zseries-build
batchtime: 10080 # 7 days
expansions:
smp_command: -j $(grep -c ^processor /proc/cpuinfo)
diff --git a/src/third_party/wiredtiger/test/format/Makefile.am b/src/third_party/wiredtiger/test/format/Makefile.am
index da55ffece4c..2d9bbf21eb8 100644
--- a/src/third_party/wiredtiger/test/format/Makefile.am
+++ b/src/third_party/wiredtiger/test/format/Makefile.am
@@ -4,7 +4,7 @@ AM_CPPFLAGS +=-I$(top_srcdir)/test/utility
noinst_PROGRAMS = t
t_SOURCES =\
- backup.c bulk.c compact.c config.c lrt.c ops.c rebalance.c \
+ backup.c bulk.c compact.c config.c lrt.c ops.c random.c rebalance.c \
salvage.c snap.c t.c util.c wts.c
t_LDADD = $(top_builddir)/test/utility/libtest_util.la
diff --git a/src/third_party/wiredtiger/test/format/bulk.c b/src/third_party/wiredtiger/test/format/bulk.c
index 303b0e4dbca..0f46e645311 100644
--- a/src/third_party/wiredtiger/test/format/bulk.c
+++ b/src/third_party/wiredtiger/test/format/bulk.c
@@ -151,15 +151,13 @@ wts_load(void)
}
/*
- * We don't want to size the cache to ensure the initial data
- * set can load in the in-memory case, guaranteeing the load
- * succeeds probably means future updates are also guaranteed
- * to succeed, which isn't what we want. If we run out of space
- * in the initial load, reset the row counter and continue.
+ * We don't want to size the cache to ensure the initial data set can load in the in-memory
+ * case, guaranteeing the load succeeds probably means future updates are also guaranteed to
+ * succeed, which isn't what we want. If we run out of space in the initial load, reset the
+ * row counter and continue.
*
- * Decrease inserts, they can't be successful if we're at the
- * cache limit, and increase the delete percentage to get some
- * extra space once the run starts.
+ * Decrease inserts, they can't be successful if we're at the cache limit, and increase the
+ * delete percentage to get some extra space once the run starts.
*/
if ((ret = cursor->insert(cursor)) != 0) {
testutil_assert(ret == WT_CACHE_FULL || ret == WT_ROLLBACK);
diff --git a/src/third_party/wiredtiger/test/format/compact.c b/src/third_party/wiredtiger/test/format/compact.c
index e0492b7d5d6..a8c7ea4b3f9 100644
--- a/src/third_party/wiredtiger/test/format/compact.c
+++ b/src/third_party/wiredtiger/test/format/compact.c
@@ -60,12 +60,11 @@ compact(void *arg)
break;
/*
- * Compact can return EBUSY if concurrent with alter or if there
- * is eviction pressure, or we collide with checkpoints.
+ * Compact can return EBUSY if concurrent with alter or if there is eviction pressure, or we
+ * collide with checkpoints.
*
- * Compact returns ETIMEDOUT if the compaction doesn't finish in
- * in some number of seconds. We don't configure a timeout and
- * occasionally exceed the default of 1200 seconds.
+ * Compact returns ETIMEDOUT if the compaction doesn't finish in in some number of seconds.
+ * We don't configure a timeout and occasionally exceed the default of 1200 seconds.
*/
ret = session->compact(session, g.uri, NULL);
if (ret != 0 && ret != EBUSY && ret != ETIMEDOUT && ret != WT_ROLLBACK)
diff --git a/src/third_party/wiredtiger/test/format/config.c b/src/third_party/wiredtiger/test/format/config.c
index 712bd27fffb..8cec1318efc 100644
--- a/src/third_party/wiredtiger/test/format/config.c
+++ b/src/third_party/wiredtiger/test/format/config.c
@@ -109,11 +109,10 @@ config_setup(void)
/*
* LSM requires a row-store and backing disk.
*
- * Configuring truncation or timestamps results in LSM
- * cache problems, don't configure LSM if those set.
+ * Configuring truncation or timestamps results in LSM cache problems, don't
+ * configure LSM if those set.
*
- * XXX
- * Remove the timestamp test when WT-4162 resolved.
+ * XXX Remove the timestamp test when WT-4162 resolved.
*/
if (g.type != ROW || g.c_in_memory)
break;
@@ -209,16 +208,14 @@ config_setup(void)
/*
* Run-length is configured by a number of operations and a timer.
*
- * If the operation count and the timer are both configured, do nothing.
- * If only the timer is configured, clear the operations count.
- * If only the operation count is configured, limit the run to 6 hours.
- * If neither is configured, leave the operations count alone and limit
- * the run to 30 minutes.
+ * If the operation count and the timer are both configured, do nothing. If only the timer is
+ * configured, clear the operations count. If only the operation count is configured, limit the
+ * run to 6 hours. If neither is configured, leave the operations count alone and limit the run
+ * to 30 minutes.
*
- * In other words, if we rolled the dice on everything, do a short run.
- * If we chose a number of operations but the rest of the configuration
- * means operations take a long time to complete (for example, a small
- * cache and many worker threads), don't let it run forever.
+ * In other words, if we rolled the dice on everything, do a short run. If we chose a number of
+ * operations but the rest of the configuration means operations take a long time to complete
+ * (for example, a small cache and many worker threads), don't let it run forever.
*/
if (config_is_perm("timer")) {
if (!config_is_perm("ops"))
@@ -263,16 +260,14 @@ config_cache(void)
/*
* Maximum internal/leaf page size sanity.
*
- * Ensure we can service at least one operation per-thread concurrently
- * without filling the cache with pinned pages, that is, every thread
- * consuming an internal page and a leaf page (or a pair of leaf pages
- * for cursor movements).
+ * Ensure we can service at least one operation per-thread concurrently without filling the
+ * cache with pinned pages, that is, every thread consuming an internal page and a leaf page (or
+ * a pair of leaf pages for cursor movements).
*
* Maximum memory pages are in units of MB.
*
- * This code is what dramatically increases the cache size when there
- * are lots of threads, it grows the cache to several megabytes per
- * thread.
+ * This code is what dramatically increases the cache size when there are lots of threads, it
+ * grows the cache to several megabytes per thread.
*/
g.c_cache = WT_MAX(g.c_cache, 2 * g.c_threads * g.c_memory_page_max);
@@ -368,8 +363,7 @@ config_compression(const char *conf_name)
/*
* Select a compression type from the list of built-in engines.
*
- * Listed percentages are only correct if all of the possible engines
- * are compiled in.
+ * Listed percentages are only correct if all of the possible engines are compiled in.
*/
switch (mmrand(NULL, 1, 20)) {
#ifdef HAVE_BUILTIN_EXTENSION_LZ4
@@ -657,13 +651,11 @@ config_pct(void)
}
/*
- * Walk the list, allocating random numbers of operations in a random
- * order.
+ * Walk the list, allocating random numbers of operations in a random order.
*
- * If the "order" field is non-zero, we need to create a value for this
- * operation. Find the largest order field in the array; if one non-zero
- * order field is found, it's the last entry and gets the remainder of
- * the operations.
+ * If the "order" field is non-zero, we need to create a value for this operation. Find the
+ * largest order field in the array; if one non-zero order field is found, it's the last entry
+ * and gets the remainder of the operations.
*/
for (pct = 100 - pct;;) {
for (i = n = max_order = max_slot = 0; i < WT_ELEMENTS(list); ++i) {
diff --git a/src/third_party/wiredtiger/test/format/config.h b/src/third_party/wiredtiger/test/format/config.h
index 58decce75af..492d5124a1c 100644
--- a/src/third_party/wiredtiger/test/format/config.h
+++ b/src/third_party/wiredtiger/test/format/config.h
@@ -222,6 +222,9 @@ static CONFIG c[] = {{"abort", "if timed run should drop core", /* 0% */
{"quiet", "quiet run (same as -q)", C_IGNORE | C_BOOL, 0, 0, 1, &g.c_quiet, NULL},
+ {"random_cursor", "if random cursor reads configured", /* 10% */
+ C_BOOL, 10, 0, 0, &g.c_random_cursor, NULL},
+
{"read_pct", "percent operations that are reads", C_IGNORE, 0, 0, 100, &g.c_read_pct, NULL},
{"rebalance", "rebalance testing", /* 100% */
diff --git a/src/third_party/wiredtiger/test/format/format.h b/src/third_party/wiredtiger/test/format/format.h
index e90bbf86998..d8cfea5730a 100644
--- a/src/third_party/wiredtiger/test/format/format.h
+++ b/src/third_party/wiredtiger/test/format/format.h
@@ -90,13 +90,11 @@ typedef struct {
WT_RAND_STATE rnd; /* Global RNG state */
/*
- * Prepare will return an error if the prepare timestamp is less than
- * any active read timestamp. Lock across allocating prepare and read
- * timestamps.
+ * Prepare will return an error if the prepare timestamp is less than any active read timestamp.
+ * Lock across allocating prepare and read timestamps.
*
- * We get the last committed timestamp periodically in order to update
- * the oldest timestamp, that requires locking out transactional ops
- * that set a timestamp.
+ * We get the last committed timestamp periodically in order to update the oldest timestamp,
+ * that requires locking out transactional ops that set a timestamp.
*/
pthread_rwlock_t ts_lock;
@@ -178,6 +176,7 @@ typedef struct {
uint32_t c_prefix_compression_min;
uint32_t c_prepare;
uint32_t c_quiet;
+ uint32_t c_random_cursor;
uint32_t c_read_pct;
uint32_t c_rebalance;
uint32_t c_repeat_data_pct;
@@ -345,6 +344,7 @@ void key_gen_insert(WT_RAND_STATE *, WT_ITEM *, uint64_t);
void key_gen_teardown(WT_ITEM *);
void key_init(void);
WT_THREAD_RET lrt(void *);
+WT_THREAD_RET random_kv(void *);
void path_setup(const char *);
int read_row_worker(WT_CURSOR *, uint64_t, WT_ITEM *, WT_ITEM *, bool);
uint32_t rng(WT_RAND_STATE *);
diff --git a/src/third_party/wiredtiger/test/format/ops.c b/src/third_party/wiredtiger/test/format/ops.c
index a03b42e427b..f136372260c 100644
--- a/src/third_party/wiredtiger/test/format/ops.c
+++ b/src/third_party/wiredtiger/test/format/ops.c
@@ -87,7 +87,7 @@ wts_ops(bool lastrun)
TINFO *tinfo, total;
WT_CONNECTION *conn;
WT_SESSION *session;
- wt_thread_t alter_tid, backup_tid, checkpoint_tid, compact_tid, lrt_tid;
+ wt_thread_t alter_tid, backup_tid, checkpoint_tid, compact_tid, lrt_tid, random_tid;
wt_thread_t timestamp_tid;
int64_t fourths, quit_fourths, thread_ops;
uint32_t i;
@@ -101,20 +101,19 @@ wts_ops(bool lastrun)
memset(&checkpoint_tid, 0, sizeof(checkpoint_tid));
memset(&compact_tid, 0, sizeof(compact_tid));
memset(&lrt_tid, 0, sizeof(lrt_tid));
+ memset(&random_tid, 0, sizeof(random_tid));
memset(&timestamp_tid, 0, sizeof(timestamp_tid));
modify_repl_init();
/*
- * There are two mechanisms to specify the length of the run, a number
- * of operations and a timer, when either expire the run terminates.
+ * There are two mechanisms to specify the length of the run, a number of operations and a
+ * timer, when either expire the run terminates.
*
- * Each thread does an equal share of the total operations (and make
- * sure that it's not 0).
+ * Each thread does an equal share of the total operations (and make sure that it's not 0).
*
- * Calculate how many fourth-of-a-second sleeps until the timer expires.
- * If the timer expires and threads don't return in 15 minutes, assume
- * there is something hung, and force the quit.
+ * Calculate how many fourth-of-a-second sleeps until the timer expires. If the timer expires
+ * and threads don't return in 15 minutes, assume there is something hung, and force the quit.
*/
if (g.c_ops == 0)
thread_ops = -1;
@@ -183,6 +182,8 @@ wts_ops(bool lastrun)
testutil_check(__wt_thread_create(NULL, &compact_tid, compact, NULL));
if (!SINGLETHREADED && g.c_long_running_txn)
testutil_check(__wt_thread_create(NULL, &lrt_tid, lrt, NULL));
+ if (g.c_random_cursor)
+ testutil_check(__wt_thread_create(NULL, &random_tid, random_kv, NULL));
if (g.c_txn_timestamps)
testutil_check(__wt_thread_create(NULL, &timestamp_tid, timestamp, tinfo_list));
@@ -267,6 +268,8 @@ wts_ops(bool lastrun)
testutil_check(__wt_thread_join(NULL, &compact_tid));
if (!SINGLETHREADED && g.c_long_running_txn)
testutil_check(__wt_thread_join(NULL, &lrt_tid));
+ if (g.c_random_cursor)
+ testutil_check(__wt_thread_join(NULL, &random_tid));
if (g.c_txn_timestamps)
testutil_check(__wt_thread_join(NULL, &timestamp_tid));
g.workers_finished = false;
@@ -335,9 +338,8 @@ begin_transaction_ts(TINFO *tinfo, u_int *iso_configp)
/*
* Otherwise, pick a current timestamp.
*
- * Prepare returns an error if the prepare timestamp is less
- * than any active read timestamp, single-thread transaction
- * prepare and begin.
+ * Prepare returns an error if the prepare timestamp is less than any active read timestamp,
+ * single-thread transaction prepare and begin.
*
* Lock out the oldest timestamp update.
*/
@@ -469,12 +471,12 @@ prepare_transaction(TINFO *tinfo)
++tinfo->prepare;
/*
- * Prepare timestamps must be less than or equal to the eventual commit
- * timestamp. Set the prepare timestamp to whatever the global value is
- * now. The subsequent commit will increment it, ensuring correctness.
+ * Prepare timestamps must be less than or equal to the eventual commit timestamp. Set the
+ * prepare timestamp to whatever the global value is now. The subsequent commit will increment
+ * it, ensuring correctness.
*
- * Prepare returns an error if the prepare timestamp is less than any
- * active read timestamp, single-thread transaction prepare and begin.
+ * Prepare returns an error if the prepare timestamp is less than any active read timestamp,
+ * single-thread transaction prepare and begin.
*
* Lock out the oldest timestamp update.
*/
@@ -568,11 +570,9 @@ ops_open_session(TINFO *tinfo, bool *ckpt_handlep)
}
if (cursor == NULL) {
/*
- * Configure "append", in the case of column stores, we append
- * when inserting new rows.
+ * Configure "append", in the case of column stores, we append when inserting new rows.
*
- * WT_SESSION.open_cursor can return EBUSY if concurrent with a
- * metadata operation, retry.
+ * WT_SESSION.open_cursor can return EBUSY if concurrent with a metadata operation, retry.
*/
while ((ret = session->open_cursor(session, g.uri, NULL, "append", &cursor)) == EBUSY)
__wt_yield();
@@ -837,16 +837,13 @@ ops(void *arg)
tinfo->keyno = mmrand(&tinfo->rnd, 1, (u_int)g.rows);
/*
- * Truncate up to 5% of the table. If the range overlaps
- * the beginning/end of the table, set the key to 0 (the
- * truncate function then sets a cursor to NULL so that
- * code is tested).
+ * Truncate up to 5% of the table. If the range overlaps the beginning/end of the table,
+ * set the key to 0 (the truncate function then sets a cursor to NULL so that code is
+ * tested).
*
- * This gets tricky: there are 2 directions (truncating
- * from lower keys to the current position or from
- * the current position to higher keys), and collation
- * order (truncating from lower keys to higher keys or
- * vice-versa).
+ * This gets tricky: there are 2 directions (truncating from lower keys to the current
+ * position or from the current position to higher keys), and collation order
+ * (truncating from lower keys to higher keys or vice-versa).
*/
greater_than = mmrand(&tinfo->rnd, 0, 1) == 1;
range = g.rows < 20 ? 0 : mmrand(&tinfo->rnd, 0, (u_int)g.rows / 20);
@@ -1578,30 +1575,26 @@ table_append(uint64_t keyno)
ep = g.append + g.append_max;
/*
- * We don't want to ignore records we append, which requires we update
- * the "last row" as we insert new records. Threads allocating record
- * numbers can race with other threads, so the thread allocating record
- * N may return after the thread allocating N + 1. We can't update a
- * record before it's been inserted, and so we can't leave gaps when the
- * count of records in the table is incremented.
+ * We don't want to ignore records we append, which requires we update the "last row" as we
+ * insert new records. Threads allocating record numbers can race with other threads, so the
+ * thread allocating record N may return after the thread allocating N + 1. We can't update a
+ * record before it's been inserted, and so we can't leave gaps when the count of records in the
+ * table is incremented.
*
- * The solution is the append table, which contains an unsorted list of
- * appended records. Every time we finish appending a record, process
- * the table, trying to update the total records in the object.
+ * The solution is the append table, which contains an unsorted list of appended records. Every
+ * time we finish appending a record, process the table, trying to update the total records in
+ * the object.
*
* First, enter the new key into the append list.
*
- * It's technically possible to race: we allocated space for 10 records
- * per thread, but the check for the maximum number of records being
- * appended doesn't lock. If a thread allocated a new record and went
- * to sleep (so the append table fills up), then N threads of control
- * used the same g.append_cnt value to decide there was an available
- * slot in the append table and both allocated new records, we could run
- * out of space in the table. It's unfortunately not even unlikely in
- * the case of a large number of threads all inserting as fast as they
- * can and a single thread going to sleep for an unexpectedly long time.
- * If it happens, sleep and retry until earlier records are resolved
- * and we find a slot.
+ * It's technically possible to race: we allocated space for 10 records per thread, but the
+ * check for the maximum number of records being appended doesn't lock. If a thread allocated a
+ * new record and went to sleep (so the append table fills up), then N threads of control used
+ * the same g.append_cnt value to decide there was an available slot in the append table and
+ * both allocated new records, we could run out of space in the table. It's unfortunately not
+ * even unlikely in the case of a large number of threads all inserting as fast as they can and
+ * a single thread going to sleep for an unexpectedly long time. If it happens, sleep and retry
+ * until earlier records are resolved and we find a slot.
*/
for (done = 0;;) {
testutil_check(pthread_rwlock_wrlock(&g.append_lock));
diff --git a/src/third_party/wiredtiger/test/format/random.c b/src/third_party/wiredtiger/test/format/random.c
new file mode 100644
index 00000000000..c808c53d442
--- /dev/null
+++ b/src/third_party/wiredtiger/test/format/random.c
@@ -0,0 +1,104 @@
+/*-
+ * Public Domain 2014-2019 MongoDB, Inc.
+ * Public Domain 2008-2014 WiredTiger, Inc.
+ *
+ * This is free and unencumbered software released into the public domain.
+ *
+ * Anyone is free to copy, modify, publish, use, compile, sell, or
+ * distribute this software, either in source code form or as a compiled
+ * binary, for any purpose, commercial or non-commercial, and by any
+ * means.
+ *
+ * In jurisdictions that recognize copyright laws, the author or authors
+ * of this software dedicate any and all copyright interest in the
+ * software to the public domain. We make this dedication for the benefit
+ * of the public at large and to the detriment of our heirs and
+ * successors. We intend this dedication to be an overt act of
+ * relinquishment in perpetuity of all present and future rights to this
+ * software under copyright law.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include "format.h"
+
+/*
+ * random_kv --
+ * Do random cursor operations.
+ */
+WT_THREAD_RET
+random_kv(void *arg)
+{
+ WT_CONNECTION *conn;
+ WT_CURSOR *cursor;
+ WT_DECL_RET;
+ WT_ITEM key, value;
+ WT_SESSION *session;
+ uint32_t i;
+ u_int period;
+ const char *config;
+ bool simple;
+
+ (void)(arg); /* Unused parameter */
+
+ conn = g.wts_conn;
+
+ /* Random cursor ops are only supported on row-store. */
+ if (g.type != ROW)
+ return (WT_THREAD_RET_VALUE);
+
+ /* Open a session. */
+ testutil_check(conn->open_session(conn, NULL, NULL, &session));
+
+ for (simple = false;;) {
+ /* Alternate between simple random cursors and sample-size random cursors. */
+ config = simple ? "next_random=true" : "next_random=true,next_random_sample_size=37";
+ simple = !simple;
+
+ /*
+ * open_cursor can return EBUSY if concurrent with a metadata operation, retry in that case.
+ */
+ while ((ret = session->open_cursor(session, g.uri, NULL, config, &cursor)) == EBUSY)
+ __wt_yield();
+ testutil_check(ret);
+
+ /* This is just a smoke-test, get some key/value pairs. */
+ for (i = mmrand(NULL, 0, 1000); i > 0; --i) {
+ switch (ret = cursor->next(cursor)) {
+ case 0:
+ break;
+ case WT_NOTFOUND:
+ case WT_ROLLBACK:
+ case WT_PREPARE_CONFLICT:
+ continue;
+ default:
+ testutil_check(ret);
+ }
+ testutil_check(cursor->get_key(cursor, &key));
+ testutil_check(cursor->get_value(cursor, &value));
+ }
+
+ testutil_check(cursor->close(cursor));
+
+ /* Sleep for some number of seconds. */
+ period = mmrand(NULL, 1, 10);
+
+ /* Sleep for short periods so we don't make the run wait. */
+ while (period > 0 && !g.workers_finished) {
+ --period;
+ __wt_sleep(1, 0);
+ }
+ if (g.workers_finished)
+ break;
+ }
+
+ testutil_check(session->close(session, NULL));
+
+ return (WT_THREAD_RET_VALUE);
+}
diff --git a/src/third_party/wiredtiger/test/format/salvage.c b/src/third_party/wiredtiger/test/format/salvage.c
index efe2e0162a4..8c6e003370b 100644
--- a/src/third_party/wiredtiger/test/format/salvage.c
+++ b/src/third_party/wiredtiger/test/format/salvage.c
@@ -61,12 +61,11 @@ corrupt(void)
char buf[8 * 1024], copycmd[2 * 1024];
/*
- * If it's a single Btree file (not LSM), open the file, and corrupt
- * roughly 2% of the file at a random spot, including the beginning
- * of the file and overlapping the end.
+ * If it's a single Btree file (not LSM), open the file, and corrupt roughly 2% of the file at a
+ * random spot, including the beginning of the file and overlapping the end.
*
- * It's a little tricky: if the data source is a file, we're looking
- * for "wt", if the data source is a table, we're looking for "wt.wt".
+ * It's a little tricky: if the data source is a file, we're looking for "wt", if the data
+ * source is a table, we're looking for "wt.wt".
*/
testutil_check(__wt_snprintf(buf, sizeof(buf), "%s/%s", g.home, WT_NAME));
if ((fd = open(buf, O_RDWR)) != -1) {
diff --git a/src/third_party/wiredtiger/test/format/t.c b/src/third_party/wiredtiger/test/format/t.c
index c46a12f45b2..7a43ca9f9b4 100644
--- a/src/third_party/wiredtiger/test/format/t.c
+++ b/src/third_party/wiredtiger/test/format/t.c
@@ -134,11 +134,10 @@ main(int argc, char *argv[])
}
/*
- * If we weren't given a configuration file, set values from "CONFIG",
- * if it exists.
+ * If we weren't given a configuration file, set values from "CONFIG", if it exists.
*
- * Small hack to ignore any CONFIG file named ".", that just makes it
- * possible to ignore any local CONFIG file, used when running checks.
+ * Small hack to ignore any CONFIG file named ".", that just makes it possible to ignore any
+ * local CONFIG file, used when running checks.
*/
if (config == NULL && access("CONFIG", R_OK) == 0)
config = "CONFIG";
@@ -213,12 +212,10 @@ main(int argc, char *argv[])
wts_ops(reps == FORMAT_OPERATION_REPS);
/*
- * Copy out the run's statistics after the last
- * set of operations.
+ * Copy out the run's statistics after the last set of operations.
*
- * XXX
- * Verify closes the underlying handle and
- * discards the statistics, read them first.
+ * XXX Verify closes the underlying handle and discards the statistics, read them
+ * first.
*/
if (reps == FORMAT_OPERATION_REPS)
wts_stats();
diff --git a/src/third_party/wiredtiger/test/format/util.c b/src/third_party/wiredtiger/test/format/util.c
index 88c5afd8e06..bdc98d25b46 100644
--- a/src/third_party/wiredtiger/test/format/util.c
+++ b/src/third_party/wiredtiger/test/format/util.c
@@ -165,9 +165,8 @@ val_init(void)
/*
* Set initial buffer contents to recognizable text.
*
- * Add a few extra bytes in order to guarantee we can always offset
- * into the buffer by a few extra bytes, used to generate different
- * data for column-store run-length encoded files.
+ * Add a few extra bytes in order to guarantee we can always offset into the buffer by a few
+ * extra bytes, used to generate different data for column-store run-length encoded files.
*/
val_len = MAX(KILOBYTE(100), g.c_value_max) + 20;
val_base = dmalloc(val_len);
@@ -351,11 +350,11 @@ path_setup(const char *home)
testutil_check(__wt_snprintf(g.home_stats, len, "%s/%s", g.home, "stats"));
/*
- * Home directory initialize command: create the directory if it doesn't
- * exist, else remove everything except the RNG log file.
+ * Home directory initialize command: create the directory if it doesn't exist, else remove
+ * everything except the RNG log file.
*
- * Redirect the "cd" command to /dev/null so chatty cd implementations
- * don't add the new working directory to our output.
+ * Redirect the "cd" command to /dev/null so chatty cd implementations don't add the new working
+ * directory to our output.
*/
#undef CMD
#ifdef _WIN32
@@ -398,11 +397,10 @@ path_setup(const char *home)
"BACKUP_COPY", g.home, "BACKUP", g.home, "BACKUP_COPY"));
/*
- * Salvage command, save the interesting files so we can replay the
- * salvage command as necessary.
+ * Salvage command, save the interesting files so we can replay the salvage command as necessary.
*
- * Redirect the "cd" command to /dev/null so chatty cd implementations
- * don't add the new working directory to our output.
+ * Redirect the "cd" command to /dev/null so chatty cd implementations don't add the new working
+ * directory to our output.
*/
#undef CMD
#ifdef _WIN32
@@ -439,12 +437,11 @@ rng(WT_RAND_STATE *rnd)
rnd = &g.rnd;
/*
- * We can reproduce a single-threaded run based on the random numbers
- * used in the initial run, plus the configuration files.
+ * We can reproduce a single-threaded run based on the random numbers used in the initial run,
+ * plus the configuration files.
*
- * Check g.replay and g.rand_log_stop: multithreaded runs log/replay
- * until they get to the operations phase, then turn off log/replay,
- * threaded operation order can't be replayed.
+ * Check g.replay and g.rand_log_stop: multithreaded runs log/replay until they get to the
+ * operations phase, then turn off log/replay, threaded operation order can't be replayed.
*/
if (g.rand_log_stop)
return (__wt_random(rnd));
diff --git a/src/third_party/wiredtiger/test/format/wts.c b/src/third_party/wiredtiger/test/format/wts.c
index 89a72f090e7..f3482861573 100644
--- a/src/third_party/wiredtiger/test/format/wts.c
+++ b/src/third_party/wiredtiger/test/format/wts.c
@@ -162,7 +162,8 @@ wts_open(const char *home, bool set_api, WT_CONNECTION **connp)
",cache_size=%" PRIu32
"MB"
",checkpoint_sync=false"
- ",error_prefix=\"%s\"",
+ ",error_prefix=\"%s\""
+ ",operation_timeout_ms=2000",
g.c_cache, progname);
/* In-memory configuration. */
diff --git a/src/third_party/wiredtiger/test/readonly/readonly.c b/src/third_party/wiredtiger/test/readonly/readonly.c
index bd6adae429a..fb10a0f61e2 100644
--- a/src/third_party/wiredtiger/test/readonly/readonly.c
+++ b/src/third_party/wiredtiger/test/readonly/readonly.c
@@ -291,11 +291,10 @@ main(int argc, char *argv[])
testutil_die(ret, "wiredtiger_open readonly nolock");
/*
- * Create a child to also open a connection handle to the databases.
- * We cannot use fork here because using fork the child inherits the
- * same memory image. Therefore the WT process structure is set in
- * the child even though it should not be. So use 'system' to spawn
- * an entirely new process.
+ * Create a child to also open a connection handle to the databases. We cannot use fork here
+ * because using fork the child inherits the same memory image. Therefore the WT process
+ * structure is set in the child even though it should not be. So use 'system' to spawn an
+ * entirely new process.
*
* The child will exit with success if its test passes.
*/
diff --git a/src/third_party/wiredtiger/test/suite/test_debug_mode05.py b/src/third_party/wiredtiger/test/suite/test_debug_mode05.py
index f248a05e646..09597e7a38f 100644
--- a/src/third_party/wiredtiger/test/suite/test_debug_mode05.py
+++ b/src/third_party/wiredtiger/test/suite/test_debug_mode05.py
@@ -43,9 +43,11 @@ class test_debug_mode05(wttest.WiredTigerTestCase):
def test_table_logging_rollback_to_stable(self):
self.session.create(self.uri, 'key_format=i,value_format=u')
+
cursor = self.session.open_cursor(self.uri, None)
self.conn.set_timestamp('stable_timestamp=' + timestamp_str(100))
+ self.session.checkpoint()
# Try doing a normal prepared txn and then rollback to stable.
self.session.begin_transaction()
diff --git a/src/third_party/wiredtiger/test/suite/test_las01.py b/src/third_party/wiredtiger/test/suite/test_las01.py
index 76f19b51768..679d01ae06d 100755
--- a/src/third_party/wiredtiger/test/suite/test_las01.py
+++ b/src/third_party/wiredtiger/test/suite/test_las01.py
@@ -83,10 +83,11 @@ class test_las01(wttest.WiredTigerTestCase):
# Skip the initial rows, which were not updated
for i in range(0, nrows+1):
self.assertEqual(cursor.next(), 0)
- if (check_value != cursor.get_value()):
- print("Check value : " + str(check_value))
- print("value : " + str(cursor.get_value()))
- self.assertTrue(check_value == cursor.get_value())
+ if check_value != cursor.get_value():
+ session.breakpoint()
+ self.assertTrue(check_value == cursor.get_value(),
+ "for key " + str(i) + ", expected " + str(check_value) +
+ ", got " + str(cursor.get_value()))
cursor.close()
session.close()
conn.close()
diff --git a/src/third_party/wiredtiger/test/suite/test_timestamp04.py b/src/third_party/wiredtiger/test/suite/test_timestamp04.py
index acbad7e02a4..9e0e4a0cec0 100644
--- a/src/third_party/wiredtiger/test/suite/test_timestamp04.py
+++ b/src/third_party/wiredtiger/test/suite/test_timestamp04.py
@@ -78,7 +78,8 @@ class test_timestamp04(wttest.WiredTigerTestCase, suite_subprocess):
# Search for the expected items as well as iterating.
for k, v in expected.items():
if missing == False:
- self.assertEqual(cur[k], v, "for key " + str(k))
+ self.assertEqual(cur[k], v, "for key " + str(k) +
+ " expected " + str(v) + ", got " + str(cur[k]))
else:
cur.set_key(k)
if self.empty:
@@ -162,7 +163,11 @@ class test_timestamp04(wttest.WiredTigerTestCase, suite_subprocess):
# Roll back half timestamps.
stable_ts = timestamp_str(key_range // 2)
self.conn.set_timestamp('stable_timestamp=' + stable_ts)
+
+ # We're about to test rollback-to-stable which requires a checkpoint to which we can roll back.
+ self.session.checkpoint()
self.conn.rollback_to_stable()
+
stat_cursor = self.session.open_cursor('statistics:', None, None)
calls = stat_cursor[stat.conn.txn_rollback_to_stable][2]
upd_aborted = (stat_cursor[stat.conn.txn_rollback_upd_aborted][2] +
diff --git a/src/third_party/wiredtiger/test/suite/test_timestamp06.py b/src/third_party/wiredtiger/test/suite/test_timestamp06.py
index 55981f67a98..fd004a23703 100644
--- a/src/third_party/wiredtiger/test/suite/test_timestamp06.py
+++ b/src/third_party/wiredtiger/test/suite/test_timestamp06.py
@@ -157,7 +157,7 @@ class test_timestamp06(wttest.WiredTigerTestCase, suite_subprocess):
# Scenario: 1
# Check that we see all the latest values (i.e. 3) as per transaction
- # visibility when reading with out the read timestamp.
+ # visibility when reading without the read timestamp.
# All tables should see all the values.
self.check(self.session, "", self.table_ts_log,
dict((k, 3) for k in orig_keys))
@@ -204,8 +204,12 @@ class test_timestamp06(wttest.WiredTigerTestCase, suite_subprocess):
self.ckpt_backup(2, (nkeys - valcnt_ts_log), (nkeys - valcnt_ts_nolog))
# Scenario: 3
- # Check that we see all the data values correctly after rollback
+ # Check we see all the data values correctly after rollback. Skip the case where the most
+ # recent checkpoint wasn't based on the last stable timestamp, those can't be rolled back.
+ if self.ckpt_ts == False:
+ return
self.conn.rollback_to_stable()
+
# All tables should see the values correctly when read with
# read timestamp as stable timestamp.
self.check(self.session, 'read_timestamp=' + stable_ts,
@@ -214,7 +218,7 @@ class test_timestamp06(wttest.WiredTigerTestCase, suite_subprocess):
self.table_ts_log, dict((k, 2) for k in orig_keys))
# Scenario: 4
- # Check that we see the values correctly when read with out any
+ # Check that we see the values correctly when read without any
# timestamp.
if self.using_log == True:
# For logged table we should see latest values (i.e. 3) when logging
@@ -224,21 +228,13 @@ class test_timestamp06(wttest.WiredTigerTestCase, suite_subprocess):
else:
# When logging is disabled, we should not see the values beyond the
# stable timestamp with timestamped checkpoints.
- if self.ckpt_ts == True:
- self.check(self.session, "",
- self.table_ts_log, dict((k, 2) for k in orig_keys))
- else:
- self.check(self.session, "",
- self.table_ts_log, dict((k, 3) for k in orig_keys))
+ self.check(self.session, "",
+ self.table_ts_log, dict((k, 2) for k in orig_keys))
# For non-logged table we should not see the values beyond the
# stable timestamp with timestamped checkpoints.
- if self.ckpt_ts == True:
- self.check(self.session, "",
- self.table_ts_nolog, dict((k, 2) for k in orig_keys))
- else:
- self.check(self.session, "",
- self.table_ts_nolog, dict((k, 3) for k in orig_keys))
+ self.check(self.session, "",
+ self.table_ts_nolog, dict((k, 2) for k in orig_keys))
if __name__ == '__main__':
wttest.run()
diff --git a/src/third_party/wiredtiger/test/suite/test_timestamp11.py b/src/third_party/wiredtiger/test/suite/test_timestamp11.py
index 1256a544d78..f3d03cd8fa5 100644
--- a/src/third_party/wiredtiger/test/suite/test_timestamp11.py
+++ b/src/third_party/wiredtiger/test/suite/test_timestamp11.py
@@ -83,6 +83,7 @@ class test_timestamp11(wttest.WiredTigerTestCase, suite_subprocess):
#
stable_ts = timestamp_str(2)
self.conn.set_timestamp('stable_timestamp=' + stable_ts)
+ self.session.checkpoint()
self.conn.rollback_to_stable()
c = self.session.open_cursor(uri)
diff --git a/src/third_party/wiredtiger/test/suite/test_timestamp16.py b/src/third_party/wiredtiger/test/suite/test_timestamp16.py
index bef116d62a9..20663889450 100644
--- a/src/third_party/wiredtiger/test/suite/test_timestamp16.py
+++ b/src/third_party/wiredtiger/test/suite/test_timestamp16.py
@@ -50,25 +50,21 @@ class test_timestamp16(wttest.WiredTigerTestCase, suite_subprocess):
self.session.begin_transaction('read_timestamp=100')
self.session.rollback_transaction()
self.session.checkpoint('use_timestamp=true')
- self.assertTimestampsEqual('0',
- self.conn.query_timestamp('get=last_checkpoint'))
+ self.assertTimestampsEqual('0', self.conn.query_timestamp('get=last_checkpoint'))
- # Set a stable and make sure that we still checkpoint at
- # the stable.
- self.conn.set_timestamp('stable_timestamp=1')
+ # Set a stable and make sure that we still checkpoint at the stable.
+ self.conn.set_timestamp('stable_timestamp=2')
self.session.begin_transaction('read_timestamp=100')
self.session.rollback_transaction()
self.session.checkpoint('use_timestamp=true')
- self.assertTimestampsEqual('1',
- self.conn.query_timestamp('get=last_checkpoint'))
+ self.assertTimestampsEqual('2', self.conn.query_timestamp('get=last_checkpoint'))
# Finally make sure that commit also resets the read timestamp.
self.session.create(self.uri, 'key_format=i,value_format=i')
self.session.begin_transaction('read_timestamp=150')
self.session.commit_transaction()
self.session.checkpoint('use_timestamp=true')
- self.assertTimestampsEqual('1',
- self.conn.query_timestamp('get=last_checkpoint'))
+ self.assertTimestampsEqual('2', self.conn.query_timestamp('get=last_checkpoint'))
if __name__ == '__main__':
wttest.run()
diff --git a/src/third_party/wiredtiger/test/suite/test_txn20.py b/src/third_party/wiredtiger/test/suite/test_txn20.py
index 381435ff472..f824d85ff45 100644
--- a/src/third_party/wiredtiger/test/suite/test_txn20.py
+++ b/src/third_party/wiredtiger/test/suite/test_txn20.py
@@ -83,10 +83,5 @@ class test_txn20(wttest.WiredTigerTestCase):
# 'read-uncommitted' will still see the new value.
self.assertEqual(cursor[self.key], self.new_value)
- # Cleanup.
- self.session.close()
- s.rollback_transaction()
- s.close()
-
if __name__ == '__main__':
wttest.run()
diff --git a/src/third_party/wiredtiger/test/suite/test_txn21.py b/src/third_party/wiredtiger/test/suite/test_txn21.py
new file mode 100644
index 00000000000..212a4d321b6
--- /dev/null
+++ b/src/third_party/wiredtiger/test/suite/test_txn21.py
@@ -0,0 +1,49 @@
+#!/usr/bin/env python
+#
+# Public Domain 2014-2019 MongoDB, Inc.
+# Public Domain 2008-2014 WiredTiger, Inc.
+#
+# This is free and unencumbered software released into the public domain.
+#
+# Anyone is free to copy, modify, publish, use, compile, sell, or
+# distribute this software, either in source code form or as a compiled
+# binary, for any purpose, commercial or non-commercial, and by any
+# means.
+#
+# In jurisdictions that recognize copyright laws, the author or authors
+# of this software dedicate any and all copyright interest in the
+# software to the public domain. We make this dedication for the benefit
+# of the public at large and to the detriment of our heirs and
+# successors. We intend this dedication to be an overt act of
+# relinquishment in perpetuity of all present and future rights to this
+# software under copyright law.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+# IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+# OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+# ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+# OTHER DEALINGS IN THE SOFTWARE.
+#
+# test_txn21.py
+# Transactions: smoke test the operation timeout API
+#
+
+import wiredtiger, wttest
+
+class test_txn21(wttest.WiredTigerTestCase):
+
+ # Connection-level configuration.
+ def test_operation_timeout_conn(self):
+ # Close the automatically opened connection and open one with the timeout configuration.
+ conn_config = 'operation_timeout_ms=2000'
+ self.conn.close()
+ self.conn = wiredtiger.wiredtiger_open(self.home, conn_config)
+
+ # Transaction-level configuration.
+ def test_operation_timeout_txn(self):
+ self.session.begin_transaction('operation_timeout_ms=2000')
+
+if __name__ == '__main__':
+ wttest.run()
diff --git a/src/third_party/wiredtiger/test/utility/test_util.h b/src/third_party/wiredtiger/test/utility/test_util.h
index 398727a6ca8..3442e8edcec 100644
--- a/src/third_party/wiredtiger/test/utility/test_util.h
+++ b/src/third_party/wiredtiger/test/utility/test_util.h
@@ -185,8 +185,8 @@ u64_to_string(uint64_t n, char **pp)
char *p;
/*
- * The argument pointer references the last element of a buffer (which
- * must be large enough to hold any possible value).
+ * The argument pointer references the last element of a buffer (which must be large enough to
+ * hold any possible value).
*
* Nul-terminate the buffer.
*/