summaryrefslogtreecommitdiff
path: root/src/third_party
diff options
context:
space:
mode:
authorLuke Chen <luke.chen@mongodb.com>2019-03-05 13:25:53 +1100
committerLuke Chen <luke.chen@mongodb.com>2019-03-05 13:27:30 +1100
commit70f793946b5e3872d07f73908b2d45a31cce051d (patch)
tree70cd44000deed16e6c4a9179f326dba1c08953be /src/third_party
parent6f3c3df4fc0abda76fd97e970ced4a01f0c48007 (diff)
downloadmongo-70f793946b5e3872d07f73908b2d45a31cce051d.tar.gz
Import wiredtiger: afdead1093b5c5b41dd54ddddf6f42d92bef1666 from branch mongodb-4.2
ref: 37e1570f82..afdead1093 for: 4.1.9 WT-4413 Add optional compact progress messages WT-4465 Add documentation for a dhandle's lifecycle WT-4514 Implement assert=durable_timestamp WT-4517 Change WT data format to avoid writing stable timestamps WT-4537 Fix WiredTiger cursor prev/next traversal failure on prepare retry WT-4554 Enhance WT salvage to handle the case of corrupted WiredTiger.turtle WT-4556 workgen: add synchronization points during run time WT-4558 WiredTiger connection statistics cursor incorrectly provides doubled-up values WT-4568 Operation tracking visualization incorrectly displays call stacks WT-4573 Reducing calls to __wt_epoch from session reset WT-4587 Parallelize the script that parses operation tracking files WT-4595 Coverity "null pointer dereference" complaints WT-4596 Simplify wt utility's session/connection handling WT-4599 Show latency threshold violations in operation visualizations WT-4605 workgen: read_write_storms.py needs a public domain notice WT-4606 workgen: remove lsm from the default table type in wtperf emulation WT-4613 Skip the wt4333_handle_locks test on OS X WT-4615 Sync backup file before returning backup cursor WT-4617 Cursor next/prev returns PREPARE_CONFLICT only once
Diffstat (limited to 'src/third_party')
-rw-r--r--src/third_party/wiredtiger/bench/workgen/runner/read_write_storms.py35
-rwxr-xr-xsrc/third_party/wiredtiger/bench/workgen/runner/read_write_sync_long.py136
-rwxr-xr-xsrc/third_party/wiredtiger/bench/workgen/runner/read_write_sync_short.py151
-rwxr-xr-x[-rw-r--r--]src/third_party/wiredtiger/bench/workgen/runner/runner/__init__.py2
-rwxr-xr-x[-rw-r--r--]src/third_party/wiredtiger/bench/workgen/runner/runner/core.py17
-rw-r--r--src/third_party/wiredtiger/bench/workgen/workgen.cxx115
-rw-r--r--src/third_party/wiredtiger/bench/workgen/workgen.h8
-rw-r--r--src/third_party/wiredtiger/bench/workgen/workgen.swig2
-rw-r--r--src/third_party/wiredtiger/bench/workgen/workgen_int.h4
-rw-r--r--src/third_party/wiredtiger/bench/workgen/workgen_time.h6
-rwxr-xr-xsrc/third_party/wiredtiger/bench/workgen/wtperf.py2
-rw-r--r--src/third_party/wiredtiger/dist/api_data.py9
-rw-r--r--src/third_party/wiredtiger/dist/s_string.ok1
-rw-r--r--src/third_party/wiredtiger/import.data2
-rw-r--r--src/third_party/wiredtiger/src/block/block_compact.c3
-rw-r--r--src/third_party/wiredtiger/src/btree/bt_compact.c34
-rw-r--r--src/third_party/wiredtiger/src/btree/bt_curnext.c94
-rw-r--r--src/third_party/wiredtiger/src/btree/bt_curprev.c96
-rw-r--r--src/third_party/wiredtiger/src/btree/bt_debug.c26
-rw-r--r--src/third_party/wiredtiger/src/btree/bt_handle.c16
-rw-r--r--src/third_party/wiredtiger/src/btree/bt_vrfy_dsk.c62
-rw-r--r--src/third_party/wiredtiger/src/config/config_def.c159
-rw-r--r--src/third_party/wiredtiger/src/conn/conn_api.c1
-rw-r--r--src/third_party/wiredtiger/src/cursor/cur_backup.c15
-rw-r--r--src/third_party/wiredtiger/src/cursor/cur_index.c4
-rw-r--r--src/third_party/wiredtiger/src/cursor/cur_join.c2
-rw-r--r--src/third_party/wiredtiger/src/cursor/cur_stat.c1
-rw-r--r--src/third_party/wiredtiger/src/cursor/cur_table.c4
-rw-r--r--src/third_party/wiredtiger/src/docs/devdoc-dhandle-lifecycle.dox92
-rw-r--r--src/third_party/wiredtiger/src/docs/devdoc-index.dox10
-rw-r--r--src/third_party/wiredtiger/src/docs/spell.ok5
-rw-r--r--src/third_party/wiredtiger/src/docs/transactions.dox7
-rw-r--r--src/third_party/wiredtiger/src/include/btree.h7
-rw-r--r--src/third_party/wiredtiger/src/include/cell.i288
-rw-r--r--src/third_party/wiredtiger/src/include/compact.h1
-rw-r--r--src/third_party/wiredtiger/src/include/connection.h59
-rw-r--r--src/third_party/wiredtiger/src/include/cursor.h8
-rw-r--r--src/third_party/wiredtiger/src/include/cursor.i42
-rw-r--r--src/third_party/wiredtiger/src/include/extern.h2
-rw-r--r--src/third_party/wiredtiger/src/include/misc.h26
-rw-r--r--src/third_party/wiredtiger/src/include/session.h2
-rw-r--r--src/third_party/wiredtiger/src/include/txn.h40
-rw-r--r--src/third_party/wiredtiger/src/include/txn.i6
-rw-r--r--src/third_party/wiredtiger/src/include/wiredtiger.in33
-rw-r--r--src/third_party/wiredtiger/src/include/wt_internal.h4
-rw-r--r--src/third_party/wiredtiger/src/meta/meta_turtle.c30
-rw-r--r--src/third_party/wiredtiger/src/reconcile/rec_write.c117
-rw-r--r--src/third_party/wiredtiger/src/schema/schema_create.c9
-rw-r--r--src/third_party/wiredtiger/src/schema/schema_drop.c8
-rw-r--r--src/third_party/wiredtiger/src/schema/schema_list.c16
-rw-r--r--src/third_party/wiredtiger/src/schema/schema_open.c6
-rw-r--r--src/third_party/wiredtiger/src/schema/schema_rename.c4
-rw-r--r--src/third_party/wiredtiger/src/schema/schema_stat.c2
-rw-r--r--src/third_party/wiredtiger/src/schema/schema_truncate.c2
-rw-r--r--src/third_party/wiredtiger/src/schema/schema_worker.c3
-rw-r--r--src/third_party/wiredtiger/src/session/session_api.c6
-rw-r--r--src/third_party/wiredtiger/src/txn/txn.c116
-rw-r--r--src/third_party/wiredtiger/src/txn/txn_ckpt.c7
-rw-r--r--src/third_party/wiredtiger/src/txn/txn_timestamp.c33
-rw-r--r--src/third_party/wiredtiger/src/utilities/util.h2
-rw-r--r--src/third_party/wiredtiger/src/utilities/util_downgrade.c6
-rw-r--r--src/third_party/wiredtiger/src/utilities/util_main.c20
-rw-r--r--src/third_party/wiredtiger/test/csuite/wt4156_metadata_salvage/main.c23
-rw-r--r--src/third_party/wiredtiger/test/csuite/wt4333_handle_locks/main.c38
-rw-r--r--src/third_party/wiredtiger/test/format/config.c3
-rw-r--r--src/third_party/wiredtiger/test/suite/test_assert04.py3
-rw-r--r--src/third_party/wiredtiger/test/suite/test_assert05.py120
-rw-r--r--src/third_party/wiredtiger/test/suite/test_assert06.py388
-rw-r--r--src/third_party/wiredtiger/test/suite/test_prepare_cursor02.py101
-rwxr-xr-xsrc/third_party/wiredtiger/tools/optrack/find-latency-spikes.py314
-rwxr-xr-xsrc/third_party/wiredtiger/tools/optrack/wt_optrack_decode.py1
71 files changed, 2329 insertions, 688 deletions
diff --git a/src/third_party/wiredtiger/bench/workgen/runner/read_write_storms.py b/src/third_party/wiredtiger/bench/workgen/runner/read_write_storms.py
index 2f774d0c902..0a4dab510f7 100644
--- a/src/third_party/wiredtiger/bench/workgen/runner/read_write_storms.py
+++ b/src/third_party/wiredtiger/bench/workgen/runner/read_write_storms.py
@@ -1,5 +1,34 @@
-#/usr/bin/env python
-# generated from runner/read_write_heavy.wtperf originally, then hand edited.
+#!/usr/bin/env python
+#
+# Public Domain 2014-2019 MongoDB, Inc.
+# Public Domain 2008-2014 WiredTiger, Inc.
+#
+# This is free and unencumbered software released into the public domain.
+#
+# Anyone is free to copy, modify, publish, use, compile, sell, or
+# distribute this software, either in source code form or as a compiled
+# binary, for any purpose, commercial or non-commercial, and by any
+# means.
+#
+# In jurisdictions that recognize copyright laws, the author or authors
+# of this software dedicate any and all copyright interest in the
+# software to the public domain. We make this dedication for the benefit
+# of the public at large and to the detriment of our heirs and
+# successors. We intend this dedication to be an overt act of
+# relinquishment in perpetuity of all present and future rights to this
+# software under copyright law.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+# IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+# OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+# ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+# OTHER DEALINGS IN THE SOFTWARE.
+#
+
+# Generated from runner/read_write_heavy.wtperf originally, then hand edited.
+# Create waves of extra read and write activity (storms).
from runner import *
from wiredtiger import *
@@ -11,7 +40,7 @@ conn_config += ",cache_size=2GB,eviction=(threads_max=8),log=(enabled=true),sess
conn = wiredtiger_open("WT_TEST", "create," + conn_config)
s = conn.open_session("")
-wtperf_table_config = "key_format=S,value_format=S,type=lsm," +\
+wtperf_table_config = "key_format=S,value_format=S," +\
"exclusive=true,allocation_size=4kb," +\
"internal_page_max=64kb,leaf_page_max=4kb,split_pct=100,"
compress_table_config = "block_compressor=snappy,"
diff --git a/src/third_party/wiredtiger/bench/workgen/runner/read_write_sync_long.py b/src/third_party/wiredtiger/bench/workgen/runner/read_write_sync_long.py
new file mode 100755
index 00000000000..c3374f4d3f2
--- /dev/null
+++ b/src/third_party/wiredtiger/bench/workgen/runner/read_write_sync_long.py
@@ -0,0 +1,136 @@
+#!/usr/bin/env python
+#
+# Public Domain 2014-2019 MongoDB, Inc.
+# Public Domain 2008-2014 WiredTiger, Inc.
+#
+# This is free and unencumbered software released into the public domain.
+#
+# Anyone is free to copy, modify, publish, use, compile, sell, or
+# distribute this software, either in source code form or as a compiled
+# binary, for any purpose, commercial or non-commercial, and by any
+# means.
+#
+# In jurisdictions that recognize copyright laws, the author or authors
+# of this software dedicate any and all copyright interest in the
+# software to the public domain. We make this dedication for the benefit
+# of the public at large and to the detriment of our heirs and
+# successors. We intend this dedication to be an overt act of
+# relinquishment in perpetuity of all present and future rights to this
+# software under copyright law.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+# IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+# OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+# ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+# OTHER DEALINGS IN THE SOFTWARE.
+#
+
+# Generated from runner/read_write_heavy.wtperf originally, then hand edited.
+# A long run demonstrating how read or write threads can be synchronized.
+import sys
+
+from runner import *
+from wiredtiger import *
+from workgen import *
+
+context = Context()
+conn_config = ""
+conn_config += ",cache_size=2GB,eviction=(threads_max=8),log=(enabled=true),session_max=250,statistics=(fast),statistics_log=(wait=1,json)" # explicitly added
+conn = wiredtiger_open("WT_TEST", "create," + conn_config)
+s = conn.open_session("")
+
+wtperf_table_config = "key_format=S,value_format=S," +\
+ "exclusive=true,allocation_size=4kb," +\
+ "internal_page_max=64kb,leaf_page_max=4kb,split_pct=100,"
+compress_table_config = "block_compressor=snappy,"
+table_config = "memory_page_max=10m,leaf_value_max=64MB,checksum=on,split_pct=90,type=file,log=(enabled=false),leaf_page_max=32k,block_compressor=snappy"
+tables = []
+table_count = 100
+for i in range(0, table_count):
+ tname = "table:test" + str(i)
+ table = Table(tname)
+ s.create(tname, wtperf_table_config +\
+ compress_table_config + table_config)
+ table.options.key_size = 20
+ table.options.value_size = 7000
+ tables.append(table)
+
+populate_threads = 4
+icount = 4000000
+# There are multiple tables to be filled during populate,
+# the icount is split between them all.
+pop_ops = Operation(Operation.OP_INSERT, tables[0])
+pop_ops = op_multi_table(pop_ops, tables)
+nops_per_thread = icount / (populate_threads * table_count)
+pop_thread = Thread(pop_ops * nops_per_thread)
+pop_workload = Workload(context, populate_threads * pop_thread)
+pop_workload.run(conn)
+print('populate complete')
+
+# Log like file, requires that logging be enabled in the connection config.
+log_name = "table:log"
+s.create(log_name, wtperf_table_config + "key_format=S,value_format=S," + compress_table_config + table_config + ",log=(enabled=true)")
+log_table = Table(log_name)
+
+ops = Operation(Operation.OP_UPDATE, tables[0])
+ops = op_multi_table(ops, tables, False)
+ops = op_log_like(ops, log_table, 0)
+thread0 = Thread(ops)
+# These operations include log_like operations, which will increase the number
+# of insert/update operations by a factor of 2.0. This may cause the
+# actual operations performed to be above the throttle.
+thread0.options.throttle=11
+thread0.options.throttle_burst=0
+
+ops = Operation(Operation.OP_SEARCH, tables[0])
+ops = op_multi_table(ops, tables, False)
+ops = op_log_like(ops, log_table, 0)
+thread1 = Thread(ops)
+thread1.options.throttle=60
+thread1.options.throttle_burst=0
+
+ops = Operation(Operation.OP_SLEEP, "60") + \
+ Operation(Operation.OP_CHECKPOINT, "")
+checkpoint_thread = Thread(ops)
+
+ops = Operation(Operation.OP_SLEEP, "0.1") + \
+ Operation(Operation.OP_LOG_FLUSH, "")
+logging_thread = Thread(ops)
+
+############################################################################
+# This part was added to the generated file.
+# Add unthrottled threads that are synchronized so we get two minutes with
+# unthrottled readers + writers, two minutes of unthrottled writers, two minutes
+# of unthrottled readers and two minutes of idle. That is repeated. This is
+# against a background of light reading and writing by the throttled threads.
+
+ops = Operation(Operation.OP_UPDATE, tables[0])
+ops = op_multi_table(ops, tables, False)
+ops = op_log_like(ops, log_table, 0)
+ops = timed(240.0, ops) + sleep(240.0)
+thread_big_writer = Thread(ops)
+thread_big_writer.synchronized = True
+
+ops = Operation(Operation.OP_SEARCH, tables[0])
+ops = op_multi_table(ops, tables, False)
+ops = op_log_like(ops, log_table, 0)
+ops = timed(120.0, ops) + timed(240.0, ops) + sleep(120.0)
+thread_big_reader = Thread(ops)
+thread_big_reader.synchronized = True
+
+# End of added section.
+# The new threads will also be added to the workload below.
+############################################################################
+
+workload = Workload(context, 20 * thread0 + 20 * thread1 + checkpoint_thread + logging_thread + 50 * thread_big_writer + 50 * thread_big_reader)
+workload.options.report_interval=1
+workload.options.run_time=1800
+workload.options.sample_rate=1
+workload.options.warmup=0
+workload.options.sample_interval_ms = 1000
+workload.run(conn)
+
+latency_filename = "WT_TEST/latency.out"
+latency.workload_latency(workload, latency_filename)
diff --git a/src/third_party/wiredtiger/bench/workgen/runner/read_write_sync_short.py b/src/third_party/wiredtiger/bench/workgen/runner/read_write_sync_short.py
new file mode 100755
index 00000000000..a3121b323d8
--- /dev/null
+++ b/src/third_party/wiredtiger/bench/workgen/runner/read_write_sync_short.py
@@ -0,0 +1,151 @@
+#!/usr/bin/env python
+#
+# Public Domain 2014-2019 MongoDB, Inc.
+# Public Domain 2008-2014 WiredTiger, Inc.
+#
+# This is free and unencumbered software released into the public domain.
+#
+# Anyone is free to copy, modify, publish, use, compile, sell, or
+# distribute this software, either in source code form or as a compiled
+# binary, for any purpose, commercial or non-commercial, and by any
+# means.
+#
+# In jurisdictions that recognize copyright laws, the author or authors
+# of this software dedicate any and all copyright interest in the
+# software to the public domain. We make this dedication for the benefit
+# of the public at large and to the detriment of our heirs and
+# successors. We intend this dedication to be an overt act of
+# relinquishment in perpetuity of all present and future rights to this
+# software under copyright law.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+# IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+# OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+# ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+# OTHER DEALINGS IN THE SOFTWARE.
+#
+
+# Generated from runner/read_write_heavy.wtperf originally, then hand edited.
+# A short run demonstrating how read or write threads can be synchronized.
+import sys
+
+from runner import *
+from wiredtiger import *
+from workgen import *
+
+context = Context()
+conn_config = ""
+conn_config += ",cache_size=2GB,eviction=(threads_max=8),log=(enabled=true),session_max=250,statistics=(fast),statistics_log=(wait=1,json)" # explicitly added
+conn = wiredtiger_open("WT_TEST", "create," + conn_config)
+s = conn.open_session("")
+
+wtperf_table_config = "key_format=S,value_format=S," +\
+ "exclusive=true,allocation_size=4kb," +\
+ "internal_page_max=64kb,leaf_page_max=4kb,split_pct=100,"
+compress_table_config = "block_compressor=snappy,"
+table_config = "memory_page_max=10m,leaf_value_max=64MB,checksum=on,split_pct=90,type=file,log=(enabled=false),leaf_page_max=32k,block_compressor=snappy"
+tables = []
+table_count = 100
+for i in range(0, table_count):
+ tname = "table:test" + str(i)
+ table = Table(tname)
+ s.create(tname, wtperf_table_config +\
+ compress_table_config + table_config)
+ table.options.key_size = 20
+ table.options.value_size = 7000
+ tables.append(table)
+
+populate_threads = 4
+icount = 4000000
+# There are multiple tables to be filled during populate,
+# the icount is split between them all.
+pop_ops = Operation(Operation.OP_INSERT, tables[0])
+pop_ops = op_multi_table(pop_ops, tables)
+nops_per_thread = icount / (populate_threads * table_count)
+pop_thread = Thread(pop_ops * nops_per_thread)
+pop_workload = Workload(context, populate_threads * pop_thread)
+pop_workload.run(conn)
+print('populate complete')
+
+# Log like file, requires that logging be enabled in the connection config.
+log_name = "table:log"
+s.create(log_name, wtperf_table_config + "key_format=S,value_format=S," + compress_table_config + table_config + ",log=(enabled=true)")
+log_table = Table(log_name)
+
+ops = Operation(Operation.OP_UPDATE, tables[0])
+ops = op_multi_table(ops, tables, False)
+ops = op_log_like(ops, log_table, 0)
+thread0 = Thread(ops)
+# These operations include log_like operations, which will increase the number
+# of insert/update operations by a factor of 2.0. This may cause the
+# actual operations performed to be above the throttle.
+thread0.options.throttle=11
+thread0.options.throttle_burst=0
+
+ops = Operation(Operation.OP_SEARCH, tables[0])
+ops = op_multi_table(ops, tables, False)
+ops = op_log_like(ops, log_table, 0)
+thread1 = Thread(ops)
+thread1.options.throttle=60
+thread1.options.throttle_burst=0
+
+ops = Operation(Operation.OP_SLEEP, "60") + \
+ Operation(Operation.OP_CHECKPOINT, "")
+checkpoint_thread = Thread(ops)
+
+ops = Operation(Operation.OP_SLEEP, "0.1") + \
+ Operation(Operation.OP_LOG_FLUSH, "")
+logging_thread = Thread(ops)
+
+############################################################################
+# This part was added to the generated file.
+# Add threads that do a bunch of operations and sleep, all in a loop.
+# The write threads are in two groups that synchronize with each other
+# every 20 seconds. The read threads are in two groups that synchronize
+# with each other every 16 seconds. There is a collective synchronization
+# every 80 seconds.
+
+ops = Operation(Operation.OP_UPDATE, tables[0])
+ops = op_multi_table(ops, tables, False)
+ops = op_log_like(ops, log_table, 0)
+ops = timed(5.0, ops) + sleep(5.0)
+thread_big_10 = Thread(ops)
+thread_big_10.synchronized = True
+
+ops = Operation(Operation.OP_UPDATE, tables[0])
+ops = op_multi_table(ops, tables, False)
+ops = op_log_like(ops, log_table, 0)
+ops = timed(5.0, ops) + sleep(15.0)
+thread_big_20 = Thread(ops)
+thread_big_20.synchronized = True
+
+ops = Operation(Operation.OP_SEARCH, tables[0])
+ops = op_multi_table(ops, tables, False)
+ops = op_log_like(ops, log_table, 0)
+ops = timed(4.0, ops) + sleep(4.0)
+thread_bigread_8 = Thread(ops)
+thread_bigread_8.synchronized = True
+
+ops = Operation(Operation.OP_SEARCH, tables[0])
+ops = op_multi_table(ops, tables, False)
+ops = op_log_like(ops, log_table, 0)
+ops = timed(4.0, ops) + sleep(12.0)
+thread_bigread_16 = Thread(ops)
+thread_bigread_16.synchronized = True
+
+# End of added section.
+# The new threads will also be added to the workload below.
+############################################################################
+
+workload = Workload(context, 20 * thread0 + 20 * thread1 + checkpoint_thread + logging_thread + 50 * thread_big_10 + 50 * thread_big_20 + 50 * thread_bigread_8 + 50 * thread_bigread_16)
+workload.options.report_interval=1
+workload.options.run_time=900
+workload.options.sample_rate=1
+workload.options.warmup=0
+workload.options.sample_interval_ms = 1000
+workload.run(conn)
+
+latency_filename = "WT_TEST/latency.out"
+latency.workload_latency(workload, latency_filename)
diff --git a/src/third_party/wiredtiger/bench/workgen/runner/runner/__init__.py b/src/third_party/wiredtiger/bench/workgen/runner/runner/__init__.py
index 7c352aa6cda..42ffad6b247 100644..100755
--- a/src/third_party/wiredtiger/bench/workgen/runner/runner/__init__.py
+++ b/src/third_party/wiredtiger/bench/workgen/runner/runner/__init__.py
@@ -88,5 +88,5 @@ except:
shutil.rmtree('WT_TEST', True)
os.mkdir('WT_TEST')
-from .core import txn, extensions_config, op_append, op_group_transaction, op_log_like, op_multi_table, op_populate_with_range
+from .core import txn, extensions_config, op_append, op_group_transaction, op_log_like, op_multi_table, op_populate_with_range, sleep, timed
from .latency import workload_latency
diff --git a/src/third_party/wiredtiger/bench/workgen/runner/runner/core.py b/src/third_party/wiredtiger/bench/workgen/runner/runner/core.py
index a79efc9c547..be660aecb88 100644..100755
--- a/src/third_party/wiredtiger/bench/workgen/runner/runner/core.py
+++ b/src/third_party/wiredtiger/bench/workgen/runner/runner/core.py
@@ -38,6 +38,23 @@ def txn(op, config=None):
op._transaction = t
return op
+# sleep --
+# Create an operation to sleep a given number of seconds.
+def sleep(seconds):
+ return Operation(Operation.OP_SLEEP, str(seconds))
+
+# timed --
+# Configure the operation (and suboperations) to run until the time elapses.
+def timed(seconds, op):
+ if op._group == None:
+ result = Operation()
+ result._group = OpList([op])
+ result._repeatgroup = 1
+ else:
+ result = op
+ result._timed = seconds
+ return result
+
# Check for a local build that contains the wt utility. First check in
# current working directory, then in build_posix and finally in the disttop
# directory. This isn't ideal - if a user has multiple builds in a tree we
diff --git a/src/third_party/wiredtiger/bench/workgen/workgen.cxx b/src/third_party/wiredtiger/bench/workgen/workgen.cxx
index 9bfa29e3136..9b6a2f76020 100644
--- a/src/third_party/wiredtiger/bench/workgen/workgen.cxx
+++ b/src/third_party/wiredtiger/bench/workgen/workgen.cxx
@@ -450,7 +450,8 @@ ThreadRunner::ThreadRunner() :
_errno(0), _exception(), _thread(NULL), _context(NULL), _icontext(NULL),
_workload(NULL), _wrunner(NULL), _rand_state(NULL),
_throttle(NULL), _throttle_ops(0), _throttle_limit(0),
- _in_transaction(false), _number(0), _stats(false), _table_usage(),
+ _in_transaction(false), _start_time_us(0), _op_time_us(0),
+ _number(0), _stats(false), _table_usage(),
_cursors(NULL), _stop(false), _session(NULL), _keybuf(NULL),
_valuebuf(NULL), _repeat(false) {
}
@@ -464,6 +465,8 @@ int ThreadRunner::create_all(WT_CONNECTION *conn) {
WT_RET(close_all());
ASSERT(_session == NULL);
+ if (_thread->options.synchronized)
+ _thread->_op.synchronized_check();
WT_RET(conn->open_session(conn, NULL, NULL, &_session));
_table_usage.clear();
_stats.track_latency(_workload->options.sample_interval_ms > 0);
@@ -562,6 +565,11 @@ int ThreadRunner::run() {
ThreadOptions *options = &_thread->options;
std::string name = options->name;
+ timespec start_time;
+ workgen_epoch(&start_time);
+ _start_time_us = ts_us(start_time);
+ _op_time_us = _start_time_us;
+
VERBOSE(*this, "thread " << name << " running");
if (options->throttle != 0) {
_throttle = new Throttle(*this, options->throttle,
@@ -855,6 +863,7 @@ int ThreadRunner::op_run(Operation *op) {
// Never retry on an internal op.
retry_op = false;
WT_ERR(op->_internal->run(this, _session));
+ _op_time_us += op->_internal->sync_time_us();
}
}
@@ -865,12 +874,28 @@ int ThreadRunner::op_run(Operation *op) {
} else if (track != NULL)
track->complete();
- if (op->_group != NULL)
- VERBOSE(*this, "GROUP operation " << op->_repeatgroup << " times");
- for (int count = 0; !_stop && count < op->_repeatgroup; count++)
- for (std::vector<Operation>::iterator i = op->_group->begin();
- i != op->_group->end(); i++)
- WT_ERR(op_run(&*i));
+ if (op->_group != NULL) {
+ uint64_t endtime = 0;
+ timespec now;
+
+ if (op->_timed != 0.0)
+ endtime = _op_time_us + secs_us(op->_timed);
+
+ VERBOSE(*this, "GROUP operation " << op->_timed << " secs, "
+ << op->_repeatgroup << "times");
+
+ do {
+ for (int count = 0; !_stop && count < op->_repeatgroup; count++) {
+ for (std::vector<Operation>::iterator i = op->_group->begin();
+ i != op->_group->end(); i++)
+ WT_ERR(op_run(&*i));
+ }
+ workgen_epoch(&now);
+ } while (!_stop && ts_us(now) < endtime);
+
+ if (op->_timed != 0.0)
+ _op_time_us = endtime;
+ }
err:
if (own_cursor)
WT_TRET(cursor->close(cursor));
@@ -995,7 +1020,7 @@ int Throttle::throttle(uint64_t op_count, uint64_t *op_limit) {
}
ThreadOptions::ThreadOptions() : name(), throttle(0.0), throttle_burst(1.0),
- _options() {
+ synchronized(false), _options() {
_options.add_string("name", name, "name of the thread");
_options.add_double("throttle", throttle,
"Limit to this number of operations per second");
@@ -1005,7 +1030,8 @@ ThreadOptions::ThreadOptions() : name(), throttle(0.0), throttle_burst(1.0),
}
ThreadOptions::ThreadOptions(const ThreadOptions &other) :
name(other.name), throttle(other.throttle),
- throttle_burst(other.throttle_burst), _options(other._options) {}
+ throttle_burst(other.throttle_burst), synchronized(other.synchronized),
+ _options(other._options) {}
ThreadOptions::~ThreadOptions() {}
void
@@ -1051,27 +1077,27 @@ void Thread::describe(std::ostream &os) const {
Operation::Operation() :
_optype(OP_NONE), _internal(NULL), _table(), _key(), _value(), _config(),
- _transaction(NULL), _group(NULL), _repeatgroup(0) {
+ _transaction(NULL), _group(NULL), _repeatgroup(0), _timed(0.0) {
init_internal(NULL);
}
Operation::Operation(OpType optype, Table table, Key key, Value value) :
_optype(optype), _internal(NULL), _table(table), _key(key), _value(value),
- _config(), _transaction(NULL), _group(NULL), _repeatgroup(0) {
+ _config(), _transaction(NULL), _group(NULL), _repeatgroup(0), _timed(0.0) {
init_internal(NULL);
size_check();
}
Operation::Operation(OpType optype, Table table, Key key) :
_optype(optype), _internal(NULL), _table(table), _key(key), _value(),
- _config(), _transaction(NULL), _group(NULL), _repeatgroup(0) {
+ _config(), _transaction(NULL), _group(NULL), _repeatgroup(0), _timed(0.0) {
init_internal(NULL);
size_check();
}
Operation::Operation(OpType optype, Table table) :
_optype(optype), _internal(NULL), _table(table), _key(), _value(),
- _config(), _transaction(NULL), _group(NULL), _repeatgroup(0) {
+ _config(), _transaction(NULL), _group(NULL), _repeatgroup(0), _timed(0.0) {
init_internal(NULL);
size_check();
}
@@ -1080,7 +1106,7 @@ Operation::Operation(const Operation &other) :
_optype(other._optype), _internal(NULL), _table(other._table),
_key(other._key), _value(other._value), _config(other._config),
_transaction(other._transaction), _group(other._group),
- _repeatgroup(other._repeatgroup) {
+ _repeatgroup(other._repeatgroup), _timed(other._timed) {
// Creation and destruction of _group and _transaction is managed
// by Python.
init_internal(other._internal);
@@ -1088,7 +1114,8 @@ Operation::Operation(const Operation &other) :
Operation::Operation(OpType optype, const char *config) :
_optype(optype), _internal(NULL), _table(), _key(), _value(),
- _config(config), _transaction(NULL), _group(NULL), _repeatgroup(0) {
+ _config(config), _transaction(NULL), _group(NULL), _repeatgroup(0),
+ _timed(0.0) {
init_internal(NULL);
}
@@ -1105,6 +1132,7 @@ Operation& Operation::operator=(const Operation &other) {
_transaction = other._transaction;
_group = other._group;
_repeatgroup = other._repeatgroup;
+ _timed = other._timed;
delete _internal;
_internal = NULL;
init_internal(other._internal);
@@ -1154,6 +1182,11 @@ void Operation::init_internal(OperationInternal *other) {
}
}
+bool Operation::combinable() const {
+ return (_group != NULL && _repeatgroup == 1 && _timed == 0.0 &&
+ _transaction == NULL && _config == "");
+}
+
void Operation::create_all() {
size_check();
@@ -1169,14 +1202,19 @@ void Operation::describe(std::ostream &os) const {
os << ", "; _value.describe(os);
}
if (!_config.empty())
- os << ", '" << _config;
+ os << ", '" << _config << "'";
if (_transaction != NULL) {
os << ", [";
_transaction->describe(os);
os << "]";
}
+ if (_timed != 0.0)
+ os << ", [timed " << _timed << " secs]";
if (_group != NULL) {
- os << ", group[" << _repeatgroup << "]: {";
+ os << ", group";
+ if (_repeatgroup != 1)
+ os << "[repeat " << _repeatgroup << "]";
+ os << ": {";
bool first = true;
for (std::vector<Operation>::const_iterator i = _group->begin();
i != _group->end(); i++) {
@@ -1331,6 +1369,19 @@ void Operation::size_check() const {
}
}
+void Operation::synchronized_check() const {
+ if (_timed != 0.0)
+ return;
+ if (_optype != Operation::OP_NONE) {
+ if (is_table_op() || _internal->sync_time_us() == 0)
+ THROW("operation cannot be synchronized, needs to be timed()");
+ } else if (_group != NULL) {
+ for (std::vector<Operation>::iterator i = _group->begin();
+ i != _group->end(); i++)
+ i->synchronized_check();
+ }
+}
+
int CheckpointOperationInternal::run(ThreadRunner *runner, WT_SESSION *session)
{
(void)runner; /* not used */
@@ -1357,12 +1408,40 @@ void SleepOperationInternal::parse_config(const std::string &config)
int SleepOperationInternal::run(ThreadRunner *runner, WT_SESSION *session)
{
+ uint64_t endtime;
+ timespec now;
+ uint64_t now_us;
+
(void)runner; /* not used */
(void)session; /* not used */
- sleep(_sleepvalue);
+
+ workgen_epoch(&now);
+ now_us = ts_us(now);
+ if (runner->_thread->options.synchronized)
+ endtime = runner->_op_time_us + secs_us(_sleepvalue);
+ else
+ endtime = now_us + secs_us(_sleepvalue);
+
+ // Sleep for up to a second at a time, so we'll break out if
+ // we should stop.
+ while (!runner->_stop && now_us < endtime) {
+ uint64_t sleep_us = endtime - now_us;
+ if (sleep_us >= WT_MILLION) // one second
+ sleep(1);
+ else
+ usleep(sleep_us);
+
+ workgen_epoch(&now);
+ now_us = ts_us(now);
+ }
return (0);
}
+uint64_t SleepOperationInternal::sync_time_us() const
+{
+ return (secs_us(_sleepvalue));
+}
+
void TableOperationInternal::parse_config(const std::string &config)
{
if (!config.empty()) {
diff --git a/src/third_party/wiredtiger/bench/workgen/workgen.h b/src/third_party/wiredtiger/bench/workgen/workgen.h
index d29b42f17d7..c6c739d54cf 100644
--- a/src/third_party/wiredtiger/bench/workgen/workgen.h
+++ b/src/third_party/wiredtiger/bench/workgen/workgen.h
@@ -295,6 +295,7 @@ struct Operation {
Transaction *_transaction;
std::vector<Operation> *_group;
int _repeatgroup;
+ double _timed;
Operation();
Operation(OpType optype, Table table, Key key, Value value);
@@ -305,6 +306,9 @@ struct Operation {
Operation(const Operation &other);
~Operation();
+ // Check if adding (via Python '+') another operation to this one is
+ // as easy as appending the new operation to the _group.
+ bool combinable() const;
void describe(std::ostream &os) const;
#ifndef SWIG
Operation& operator=(const Operation &other);
@@ -317,6 +321,7 @@ struct Operation {
uint64_t n, char *result) const;
void kv_size_buffer(bool iskey, size_t &size) const;
void size_check() const;
+ void synchronized_check() const;
#endif
};
@@ -327,6 +332,7 @@ struct ThreadOptions {
std::string name;
double throttle;
double throttle_burst;
+ bool synchronized;
ThreadOptions();
ThreadOptions(const ThreadOptions &other);
@@ -334,6 +340,8 @@ struct ThreadOptions {
void describe(std::ostream &os) const {
os << "throttle " << throttle;
+ os << ", throttle_burst " << throttle_burst;
+ os << ", synchronized " << synchronized;
}
std::string help() const { return _options.help(); }
diff --git a/src/third_party/wiredtiger/bench/workgen/workgen.swig b/src/third_party/wiredtiger/bench/workgen/workgen.swig
index 05866c1b3e2..5a0d03f6c69 100644
--- a/src/third_party/wiredtiger/bench/workgen/workgen.swig
+++ b/src/third_party/wiredtiger/bench/workgen/workgen.swig
@@ -158,7 +158,7 @@ WorkgenFrozenClass(WorkloadOptions)
def __add__(self, other):
if not isinstance(other, Operation):
raise Exception('Operation.__sum__ requires an Operation')
- if self._group == None or self._repeatgroup != 1 or self._transaction != None or self._config != '':
+ if not self.combinable():
op = Operation()
op._group = OpList([self, other])
op._repeatgroup = 1
diff --git a/src/third_party/wiredtiger/bench/workgen/workgen_int.h b/src/third_party/wiredtiger/bench/workgen/workgen_int.h
index 50eeada0e30..e4cc948c7b9 100644
--- a/src/third_party/wiredtiger/bench/workgen/workgen_int.h
+++ b/src/third_party/wiredtiger/bench/workgen/workgen_int.h
@@ -101,6 +101,8 @@ struct ThreadRunner {
Throttle *_throttle;
uint64_t _throttle_ops;
uint64_t _throttle_limit;
+ uint64_t _start_time_us;
+ uint64_t _op_time_us; // time that current operation starts
bool _in_transaction;
uint32_t _number;
Stats _stats;
@@ -188,6 +190,7 @@ struct OperationInternal {
virtual void parse_config(const std::string &config) { (void)config; }
virtual int run(ThreadRunner *runner, WT_SESSION *session) {
(void)runner; (void)session; return (0); }
+ virtual uint64_t sync_time_us() const { return (0); }
};
struct CheckpointOperationInternal : OperationInternal {
@@ -227,6 +230,7 @@ struct SleepOperationInternal : OperationInternal {
OperationInternal(other),_sleepvalue(other._sleepvalue) {}
virtual void parse_config(const std::string &config);
virtual int run(ThreadRunner *runner, WT_SESSION *session);
+ virtual uint64_t sync_time_us() const;
};
struct TableInternal {
diff --git a/src/third_party/wiredtiger/bench/workgen/workgen_time.h b/src/third_party/wiredtiger/bench/workgen/workgen_time.h
index 9e045087d3a..187a452cb86 100644
--- a/src/third_party/wiredtiger/bench/workgen/workgen_time.h
+++ b/src/third_party/wiredtiger/bench/workgen/workgen_time.h
@@ -199,3 +199,9 @@ ts_us(const timespec &ts)
{
return (ns_to_us(ts.tv_nsec) + sec_to_us(ts.tv_sec));
}
+
+inline uint64_t
+secs_us(double secs)
+{
+ return (secs * USEC_PER_SEC);
+}
diff --git a/src/third_party/wiredtiger/bench/workgen/wtperf.py b/src/third_party/wiredtiger/bench/workgen/wtperf.py
index e4ce0393276..24e7dd8f733 100755
--- a/src/third_party/wiredtiger/bench/workgen/wtperf.py
+++ b/src/third_party/wiredtiger/bench/workgen/wtperf.py
@@ -393,7 +393,7 @@ class Translator:
def translate_table_create(self):
opts = self.options
s = ''
- s += 'wtperf_table_config = "key_format=S,value_format=S,type=lsm," +\\\n'
+ s += 'wtperf_table_config = "key_format=S,value_format=S," +\\\n'
s += ' "exclusive=true,allocation_size=4kb," +\\\n'
s += ' "internal_page_max=64kb,leaf_page_max=4kb,split_pct=100,"\n'
if opts.compression != '':
diff --git a/src/third_party/wiredtiger/dist/api_data.py b/src/third_party/wiredtiger/dist/api_data.py
index 4db94e19cf3..d585557d968 100644
--- a/src/third_party/wiredtiger/dist/api_data.py
+++ b/src/third_party/wiredtiger/dist/api_data.py
@@ -162,12 +162,16 @@ file_runtime_config = common_runtime_config + [
if mixed update use is allowed. If 'key_consistent' is
set then all updates to a specific key must be the same
with respect to timestamp usage or not.''',
- choices=['always','key_consistent', 'never','none']),
+ choices=['always', 'key_consistent', 'never', 'none']),
+ Config('durable_timestamp', 'none', r'''
+ verify that durable timestamps should 'always' or 'never' be used
+ on modifications with this table.''',
+ choices=['always', 'key_consistent', 'never', 'none']),
Config('read_timestamp', 'none', r'''
verify that timestamps should 'always' or 'never' be used
on reads with this table. Verification is 'none'
if mixed read use is allowed.''',
- choices=['always','never','none'])
+ choices=['always', 'never', 'none'])
], undoc=True),
Config('cache_resident', 'false', r'''
do not ever evict the object's pages from cache. Not compatible with
@@ -620,6 +624,7 @@ connection_runtime_config = [
'checkpoint',
'checkpoint_progress',
'compact',
+ 'compact_progress',
'error_returns',
'evict',
'evict_stuck',
diff --git a/src/third_party/wiredtiger/dist/s_string.ok b/src/third_party/wiredtiger/dist/s_string.ok
index 3db38f0ea18..1d49ea15866 100644
--- a/src/third_party/wiredtiger/dist/s_string.ok
+++ b/src/third_party/wiredtiger/dist/s_string.ok
@@ -1257,6 +1257,7 @@ th
tid
timedwait
timestamp
+timestamped
timestamps
tmp
todo
diff --git a/src/third_party/wiredtiger/import.data b/src/third_party/wiredtiger/import.data
index 85fe66ac667..0842ac032b3 100644
--- a/src/third_party/wiredtiger/import.data
+++ b/src/third_party/wiredtiger/import.data
@@ -1,5 +1,5 @@
{
- "commit": "37e1570f8205f3e2d32a7abef6d0e8b09421f3ad",
+ "commit": "afdead1093b5c5b41dd54ddddf6f42d92bef1666",
"github": "wiredtiger/wiredtiger.git",
"vendor": "wiredtiger",
"branch": "mongodb-4.2"
diff --git a/src/third_party/wiredtiger/src/block/block_compact.c b/src/third_party/wiredtiger/src/block/block_compact.c
index 85fc7303bc2..c6d02e5b514 100644
--- a/src/third_party/wiredtiger/src/block/block_compact.c
+++ b/src/third_party/wiredtiger/src/block/block_compact.c
@@ -178,7 +178,8 @@ __wt_block_compact_page_skip(WT_SESSION_IMPL *session,
}
__wt_spin_unlock(session, &block->live_lock);
- if (WT_VERBOSE_ISSET(session, WT_VERB_COMPACT)) {
+ if (WT_VERBOSE_ISSET(session, WT_VERB_COMPACT) ||
+ WT_VERBOSE_ISSET(session, WT_VERB_COMPACT_PROGRESS)) {
++block->compact_pages_reviewed;
if (*skipp)
++block->compact_pages_skipped;
diff --git a/src/third_party/wiredtiger/src/btree/bt_compact.c b/src/third_party/wiredtiger/src/btree/bt_compact.c
index 37ee36634ff..e358e993ec4 100644
--- a/src/third_party/wiredtiger/src/btree/bt_compact.c
+++ b/src/third_party/wiredtiger/src/btree/bt_compact.c
@@ -104,6 +104,39 @@ __compact_rewrite_lock(WT_SESSION_IMPL *session, WT_REF *ref, bool *skipp)
}
/*
+ * __compact_progress --
+ * Output a compact progress message.
+ */
+static void
+__compact_progress(WT_SESSION_IMPL *session)
+{
+ struct timespec cur_time;
+ WT_BM *bm;
+ uint64_t time_diff;
+
+ if (!WT_VERBOSE_ISSET(session, WT_VERB_COMPACT_PROGRESS))
+ return;
+
+ bm = S2BT(session)->bm;
+ __wt_epoch(session, &cur_time);
+
+ /* Log one progress message every twenty seconds. */
+ time_diff = WT_TIMEDIFF_SEC(cur_time, session->compact->begin);
+ if (time_diff / WT_PROGRESS_MSG_PERIOD >
+ session->compact->prog_msg_count) {
+ __wt_verbose(session,
+ WT_VERB_COMPACT_PROGRESS, "Compact running"
+ " for %" PRIu64 " seconds; reviewed %"
+ PRIu64 " pages, skipped %" PRIu64 " pages,"
+ " wrote %" PRIu64 " pages", time_diff,
+ bm->block->compact_pages_reviewed,
+ bm->block->compact_pages_skipped,
+ bm->block->compact_pages_written);
+ session->compact->prog_msg_count++;
+ }
+}
+
+/*
* __wt_compact --
* Compact a file.
*/
@@ -137,6 +170,7 @@ __wt_compact(WT_SESSION_IMPL *session)
* Quit if eviction is stuck, we're making the problem worse.
*/
if (++i > 100) {
+ __compact_progress(session);
WT_ERR(__wt_session_compact_check_timeout(session));
if (__wt_cache_stuck(session))
diff --git a/src/third_party/wiredtiger/src/btree/bt_curnext.c b/src/third_party/wiredtiger/src/btree/bt_curnext.c
index d12548b008e..f504bdeddf4 100644
--- a/src/third_party/wiredtiger/src/btree/bt_curnext.c
+++ b/src/third_party/wiredtiger/src/btree/bt_curnext.c
@@ -13,13 +13,17 @@
* Return the next entry on the append list.
*/
static inline int
-__cursor_fix_append_next(WT_CURSOR_BTREE *cbt, bool newpage)
+__cursor_fix_append_next(WT_CURSOR_BTREE *cbt, bool newpage, bool restart)
{
WT_SESSION_IMPL *session;
WT_UPDATE *upd;
session = (WT_SESSION_IMPL *)cbt->iface.session;
+ /* If restarting after a prepare conflict, jump to the right spot. */
+ if (restart)
+ goto restart_read;
+
if (newpage) {
if ((cbt->ins = WT_SKIP_FIRST(cbt->ins_head)) == NULL)
return (WT_NOTFOUND);
@@ -58,7 +62,7 @@ __cursor_fix_append_next(WT_CURSOR_BTREE *cbt, bool newpage)
cbt->v = 0;
cbt->iface.value.data = &cbt->v;
} else {
- WT_RET(__wt_txn_read(session, cbt->ins->upd, &upd));
+restart_read: WT_RET(__wt_txn_read(session, cbt->ins->upd, &upd));
if (upd == NULL) {
cbt->v = 0;
cbt->iface.value.data = &cbt->v;
@@ -74,7 +78,7 @@ __cursor_fix_append_next(WT_CURSOR_BTREE *cbt, bool newpage)
* Move to the next, fixed-length column-store item.
*/
static inline int
-__cursor_fix_next(WT_CURSOR_BTREE *cbt, bool newpage)
+__cursor_fix_next(WT_CURSOR_BTREE *cbt, bool newpage, bool restart)
{
WT_BTREE *btree;
WT_PAGE *page;
@@ -86,6 +90,10 @@ __cursor_fix_next(WT_CURSOR_BTREE *cbt, bool newpage)
page = cbt->ref->page;
upd = NULL;
+ /* If restarting after a prepare conflict, jump to the right spot. */
+ if (restart)
+ goto restart_read;
+
/* Initialize for each new page. */
if (newpage) {
cbt->last_standard_recno = __col_fix_last_recno(cbt->ref);
@@ -108,7 +116,7 @@ new_page:
if (cbt->ins != NULL && cbt->recno != WT_INSERT_RECNO(cbt->ins))
cbt->ins = NULL;
if (cbt->ins != NULL)
- WT_RET(__wt_txn_read(session, cbt->ins->upd, &upd));
+restart_read: WT_RET(__wt_txn_read(session, cbt->ins->upd, &upd));
if (upd == NULL) {
cbt->v = __bit_getv_recno(cbt->ref, cbt->recno, btree->bitcnt);
cbt->iface.value.data = &cbt->v;
@@ -123,13 +131,17 @@ new_page:
* Return the next variable-length entry on the append list.
*/
static inline int
-__cursor_var_append_next(WT_CURSOR_BTREE *cbt, bool newpage)
+__cursor_var_append_next(WT_CURSOR_BTREE *cbt, bool newpage, bool restart)
{
WT_SESSION_IMPL *session;
WT_UPDATE *upd;
session = (WT_SESSION_IMPL *)cbt->iface.session;
+ /* If restarting after a prepare conflict, jump to the right spot. */
+ if (restart)
+ goto restart_read;
+
if (newpage) {
cbt->ins = WT_SKIP_FIRST(cbt->ins_head);
goto new_page;
@@ -141,7 +153,7 @@ new_page: if (cbt->ins == NULL)
return (WT_NOTFOUND);
__cursor_set_recno(cbt, WT_INSERT_RECNO(cbt->ins));
- WT_RET(__wt_txn_read(session, cbt->ins->upd, &upd));
+restart_read: WT_RET(__wt_txn_read(session, cbt->ins->upd, &upd));
if (upd == NULL)
continue;
if (upd->type == WT_UPDATE_TOMBSTONE) {
@@ -160,7 +172,7 @@ new_page: if (cbt->ins == NULL)
* Move to the next, variable-length column-store item.
*/
static inline int
-__cursor_var_next(WT_CURSOR_BTREE *cbt, bool newpage)
+__cursor_var_next(WT_CURSOR_BTREE *cbt, bool newpage, bool restart)
{
WT_CELL *cell;
WT_CELL_UNPACK unpack;
@@ -176,6 +188,10 @@ __cursor_var_next(WT_CURSOR_BTREE *cbt, bool newpage)
rle_start = 0; /* -Werror=maybe-uninitialized */
+ /* If restarting after a prepare conflict, jump to the right spot. */
+ if (restart)
+ goto restart_read;
+
/* Initialize for each new page. */
if (newpage) {
/*
@@ -197,7 +213,8 @@ __cursor_var_next(WT_CURSOR_BTREE *cbt, bool newpage)
return (WT_NOTFOUND);
__cursor_set_recno(cbt, cbt->recno + 1);
-new_page: /* Find the matching WT_COL slot. */
+new_page:
+restart_read: /* Find the matching WT_COL slot. */
if ((cip =
__col_var_search(cbt->ref, cbt->recno, &rle_start)) == NULL)
return (WT_NOTFOUND);
@@ -282,7 +299,7 @@ new_page: /* Find the matching WT_COL slot. */
* Move to the next row-store item.
*/
static inline int
-__cursor_row_next(WT_CURSOR_BTREE *cbt, bool newpage)
+__cursor_row_next(WT_CURSOR_BTREE *cbt, bool newpage, bool restart)
{
WT_INSERT *ins;
WT_ITEM *key;
@@ -295,6 +312,15 @@ __cursor_row_next(WT_CURSOR_BTREE *cbt, bool newpage)
page = cbt->ref->page;
key = &cbt->iface.key;
+ /* If restarting after a prepare conflict, jump to the right spot. */
+ if (restart) {
+ if (cbt->iter_retry == WT_CBT_RETRY_INSERT)
+ goto restart_read_insert;
+ if (cbt->iter_retry == WT_CBT_RETRY_PAGE)
+ goto restart_read_page;
+ }
+ cbt->iter_retry = WT_CBT_RETRY_NOTSET;
+
/*
* For row-store pages, we need a single item that tells us the part
* of the page we're walking (otherwise switching from next to prev
@@ -329,7 +355,10 @@ __cursor_row_next(WT_CURSOR_BTREE *cbt, bool newpage)
if (cbt->ins != NULL)
cbt->ins = WT_SKIP_NEXT(cbt->ins);
-new_insert: if ((ins = cbt->ins) != NULL) {
+new_insert:
+ cbt->iter_retry = WT_CBT_RETRY_INSERT;
+restart_read_insert:
+ if ((ins = cbt->ins) != NULL) {
WT_RET(__wt_txn_read(session, ins->upd, &upd));
if (upd == NULL)
continue;
@@ -362,7 +391,9 @@ new_insert: if ((ins = cbt->ins) != NULL) {
cbt->ins_head = NULL;
cbt->ins = NULL;
+ cbt->iter_retry = WT_CBT_RETRY_PAGE;
cbt->slot = cbt->row_iteration_slot / 2 - 1;
+restart_read_page:
rip = &page->pg_row[cbt->slot];
WT_RET(__wt_txn_read(session, WT_ROW_UPDATE(page, rip), &upd));
if (upd != NULL && upd->type == WT_UPDATE_TOMBSTONE) {
@@ -593,7 +624,7 @@ __wt_btcur_next(WT_CURSOR_BTREE *cbt, bool truncating)
WT_PAGE *page;
WT_SESSION_IMPL *session;
uint32_t flags;
- bool newpage, visible;
+ bool newpage, restart;
cursor = &cbt->iface;
session = (WT_SESSION_IMPL *)cbt->iface.session;
@@ -601,23 +632,11 @@ __wt_btcur_next(WT_CURSOR_BTREE *cbt, bool truncating)
WT_STAT_CONN_INCR(session, cursor_next);
WT_STAT_DATA_INCR(session, cursor_next);
- F_CLR(cursor, WT_CURSTD_KEY_SET | WT_CURSTD_VALUE_SET);
+ flags = WT_READ_NO_SPLIT | WT_READ_SKIP_INTL; /* tree walk flags */
+ if (truncating)
+ LF_SET(WT_READ_TRUNCATE);
- /*
- * If this cursor has returned prepare conflict earlier, check to see
- * whether that prepared update is resolved or not. If not resolved,
- * continue returning prepare conflict. If resolved, return the value
- * based on the visibility rules.
- */
- if (F_ISSET(cbt, WT_CBT_ITERATE_RETRY_NEXT)) {
- WT_ERR(__cursor_check_prepared_update(cbt, &visible));
- if (visible) {
-#ifdef HAVE_DIAGNOSTIC
- WT_ERR(__wt_cursor_key_order_check(session, cbt, true));
-#endif
- return (0);
- }
- }
+ F_CLR(cursor, WT_CURSTD_KEY_SET | WT_CURSTD_VALUE_SET);
WT_ERR(__cursor_func_init(cbt, false));
@@ -633,19 +652,20 @@ __wt_btcur_next(WT_CURSOR_BTREE *cbt, bool truncating)
* found. Then, move to the next page, until we reach the end of the
* file.
*/
- flags = WT_READ_NO_SPLIT | WT_READ_SKIP_INTL; /* tree walk flags */
- if (truncating)
- LF_SET(WT_READ_TRUNCATE);
- for (newpage = false;; newpage = true) {
+ restart = F_ISSET(cbt, WT_CBT_ITERATE_RETRY_NEXT);
+ F_CLR(cbt, WT_CBT_ITERATE_RETRY_NEXT);
+ for (newpage = false;; newpage = true, restart = false) {
page = cbt->ref == NULL ? NULL : cbt->ref->page;
if (F_ISSET(cbt, WT_CBT_ITERATE_APPEND)) {
switch (page->type) {
case WT_PAGE_COL_FIX:
- ret = __cursor_fix_append_next(cbt, newpage);
+ ret = __cursor_fix_append_next(
+ cbt, newpage, restart);
break;
case WT_PAGE_COL_VAR:
- ret = __cursor_var_append_next(cbt, newpage);
+ ret = __cursor_var_append_next(
+ cbt, newpage, restart);
break;
WT_ILLEGAL_VALUE_ERR(session, page->type);
}
@@ -657,13 +677,13 @@ __wt_btcur_next(WT_CURSOR_BTREE *cbt, bool truncating)
} else if (page != NULL) {
switch (page->type) {
case WT_PAGE_COL_FIX:
- ret = __cursor_fix_next(cbt, newpage);
+ ret = __cursor_fix_next(cbt, newpage, restart);
break;
case WT_PAGE_COL_VAR:
- ret = __cursor_var_next(cbt, newpage);
+ ret = __cursor_var_next(cbt, newpage, restart);
break;
case WT_PAGE_ROW_LEAF:
- ret = __cursor_row_next(cbt, newpage);
+ ret = __cursor_row_next(cbt, newpage, restart);
break;
WT_ILLEGAL_VALUE_ERR(session, page->type);
}
@@ -712,7 +732,7 @@ err: switch (ret) {
* at a prepared update, hence current key returned could be
* same as earlier returned key.
*
- * eg: Initial data set : {1,2,3,...10)
+ * eg: Initial data set : (1,2,3,...10)
* insert key 11 in a prepare transaction.
* loop on next will return 1,2,3...10 and subsequent call to
* next will return a prepare conflict. Now if we call prev
diff --git a/src/third_party/wiredtiger/src/btree/bt_curprev.c b/src/third_party/wiredtiger/src/btree/bt_curprev.c
index 32310b8a341..22effc47553 100644
--- a/src/third_party/wiredtiger/src/btree/bt_curprev.c
+++ b/src/third_party/wiredtiger/src/btree/bt_curprev.c
@@ -125,13 +125,17 @@ restart:
* Return the previous fixed-length entry on the append list.
*/
static inline int
-__cursor_fix_append_prev(WT_CURSOR_BTREE *cbt, bool newpage)
+__cursor_fix_append_prev(WT_CURSOR_BTREE *cbt, bool newpage, bool restart)
{
WT_SESSION_IMPL *session;
WT_UPDATE *upd;
session = (WT_SESSION_IMPL *)cbt->iface.session;
+ /* If restarting after a prepare conflict, jump to the right spot. */
+ if (restart)
+ goto restart_read;
+
if (newpage) {
if ((cbt->ins = WT_SKIP_LAST(cbt->ins_head)) == NULL)
return (WT_NOTFOUND);
@@ -204,7 +208,7 @@ __cursor_fix_append_prev(WT_CURSOR_BTREE *cbt, bool newpage)
cbt->iface.value.data = &cbt->v;
} else {
upd = NULL;
- WT_RET(__wt_txn_read(session, cbt->ins->upd, &upd));
+restart_read: WT_RET(__wt_txn_read(session, cbt->ins->upd, &upd));
if (upd == NULL) {
cbt->v = 0;
cbt->iface.value.data = &cbt->v;
@@ -220,7 +224,7 @@ __cursor_fix_append_prev(WT_CURSOR_BTREE *cbt, bool newpage)
* Move to the previous, fixed-length column-store item.
*/
static inline int
-__cursor_fix_prev(WT_CURSOR_BTREE *cbt, bool newpage)
+__cursor_fix_prev(WT_CURSOR_BTREE *cbt, bool newpage, bool restart)
{
WT_BTREE *btree;
WT_PAGE *page;
@@ -231,6 +235,10 @@ __cursor_fix_prev(WT_CURSOR_BTREE *cbt, bool newpage)
page = cbt->ref->page;
btree = S2BT(session);
+ /* If restarting after a prepare conflict, jump to the right spot. */
+ if (restart)
+ goto restart_read;
+
/* Initialize for each new page. */
if (newpage) {
cbt->last_standard_recno = __col_fix_last_recno(cbt->ref);
@@ -254,7 +262,7 @@ new_page:
cbt->ins = NULL;
upd = NULL;
if (cbt->ins != NULL)
- WT_RET(__wt_txn_read(session, cbt->ins->upd, &upd));
+restart_read: WT_RET(__wt_txn_read(session, cbt->ins->upd, &upd));
if (upd == NULL) {
cbt->v = __bit_getv_recno(cbt->ref, cbt->recno, btree->bitcnt);
cbt->iface.value.data = &cbt->v;
@@ -269,13 +277,17 @@ new_page:
* Return the previous variable-length entry on the append list.
*/
static inline int
-__cursor_var_append_prev(WT_CURSOR_BTREE *cbt, bool newpage)
+__cursor_var_append_prev(WT_CURSOR_BTREE *cbt, bool newpage, bool restart)
{
WT_SESSION_IMPL *session;
WT_UPDATE *upd;
session = (WT_SESSION_IMPL *)cbt->iface.session;
+ /* If restarting after a prepare conflict, jump to the right spot. */
+ if (restart)
+ goto restart_read;
+
if (newpage) {
cbt->ins = WT_SKIP_LAST(cbt->ins_head);
goto new_page;
@@ -287,7 +299,7 @@ new_page: if (cbt->ins == NULL)
return (WT_NOTFOUND);
__cursor_set_recno(cbt, WT_INSERT_RECNO(cbt->ins));
- WT_RET(__wt_txn_read(session, cbt->ins->upd, &upd));
+restart_read: WT_RET(__wt_txn_read(session, cbt->ins->upd, &upd));
if (upd == NULL)
continue;
if (upd->type == WT_UPDATE_TOMBSTONE) {
@@ -306,7 +318,7 @@ new_page: if (cbt->ins == NULL)
* Move to the previous, variable-length column-store item.
*/
static inline int
-__cursor_var_prev(WT_CURSOR_BTREE *cbt, bool newpage)
+__cursor_var_prev(WT_CURSOR_BTREE *cbt, bool newpage, bool restart)
{
WT_CELL *cell;
WT_CELL_UNPACK unpack;
@@ -322,6 +334,10 @@ __cursor_var_prev(WT_CURSOR_BTREE *cbt, bool newpage)
rle_start = 0; /* -Werror=maybe-uninitialized */
+ /* If restarting after a prepare conflict, jump to the right spot. */
+ if (restart)
+ goto restart_read;
+
/* Initialize for each new page. */
if (newpage) {
/*
@@ -344,7 +360,7 @@ __cursor_var_prev(WT_CURSOR_BTREE *cbt, bool newpage)
new_page: if (cbt->recno < cbt->ref->ref_recno)
return (WT_NOTFOUND);
- /* Find the matching WT_COL slot. */
+restart_read: /* Find the matching WT_COL slot. */
if ((cip =
__col_var_search(cbt->ref, cbt->recno, &rle_start)) == NULL)
return (WT_NOTFOUND);
@@ -428,7 +444,7 @@ new_page: if (cbt->recno < cbt->ref->ref_recno)
* Move to the previous row-store item.
*/
static inline int
-__cursor_row_prev(WT_CURSOR_BTREE *cbt, bool newpage)
+__cursor_row_prev(WT_CURSOR_BTREE *cbt, bool newpage, bool restart)
{
WT_INSERT *ins;
WT_ITEM *key;
@@ -441,6 +457,15 @@ __cursor_row_prev(WT_CURSOR_BTREE *cbt, bool newpage)
page = cbt->ref->page;
key = &cbt->iface.key;
+ /* If restarting after a prepare conflict, jump to the right spot. */
+ if (restart) {
+ if (cbt->iter_retry == WT_CBT_RETRY_INSERT)
+ goto restart_read_insert;
+ if (cbt->iter_retry == WT_CBT_RETRY_PAGE)
+ goto restart_read_page;
+ }
+ cbt->iter_retry = WT_CBT_RETRY_NOTSET;
+
/*
* For row-store pages, we need a single item that tells us the part
* of the page we're walking (otherwise switching from next to prev
@@ -486,7 +511,10 @@ __cursor_row_prev(WT_CURSOR_BTREE *cbt, bool newpage)
if (cbt->ins != NULL)
WT_RET(__cursor_skip_prev(cbt));
-new_insert: if ((ins = cbt->ins) != NULL) {
+new_insert:
+ cbt->iter_retry = WT_CBT_RETRY_INSERT;
+restart_read_insert:
+ if ((ins = cbt->ins) != NULL) {
WT_RET(__wt_txn_read(session, ins->upd, &upd));
if (upd == NULL)
continue;
@@ -521,7 +549,9 @@ new_insert: if ((ins = cbt->ins) != NULL) {
cbt->ins_head = NULL;
cbt->ins = NULL;
+ cbt->iter_retry = WT_CBT_RETRY_PAGE;
cbt->slot = cbt->row_iteration_slot / 2 - 1;
+restart_read_page:
rip = &page->pg_row[cbt->slot];
WT_RET(__wt_txn_read(session, WT_ROW_UPDATE(page, rip), &upd));
if (upd != NULL && upd->type == WT_UPDATE_TOMBSTONE) {
@@ -547,7 +577,7 @@ __wt_btcur_prev(WT_CURSOR_BTREE *cbt, bool truncating)
WT_PAGE *page;
WT_SESSION_IMPL *session;
uint32_t flags;
- bool newpage, visible;
+ bool newpage, restart;
cursor = &cbt->iface;
session = (WT_SESSION_IMPL *)cbt->iface.session;
@@ -555,24 +585,12 @@ __wt_btcur_prev(WT_CURSOR_BTREE *cbt, bool truncating)
WT_STAT_CONN_INCR(session, cursor_prev);
WT_STAT_DATA_INCR(session, cursor_prev);
- F_CLR(cursor, WT_CURSTD_KEY_SET | WT_CURSTD_VALUE_SET);
+ flags = /* tree walk flags */
+ WT_READ_NO_SPLIT | WT_READ_PREV | WT_READ_SKIP_INTL;
+ if (truncating)
+ LF_SET(WT_READ_TRUNCATE);
- /*
- * If this cursor has returned prepare conflict earlier, check to see
- * whether that prepared update is resolved or not. If not resolved,
- * continue returning prepare conflict. If resolved, return the value
- * based on the visibility rules.
- */
- if (F_ISSET(cbt, WT_CBT_ITERATE_RETRY_PREV)) {
- WT_ERR(__cursor_check_prepared_update(cbt, &visible));
- if (visible) {
-#ifdef HAVE_DIAGNOSTIC
- WT_ERR(
- __wt_cursor_key_order_check(session, cbt, false));
-#endif
- return (0);
- }
- }
+ F_CLR(cursor, WT_CURSTD_KEY_SET | WT_CURSTD_VALUE_SET);
WT_ERR(__cursor_func_init(cbt, false));
@@ -588,11 +606,9 @@ __wt_btcur_prev(WT_CURSOR_BTREE *cbt, bool truncating)
* found. Then, move to the previous page, until we reach the start
* of the file.
*/
- flags = /* tree walk flags */
- WT_READ_NO_SPLIT | WT_READ_PREV | WT_READ_SKIP_INTL;
- if (truncating)
- LF_SET(WT_READ_TRUNCATE);
- for (newpage = false;; newpage = true) {
+ restart = F_ISSET(cbt, WT_CBT_ITERATE_RETRY_PREV);
+ F_CLR(cbt, WT_CBT_ITERATE_RETRY_PREV);
+ for (newpage = false;; newpage = true, restart = false) {
page = cbt->ref == NULL ? NULL : cbt->ref->page;
/*
@@ -607,10 +623,12 @@ __wt_btcur_prev(WT_CURSOR_BTREE *cbt, bool truncating)
if (F_ISSET(cbt, WT_CBT_ITERATE_APPEND)) {
switch (page->type) {
case WT_PAGE_COL_FIX:
- ret = __cursor_fix_append_prev(cbt, newpage);
+ ret = __cursor_fix_append_prev(
+ cbt, newpage, restart);
break;
case WT_PAGE_COL_VAR:
- ret = __cursor_var_append_prev(cbt, newpage);
+ ret = __cursor_var_append_prev(
+ cbt, newpage, restart);
break;
WT_ILLEGAL_VALUE_ERR(session, page->type);
}
@@ -624,13 +642,13 @@ __wt_btcur_prev(WT_CURSOR_BTREE *cbt, bool truncating)
if (page != NULL) {
switch (page->type) {
case WT_PAGE_COL_FIX:
- ret = __cursor_fix_prev(cbt, newpage);
+ ret = __cursor_fix_prev(cbt, newpage, restart);
break;
case WT_PAGE_COL_VAR:
- ret = __cursor_var_prev(cbt, newpage);
+ ret = __cursor_var_prev(cbt, newpage, restart);
break;
case WT_PAGE_ROW_LEAF:
- ret = __cursor_row_prev(cbt, newpage);
+ ret = __cursor_row_prev(cbt, newpage, restart);
break;
WT_ILLEGAL_VALUE_ERR(session, page->type);
}
@@ -668,7 +686,7 @@ err: switch (ret) {
* at a prepared update, hence current key returned could be
* same as earlier returned key.
*
- * eg: Initial data set : {2,3,...10)
+ * eg: Initial data set : (2,3,...10)
* insert key 1 in a prepare transaction.
* loop on prev will return 10,...3,2 and subsequent call to
* prev will return a prepare conflict. Now if we call next
diff --git a/src/third_party/wiredtiger/src/btree/bt_debug.c b/src/third_party/wiredtiger/src/btree/bt_debug.c
index 1ce403dba7f..29c784c4198 100644
--- a/src/third_party/wiredtiger/src/btree/bt_debug.c
+++ b/src/third_party/wiredtiger/src/btree/bt_debug.c
@@ -1169,6 +1169,7 @@ __debug_modify(WT_DBG *ds, WT_UPDATE *upd)
static int
__debug_update(WT_DBG *ds, WT_UPDATE *upd, bool hexbyte)
{
+ const char *prepare_state;
char ts_string[WT_TS_INT_STRING_SIZE];
for (; upd != NULL; upd = upd->next) {
@@ -1200,16 +1201,41 @@ __debug_update(WT_DBG *ds, WT_UPDATE *upd, bool hexbyte)
WT_RET(ds->f(ds, "\tvalue {tombstone}\n"));
break;
}
+
if (upd->txnid == WT_TXN_ABORTED)
WT_RET(ds->f(ds, "\t" "txn id aborted"));
else
WT_RET(ds->f(ds, "\t" "txn id %" PRIu64, upd->txnid));
+
__wt_timestamp_to_string(
upd->start_ts, ts_string, sizeof(ts_string));
WT_RET(ds->f(ds, ", start_ts %s", ts_string));
__wt_timestamp_to_string(
upd->stop_ts, ts_string, sizeof(ts_string));
WT_RET(ds->f(ds, ", stop_ts %s", ts_string));
+ if (upd->durable_ts != WT_TS_NONE) {
+ __wt_timestamp_to_string(upd->durable_ts,
+ ts_string, sizeof(ts_string));
+ WT_RET(ds->f(ds, ", durable-ts %s", ts_string));
+ }
+
+ prepare_state = NULL;
+ switch (upd->prepare_state) {
+ case WT_PREPARE_INIT:
+ break;
+ case WT_PREPARE_INPROGRESS:
+ prepare_state = "in-progress";
+ break;
+ case WT_PREPARE_LOCKED:
+ prepare_state = "locked";
+ break;
+ case WT_PREPARE_RESOLVED:
+ prepare_state = "resolved";
+ break;
+ }
+ if (prepare_state != NULL)
+ WT_RET(ds->f(ds, ", prepare %s", prepare_state));
+
WT_RET(ds->f(ds, "\n"));
}
return (0);
diff --git a/src/third_party/wiredtiger/src/btree/bt_handle.c b/src/third_party/wiredtiger/src/btree/bt_handle.c
index 6d96c2537b3..aa26a627a94 100644
--- a/src/third_party/wiredtiger/src/btree/bt_handle.c
+++ b/src/third_party/wiredtiger/src/btree/bt_handle.c
@@ -437,6 +437,22 @@ __btree_conf(WT_SESSION_IMPL *session, WT_CKPT *ckpt)
FLD_SET(btree->assert_flags, WT_ASSERT_COMMIT_TS_KEYS);
else if (WT_STRING_MATCH("never", cval.str, cval.len))
FLD_SET(btree->assert_flags, WT_ASSERT_COMMIT_TS_NEVER);
+
+ /*
+ * A durable timestamp always implies a commit timestamp. But never
+ * having a durable timestamp does not imply anything about a commit
+ * timestamp.
+ */
+ WT_RET(__wt_config_gets(session,
+ cfg, "assert.durable_timestamp", &cval));
+ if (WT_STRING_MATCH("always", cval.str, cval.len))
+ FLD_SET(btree->assert_flags,
+ WT_ASSERT_COMMIT_TS_ALWAYS | WT_ASSERT_DURABLE_TS_ALWAYS);
+ else if (WT_STRING_MATCH("key_consistent", cval.str, cval.len))
+ FLD_SET(btree->assert_flags, WT_ASSERT_DURABLE_TS_KEYS);
+ else if (WT_STRING_MATCH("never", cval.str, cval.len))
+ FLD_SET(btree->assert_flags, WT_ASSERT_DURABLE_TS_NEVER);
+
WT_RET(__wt_config_gets(session, cfg, "assert.read_timestamp", &cval));
if (WT_STRING_MATCH("always", cval.str, cval.len))
FLD_SET(btree->assert_flags, WT_ASSERT_READ_TS_ALWAYS);
diff --git a/src/third_party/wiredtiger/src/btree/bt_vrfy_dsk.c b/src/third_party/wiredtiger/src/btree/bt_vrfy_dsk.c
index 5896852c1bf..82045edea31 100644
--- a/src/third_party/wiredtiger/src/btree/bt_vrfy_dsk.c
+++ b/src/third_party/wiredtiger/src/btree/bt_vrfy_dsk.c
@@ -8,8 +8,9 @@
#include "wt_internal.h"
-static int __err_cell_corrupt(WT_SESSION_IMPL *, uint32_t, const char *);
-static int __err_cell_corrupt_or_eof(WT_SESSION_IMPL *, uint32_t, const char *);
+static int __err_cell_corrupt(WT_SESSION_IMPL *, int, uint32_t, const char *);
+static int __err_cell_corrupt_or_eof(
+ WT_SESSION_IMPL *, int, uint32_t, const char *);
static int __err_cell_type(
WT_SESSION_IMPL *, uint32_t, const char *, uint8_t, uint8_t);
static int __verify_dsk_chunk(
@@ -31,12 +32,19 @@ static int __verify_dsk_row(
goto err; \
} while (0)
-#define WT_RET_VRFY(session, ...) do { \
- if (!(F_ISSET(session, WT_SESSION_QUIET_CORRUPT_FILE))) \
- __wt_errx(session, __VA_ARGS__); \
- return (WT_ERROR); \
+#define WT_RET_VRFY_RETVAL(session, ret, ...) do { \
+ if (!(F_ISSET(session, WT_SESSION_QUIET_CORRUPT_FILE))) { \
+ if ((ret) == 0) \
+ __wt_errx(session, __VA_ARGS__); \
+ else \
+ __wt_err(session, ret, __VA_ARGS__); \
+ } \
+ return ((ret) == 0 ? ret : WT_ERROR); \
} while (0)
+#define WT_RET_VRFY(session, ...) \
+ WT_RET_VRFY_RETVAL(session, 0, __VA_ARGS__);
+
/*
* WT_CELL_FOREACH_VRFY --
* Iterate through each cell on a page. Verify-specific version of the
@@ -394,9 +402,9 @@ __verify_dsk_row(WT_SESSION_IMPL *session,
++cell_num;
/* Carefully unpack the cell. */
- if (__wt_cell_unpack_safe(
- session, dsk, cell, unpack, end) != 0) {
- ret = __err_cell_corrupt(session, cell_num, tag);
+ ret = __wt_cell_unpack_safe(session, dsk, cell, unpack, end);
+ if (ret != 0) {
+ (void)__err_cell_corrupt(session, ret, cell_num, tag);
goto err;
}
@@ -470,8 +478,8 @@ __verify_dsk_row(WT_SESSION_IMPL *session,
case WT_CELL_VALUE_OVFL:
if ((ret = bm->addr_invalid(
bm, session, unpack->data, unpack->size)) == EINVAL)
- ret = __err_cell_corrupt_or_eof(
- session, cell_num, tag);
+ (void)__err_cell_corrupt_or_eof(
+ session, ret, cell_num, tag);
WT_ERR(ret);
break;
}
@@ -615,8 +623,7 @@ key_compare: /*
__wt_page_type_string(dsk->type),
tag, key_cnt, dsk->u.entries);
if (dsk->type == WT_PAGE_ROW_LEAF &&
- F_ISSET(dsk, WT_PAGE_EMPTY_V_ALL) &&
- key_cnt != dsk->u.entries)
+ F_ISSET(dsk, WT_PAGE_EMPTY_V_ALL) && key_cnt != dsk->u.entries)
WT_ERR_VRFY(session,
"%s page at %s with the 'all empty values' flag set has a "
"key count of %" PRIu32 " and a physical entry count of %"
@@ -624,8 +631,7 @@ key_compare: /*
__wt_page_type_string(dsk->type),
tag, key_cnt, dsk->u.entries);
if (dsk->type == WT_PAGE_ROW_LEAF &&
- F_ISSET(dsk, WT_PAGE_EMPTY_V_NONE) &&
- key_cnt * 2 != dsk->u.entries)
+ F_ISSET(dsk, WT_PAGE_EMPTY_V_NONE) && key_cnt * 2 != dsk->u.entries)
WT_ERR_VRFY(session,
"%s page at %s with the 'no empty values' flag set has a "
"key count of %" PRIu32 " and a physical entry count of %"
@@ -671,8 +677,10 @@ __verify_dsk_col_int(WT_SESSION_IMPL *session,
++cell_num;
/* Carefully unpack the cell. */
- if (__wt_cell_unpack_safe(session, dsk, cell, unpack, end) != 0)
- return (__err_cell_corrupt(session, cell_num, tag));
+ ret = __wt_cell_unpack_safe(session, dsk, cell, unpack, end);
+ if (ret != 0)
+ return (
+ __err_cell_corrupt(session, ret, cell_num, tag));
/* Check the raw and collapsed cell types. */
WT_RET(__err_cell_type(
@@ -687,8 +695,8 @@ __verify_dsk_col_int(WT_SESSION_IMPL *session,
ret = bm->addr_invalid(bm, session, unpack->data, unpack->size);
WT_RET_ERROR_OK(ret, EINVAL);
if (ret == EINVAL)
- return (
- __err_cell_corrupt_or_eof(session, cell_num, tag));
+ return (__err_cell_corrupt_or_eof(
+ session, ret, cell_num, tag));
}
WT_RET(__verify_dsk_memsize(session, tag, dsk, cell));
@@ -749,8 +757,10 @@ __verify_dsk_col_var(WT_SESSION_IMPL *session,
++cell_num;
/* Carefully unpack the cell. */
- if (__wt_cell_unpack_safe(session, dsk, cell, unpack, end) != 0)
- return (__err_cell_corrupt(session, cell_num, tag));
+ ret = __wt_cell_unpack_safe(session, dsk, cell, unpack, end);
+ if (ret != 0)
+ return (__err_cell_corrupt(
+ session, ret, cell_num, tag));
/* Check the raw and collapsed cell types. */
WT_RET(__err_cell_type(
@@ -769,7 +779,7 @@ __verify_dsk_col_var(WT_SESSION_IMPL *session,
WT_RET_ERROR_OK(ret, EINVAL);
if (ret == EINVAL)
return (__err_cell_corrupt_or_eof(
- session, cell_num, tag));
+ session, ret, cell_num, tag));
}
/*
@@ -881,9 +891,9 @@ __verify_dsk_chunk(WT_SESSION_IMPL *session,
*/
static int
__err_cell_corrupt(
- WT_SESSION_IMPL *session, uint32_t entry_num, const char *tag)
+ WT_SESSION_IMPL *session, int retval, uint32_t entry_num, const char *tag)
{
- WT_RET_VRFY(session,
+ WT_RET_VRFY_RETVAL(session, retval,
"item %" PRIu32 " on page at %s is a corrupted cell",
entry_num, tag);
}
@@ -894,9 +904,9 @@ __err_cell_corrupt(
*/
static int
__err_cell_corrupt_or_eof(
- WT_SESSION_IMPL *session, uint32_t entry_num, const char *tag)
+ WT_SESSION_IMPL *session, int retval, uint32_t entry_num, const char *tag)
{
- WT_RET_VRFY(session,
+ WT_RET_VRFY_RETVAL(session, retval,
"item %" PRIu32 " on page at %s is a corrupted cell or references "
"non-existent file pages",
entry_num, tag);
diff --git a/src/third_party/wiredtiger/src/config/config_def.c b/src/third_party/wiredtiger/src/config/config_def.c
index 90b1dc023ec..1b2f6ce5ed6 100644
--- a/src/third_party/wiredtiger/src/config/config_def.c
+++ b/src/third_party/wiredtiger/src/config/config_def.c
@@ -206,14 +206,14 @@ static const WT_CONFIG_CHECK confchk_WT_CONNECTION_reconfigure[] = {
NULL, 0 },
{ "verbose", "list",
NULL, "choices=[\"api\",\"block\",\"checkpoint\","
- "\"checkpoint_progress\",\"compact\",\"error_returns\",\"evict\","
- "\"evict_stuck\",\"evictserver\",\"fileops\",\"handleops\","
- "\"log\",\"lookaside\",\"lookaside_activity\",\"lsm\","
- "\"lsm_manager\",\"metadata\",\"mutex\",\"overflow\",\"read\","
- "\"rebalance\",\"reconcile\",\"recovery\",\"recovery_progress\","
- "\"salvage\",\"shared_cache\",\"split\",\"temporary\","
- "\"thread_group\",\"timestamp\",\"transaction\",\"verify\","
- "\"version\",\"write\"]",
+ "\"checkpoint_progress\",\"compact\",\"compact_progress\","
+ "\"error_returns\",\"evict\",\"evict_stuck\",\"evictserver\","
+ "\"fileops\",\"handleops\",\"log\",\"lookaside\","
+ "\"lookaside_activity\",\"lsm\",\"lsm_manager\",\"metadata\","
+ "\"mutex\",\"overflow\",\"read\",\"rebalance\",\"reconcile\","
+ "\"recovery\",\"recovery_progress\",\"salvage\",\"shared_cache\","
+ "\"split\",\"temporary\",\"thread_group\",\"timestamp\","
+ "\"transaction\",\"verify\",\"version\",\"write\"]",
NULL, 0 },
{ NULL, NULL, NULL, NULL, NULL, 0 }
};
@@ -237,6 +237,10 @@ static const WT_CONFIG_CHECK confchk_assert_subconfigs[] = {
NULL, "choices=[\"always\",\"key_consistent\",\"never\","
"\"none\"]",
NULL, 0 },
+ { "durable_timestamp", "string",
+ NULL, "choices=[\"always\",\"key_consistent\",\"never\","
+ "\"none\"]",
+ NULL, 0 },
{ "read_timestamp", "string",
NULL, "choices=[\"always\",\"never\",\"none\"]",
NULL, 0 },
@@ -256,7 +260,7 @@ static const WT_CONFIG_CHECK confchk_WT_SESSION_alter[] = {
{ "app_metadata", "string", NULL, NULL, NULL, 0 },
{ "assert", "category",
NULL, NULL,
- confchk_assert_subconfigs, 2 },
+ confchk_assert_subconfigs, 3 },
{ "cache_resident", "boolean", NULL, NULL, NULL, 0 },
{ "exclusive_refreshed", "boolean", NULL, NULL, NULL, 0 },
{ "log", "category",
@@ -349,7 +353,7 @@ static const WT_CONFIG_CHECK confchk_WT_SESSION_create[] = {
{ "app_metadata", "string", NULL, NULL, NULL, 0 },
{ "assert", "category",
NULL, NULL,
- confchk_assert_subconfigs, 2 },
+ confchk_assert_subconfigs, 3 },
{ "block_allocation", "string",
NULL, "choices=[\"first\",\"best\"]",
NULL, 0 },
@@ -556,7 +560,7 @@ static const WT_CONFIG_CHECK confchk_file_config[] = {
{ "app_metadata", "string", NULL, NULL, NULL, 0 },
{ "assert", "category",
NULL, NULL,
- confchk_assert_subconfigs, 2 },
+ confchk_assert_subconfigs, 3 },
{ "block_allocation", "string",
NULL, "choices=[\"first\",\"best\"]",
NULL, 0 },
@@ -621,7 +625,7 @@ static const WT_CONFIG_CHECK confchk_file_meta[] = {
{ "app_metadata", "string", NULL, NULL, NULL, 0 },
{ "assert", "category",
NULL, NULL,
- confchk_assert_subconfigs, 2 },
+ confchk_assert_subconfigs, 3 },
{ "block_allocation", "string",
NULL, "choices=[\"first\",\"best\"]",
NULL, 0 },
@@ -706,7 +710,7 @@ static const WT_CONFIG_CHECK confchk_lsm_meta[] = {
{ "app_metadata", "string", NULL, NULL, NULL, 0 },
{ "assert", "category",
NULL, NULL,
- confchk_assert_subconfigs, 2 },
+ confchk_assert_subconfigs, 3 },
{ "block_allocation", "string",
NULL, "choices=[\"first\",\"best\"]",
NULL, 0 },
@@ -927,14 +931,14 @@ static const WT_CONFIG_CHECK confchk_wiredtiger_open[] = {
{ "use_environment_priv", "boolean", NULL, NULL, NULL, 0 },
{ "verbose", "list",
NULL, "choices=[\"api\",\"block\",\"checkpoint\","
- "\"checkpoint_progress\",\"compact\",\"error_returns\",\"evict\","
- "\"evict_stuck\",\"evictserver\",\"fileops\",\"handleops\","
- "\"log\",\"lookaside\",\"lookaside_activity\",\"lsm\","
- "\"lsm_manager\",\"metadata\",\"mutex\",\"overflow\",\"read\","
- "\"rebalance\",\"reconcile\",\"recovery\",\"recovery_progress\","
- "\"salvage\",\"shared_cache\",\"split\",\"temporary\","
- "\"thread_group\",\"timestamp\",\"transaction\",\"verify\","
- "\"version\",\"write\"]",
+ "\"checkpoint_progress\",\"compact\",\"compact_progress\","
+ "\"error_returns\",\"evict\",\"evict_stuck\",\"evictserver\","
+ "\"fileops\",\"handleops\",\"log\",\"lookaside\","
+ "\"lookaside_activity\",\"lsm\",\"lsm_manager\",\"metadata\","
+ "\"mutex\",\"overflow\",\"read\",\"rebalance\",\"reconcile\","
+ "\"recovery\",\"recovery_progress\",\"salvage\",\"shared_cache\","
+ "\"split\",\"temporary\",\"thread_group\",\"timestamp\","
+ "\"transaction\",\"verify\",\"version\",\"write\"]",
NULL, 0 },
{ "write_through", "list",
NULL, "choices=[\"data\",\"log\"]",
@@ -1036,14 +1040,14 @@ static const WT_CONFIG_CHECK confchk_wiredtiger_open_all[] = {
{ "use_environment_priv", "boolean", NULL, NULL, NULL, 0 },
{ "verbose", "list",
NULL, "choices=[\"api\",\"block\",\"checkpoint\","
- "\"checkpoint_progress\",\"compact\",\"error_returns\",\"evict\","
- "\"evict_stuck\",\"evictserver\",\"fileops\",\"handleops\","
- "\"log\",\"lookaside\",\"lookaside_activity\",\"lsm\","
- "\"lsm_manager\",\"metadata\",\"mutex\",\"overflow\",\"read\","
- "\"rebalance\",\"reconcile\",\"recovery\",\"recovery_progress\","
- "\"salvage\",\"shared_cache\",\"split\",\"temporary\","
- "\"thread_group\",\"timestamp\",\"transaction\",\"verify\","
- "\"version\",\"write\"]",
+ "\"checkpoint_progress\",\"compact\",\"compact_progress\","
+ "\"error_returns\",\"evict\",\"evict_stuck\",\"evictserver\","
+ "\"fileops\",\"handleops\",\"log\",\"lookaside\","
+ "\"lookaside_activity\",\"lsm\",\"lsm_manager\",\"metadata\","
+ "\"mutex\",\"overflow\",\"read\",\"rebalance\",\"reconcile\","
+ "\"recovery\",\"recovery_progress\",\"salvage\",\"shared_cache\","
+ "\"split\",\"temporary\",\"thread_group\",\"timestamp\","
+ "\"transaction\",\"verify\",\"version\",\"write\"]",
NULL, 0 },
{ "version", "string", NULL, NULL, NULL, 0 },
{ "write_through", "list",
@@ -1140,14 +1144,14 @@ static const WT_CONFIG_CHECK confchk_wiredtiger_open_basecfg[] = {
confchk_wiredtiger_open_transaction_sync_subconfigs, 2 },
{ "verbose", "list",
NULL, "choices=[\"api\",\"block\",\"checkpoint\","
- "\"checkpoint_progress\",\"compact\",\"error_returns\",\"evict\","
- "\"evict_stuck\",\"evictserver\",\"fileops\",\"handleops\","
- "\"log\",\"lookaside\",\"lookaside_activity\",\"lsm\","
- "\"lsm_manager\",\"metadata\",\"mutex\",\"overflow\",\"read\","
- "\"rebalance\",\"reconcile\",\"recovery\",\"recovery_progress\","
- "\"salvage\",\"shared_cache\",\"split\",\"temporary\","
- "\"thread_group\",\"timestamp\",\"transaction\",\"verify\","
- "\"version\",\"write\"]",
+ "\"checkpoint_progress\",\"compact\",\"compact_progress\","
+ "\"error_returns\",\"evict\",\"evict_stuck\",\"evictserver\","
+ "\"fileops\",\"handleops\",\"log\",\"lookaside\","
+ "\"lookaside_activity\",\"lsm\",\"lsm_manager\",\"metadata\","
+ "\"mutex\",\"overflow\",\"read\",\"rebalance\",\"reconcile\","
+ "\"recovery\",\"recovery_progress\",\"salvage\",\"shared_cache\","
+ "\"split\",\"temporary\",\"thread_group\",\"timestamp\","
+ "\"transaction\",\"verify\",\"version\",\"write\"]",
NULL, 0 },
{ "version", "string", NULL, NULL, NULL, 0 },
{ "write_through", "list",
@@ -1244,14 +1248,14 @@ static const WT_CONFIG_CHECK confchk_wiredtiger_open_usercfg[] = {
confchk_wiredtiger_open_transaction_sync_subconfigs, 2 },
{ "verbose", "list",
NULL, "choices=[\"api\",\"block\",\"checkpoint\","
- "\"checkpoint_progress\",\"compact\",\"error_returns\",\"evict\","
- "\"evict_stuck\",\"evictserver\",\"fileops\",\"handleops\","
- "\"log\",\"lookaside\",\"lookaside_activity\",\"lsm\","
- "\"lsm_manager\",\"metadata\",\"mutex\",\"overflow\",\"read\","
- "\"rebalance\",\"reconcile\",\"recovery\",\"recovery_progress\","
- "\"salvage\",\"shared_cache\",\"split\",\"temporary\","
- "\"thread_group\",\"timestamp\",\"transaction\",\"verify\","
- "\"version\",\"write\"]",
+ "\"checkpoint_progress\",\"compact\",\"compact_progress\","
+ "\"error_returns\",\"evict\",\"evict_stuck\",\"evictserver\","
+ "\"fileops\",\"handleops\",\"log\",\"lookaside\","
+ "\"lookaside_activity\",\"lsm\",\"lsm_manager\",\"metadata\","
+ "\"mutex\",\"overflow\",\"read\",\"rebalance\",\"reconcile\","
+ "\"recovery\",\"recovery_progress\",\"salvage\",\"shared_cache\","
+ "\"split\",\"temporary\",\"thread_group\",\"timestamp\","
+ "\"transaction\",\"verify\",\"version\",\"write\"]",
NULL, 0 },
{ "write_through", "list",
NULL, "choices=[\"data\",\"log\"]",
@@ -1348,9 +1352,10 @@ static const WT_CONFIG_ENTRY config_entries[] = {
},
{ "WT_SESSION.alter",
"access_pattern_hint=none,app_metadata=,"
- "assert=(commit_timestamp=none,read_timestamp=none),"
- "cache_resident=false,exclusive_refreshed=true,log=(enabled=true)"
- ",os_cache_dirty_max=0,os_cache_max=0",
+ "assert=(commit_timestamp=none,durable_timestamp=none,"
+ "read_timestamp=none),cache_resident=false,"
+ "exclusive_refreshed=true,log=(enabled=true),os_cache_dirty_max=0"
+ ",os_cache_max=0",
confchk_WT_SESSION_alter, 8
},
{ "WT_SESSION.begin_transaction",
@@ -1376,11 +1381,11 @@ static const WT_CONFIG_ENTRY config_entries[] = {
},
{ "WT_SESSION.create",
"access_pattern_hint=none,allocation_size=4KB,app_metadata=,"
- "assert=(commit_timestamp=none,read_timestamp=none),"
- "block_allocation=best,block_compressor=,cache_resident=false,"
- "checksum=uncompressed,colgroups=,collator=,columns=,dictionary=0"
- ",encryption=(keyid=,name=),exclusive=false,extractor=,"
- "format=btree,huffman_key=,huffman_value=,"
+ "assert=(commit_timestamp=none,durable_timestamp=none,"
+ "read_timestamp=none),block_allocation=best,block_compressor=,"
+ "cache_resident=false,checksum=uncompressed,colgroups=,collator=,"
+ "columns=,dictionary=0,encryption=(keyid=,name=),exclusive=false,"
+ "extractor=,format=btree,huffman_key=,huffman_value=,"
"ignore_in_memory_cache_size=false,immutable=false,"
"internal_item_max=0,internal_key_max=0,"
"internal_key_truncate=true,internal_page_max=4KB,key_format=u,"
@@ -1491,11 +1496,11 @@ static const WT_CONFIG_ENTRY config_entries[] = {
},
{ "file.config",
"access_pattern_hint=none,allocation_size=4KB,app_metadata=,"
- "assert=(commit_timestamp=none,read_timestamp=none),"
- "block_allocation=best,block_compressor=,cache_resident=false,"
- "checksum=uncompressed,collator=,columns=,dictionary=0,"
- "encryption=(keyid=,name=),format=btree,huffman_key=,"
- "huffman_value=,ignore_in_memory_cache_size=false,"
+ "assert=(commit_timestamp=none,durable_timestamp=none,"
+ "read_timestamp=none),block_allocation=best,block_compressor=,"
+ "cache_resident=false,checksum=uncompressed,collator=,columns=,"
+ "dictionary=0,encryption=(keyid=,name=),format=btree,huffman_key="
+ ",huffman_value=,ignore_in_memory_cache_size=false,"
"internal_item_max=0,internal_key_max=0,"
"internal_key_truncate=true,internal_page_max=4KB,key_format=u,"
"key_gap=10,leaf_item_max=0,leaf_key_max=0,leaf_page_max=32KB,"
@@ -1508,20 +1513,20 @@ static const WT_CONFIG_ENTRY config_entries[] = {
},
{ "file.meta",
"access_pattern_hint=none,allocation_size=4KB,app_metadata=,"
- "assert=(commit_timestamp=none,read_timestamp=none),"
- "block_allocation=best,block_compressor=,cache_resident=false,"
- "checkpoint=,checkpoint_lsn=,checksum=uncompressed,collator=,"
- "columns=,dictionary=0,encryption=(keyid=,name=),format=btree,"
- "huffman_key=,huffman_value=,id=,"
- "ignore_in_memory_cache_size=false,internal_item_max=0,"
- "internal_key_max=0,internal_key_truncate=true,"
- "internal_page_max=4KB,key_format=u,key_gap=10,leaf_item_max=0,"
- "leaf_key_max=0,leaf_page_max=32KB,leaf_value_max=0,"
- "log=(enabled=true),memory_page_image_max=0,memory_page_max=5MB,"
- "os_cache_dirty_max=0,os_cache_max=0,prefix_compression=false,"
- "prefix_compression_min=4,split_deepen_min_child=0,"
- "split_deepen_per_child=0,split_pct=90,value_format=u,"
- "version=(major=0,minor=0)",
+ "assert=(commit_timestamp=none,durable_timestamp=none,"
+ "read_timestamp=none),block_allocation=best,block_compressor=,"
+ "cache_resident=false,checkpoint=,checkpoint_lsn=,"
+ "checksum=uncompressed,collator=,columns=,dictionary=0,"
+ "encryption=(keyid=,name=),format=btree,huffman_key=,"
+ "huffman_value=,id=,ignore_in_memory_cache_size=false,"
+ "internal_item_max=0,internal_key_max=0,"
+ "internal_key_truncate=true,internal_page_max=4KB,key_format=u,"
+ "key_gap=10,leaf_item_max=0,leaf_key_max=0,leaf_page_max=32KB,"
+ "leaf_value_max=0,log=(enabled=true),memory_page_image_max=0,"
+ "memory_page_max=5MB,os_cache_dirty_max=0,os_cache_max=0,"
+ "prefix_compression=false,prefix_compression_min=4,"
+ "split_deepen_min_child=0,split_deepen_per_child=0,split_pct=90,"
+ "value_format=u,version=(major=0,minor=0)",
confchk_file_meta, 41
},
{ "index.meta",
@@ -1531,11 +1536,11 @@ static const WT_CONFIG_ENTRY config_entries[] = {
},
{ "lsm.meta",
"access_pattern_hint=none,allocation_size=4KB,app_metadata=,"
- "assert=(commit_timestamp=none,read_timestamp=none),"
- "block_allocation=best,block_compressor=,cache_resident=false,"
- "checksum=uncompressed,chunks=,collator=,columns=,dictionary=0,"
- "encryption=(keyid=,name=),format=btree,huffman_key=,"
- "huffman_value=,ignore_in_memory_cache_size=false,"
+ "assert=(commit_timestamp=none,durable_timestamp=none,"
+ "read_timestamp=none),block_allocation=best,block_compressor=,"
+ "cache_resident=false,checksum=uncompressed,chunks=,collator=,"
+ "columns=,dictionary=0,encryption=(keyid=,name=),format=btree,"
+ "huffman_key=,huffman_value=,ignore_in_memory_cache_size=false,"
"internal_item_max=0,internal_key_max=0,"
"internal_key_truncate=true,internal_page_max=4KB,key_format=u,"
"key_gap=10,last=,leaf_item_max=0,leaf_key_max=0,"
diff --git a/src/third_party/wiredtiger/src/conn/conn_api.c b/src/third_party/wiredtiger/src/conn/conn_api.c
index 0630bdb3711..392cbad87d3 100644
--- a/src/third_party/wiredtiger/src/conn/conn_api.c
+++ b/src/third_party/wiredtiger/src/conn/conn_api.c
@@ -1835,6 +1835,7 @@ __wt_verbose_config(WT_SESSION_IMPL *session, const char *cfg[])
{ "checkpoint", WT_VERB_CHECKPOINT },
{ "checkpoint_progress",WT_VERB_CHECKPOINT_PROGRESS },
{ "compact", WT_VERB_COMPACT },
+ { "compact_progress", WT_VERB_COMPACT_PROGRESS },
{ "error_returns", WT_VERB_ERROR_RETURNS },
{ "evict", WT_VERB_EVICT },
{ "evict_stuck", WT_VERB_EVICT_STUCK },
diff --git a/src/third_party/wiredtiger/src/cursor/cur_backup.c b/src/third_party/wiredtiger/src/cursor/cur_backup.c
index 315a822cc13..04882e527ce 100644
--- a/src/third_party/wiredtiger/src/cursor/cur_backup.c
+++ b/src/third_party/wiredtiger/src/cursor/cur_backup.c
@@ -359,18 +359,25 @@ __backup_start(WT_SESSION_IMPL *session,
}
err: /* Close the hot backup file. */
- if (cb->bfs != NULL)
- WT_TRET(__wt_fclose(session, &cb->bfs));
if (srcfs != NULL)
WT_TRET(__wt_fclose(session, &srcfs));
+ /*
+ * Sync and rename the temp file into place.
+ */
+ if (ret == 0)
+ ret = __wt_sync_and_rename(session,
+ &cb->bfs, WT_BACKUP_TMP, dest);
if (ret == 0) {
- WT_ASSERT(session, dest != NULL);
- WT_TRET(__wt_fs_rename(session, WT_BACKUP_TMP, dest, false));
__wt_writelock(session, &conn->hot_backup_lock);
conn->hot_backup_list = cb->list;
__wt_writeunlock(session, &conn->hot_backup_lock);
F_SET(session, WT_SESSION_BACKUP_CURSOR);
}
+ /*
+ * If the file hasn't been closed, do it now.
+ */
+ if (cb->bfs != NULL)
+ WT_TRET(__wt_fclose(session, &cb->bfs));
done:
return (ret);
diff --git a/src/third_party/wiredtiger/src/cursor/cur_index.c b/src/third_party/wiredtiger/src/cursor/cur_index.c
index a5d77df7b47..baabcd0182c 100644
--- a/src/third_party/wiredtiger/src/cursor/cur_index.c
+++ b/src/third_party/wiredtiger/src/cursor/cur_index.c
@@ -382,7 +382,7 @@ err:
if (cindex->child != NULL)
WT_TRET(cindex->child->close(cindex->child));
- WT_TRET(__wt_schema_release_table(session, cindex->table));
+ WT_TRET(__wt_schema_release_table(session, &cindex->table));
/* The URI is owned by the index. */
cursor->internal_uri = NULL;
__wt_cursor_close(cursor);
@@ -489,7 +489,7 @@ __wt_curindex_open(WT_SESSION_IMPL *session,
if ((ret = __wt_schema_open_index(
session, table, idxname, namesize, &idx)) != 0) {
- WT_TRET(__wt_schema_release_table(session, table));
+ WT_TRET(__wt_schema_release_table(session, &table));
return (ret);
}
WT_RET(__wt_calloc_one(session, &cindex));
diff --git a/src/third_party/wiredtiger/src/cursor/cur_join.c b/src/third_party/wiredtiger/src/cursor/cur_join.c
index a10e6cc1053..1a8f6ab28af 100644
--- a/src/third_party/wiredtiger/src/cursor/cur_join.c
+++ b/src/third_party/wiredtiger/src/cursor/cur_join.c
@@ -327,7 +327,7 @@ __curjoin_close(WT_CURSOR *cursor)
JOINABLE_CURSOR_API_CALL_PREPARE_ALLOWED(cursor, session, close, NULL);
err:
- WT_TRET(__wt_schema_release_table(session, cjoin->table));
+ WT_TRET(__wt_schema_release_table(session, &cjoin->table));
/* This is owned by the table */
cursor->key_format = NULL;
diff --git a/src/third_party/wiredtiger/src/cursor/cur_stat.c b/src/third_party/wiredtiger/src/cursor/cur_stat.c
index 0118f342903..f4aee9a8c90 100644
--- a/src/third_party/wiredtiger/src/cursor/cur_stat.c
+++ b/src/third_party/wiredtiger/src/cursor/cur_stat.c
@@ -357,6 +357,7 @@ __curstat_conn_init(WT_SESSION_IMPL *session, WT_CURSOR_STAT *cst)
* Optionally clear the connection statistics.
*/
__wt_conn_stat_init(session);
+ __wt_stat_connection_init_single(&cst->u.conn_stats);
__wt_stat_connection_aggregate(conn->stats, &cst->u.conn_stats);
if (F_ISSET(cst, WT_STAT_CLEAR))
__wt_stat_connection_clear_all(conn->stats);
diff --git a/src/third_party/wiredtiger/src/cursor/cur_table.c b/src/third_party/wiredtiger/src/cursor/cur_table.c
index 807bd668096..77c6018778c 100644
--- a/src/third_party/wiredtiger/src/cursor/cur_table.c
+++ b/src/third_party/wiredtiger/src/cursor/cur_table.c
@@ -839,7 +839,7 @@ err:
__wt_free(session, ctable->cg_valcopy);
__wt_free(session, ctable->idx_cursors);
- WT_TRET(__wt_schema_release_table(session, ctable->table));
+ WT_TRET(__wt_schema_release_table(session, &ctable->table));
/* The URI is owned by the table. */
cursor->internal_uri = NULL;
__wt_cursor_close(cursor);
@@ -999,7 +999,7 @@ __wt_curtable_open(WT_SESSION_IMPL *session,
ret = __wt_open_cursor(session,
table->cgroups[0]->source, NULL, cfg, cursorp);
- WT_TRET(__wt_schema_release_table(session, table));
+ WT_TRET(__wt_schema_release_table(session, &table));
if (ret == 0) {
/* Fix up the public URI to match what was passed in. */
cursor = *cursorp;
diff --git a/src/third_party/wiredtiger/src/docs/devdoc-dhandle-lifecycle.dox b/src/third_party/wiredtiger/src/docs/devdoc-dhandle-lifecycle.dox
new file mode 100644
index 00000000000..8f79a0da22e
--- /dev/null
+++ b/src/third_party/wiredtiger/src/docs/devdoc-dhandle-lifecycle.dox
@@ -0,0 +1,92 @@
+/*! @page devdoc-dhandle-lifecycle Data Handle Lifecycle
+
+A WiredTiger Data Handle (dhandle) is a generic representation of any named
+data source. This representation contains information such as its name,
+how many references there are to it, individual data source statistics and
+what type of underlying data object it is.
+
+WiredTiger maintains all dhandles in a global dhandle list accessed
+from the connection. Multiple sessions access this list, which is
+protected by a R/W lock. Each session also maintains a session
+dhandle cache which is a cache of dhandles a session has operated
+upon. The entries in the cache are references into the global dhandle
+list.
+
+@section dhandle-creation dhandle creation
+
+When a cursor in a session attempts to access a WiredTiger table
+that has not been accessed before, the dhandle for the table will
+neither be in the session's dhandle cache nor in the connection's
+global dhandle list. A new dhandle will have to be created for this
+table. The cursor trying to access a table first attempts to find
+the dhandle in the session dhandle cache. When it doesn't find the
+dhandle in the session cache, it searches in the global dhandle
+list while holding the read lock. When it doesn't find the dhandle
+there, it creates a dhandle for this table and puts it in the global
+dhandle list while holding the write lock on the global dhandle
+list. The cursor operation then puts a reference to the dhandle in
+the session's dhandle cache.
+
+There are two relevant reference counters in the dhandle structure,
+\c session_ref and \c session_inuse. \c session_ref counts the
+number of session dhandle cache lists that this contain dhandle.
+\c session_inuse is a count of cursors opened and operating on this
+dhandle. Both these counters are incremented by this session as the
+cursor attempts to use this dhandle. When the operation completes
+and the cursor is closed, the \c session_inuse is decremented. The
+dhandle reference is not removed immediately from the session dhandle
+cache. If this session accesses the same table again in the near
+future, having the dhandle reference already in the session dhandle
+cache is a performance optimization.
+
+@section session-cache-sweep dhandle session cache sweep
+
+Dhandle cache sweep is the only way a cleanup is performed on the
+session' dhandle cache list. The references to the dhandles that have not
+been accessed by this session in a long time are removed from the
+cache. Since a session is single-threaded, a session's dhandle cache
+can only be altered by that session alone.
+
+Each time a session accesses a dhandle, it checks if enough
+time has elapsed to do a session cache sweep for that session.
+As it walks the session dhandle cache list, it notices if any dhandle
+on its list has been marked dead (idle too long). If it has, the
+session removes that dhandle from its list and decrements the
+\c session_ref count.
+
+Since accessing a dhandle involves walking the session dhandle cache
+list anyway, cache cleanup is piggy-backed on this operation.
+
+@section dhandle-sweep sweep-server dhandle sweep
+
+WiredTiger maintains a sweep server in the background for the cleanup of the
+global dhandle list. The sweep server periodically (\c close_scan_interval)
+revisits the dhandles in the global list and if the dhandles are not being used,
+i.e., the \c session_inuse count is 0, it assigns the current time as the time
+of death for the dhandle, if not already done before.
+
+If a dhandle has already got a time of death set for it in a previous iteration
+of the dhandle sweep and the dhandle has stayed not in use, the sweep server
+compares the time of death with the current time to check if the dhandle has
+remained idle for the configured idle time (\c close_idle_time). If the dhandle
+has remained idle, the sweep server closes the associated btree contained in the
+dhandle and releases some of the resources for that dhandle. It also marks
+the dhandle as dead so that the next time a session with a reference walks its
+own cache list, it will see the handle marked dead and remove it from the
+session's dhandle cache list (see above).
+
+The sweep server then checks for whether or not any session is referencing this
+dhandle, i.e., if a session's dhandle cache still contains a reference to this
+dhandle. If a dhandle stays referenced by at least one session, i.e., the
+\c session_ref count is not 0, the dhandle can't be removed from the global
+list. If this dhandle is not referenced by any session, i.e., \c session_ref
+count is 0, the sweep server removes the dhandle from the global dhandle list
+and frees any remaining resources associated with it. The removal of the dhandle
+from the global list hence completes this dhandle's lifecycle. Any future access
+of the associated table would need to start by creating the dhandle again.
+
+Note: The sweep server's scan interval and a dhandle's close idle time can be
+configured using \c file_manager configuration settings of the connection
+handle.
+
+*/
diff --git a/src/third_party/wiredtiger/src/docs/devdoc-index.dox b/src/third_party/wiredtiger/src/docs/devdoc-index.dox
index ba809d7af43..7ada556aa1a 100644
--- a/src/third_party/wiredtiger/src/docs/devdoc-index.dox
+++ b/src/third_party/wiredtiger/src/docs/devdoc-index.dox
@@ -9,4 +9,14 @@ see:
- @subpage devdoc-schema
+An internal structure called Data Handle (dhandle) is used to represent and
+access a table in WiredTiger. A dhandle gets created when a table is accessed
+for the first time. It is kept in a global list and is shared across the
+sessions. When a dhandle is not needed anymore and has been idle for a while,
+it is closed and destroyed, releasing all the resources associated with it.
+
+For more information on the lifecycle of a dhandle, see:
+
+- @subpage devdoc-dhandle-lifecycle
+
*/
diff --git a/src/third_party/wiredtiger/src/docs/spell.ok b/src/third_party/wiredtiger/src/docs/spell.ok
index b336b5ab17a..78741bc0ba8 100644
--- a/src/third_party/wiredtiger/src/docs/spell.ok
+++ b/src/third_party/wiredtiger/src/docs/spell.ok
@@ -181,6 +181,7 @@ dbformat
dbm
dbt
decl
+decrementing
decrypt
decrypted
del
@@ -190,6 +191,8 @@ destructors
dev
devdoc
dhandle
+dhandles
+dhandle's
disjunction
disjunctions
distclean
@@ -272,6 +275,7 @@ intl
inuse
io
ip
+ie
je
jemalloc
jitter
@@ -296,6 +300,7 @@ libhe
libkvs
libtool
libwiredtiger
+lifecycle
lmin
ln
loadtext
diff --git a/src/third_party/wiredtiger/src/docs/transactions.dox b/src/third_party/wiredtiger/src/docs/transactions.dox
index 255cba4cb52..25845ce0209 100644
--- a/src/third_party/wiredtiger/src/docs/transactions.dox
+++ b/src/third_party/wiredtiger/src/docs/transactions.dox
@@ -202,6 +202,13 @@ WT_PREPARE_CONFLICT error will be returned indicating that it is not possible
to choose a version of data to return until a prepared transaction is resolved,
it is reasonable to retry such operations.
+Durability of the data updates performed by a prepared transaction, on tables
+configured with log=(enabled=false), can be controlled by specifying a durable
+timestamp during WT_SESSION::commit_transaction. Checkpoint will consider the
+durable timestamp, instead of commit timestamp for persisting the data updates.
+If the durable timestamp is not specified, then the commit timestamp will be
+considered as the durable timestamp.
+
@subsection timestamp_connection Managing global timestamp state
Applications that use timestamps need to manage some global state in order
diff --git a/src/third_party/wiredtiger/src/include/btree.h b/src/third_party/wiredtiger/src/include/btree.h
index f7ff274cfb8..27e26af0e18 100644
--- a/src/third_party/wiredtiger/src/include/btree.h
+++ b/src/third_party/wiredtiger/src/include/btree.h
@@ -102,8 +102,11 @@ struct __wt_btree {
#define WT_ASSERT_COMMIT_TS_ALWAYS 0x01u
#define WT_ASSERT_COMMIT_TS_KEYS 0x02u
#define WT_ASSERT_COMMIT_TS_NEVER 0x04u
-#define WT_ASSERT_READ_TS_ALWAYS 0x08u
-#define WT_ASSERT_READ_TS_NEVER 0x10u
+#define WT_ASSERT_DURABLE_TS_ALWAYS 0x08u
+#define WT_ASSERT_DURABLE_TS_KEYS 0x10u
+#define WT_ASSERT_DURABLE_TS_NEVER 0x20u
+#define WT_ASSERT_READ_TS_ALWAYS 0x40u
+#define WT_ASSERT_READ_TS_NEVER 0x80u
/* AUTOMATIC FLAG VALUE GENERATION STOP */
uint32_t assert_flags; /* Debugging assertion information */
diff --git a/src/third_party/wiredtiger/src/include/cell.i b/src/third_party/wiredtiger/src/include/cell.i
index 3b45426f797..1812bd1a816 100644
--- a/src/third_party/wiredtiger/src/include/cell.i
+++ b/src/third_party/wiredtiger/src/include/cell.i
@@ -17,8 +17,8 @@
* There are 4 basic cell types: keys and data (each of which has an overflow
* form), deleted cells and off-page references. The cell is usually followed
* by additional data, varying by type: keys are followed by a chunk of data,
- * data is followed by a pair of timestamps and a chunk of data, overflow and
- * off-page cells are followed by an address cookie.
+ * data is followed by optional timestamps and a chunk of data, overflow and
+ * off-page cells are followed by optional timestamps and an address cookie.
*
* Deleted cells are place-holders for column-store files, where entries cannot
* be removed in order to preserve the record count.
@@ -58,7 +58,8 @@
* Bit 3 marks an 8B packed, uint64_t value following the cell description byte.
* (A run-length counter or a record number for variable-length column store.)
*
- * Bit 4 is unused.
+ * Bit 4 marks a value with associated timestamps (globally visible values don't
+ * require timestamps).
*
* Bits 5-8 are cell "types".
*/
@@ -71,13 +72,7 @@
#define WT_CELL_SHORT_SHIFT 2 /* Shift for short key/value */
#define WT_CELL_64V 0x04 /* Associated value */
-
-/*
- * We could use bit 4 as a single bit (similar to bit 3), or as a type bit in a
- * backward compatible way by adding bit 4 to the type mask and adding new types
- * that incorporate it.
- */
-#define WT_CELL_UNUSED_BIT4 0x08 /* Unused */
+#define WT_CELL_TIMESTAMPS 0x08 /* Associated timestamps */
/*
* WT_CELL_ADDR_INT is an internal block location, WT_CELL_ADDR_LEAF is a leaf
@@ -110,13 +105,13 @@
#define WT_CELL_TYPE(v) ((v) & WT_CELL_TYPE_MASK)
/*
- * When we aren't able to create a short key or value (and, in the case of a
- * value, there's no associated RLE), the key or value is at least 64B, else
- * we'd have been able to store it as a short cell. Decrement/Increment the
- * size before storing it, in the hopes that relatively small key/value sizes
- * will pack into a single byte instead of two bytes.
+ * When unable to create a short key or value (and where it wasn't an associated
+ * RLE or timestamps that prevented creating a short value), the data must be at
+ * least 64B, else we'd have used a short cell. When packing/unpacking the size,
+ * decrement/increment the size, in the hopes that a smaller size will pack into
+ * a single byte instead of two.
*/
-#define WT_CELL_SIZE_ADJUST 64
+#define WT_CELL_SIZE_ADJUST (WT_CELL_SHORT_MAX + 1)
/*
* WT_CELL --
@@ -193,12 +188,40 @@ __wt_timestamp_value_check(
*/
static inline void
__cell_pack_timestamp_value(WT_SESSION_IMPL *session,
- uint8_t **pp, wt_timestamp_t start_ts, wt_timestamp_t stop_ts)
+ uint8_t **pp, wt_timestamp_t *start_tsp, wt_timestamp_t *stop_tsp)
{
- __wt_timestamp_value_check(session, start_ts, stop_ts);
+ wt_timestamp_t start_ts, stop_ts;
- if (__wt_process.page_version_ts) {
- /* Start timestamp, stop timestamp difference. */
+ __wt_timestamp_value_check(session, *start_tsp, *stop_tsp);
+
+ /*
+ * Finalize the timestamps, checking if they're globally visible and
+ * won't need to be written.
+ *
+ * TIMESTAMP-FIXME
+ * Values (presumably) have associated transaction IDs, but we haven't
+ * yet decided how to handle them. For now, ignore them in determining
+ * value durability.
+ */
+ if (*start_tsp != WT_TS_NONE &&
+ __wt_txn_visible_all(session, WT_TXN_NONE, *start_tsp))
+ *start_tsp = WT_TS_NONE;
+
+ start_ts = *start_tsp;
+ stop_ts = *stop_tsp;
+
+ /*
+ * Historic versions and globally visible values don't have associated
+ * timestamps, else set a flag bit and store the packed timestamp pair.
+ */
+ if (!__wt_process.page_version_ts ||
+ (start_ts == WT_TS_NONE && stop_ts == WT_TS_MAX))
+ ++*pp;
+ else {
+ **pp |= WT_CELL_TIMESTAMPS;
+ ++*pp;
+
+ /* Store differences, not absolutes. */
(void)__wt_vpack_uint(pp, 0, start_ts);
(void)__wt_vpack_uint(pp, 0, stop_ts - start_ts);
}
@@ -235,11 +258,11 @@ __cell_pack_timestamp_addr(WT_SESSION_IMPL *session,
__wt_timestamp_addr_check(session,
oldest_start_ts, newest_start_ts, newest_stop_ts);
+ ++*pp;
if (__wt_process.page_version_ts) {
+ /* Store differences, not absolutes. */
(void)__wt_vpack_uint(pp, 0, oldest_start_ts);
-
- /* Newest start timestamp, stop timestamp difference. */
- (void)__wt_vpack_uint(pp, 0, newest_start_ts);
+ (void)__wt_vpack_uint(pp, 0, newest_start_ts - oldest_start_ts);
(void)__wt_vpack_uint(pp, 0, newest_stop_ts - newest_start_ts);
}
}
@@ -256,7 +279,10 @@ __wt_cell_pack_addr(WT_SESSION_IMPL *session,
{
uint8_t *p;
- p = cell->__chunk + 1;
+ /* Start building a cell: the descriptor byte starts zero. */
+ p = cell->__chunk;
+ *p = '\0';
+
__cell_pack_timestamp_addr(session,
&p, oldest_start_ts, newest_start_ts, newest_stop_ts);
@@ -271,32 +297,43 @@ __wt_cell_pack_addr(WT_SESSION_IMPL *session,
}
/*
- * __wt_cell_pack_data --
- * Set a data item's WT_CELL contents.
+ * __wt_cell_pack_value --
+ * Set a value item's WT_CELL contents.
*/
static inline size_t
-__wt_cell_pack_data(WT_SESSION_IMPL *session, WT_CELL *cell,
- wt_timestamp_t start_ts, wt_timestamp_t stop_ts, uint64_t rle, size_t size)
+__wt_cell_pack_value(WT_SESSION_IMPL *session, WT_CELL *cell,
+ wt_timestamp_t *start_tsp, wt_timestamp_t *stop_tsp,
+ uint64_t rle, size_t size)
{
uint8_t byte, *p;
+ bool ts;
- p = cell->__chunk + 1;
- __cell_pack_timestamp_value(session, &p, start_ts, stop_ts);
+ /* Start building a cell: the descriptor byte starts zero. */
+ p = cell->__chunk;
+ *p = '\0';
+
+ __cell_pack_timestamp_value(session, &p, start_tsp, stop_tsp);
/*
- * Short data cells without run-length encoding have 6 bits of data
- * length in the descriptor byte.
+ * Short data cells without timestamps or run-length encoding have 6
+ * bits of data length in the descriptor byte.
*/
- if (rle < 2 && size <= WT_CELL_SHORT_MAX) {
+ ts = (cell->__chunk[0] & WT_CELL_TIMESTAMPS) != 0;
+ if (!ts && rle < 2 && size <= WT_CELL_SHORT_MAX) {
byte = (uint8_t)size; /* Type + length */
cell->__chunk[0] = (uint8_t)
((byte << WT_CELL_SHORT_SHIFT) | WT_CELL_VALUE_SHORT);
} else {
- if (rle < 2) {
+ /*
+ * If the size was what prevented us from using a short cell,
+ * it's larger than the adjustment size. Decrement/increment
+ * it when packing/unpacking so it takes up less room.
+ */
+ if (!ts && rle < 2) {
size -= WT_CELL_SIZE_ADJUST;
- cell->__chunk[0] = WT_CELL_VALUE; /* Type */
+ cell->__chunk[0] |= WT_CELL_VALUE; /* Type */
} else {
- cell->__chunk[0] = WT_CELL_VALUE | WT_CELL_64V;
+ cell->__chunk[0] |= WT_CELL_VALUE | WT_CELL_64V;
(void)__wt_vpack_uint(&p, 0, rle); /* RLE */
}
(void)__wt_vpack_uint(&p, 0, (uint64_t)size); /* Length */
@@ -305,17 +342,17 @@ __wt_cell_pack_data(WT_SESSION_IMPL *session, WT_CELL *cell,
}
/*
- * __wt_cell_pack_data_match --
- * Return if two items would have identical WT_CELLs (except for timestamps
- * and any RLE).
+ * __wt_cell_pack_value_match --
+ * Return if two value items would have identical WT_CELLs (except for
+ * timestamps and any RLE).
*/
static inline int
-__wt_cell_pack_data_match(WT_CELL *page_cell,
+__wt_cell_pack_value_match(WT_CELL *page_cell,
WT_CELL *val_cell, const uint8_t *val_data, bool *matchp)
{
uint64_t alen, blen, v;
const uint8_t *a, *b;
- bool rle;
+ bool rle, ts;
*matchp = false; /* Default to no-match */
@@ -323,8 +360,8 @@ __wt_cell_pack_data_match(WT_CELL *page_cell,
* This is a special-purpose function used by reconciliation to support
* dictionary lookups. We're passed an on-page cell and a created cell
* plus a chunk of data we're about to write on the page, and we return
- * if they would match on the page. The column-store comparison ignores
- * the timestamps and the RLE because the copied cell will have its own.
+ * if they would match on the page. Ignore timestamps and column-store
+ * RLE because the copied cell will have its own.
*/
a = (uint8_t *)page_cell;
b = (uint8_t *)val_cell;
@@ -332,14 +369,11 @@ __wt_cell_pack_data_match(WT_CELL *page_cell,
if (WT_CELL_SHORT_TYPE(a[0]) == WT_CELL_VALUE_SHORT) {
alen = a[0] >> WT_CELL_SHORT_SHIFT;
++a;
- if (__wt_process.page_version_ts) {
- WT_RET(__wt_vunpack_uint(&a, 0, &v)); /* Skip TS */
- WT_RET(__wt_vunpack_uint(&a, 0, &v));
- }
} else if (WT_CELL_TYPE(a[0]) == WT_CELL_VALUE) {
rle = (a[0] & WT_CELL_64V) != 0;
+ ts = (a[0] & WT_CELL_TIMESTAMPS) != 0;
++a;
- if (__wt_process.page_version_ts) {
+ if (ts) {
WT_RET(__wt_vunpack_uint(&a, 0, &v)); /* Skip TS */
WT_RET(__wt_vunpack_uint(&a, 0, &v));
}
@@ -352,14 +386,11 @@ __wt_cell_pack_data_match(WT_CELL *page_cell,
if (WT_CELL_SHORT_TYPE(b[0]) == WT_CELL_VALUE_SHORT) {
blen = b[0] >> WT_CELL_SHORT_SHIFT;
++b;
- if (__wt_process.page_version_ts) {
- WT_RET(__wt_vunpack_uint(&b, 0, &v)); /* Skip TS */
- WT_RET(__wt_vunpack_uint(&b, 0, &v));
- }
} else if (WT_CELL_TYPE(b[0]) == WT_CELL_VALUE) {
rle = (b[0] & WT_CELL_64V) != 0;
+ ts = (b[0] & WT_CELL_TIMESTAMPS) != 0;
++b;
- if (__wt_process.page_version_ts) {
+ if (ts) {
WT_RET(__wt_vunpack_uint(&b, 0, &v)); /* Skip TS */
WT_RET(__wt_vunpack_uint(&b, 0, &v));
}
@@ -380,17 +411,21 @@ __wt_cell_pack_data_match(WT_CELL *page_cell,
*/
static inline size_t
__wt_cell_pack_copy(WT_SESSION_IMPL *session, WT_CELL *cell,
- wt_timestamp_t start_ts, wt_timestamp_t stop_ts, uint64_t rle, uint64_t v)
+ wt_timestamp_t *start_tsp, wt_timestamp_t *stop_tsp,
+ uint64_t rle, uint64_t v)
{
uint8_t *p;
- p = cell->__chunk + 1;
- __cell_pack_timestamp_value(session, &p, start_ts, stop_ts);
+ /* Start building a cell: the descriptor byte starts zero. */
+ p = cell->__chunk;
+ *p = '\0';
+
+ __cell_pack_timestamp_value(session, &p, start_tsp, stop_tsp);
if (rle < 2)
- cell->__chunk[0] = WT_CELL_VALUE_COPY; /* Type */
+ cell->__chunk[0] |= WT_CELL_VALUE_COPY; /* Type */
else {
- cell->__chunk[0] = /* Type */
+ cell->__chunk[0] |= /* Type */
WT_CELL_VALUE_COPY | WT_CELL_64V;
(void)__wt_vpack_uint(&p, 0, rle); /* RLE */
}
@@ -404,17 +439,20 @@ __wt_cell_pack_copy(WT_SESSION_IMPL *session, WT_CELL *cell,
*/
static inline size_t
__wt_cell_pack_del(WT_SESSION_IMPL *session, WT_CELL *cell,
- wt_timestamp_t start_ts, wt_timestamp_t stop_ts, uint64_t rle)
+ wt_timestamp_t *start_tsp, wt_timestamp_t *stop_tsp, uint64_t rle)
{
uint8_t *p;
- p = cell->__chunk + 1;
- __cell_pack_timestamp_value(session, &p, start_ts, stop_ts);
+ /* Start building a cell: the descriptor byte starts zero. */
+ p = cell->__chunk;
+ *p = '\0';
+
+ __cell_pack_timestamp_value(session, &p, start_tsp, stop_tsp);
if (rle < 2)
- cell->__chunk[0] = WT_CELL_DEL; /* Type */
+ cell->__chunk[0] |= WT_CELL_DEL; /* Type */
else {
- cell->__chunk[0] = /* Type */
+ cell->__chunk[0] |= /* Type */
WT_CELL_DEL | WT_CELL_64V;
(void)__wt_vpack_uint(&p, 0, rle); /* RLE */
}
@@ -441,6 +479,11 @@ __wt_cell_pack_int_key(WT_CELL *cell, size_t size)
cell->__chunk[0] = WT_CELL_KEY; /* Type */
p = cell->__chunk + 1;
+ /*
+ * If the size prevented us from using a short cell, it's larger than
+ * the adjustment size. Decrement/increment it when packing/unpacking
+ * so it takes up less room.
+ */
size -= WT_CELL_SIZE_ADJUST;
(void)__wt_vpack_uint(&p, 0, (uint64_t)size); /* Length */
return (WT_PTRDIFF(p, cell));
@@ -463,10 +506,10 @@ __wt_cell_pack_leaf_key(WT_CELL *cell, uint8_t prefix, size_t size)
((byte << WT_CELL_SHORT_SHIFT) | WT_CELL_KEY_SHORT);
return (1);
}
- byte = (uint8_t)size; /* Type + length */
+ byte = (uint8_t)size; /* Type + length */
cell->__chunk[0] = (uint8_t)
((byte << WT_CELL_SHORT_SHIFT) | WT_CELL_KEY_SHORT_PFX);
- cell->__chunk[1] = prefix; /* Prefix */
+ cell->__chunk[1] = prefix; /* Prefix */
return (2);
}
@@ -479,6 +522,11 @@ __wt_cell_pack_leaf_key(WT_CELL *cell, uint8_t prefix, size_t size)
p = cell->__chunk + 2;
}
+ /*
+ * If the size prevented us from using a short cell, it's larger than
+ * the adjustment size. Decrement/increment it when packing/unpacking
+ * so it takes up less room.
+ */
size -= WT_CELL_SIZE_ADJUST;
(void)__wt_vpack_uint(&p, 0, (uint64_t)size); /* Length */
return (WT_PTRDIFF(p, cell));
@@ -490,25 +538,30 @@ __wt_cell_pack_leaf_key(WT_CELL *cell, uint8_t prefix, size_t size)
*/
static inline size_t
__wt_cell_pack_ovfl(WT_SESSION_IMPL *session, WT_CELL *cell, uint8_t type,
- wt_timestamp_t start_ts, wt_timestamp_t stop_ts, uint64_t rle, size_t size)
+ wt_timestamp_t *start_tsp, wt_timestamp_t *stop_tsp,
+ uint64_t rle, size_t size)
{
uint8_t *p;
- p = cell->__chunk + 1;
+ /* Start building a cell: the descriptor byte starts zero. */
+ p = cell->__chunk;
+ *p = '\0';
+
switch (type) {
case WT_CELL_KEY_OVFL:
case WT_CELL_KEY_OVFL_RM:
+ ++p;
break;
case WT_CELL_VALUE_OVFL:
case WT_CELL_VALUE_OVFL_RM:
- __cell_pack_timestamp_value(session, &p, start_ts, stop_ts);
+ __cell_pack_timestamp_value(session, &p, start_tsp, stop_tsp);
break;
}
if (rle < 2)
- cell->__chunk[0] = type; /* Type */
+ cell->__chunk[0] |= type; /* Type */
else {
- cell->__chunk[0] = type | WT_CELL_64V; /* Type */
+ cell->__chunk[0] |= type | WT_CELL_64V; /* Type */
(void)__wt_vpack_uint(&p, 0, rle); /* RLE */
}
(void)__wt_vpack_uint(&p, 0, (uint64_t)size); /* Length */
@@ -690,7 +743,7 @@ __wt_cell_unpack_safe(WT_SESSION_IMPL *session, const WT_PAGE_HEADER *dsk,
* NB: when unpacking a WT_CELL_VALUE_COPY cell, unpack.cell is returned
* as the original cell, not the copied cell (in other words, data from
* the copied cell must be available from unpack after we return, as our
- * caller has no way to find the copied cell.
+ * caller has no way to find the copied cell).
*/
WT_CELL_LEN_CHK(cell, 0);
unpack->cell = cell;
@@ -714,8 +767,8 @@ restart:
/*
* Handle cells with neither RLE counts, timestamps or a data length:
- * short key cells have 6 bits of data length in the descriptor byte
- * and nothing else.
+ * short key/data cells have 6 bits of data length in the descriptor
+ * byte and nothing else.
*/
switch (unpack->raw) {
case WT_CELL_KEY_SHORT_PFX:
@@ -726,6 +779,7 @@ restart:
unpack->__len = 2 + unpack->size;
goto done;
case WT_CELL_KEY_SHORT:
+ case WT_CELL_VALUE_SHORT:
unpack->prefix = 0;
unpack->data = cell->__chunk + 1;
unpack->size = cell->__chunk[0] >> WT_CELL_SHORT_SHIFT;
@@ -742,55 +796,52 @@ restart:
/*
* Check for a prefix byte that optionally follows the cell descriptor
- * byte on row-store leaf pages.
+ * byte in keys on row-store leaf pages.
*/
if (unpack->raw == WT_CELL_KEY_PFX) {
- ++p; /* skip prefix */
+ unpack->prefix = *p++; /* skip prefix */
WT_CELL_LEN_CHK(p, 0);
- unpack->prefix = cell->__chunk[1];
}
/* Check for timestamps. */
- if (dsk->version >= WT_PAGE_VERSION_TS)
- switch (unpack->raw) {
- case WT_CELL_ADDR_DEL:
- case WT_CELL_ADDR_INT:
- case WT_CELL_ADDR_LEAF:
- case WT_CELL_ADDR_LEAF_NO:
- WT_RET(__wt_vunpack_uint(&p, end == NULL ? 0 :
- WT_PTRDIFF(end, p), &unpack->oldest_start_ts));
- WT_RET(__wt_vunpack_uint(&p, end == NULL ? 0 :
- WT_PTRDIFF(end, p), &unpack->newest_start_ts));
- WT_RET(__wt_vunpack_uint(&p, end == NULL ? 0 :
- WT_PTRDIFF(end, p), &unpack->newest_stop_ts));
- unpack->newest_stop_ts += unpack->newest_start_ts;
-
- __wt_timestamp_addr_check(session,
- unpack->oldest_start_ts,
- unpack->newest_start_ts, unpack->newest_stop_ts);
+ switch (unpack->raw) {
+ case WT_CELL_ADDR_DEL:
+ case WT_CELL_ADDR_INT:
+ case WT_CELL_ADDR_LEAF:
+ case WT_CELL_ADDR_LEAF_NO:
+ if (!__wt_process.page_version_ts)
break;
- case WT_CELL_DEL:
- case WT_CELL_VALUE:
- case WT_CELL_VALUE_COPY:
- case WT_CELL_VALUE_OVFL:
- case WT_CELL_VALUE_OVFL_RM:
- case WT_CELL_VALUE_SHORT:
- WT_RET(__wt_vunpack_uint(&p, end == NULL ?
- 0 : WT_PTRDIFF(end, p), &unpack->start_ts));
- WT_RET(__wt_vunpack_uint(&p, end == NULL ?
- 0 : WT_PTRDIFF(end, p), &unpack->stop_ts));
- unpack->stop_ts += unpack->start_ts;
-
- __wt_timestamp_value_check(
- session, unpack->start_ts, unpack->stop_ts);
+
+ WT_RET(__wt_vunpack_uint(&p, end == NULL ? 0 :
+ WT_PTRDIFF(end, p), &unpack->oldest_start_ts));
+ WT_RET(__wt_vunpack_uint(&p, end == NULL ? 0 :
+ WT_PTRDIFF(end, p), &unpack->newest_start_ts));
+ unpack->newest_start_ts += unpack->oldest_start_ts;
+ WT_RET(__wt_vunpack_uint(&p, end == NULL ? 0 :
+ WT_PTRDIFF(end, p), &unpack->newest_stop_ts));
+ unpack->newest_stop_ts += unpack->newest_start_ts;
+
+ __wt_timestamp_addr_check(session,
+ unpack->oldest_start_ts,
+ unpack->newest_start_ts, unpack->newest_stop_ts);
+ break;
+ case WT_CELL_DEL:
+ case WT_CELL_VALUE:
+ case WT_CELL_VALUE_COPY:
+ case WT_CELL_VALUE_OVFL:
+ case WT_CELL_VALUE_OVFL_RM:
+ if ((cell->__chunk[0] & WT_CELL_TIMESTAMPS) == 0)
break;
- }
- if (unpack->raw == WT_CELL_VALUE_SHORT) {
- unpack->data = p;
- unpack->size = cell->__chunk[0] >> WT_CELL_SHORT_SHIFT;
- unpack->__len = WT_PTRDIFF32(p, cell) + unpack->size;
- goto done;
+ WT_RET(__wt_vunpack_uint(&p, end == NULL ?
+ 0 : WT_PTRDIFF(end, p), &unpack->start_ts));
+ WT_RET(__wt_vunpack_uint(&p, end == NULL ?
+ 0 : WT_PTRDIFF(end, p), &unpack->stop_ts));
+ unpack->stop_ts += unpack->start_ts;
+
+ __wt_timestamp_value_check(
+ session, unpack->start_ts, unpack->stop_ts);
+ break;
}
/*
@@ -848,14 +899,21 @@ restart:
WT_RET(__wt_vunpack_uint(
&p, end == NULL ? 0 : WT_PTRDIFF(end, p), &v));
+ /*
+ * If the size was what prevented us from using a short cell,
+ * it's larger than the adjustment size. Decrement/increment
+ * it when packing/unpacking so it takes up less room.
+ */
if (unpack->raw == WT_CELL_KEY ||
unpack->raw == WT_CELL_KEY_PFX ||
- (unpack->raw == WT_CELL_VALUE && unpack->v == 0))
+ (unpack->raw == WT_CELL_VALUE &&
+ unpack->v == 0 &&
+ (cell->__chunk[0] & WT_CELL_TIMESTAMPS) == 0))
v += WT_CELL_SIZE_ADJUST;
unpack->data = p;
unpack->size = (uint32_t)v;
- unpack->__len = WT_PTRDIFF32(p + unpack->size, cell);
+ unpack->__len = WT_PTRDIFF32(p, cell) + unpack->size;
break;
case WT_CELL_DEL:
diff --git a/src/third_party/wiredtiger/src/include/compact.h b/src/third_party/wiredtiger/src/include/compact.h
index df14ed6c5e4..3a1f54ca294 100644
--- a/src/third_party/wiredtiger/src/include/compact.h
+++ b/src/third_party/wiredtiger/src/include/compact.h
@@ -10,6 +10,7 @@ struct __wt_compact_state {
uint32_t lsm_count; /* Number of LSM trees seen */
uint32_t file_count; /* Number of files seen */
uint64_t max_time; /* Configured timeout */
+ uint64_t prog_msg_count; /* Progress message count */
struct timespec begin; /* Starting time */
};
diff --git a/src/third_party/wiredtiger/src/include/connection.h b/src/third_party/wiredtiger/src/include/connection.h
index 280d7e32f7d..b6100ae134d 100644
--- a/src/third_party/wiredtiger/src/include/connection.h
+++ b/src/third_party/wiredtiger/src/include/connection.h
@@ -425,35 +425,36 @@ struct __wt_connection_impl {
#define WT_VERB_CHECKPOINT 0x000000004u
#define WT_VERB_CHECKPOINT_PROGRESS 0x000000008u
#define WT_VERB_COMPACT 0x000000010u
-#define WT_VERB_ERROR_RETURNS 0x000000020u
-#define WT_VERB_EVICT 0x000000040u
-#define WT_VERB_EVICTSERVER 0x000000080u
-#define WT_VERB_EVICT_STUCK 0x000000100u
-#define WT_VERB_FILEOPS 0x000000200u
-#define WT_VERB_HANDLEOPS 0x000000400u
-#define WT_VERB_LOG 0x000000800u
-#define WT_VERB_LOOKASIDE 0x000001000u
-#define WT_VERB_LOOKASIDE_ACTIVITY 0x000002000u
-#define WT_VERB_LSM 0x000004000u
-#define WT_VERB_LSM_MANAGER 0x000008000u
-#define WT_VERB_METADATA 0x000010000u
-#define WT_VERB_MUTEX 0x000020000u
-#define WT_VERB_OVERFLOW 0x000040000u
-#define WT_VERB_READ 0x000080000u
-#define WT_VERB_REBALANCE 0x000100000u
-#define WT_VERB_RECONCILE 0x000200000u
-#define WT_VERB_RECOVERY 0x000400000u
-#define WT_VERB_RECOVERY_PROGRESS 0x000800000u
-#define WT_VERB_SALVAGE 0x001000000u
-#define WT_VERB_SHARED_CACHE 0x002000000u
-#define WT_VERB_SPLIT 0x004000000u
-#define WT_VERB_TEMPORARY 0x008000000u
-#define WT_VERB_THREAD_GROUP 0x010000000u
-#define WT_VERB_TIMESTAMP 0x020000000u
-#define WT_VERB_TRANSACTION 0x040000000u
-#define WT_VERB_VERIFY 0x080000000u
-#define WT_VERB_VERSION 0x100000000u
-#define WT_VERB_WRITE 0x200000000u
+#define WT_VERB_COMPACT_PROGRESS 0x000000020u
+#define WT_VERB_ERROR_RETURNS 0x000000040u
+#define WT_VERB_EVICT 0x000000080u
+#define WT_VERB_EVICTSERVER 0x000000100u
+#define WT_VERB_EVICT_STUCK 0x000000200u
+#define WT_VERB_FILEOPS 0x000000400u
+#define WT_VERB_HANDLEOPS 0x000000800u
+#define WT_VERB_LOG 0x000001000u
+#define WT_VERB_LOOKASIDE 0x000002000u
+#define WT_VERB_LOOKASIDE_ACTIVITY 0x000004000u
+#define WT_VERB_LSM 0x000008000u
+#define WT_VERB_LSM_MANAGER 0x000010000u
+#define WT_VERB_METADATA 0x000020000u
+#define WT_VERB_MUTEX 0x000040000u
+#define WT_VERB_OVERFLOW 0x000080000u
+#define WT_VERB_READ 0x000100000u
+#define WT_VERB_REBALANCE 0x000200000u
+#define WT_VERB_RECONCILE 0x000400000u
+#define WT_VERB_RECOVERY 0x000800000u
+#define WT_VERB_RECOVERY_PROGRESS 0x001000000u
+#define WT_VERB_SALVAGE 0x002000000u
+#define WT_VERB_SHARED_CACHE 0x004000000u
+#define WT_VERB_SPLIT 0x008000000u
+#define WT_VERB_TEMPORARY 0x010000000u
+#define WT_VERB_THREAD_GROUP 0x020000000u
+#define WT_VERB_TIMESTAMP 0x040000000u
+#define WT_VERB_TRANSACTION 0x080000000u
+#define WT_VERB_VERIFY 0x100000000u
+#define WT_VERB_VERSION 0x200000000u
+#define WT_VERB_WRITE 0x400000000u
/* AUTOMATIC FLAG VALUE GENERATION STOP */
uint64_t verbose;
diff --git a/src/third_party/wiredtiger/src/include/cursor.h b/src/third_party/wiredtiger/src/include/cursor.h
index 0e5905f491e..33d6660e687 100644
--- a/src/third_party/wiredtiger/src/include/cursor.h
+++ b/src/third_party/wiredtiger/src/include/cursor.h
@@ -217,6 +217,14 @@ struct __wt_cursor_btree {
uint8_t append_tree; /* Cursor appended to the tree */
+ /*
+ * We have to restart cursor next/prev after a prepare conflict. Keep
+ * the state of the cursor separately so we can restart at exactly the
+ * right point.
+ */
+ enum { WT_CBT_RETRY_NOTSET=0,
+ WT_CBT_RETRY_INSERT, WT_CBT_RETRY_PAGE } iter_retry;
+
#ifdef HAVE_DIAGNOSTIC
/* Check that cursor next/prev never returns keys out-of-order. */
WT_ITEM *lastkey, _lastkey;
diff --git a/src/third_party/wiredtiger/src/include/cursor.i b/src/third_party/wiredtiger/src/include/cursor.i
index 351a5cd7abe..3c08f808c62 100644
--- a/src/third_party/wiredtiger/src/include/cursor.i
+++ b/src/third_party/wiredtiger/src/include/cursor.i
@@ -473,45 +473,3 @@ value:
__wt_row_leaf_value_cell(session, page, rip, kpack, vpack);
return (__wt_page_cell_data_ref(session, cbt->ref->page, vpack, vb));
}
-/*
- * __cursor_check_prepared_update --
- * Return whether prepared update at current position is visible or not.
- */
-static inline int
-__cursor_check_prepared_update(WT_CURSOR_BTREE *cbt, bool *visiblep)
-{
- WT_SESSION_IMPL *session;
- WT_UPDATE *upd;
-
- session = (WT_SESSION_IMPL *)cbt->iface.session;
- /*
- * When retrying an operation due to a prepared conflict, the cursor is
- * at an update list which resulted in conflict. So, when retrying we
- * should examine the same update again instead of iterating to the next
- * object. We'll eventually find a valid update, else return
- * prepare-conflict until resolved.
- */
- WT_RET(__wt_cursor_valid(cbt, &upd, visiblep));
-
- /* The update that returned prepared conflict is now visible. */
- F_CLR(cbt, WT_CBT_ITERATE_RETRY_NEXT | WT_CBT_ITERATE_RETRY_PREV);
- if (*visiblep) {
- /*
- * The underlying key-return function uses a comparison value
- * of 0 to indicate the search function has pre-built the key
- * we want to return. That's not the case, don't take that path.
- */
- cbt->compare = 1;
- /*
- * If a prepared delete operation is resolved, it will be
- * visible, but key is not valid. The update will be null in
- * that case and we continue with cursor navigation.
- */
- if (upd != NULL)
- WT_RET(__cursor_kv_return(session, cbt, upd));
- else
- *visiblep = false;
- }
-
- return (0);
-}
diff --git a/src/third_party/wiredtiger/src/include/extern.h b/src/third_party/wiredtiger/src/include/extern.h
index 4fbb1d8065c..2bc60a1a85d 100644
--- a/src/third_party/wiredtiger/src/include/extern.h
+++ b/src/third_party/wiredtiger/src/include/extern.h
@@ -621,7 +621,7 @@ extern int __wt_schema_create(WT_SESSION_IMPL *session, const char *uri, const c
extern int __wt_schema_drop(WT_SESSION_IMPL *session, const char *uri, const char *cfg[]) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
extern int __wt_schema_get_table_uri(WT_SESSION_IMPL *session, const char *uri, bool ok_incomplete, uint32_t flags, WT_TABLE **tablep) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
extern int __wt_schema_get_table(WT_SESSION_IMPL *session, const char *name, size_t namelen, bool ok_incomplete, uint32_t flags, WT_TABLE **tablep) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
-extern int __wt_schema_release_table(WT_SESSION_IMPL *session, WT_TABLE *table) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
+extern int __wt_schema_release_table(WT_SESSION_IMPL *session, WT_TABLE **tablep) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
extern void __wt_schema_destroy_colgroup(WT_SESSION_IMPL *session, WT_COLGROUP **colgroupp);
extern int __wt_schema_destroy_index(WT_SESSION_IMPL *session, WT_INDEX **idxp) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
extern int __wt_schema_close_table(WT_SESSION_IMPL *session, WT_TABLE *table) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
diff --git a/src/third_party/wiredtiger/src/include/misc.h b/src/third_party/wiredtiger/src/include/misc.h
index c9de286c34c..32d2433a9af 100644
--- a/src/third_party/wiredtiger/src/include/misc.h
+++ b/src/third_party/wiredtiger/src/include/misc.h
@@ -24,18 +24,20 @@
#define WT_DIVIDER "=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-="
/* Basic constants. */
-#define WT_THOUSAND (1000)
-#define WT_MILLION (1000000)
-#define WT_BILLION (1000000000)
-
-#define WT_MINUTE (60)
-
-#define WT_KILOBYTE (1024)
-#define WT_MEGABYTE (1048576)
-#define WT_GIGABYTE (1073741824)
-#define WT_TERABYTE ((uint64_t)1099511627776)
-#define WT_PETABYTE ((uint64_t)1125899906842624)
-#define WT_EXABYTE ((uint64_t)1152921504606846976)
+#define WT_THOUSAND (1000)
+#define WT_MILLION (1000000)
+#define WT_BILLION (1000000000)
+
+#define WT_MINUTE (60)
+
+#define WT_PROGRESS_MSG_PERIOD (20)
+
+#define WT_KILOBYTE (1024)
+#define WT_MEGABYTE (1048576)
+#define WT_GIGABYTE (1073741824)
+#define WT_TERABYTE ((uint64_t)1099511627776)
+#define WT_PETABYTE ((uint64_t)1125899906842624)
+#define WT_EXABYTE ((uint64_t)1152921504606846976)
/*
* Sizes that cannot be larger than 2**32 are stored in uint32_t fields in
diff --git a/src/third_party/wiredtiger/src/include/session.h b/src/third_party/wiredtiger/src/include/session.h
index a7b252b0bef..f7b41bc7784 100644
--- a/src/third_party/wiredtiger/src/include/session.h
+++ b/src/third_party/wiredtiger/src/include/session.h
@@ -40,7 +40,7 @@ struct __wt_hazard {
typedef TAILQ_HEAD(__wt_cursor_list, __wt_cursor) WT_CURSOR_LIST;
/* Number of cursors cached to trigger cursor sweep. */
-#define WT_SESSION_CURSOR_SWEEP_COUNTDOWN 20
+#define WT_SESSION_CURSOR_SWEEP_COUNTDOWN 40
/* Minimum number of buckets to visit during cursor sweep. */
#define WT_SESSION_CURSOR_SWEEP_MIN 5
diff --git a/src/third_party/wiredtiger/src/include/txn.h b/src/third_party/wiredtiger/src/include/txn.h
index 37b0421962b..cce5bd4cbc7 100644
--- a/src/third_party/wiredtiger/src/include/txn.h
+++ b/src/third_party/wiredtiger/src/include/txn.h
@@ -319,24 +319,28 @@ struct __wt_txn {
const char *rollback_reason; /* If rollback, the reason */
/* AUTOMATIC FLAG VALUE GENERATION START */
-#define WT_TXN_AUTOCOMMIT 0x00001u
-#define WT_TXN_ERROR 0x00002u
-#define WT_TXN_HAS_ID 0x00004u
-#define WT_TXN_HAS_SNAPSHOT 0x00008u
-#define WT_TXN_HAS_TS_COMMIT 0x00010u
-#define WT_TXN_HAS_TS_READ 0x00020u
-#define WT_TXN_IGNORE_PREPARE 0x00040u
-#define WT_TXN_NAMED_SNAPSHOT 0x00080u
-#define WT_TXN_PREPARE 0x00100u
-#define WT_TXN_PUBLIC_TS_COMMIT 0x00200u
-#define WT_TXN_PUBLIC_TS_READ 0x00400u
-#define WT_TXN_READONLY 0x00800u
-#define WT_TXN_RUNNING 0x01000u
-#define WT_TXN_SYNC_SET 0x02000u
-#define WT_TXN_TS_COMMIT_ALWAYS 0x04000u
-#define WT_TXN_TS_COMMIT_KEYS 0x08000u
-#define WT_TXN_TS_COMMIT_NEVER 0x10000u
-#define WT_TXN_UPDATE 0x20000u
+#define WT_TXN_AUTOCOMMIT 0x000001u
+#define WT_TXN_ERROR 0x000002u
+#define WT_TXN_HAS_ID 0x000004u
+#define WT_TXN_HAS_SNAPSHOT 0x000008u
+#define WT_TXN_HAS_TS_COMMIT 0x000010u
+#define WT_TXN_HAS_TS_DURABLE 0x000020u
+#define WT_TXN_HAS_TS_READ 0x000040u
+#define WT_TXN_IGNORE_PREPARE 0x000080u
+#define WT_TXN_NAMED_SNAPSHOT 0x000100u
+#define WT_TXN_PREPARE 0x000200u
+#define WT_TXN_PUBLIC_TS_COMMIT 0x000400u
+#define WT_TXN_PUBLIC_TS_READ 0x000800u
+#define WT_TXN_READONLY 0x001000u
+#define WT_TXN_RUNNING 0x002000u
+#define WT_TXN_SYNC_SET 0x004000u
+#define WT_TXN_TS_COMMIT_ALWAYS 0x008000u
+#define WT_TXN_TS_COMMIT_KEYS 0x010000u
+#define WT_TXN_TS_COMMIT_NEVER 0x020000u
+#define WT_TXN_TS_DURABLE_ALWAYS 0x040000u
+#define WT_TXN_TS_DURABLE_KEYS 0x080000u
+#define WT_TXN_TS_DURABLE_NEVER 0x100000u
+#define WT_TXN_UPDATE 0x200000u
/* AUTOMATIC FLAG VALUE GENERATION STOP */
uint32_t flags;
};
diff --git a/src/third_party/wiredtiger/src/include/txn.i b/src/third_party/wiredtiger/src/include/txn.i
index 7ba90887513..6ce224ec65a 100644
--- a/src/third_party/wiredtiger/src/include/txn.i
+++ b/src/third_party/wiredtiger/src/include/txn.i
@@ -35,6 +35,12 @@ __wt_txn_timestamp_flags(WT_SESSION_IMPL *session)
F_SET(&session->txn, WT_TXN_TS_COMMIT_KEYS);
if (FLD_ISSET(btree->assert_flags, WT_ASSERT_COMMIT_TS_NEVER))
F_SET(&session->txn, WT_TXN_TS_COMMIT_NEVER);
+ if (FLD_ISSET(btree->assert_flags, WT_ASSERT_DURABLE_TS_ALWAYS))
+ F_SET(&session->txn, WT_TXN_TS_DURABLE_ALWAYS);
+ if (FLD_ISSET(btree->assert_flags, WT_ASSERT_DURABLE_TS_KEYS))
+ F_SET(&session->txn, WT_TXN_TS_DURABLE_KEYS);
+ if (FLD_ISSET(btree->assert_flags, WT_ASSERT_DURABLE_TS_NEVER))
+ F_SET(&session->txn, WT_TXN_TS_DURABLE_NEVER);
}
/*
diff --git a/src/third_party/wiredtiger/src/include/wiredtiger.in b/src/third_party/wiredtiger/src/include/wiredtiger.in
index 332bd58f7e6..c2fcb720575 100644
--- a/src/third_party/wiredtiger/src/include/wiredtiger.in
+++ b/src/third_party/wiredtiger/src/include/wiredtiger.in
@@ -2410,14 +2410,15 @@ struct __wt_connection {
* <code>"verbose=[evictserver\,read]"</code>., a list\, with values
* chosen from the following options: \c "api"\, \c "block"\, \c
* "checkpoint"\, \c "checkpoint_progress"\, \c "compact"\, \c
- * "error_returns"\, \c "evict"\, \c "evict_stuck"\, \c "evictserver"\,
- * \c "fileops"\, \c "handleops"\, \c "log"\, \c "lookaside"\, \c
- * "lookaside_activity"\, \c "lsm"\, \c "lsm_manager"\, \c "metadata"\,
- * \c "mutex"\, \c "overflow"\, \c "read"\, \c "rebalance"\, \c
- * "reconcile"\, \c "recovery"\, \c "recovery_progress"\, \c "salvage"\,
- * \c "shared_cache"\, \c "split"\, \c "temporary"\, \c "thread_group"\,
- * \c "timestamp"\, \c "transaction"\, \c "verify"\, \c "version"\, \c
- * "write"; default empty.}
+ * "compact_progress"\, \c "error_returns"\, \c "evict"\, \c
+ * "evict_stuck"\, \c "evictserver"\, \c "fileops"\, \c "handleops"\, \c
+ * "log"\, \c "lookaside"\, \c "lookaside_activity"\, \c "lsm"\, \c
+ * "lsm_manager"\, \c "metadata"\, \c "mutex"\, \c "overflow"\, \c
+ * "read"\, \c "rebalance"\, \c "reconcile"\, \c "recovery"\, \c
+ * "recovery_progress"\, \c "salvage"\, \c "shared_cache"\, \c "split"\,
+ * \c "temporary"\, \c "thread_group"\, \c "timestamp"\, \c
+ * "transaction"\, \c "verify"\, \c "version"\, \c "write"; default
+ * empty.}
* @configend
* @errors
*/
@@ -3120,14 +3121,14 @@ struct __wt_connection {
* @config{verbose, enable messages for various events. Options are given as a
* list\, such as <code>"verbose=[evictserver\,read]"</code>., a list\, with
* values chosen from the following options: \c "api"\, \c "block"\, \c
- * "checkpoint"\, \c "checkpoint_progress"\, \c "compact"\, \c "error_returns"\,
- * \c "evict"\, \c "evict_stuck"\, \c "evictserver"\, \c "fileops"\, \c
- * "handleops"\, \c "log"\, \c "lookaside"\, \c "lookaside_activity"\, \c
- * "lsm"\, \c "lsm_manager"\, \c "metadata"\, \c "mutex"\, \c "overflow"\, \c
- * "read"\, \c "rebalance"\, \c "reconcile"\, \c "recovery"\, \c
- * "recovery_progress"\, \c "salvage"\, \c "shared_cache"\, \c "split"\, \c
- * "temporary"\, \c "thread_group"\, \c "timestamp"\, \c "transaction"\, \c
- * "verify"\, \c "version"\, \c "write"; default empty.}
+ * "checkpoint"\, \c "checkpoint_progress"\, \c "compact"\, \c
+ * "compact_progress"\, \c "error_returns"\, \c "evict"\, \c "evict_stuck"\, \c
+ * "evictserver"\, \c "fileops"\, \c "handleops"\, \c "log"\, \c "lookaside"\,
+ * \c "lookaside_activity"\, \c "lsm"\, \c "lsm_manager"\, \c "metadata"\, \c
+ * "mutex"\, \c "overflow"\, \c "read"\, \c "rebalance"\, \c "reconcile"\, \c
+ * "recovery"\, \c "recovery_progress"\, \c "salvage"\, \c "shared_cache"\, \c
+ * "split"\, \c "temporary"\, \c "thread_group"\, \c "timestamp"\, \c
+ * "transaction"\, \c "verify"\, \c "version"\, \c "write"; default empty.}
* @config{write_through, Use \c FILE_FLAG_WRITE_THROUGH on Windows to write to
* files. Ignored on non-Windows systems. Options are given as a list\, such
* as <code>"write_through=[data]"</code>. Configuring \c write_through requires
diff --git a/src/third_party/wiredtiger/src/include/wt_internal.h b/src/third_party/wiredtiger/src/include/wt_internal.h
index d93f6a3be7f..87437cd9ed3 100644
--- a/src/third_party/wiredtiger/src/include/wt_internal.h
+++ b/src/third_party/wiredtiger/src/include/wt_internal.h
@@ -393,9 +393,9 @@ typedef uint64_t wt_timestamp_t;
#include "buf.i" /* required by cell.i */
#include "cache.i" /* required by txn.i */
+#include "mutex.i" /* required by txn.i */
+#include "txn.i" /* required by cell.i */
#include "cell.i" /* required by btree.i */
-#include "mutex.i" /* required by btree.i */
-#include "txn.i" /* required by btree.i */
#include "bitstring.i"
#include "btree.i" /* required by cursor.i */
diff --git a/src/third_party/wiredtiger/src/meta/meta_turtle.c b/src/third_party/wiredtiger/src/meta/meta_turtle.c
index 3bd717b35cf..3531440e76f 100644
--- a/src/third_party/wiredtiger/src/meta/meta_turtle.c
+++ b/src/third_party/wiredtiger/src/meta/meta_turtle.c
@@ -189,11 +189,12 @@ int
__wt_turtle_init(WT_SESSION_IMPL *session)
{
WT_DECL_RET;
- char *metaconf;
- bool exist_backup, exist_incr, exist_isrc, exist_turtle, load;
+ char *metaconf, *unused_value;
+ bool exist_backup, exist_incr, exist_isrc, exist_turtle;
+ bool load, loadTurtle;
metaconf = NULL;
- load = false;
+ load = loadTurtle = false;
/*
* Discard any turtle setup file left-over from previous runs. This
@@ -202,6 +203,7 @@ __wt_turtle_init(WT_SESSION_IMPL *session)
WT_RET(__wt_remove_if_exists(session, WT_METADATA_TURTLE_SET, false));
/*
+ * If we found a corrupted turtle file, then delete it and create a new.
* We could die after creating the turtle file and before creating the
* metadata file, or worse, the metadata file might be in some random
* state. Make sure that doesn't happen: if we don't find the turtle
@@ -221,6 +223,21 @@ __wt_turtle_init(WT_SESSION_IMPL *session)
WT_RET(__wt_fs_exist(session, WT_METADATA_TURTLE, &exist_turtle));
if (exist_turtle) {
/*
+ * Failure to read means a bad turtle file. Remove it and create
+ * a new turtle file.
+ */
+ if (F_ISSET(S2C(session), WT_CONN_SALVAGE))
+ WT_WITH_TURTLE_LOCK(session,
+ ret = __wt_turtle_read(session,
+ WT_METAFILE_URI, &unused_value));
+
+ if (ret != 0) {
+ WT_RET(__wt_remove_if_exists(
+ session, WT_METADATA_TURTLE, false));
+ loadTurtle = true;
+ }
+
+ /*
* We need to detect the difference between a source database
* that may have crashed with an incremental backup file
* and a destination database that incorrectly ran recovery.
@@ -258,7 +275,9 @@ __wt_turtle_init(WT_SESSION_IMPL *session)
/* Create any bulk-loaded file stubs. */
WT_RET(__metadata_load_bulk(session));
+ }
+ if (load || loadTurtle) {
/* Create the turtle file. */
WT_RET(__metadata_config(session, &metaconf));
WT_WITH_TURTLE_LOCK(session, ret =
@@ -329,8 +348,11 @@ err: WT_TRET(__wt_fclose(session, &fs));
* A file error or a missing key/value pair in the turtle file means
* something has gone horribly wrong, except for the compatibility
* setting which is optional.
+ * Failure to read the turtle file when salvaging means it can't be
+ * used for salvage.
*/
- if (ret == 0 || strcmp(key, WT_METADATA_COMPAT) == 0)
+ if (ret == 0 || strcmp(key, WT_METADATA_COMPAT) == 0 ||
+ F_ISSET(S2C(session), WT_CONN_SALVAGE))
return (ret);
WT_PANIC_RET(session, ret,
"%s: fatal turtle file read error", WT_METADATA_TURTLE);
diff --git a/src/third_party/wiredtiger/src/reconcile/rec_write.c b/src/third_party/wiredtiger/src/reconcile/rec_write.c
index 866eab349b7..a63d3eab361 100644
--- a/src/third_party/wiredtiger/src/reconcile/rec_write.c
+++ b/src/third_party/wiredtiger/src/reconcile/rec_write.c
@@ -295,9 +295,10 @@ static int __rec_cell_build_int_key(WT_SESSION_IMPL *,
static int __rec_cell_build_leaf_key(WT_SESSION_IMPL *,
WT_RECONCILE *, const void *, size_t, bool *);
static int __rec_cell_build_ovfl(WT_SESSION_IMPL *, WT_RECONCILE *,
- WT_KV *, uint8_t, wt_timestamp_t, wt_timestamp_t, uint64_t);
+ WT_KV *, uint8_t, wt_timestamp_t *, wt_timestamp_t *, uint64_t);
static int __rec_cell_build_val(WT_SESSION_IMPL *, WT_RECONCILE *,
- const void *, size_t, wt_timestamp_t, wt_timestamp_t, uint64_t);
+ const void *, size_t, wt_timestamp_t *, wt_timestamp_t *,
+ uint64_t);
static void __rec_cleanup(WT_SESSION_IMPL *, WT_RECONCILE *);
static int __rec_col_fix(WT_SESSION_IMPL *, WT_RECONCILE *, WT_REF *);
static int __rec_col_fix_slvg(WT_SESSION_IMPL *,
@@ -1933,7 +1934,8 @@ __rec_image_copy(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_KV *kv)
*/
static int
__rec_dict_replace(WT_SESSION_IMPL *session, WT_RECONCILE *r,
- wt_timestamp_t start_ts, wt_timestamp_t stop_ts, uint64_t rle, WT_KV *val)
+ wt_timestamp_t *start_tsp, wt_timestamp_t *stop_tsp,
+ uint64_t rle, WT_KV *val)
{
WT_DICTIONARY *dp;
uint64_t offset;
@@ -1976,7 +1978,7 @@ __rec_dict_replace(WT_SESSION_IMPL *session, WT_RECONCILE *r,
offset = (uint64_t)WT_PTRDIFF(r->first_free,
(uint8_t *)r->cur_ptr->image.mem + dp->offset);
val->len = val->cell_len = __wt_cell_pack_copy(
- session, &val->cell, start_ts, stop_ts, rle, offset);
+ session, &val->cell, start_tsp, stop_tsp, rle, offset);
val->buf.data = NULL;
val->buf.size = 0;
}
@@ -3473,18 +3475,21 @@ __wt_bulk_insert_row(WT_SESSION_IMPL *session, WT_CURSOR_BULK *cbulk)
WT_CURSOR *cursor;
WT_KV *key, *val;
WT_RECONCILE *r;
+ wt_timestamp_t start_ts, stop_ts;
bool ovfl_key;
r = cbulk->reconcile;
btree = S2BT(session);
cursor = &cbulk->cbt.iface;
+ start_ts = WT_TS_NONE;
+ stop_ts = WT_TS_MAX;
key = &r->k;
val = &r->v;
WT_RET(__rec_cell_build_leaf_key(session, r, /* Build key cell */
cursor->key.data, cursor->key.size, &ovfl_key));
WT_RET(__rec_cell_build_val(session, r, /* Build value cell */
- cursor->value.data, cursor->value.size, WT_TS_NONE, WT_TS_MAX, 0));
+ cursor->value.data, cursor->value.size, &start_ts, &stop_ts, 0));
/* Boundary: split or write the page. */
if (WT_CROSSING_SPLIT_BND(r, key->len + val->len)) {
@@ -3511,10 +3516,10 @@ __wt_bulk_insert_row(WT_SESSION_IMPL *session, WT_CURSOR_BULK *cbulk)
r->all_empty_value = false;
if (btree->dictionary)
WT_RET(__rec_dict_replace(
- session, r, WT_TS_NONE, WT_TS_MAX, 0, val));
+ session, r, &start_ts, &stop_ts, 0, val));
__rec_image_copy(session, r, val);
}
- __rec_addr_ts_update(r, WT_TS_NONE, WT_TS_NONE, WT_TS_MAX);
+ __rec_addr_ts_update(r, start_ts, start_ts, stop_ts);
/* Update compression state. */
__rec_key_state_update(r, ovfl_key);
@@ -3632,14 +3637,17 @@ __wt_bulk_insert_var(
WT_BTREE *btree;
WT_KV *val;
WT_RECONCILE *r;
+ wt_timestamp_t start_ts, stop_ts;
r = cbulk->reconcile;
btree = S2BT(session);
+ start_ts = WT_TS_NONE;
+ stop_ts = WT_TS_MAX;
val = &r->v;
if (deleted) {
val->cell_len = __wt_cell_pack_del(
- session, &val->cell, WT_TS_NONE, WT_TS_MAX, cbulk->rle);
+ session, &val->cell, &start_ts, &stop_ts, cbulk->rle);
val->buf.data = NULL;
val->buf.size = 0;
val->len = val->cell_len;
@@ -3651,7 +3659,7 @@ __wt_bulk_insert_var(
*/
WT_RET(__rec_cell_build_val(session,
r, cbulk->last.data, cbulk->last.size,
- WT_TS_NONE, WT_TS_MAX, cbulk->rle));
+ &start_ts, &stop_ts, cbulk->rle));
/* Boundary: split or write the page. */
if (WT_CROSSING_SPLIT_BND(r, val->len))
@@ -3660,9 +3668,9 @@ __wt_bulk_insert_var(
/* Copy the value onto the page. */
if (btree->dictionary)
WT_RET(__rec_dict_replace(
- session, r, WT_TS_NONE, WT_TS_MAX, cbulk->rle, val));
+ session, r, &start_ts, &stop_ts, cbulk->rle, val));
__rec_image_copy(session, r, val);
- __rec_addr_ts_update(r, WT_TS_NONE, WT_TS_NONE, WT_TS_MAX);
+ __rec_addr_ts_update(r, start_ts, start_ts, stop_ts);
/* Update the starting record number in case we split. */
r->recno += cbulk->rle;
@@ -4090,19 +4098,19 @@ __rec_col_var_helper(WT_SESSION_IMPL *session, WT_RECONCILE *r,
if (deleted) {
val->cell_len = __wt_cell_pack_del(
- session, &val->cell, start_ts, stop_ts, rle);
+ session, &val->cell, &start_ts, &stop_ts, rle);
val->buf.data = NULL;
val->buf.size = 0;
val->len = val->cell_len;
} else if (overflow_type) {
val->cell_len = __wt_cell_pack_ovfl(session, &val->cell,
- WT_CELL_VALUE_OVFL, start_ts, stop_ts, rle, value->size);
+ WT_CELL_VALUE_OVFL, &start_ts, &stop_ts, rle, value->size);
val->buf.data = value->data;
val->buf.size = value->size;
val->len = val->cell_len + value->size;
} else
WT_RET(__rec_cell_build_val(session,
- r, value->data, value->size, start_ts, stop_ts, rle));
+ r, value->data, value->size, &start_ts, &stop_ts, rle));
/* Boundary: split or write the page. */
if (__rec_need_split(r, val->len))
@@ -4111,7 +4119,7 @@ __rec_col_var_helper(WT_SESSION_IMPL *session, WT_RECONCILE *r,
/* Copy the value onto the page. */
if (!deleted && !overflow_type && btree->dictionary)
WT_RET(__rec_dict_replace(
- session, r, start_ts, stop_ts, rle, val));
+ session, r, &start_ts, &stop_ts, rle, val));
__rec_image_copy(session, r, val);
__rec_addr_ts_update(r, start_ts, start_ts, stop_ts);
@@ -4986,6 +4994,27 @@ __rec_row_merge(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page)
}
/*
+ * __rec_row_zero_len --
+ * Return if a zero-length item can be written.
+ */
+static bool
+__rec_row_zero_len(WT_SESSION_IMPL *session,
+ uint64_t txnid, wt_timestamp_t start_ts, wt_timestamp_t stop_ts)
+{
+ /* Before timestamps were stored on pages, it was always possible. */
+ if (!__wt_process.page_version_ts)
+ return (true);
+
+ /*
+ * The item must be globally visible because we're not writing anything
+ * on the page.
+ */
+ return ((start_ts == WT_TS_NONE ||
+ __wt_txn_visible_all(session, txnid, start_ts)) &&
+ stop_ts == WT_TS_MAX);
+}
+
+/*
* __rec_row_leaf --
* Reconcile a row-store leaf page.
*/
@@ -5110,8 +5139,8 @@ __rec_row_leaf(WT_SESSION_IMPL *session,
p = tmpval->data;
size = tmpval->size;
}
- WT_ERR(__rec_cell_build_val(
- session, r, p, size, start_ts, stop_ts, 0));
+ WT_ERR(__rec_cell_build_val(session,
+ r, p, size, &start_ts, &stop_ts, 0));
dictionary = true;
} else if (vpack->raw == WT_CELL_VALUE_OVFL_RM) {
/*
@@ -5157,7 +5186,7 @@ __rec_row_leaf(WT_SESSION_IMPL *session,
*/
WT_ERR(__rec_cell_build_val(session, r,
"ovfl-unused", strlen("ovfl-unused"),
- start_ts, stop_ts, 0));
+ &start_ts, &stop_ts, 0));
} else {
val->buf.data = vpack->cell;
val->buf.size = __wt_cell_total_len(vpack);
@@ -5185,14 +5214,14 @@ __rec_row_leaf(WT_SESSION_IMPL *session,
WT_ERR(__rec_cell_build_val(session, r,
cbt->iface.value.data,
cbt->iface.value.size,
- start_ts, stop_ts, 0));
+ &start_ts, &stop_ts, 0));
dictionary = true;
break;
case WT_UPDATE_STANDARD:
/* Take the value from the update. */
WT_ERR(__rec_cell_build_val(session, r,
upd->data, upd->size,
- start_ts, stop_ts, 0));
+ &start_ts, &stop_ts, 0));
dictionary = true;
break;
case WT_UPDATE_TOMBSTONE:
@@ -5338,20 +5367,16 @@ build:
session, r, key->len + val->len));
}
- /*
- * Copy the key/value pair onto the page. Zero-length items must
- * be globally visible as we're writing nothing to the page.
- */
+ /* Copy the key/value pair onto the page. */
__rec_image_copy(session, r, key);
if (val->len == 0 &&
- (!__wt_process.page_version_ts ||
- __wt_txn_visible_all(session, txnid, stop_ts)))
+ __rec_row_zero_len(session, txnid, start_ts, stop_ts))
r->any_empty_value = true;
else {
r->all_empty_value = false;
if (dictionary && btree->dictionary)
WT_ERR(__rec_dict_replace(
- session, r, start_ts, stop_ts, 0, val));
+ session, r, &start_ts, &stop_ts, 0, val));
__rec_image_copy(session, r, val);
}
__rec_addr_ts_update(r, start_ts, start_ts, stop_ts);
@@ -5438,12 +5463,12 @@ __rec_row_leaf_insert(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_INSERT *ins)
session, cbt, upd, F_ISSET(r, WT_REC_VISIBLE_ALL)));
WT_RET(__rec_cell_build_val(session, r,
cbt->iface.value.data, cbt->iface.value.size,
- start_ts, stop_ts, 0));
+ &start_ts, &stop_ts, 0));
break;
case WT_UPDATE_STANDARD:
/* Take the value from the update. */
WT_RET(__rec_cell_build_val(session, r,
- upd->data, upd->size, start_ts, stop_ts, 0));
+ upd->data, upd->size, &start_ts, &stop_ts, 0));
break;
case WT_UPDATE_TOMBSTONE:
continue;
@@ -5472,20 +5497,16 @@ __rec_row_leaf_insert(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_INSERT *ins)
session, r, key->len + val->len));
}
- /*
- * Copy the key/value pair onto the page. Zero-length items must
- * be globally visible as we're writing nothing to the page.
- */
+ /* Copy the key/value pair onto the page. */
__rec_image_copy(session, r, key);
if (val->len == 0 &&
- (!__wt_process.page_version_ts &&
- __wt_txn_visible_all(session, txnid, stop_ts)))
+ __rec_row_zero_len(session, txnid, start_ts, stop_ts))
r->any_empty_value = true;
else {
r->all_empty_value = false;
if (btree->dictionary)
WT_RET(__rec_dict_replace(
- session, r, start_ts, stop_ts, 0, val));
+ session, r, &start_ts, &stop_ts, 0, val));
__rec_image_copy(session, r, val);
}
__rec_addr_ts_update(r, start_ts, start_ts, stop_ts);
@@ -5909,12 +5930,13 @@ __rec_cell_build_int_key(WT_SESSION_IMPL *session,
{
WT_BTREE *btree;
WT_KV *key;
+ wt_timestamp_t start_ts, stop_ts;
*is_ovflp = false;
btree = S2BT(session);
-
key = &r->k;
+ start_ts = stop_ts = WT_TS_NONE; /* Keys aren't timestamped. */
/* Copy the bytes into the "current" and key buffers. */
WT_RET(__wt_buf_set(session, r->cur, data, size));
@@ -5926,7 +5948,7 @@ __rec_cell_build_int_key(WT_SESSION_IMPL *session,
*is_ovflp = true;
return (__rec_cell_build_ovfl(session, r,
- key, WT_CELL_KEY_OVFL, WT_TS_NONE, WT_TS_NONE, 0));
+ key, WT_CELL_KEY_OVFL, &start_ts, &stop_ts, 0));
}
key->cell_len = __wt_cell_pack_int_key(&key->cell, key->buf.size);
@@ -5946,6 +5968,7 @@ __rec_cell_build_leaf_key(WT_SESSION_IMPL *session,
{
WT_BTREE *btree;
WT_KV *key;
+ wt_timestamp_t start_ts, stop_ts;
size_t pfx_max;
const uint8_t *a, *b;
uint8_t pfx;
@@ -5953,7 +5976,7 @@ __rec_cell_build_leaf_key(WT_SESSION_IMPL *session,
*is_ovflp = false;
btree = S2BT(session);
-
+ start_ts = stop_ts = WT_TS_NONE; /* Keys aren't timestamped. */
key = &r->k;
pfx = 0;
@@ -6026,7 +6049,7 @@ __rec_cell_build_leaf_key(WT_SESSION_IMPL *session,
*is_ovflp = true;
return (__rec_cell_build_ovfl(session, r, key,
- WT_CELL_KEY_OVFL, WT_TS_NONE, WT_TS_NONE, 0));
+ WT_CELL_KEY_OVFL, &start_ts, &stop_ts, 0));
}
return (
__rec_cell_build_leaf_key(session, r, NULL, 0, is_ovflp));
@@ -6104,7 +6127,7 @@ __rec_cell_build_addr(WT_SESSION_IMPL *session,
static int
__rec_cell_build_val(WT_SESSION_IMPL *session, WT_RECONCILE *r,
const void *data, size_t size,
- wt_timestamp_t start_ts, wt_timestamp_t stop_ts, uint64_t rle)
+ wt_timestamp_t *start_tsp, wt_timestamp_t *stop_tsp, uint64_t rle)
{
WT_BTREE *btree;
WT_KV *val;
@@ -6133,11 +6156,11 @@ __rec_cell_build_val(WT_SESSION_IMPL *session, WT_RECONCILE *r,
WT_STAT_DATA_INCR(session, rec_overflow_value);
return (__rec_cell_build_ovfl(session, r,
- val, WT_CELL_VALUE_OVFL, start_ts, stop_ts, rle));
+ val, WT_CELL_VALUE_OVFL, start_tsp, stop_tsp, rle));
}
}
- val->cell_len = __wt_cell_pack_data(
- session, &val->cell, start_ts, stop_ts, rle, val->buf.size);
+ val->cell_len = __wt_cell_pack_value(
+ session, &val->cell, start_tsp, stop_tsp, rle, val->buf.size);
val->len = val->cell_len + val->buf.size;
return (0);
@@ -6150,7 +6173,7 @@ __rec_cell_build_val(WT_SESSION_IMPL *session, WT_RECONCILE *r,
static int
__rec_cell_build_ovfl(WT_SESSION_IMPL *session,
WT_RECONCILE *r, WT_KV *kv, uint8_t type,
- wt_timestamp_t start_ts, wt_timestamp_t stop_ts, uint64_t rle)
+ wt_timestamp_t *start_tsp, wt_timestamp_t *stop_tsp, uint64_t rle)
{
WT_BM *bm;
WT_BTREE *btree;
@@ -6210,7 +6233,7 @@ __rec_cell_build_ovfl(WT_SESSION_IMPL *session,
/* Build the cell and return. */
kv->cell_len = __wt_cell_pack_ovfl(
- session, &kv->cell, type, start_ts, stop_ts, rle, kv->buf.size);
+ session, &kv->cell, type, start_tsp, stop_tsp, rle, kv->buf.size);
kv->len = kv->cell_len + kv->buf.size;
err: __wt_scr_free(session, &tmp);
@@ -6373,7 +6396,7 @@ __rec_dictionary_lookup(
hash = __wt_hash_fnv64(val->buf.data, val->buf.size);
for (dp = __rec_dictionary_skip_search(r->dictionary_head, hash);
dp != NULL && dp->hash == hash; dp = dp->next[0]) {
- WT_RET(__wt_cell_pack_data_match(
+ WT_RET(__wt_cell_pack_value_match(
(WT_CELL *)((uint8_t *)r->cur_ptr->image.mem + dp->offset),
&val->cell, val->buf.data, &match));
if (match) {
diff --git a/src/third_party/wiredtiger/src/schema/schema_create.c b/src/third_party/wiredtiger/src/schema/schema_create.c
index f96f053470d..21b7e6c305c 100644
--- a/src/third_party/wiredtiger/src/schema/schema_create.c
+++ b/src/third_party/wiredtiger/src/schema/schema_create.c
@@ -278,7 +278,7 @@ err: __wt_free(session, cgconf);
__wt_buf_free(session, &namebuf);
if (!tracked)
- WT_TRET(__wt_schema_release_table(session, table));
+ WT_TRET(__wt_schema_release_table(session, &table));
return (ret);
}
@@ -415,7 +415,7 @@ __create_index(WT_SESSION_IMPL *session,
WT_RET_MSG(session, ret,
"Can't create an index for table: %.*s",
(int)tlen, tablename);
- WT_RET(__wt_schema_release_table(session, table));
+ WT_RET(__wt_schema_release_table(session, &table));
if ((ret = __wt_schema_get_table(
session, tablename, tlen, true, 0, &table)) != 0)
@@ -565,7 +565,7 @@ err: __wt_free(session, idxconf);
__wt_buf_free(session, &fmt);
__wt_buf_free(session, &namebuf);
- WT_TRET(__wt_schema_release_table(session, table));
+ WT_TRET(__wt_schema_release_table(session, &table));
return (ret);
}
@@ -636,8 +636,7 @@ __create_table(WT_SESSION_IMPL *session,
table = NULL;
}
-err: if (table != NULL)
- WT_TRET(__wt_schema_release_table(session, table));
+err: WT_TRET(__wt_schema_release_table(session, &table));
__wt_free(session, cgname);
__wt_free(session, tableconf);
return (ret);
diff --git a/src/third_party/wiredtiger/src/schema/schema_drop.c b/src/third_party/wiredtiger/src/schema/schema_drop.c
index 6b43e93643b..f52caff4a22 100644
--- a/src/third_party/wiredtiger/src/schema/schema_drop.c
+++ b/src/third_party/wiredtiger/src/schema/schema_drop.c
@@ -131,7 +131,7 @@ __drop_table(
*/
WT_ERR(__wt_schema_get_table_uri(session, uri, true,
WT_DHANDLE_EXCLUSIVE, &table));
- WT_ERR(__wt_schema_release_table(session, table));
+ WT_ERR(__wt_schema_release_table(session, &table));
WT_ERR(__wt_schema_get_table_uri(session, uri, true, 0, &table));
/* Drop the column groups. */
@@ -162,7 +162,7 @@ __drop_table(
}
/* Make sure the table data handle is closed. */
- WT_TRET(__wt_schema_release_table(session, table));
+ WT_ERR(__wt_schema_release_table(session, &table));
WT_ERR(__wt_schema_get_table_uri(
session, uri, true, WT_DHANDLE_EXCLUSIVE, &table));
F_SET(&table->iface, WT_DHANDLE_DISCARD);
@@ -176,8 +176,8 @@ __drop_table(
/* Remove the metadata entry (ignore missing items). */
WT_ERR(__wt_metadata_remove(session, uri));
-err: if (table != NULL && !tracked)
- WT_TRET(__wt_schema_release_table(session, table));
+err: if (!tracked)
+ WT_TRET(__wt_schema_release_table(session, &table));
return (ret);
}
diff --git a/src/third_party/wiredtiger/src/schema/schema_list.c b/src/third_party/wiredtiger/src/schema/schema_list.c
index 9d0aae8fb1d..bda188d0ff4 100644
--- a/src/third_party/wiredtiger/src/schema/schema_list.c
+++ b/src/third_party/wiredtiger/src/schema/schema_list.c
@@ -14,22 +14,21 @@
*/
int
__wt_schema_get_table_uri(WT_SESSION_IMPL *session,
- const char *uri, bool ok_incomplete, uint32_t flags,
- WT_TABLE **tablep)
+ const char *uri, bool ok_incomplete, uint32_t flags, WT_TABLE **tablep)
{
WT_DATA_HANDLE *saved_dhandle;
WT_DECL_RET;
WT_TABLE *table;
- saved_dhandle = session->dhandle;
-
*tablep = NULL;
+ saved_dhandle = session->dhandle;
+
WT_ERR(__wt_session_get_dhandle(session, uri, NULL, NULL, flags));
table = (WT_TABLE *)session->dhandle;
if (!ok_incomplete && !table->cg_complete) {
+ WT_ERR(__wt_session_release_dhandle(session));
ret = __wt_set_return(session, EINVAL);
- WT_TRET(__wt_session_release_dhandle(session));
WT_ERR_MSG(session, ret, "'%s' cannot be used "
"until all column groups are created",
table->iface.name);
@@ -68,9 +67,14 @@ err: __wt_scr_free(session, &namebuf);
* Release a table handle.
*/
int
-__wt_schema_release_table(WT_SESSION_IMPL *session, WT_TABLE *table)
+__wt_schema_release_table(WT_SESSION_IMPL *session, WT_TABLE **tablep)
{
WT_DECL_RET;
+ WT_TABLE *table;
+
+ if ((table = *tablep) == NULL)
+ return (0);
+ *tablep = NULL;
WT_WITH_DHANDLE(session, &table->iface,
ret = __wt_session_release_dhandle(session));
diff --git a/src/third_party/wiredtiger/src/schema/schema_open.c b/src/third_party/wiredtiger/src/schema/schema_open.c
index 89d2232da64..3796b1502b3 100644
--- a/src/third_party/wiredtiger/src/schema/schema_open.c
+++ b/src/third_party/wiredtiger/src/schema/schema_open.c
@@ -525,12 +525,12 @@ __wt_schema_get_colgroup(WT_SESSION_IMPL *session,
*tablep = table;
else
WT_RET(
- __wt_schema_release_table(session, table));
+ __wt_schema_release_table(session, &table));
return (0);
}
}
- WT_RET(__wt_schema_release_table(session, table));
+ WT_RET(__wt_schema_release_table(session, &table));
if (quiet)
WT_RET(ENOENT);
WT_RET_MSG(session, ENOENT, "%s not found in table", uri);
@@ -576,7 +576,7 @@ __wt_schema_get_index(WT_SESSION_IMPL *session,
done: if (invalidate)
table->idx_complete = false;
-err: WT_TRET(__wt_schema_release_table(session, table));
+err: WT_TRET(__wt_schema_release_table(session, &table));
WT_RET(ret);
if (*indexp != NULL)
diff --git a/src/third_party/wiredtiger/src/schema/schema_rename.c b/src/third_party/wiredtiger/src/schema/schema_rename.c
index 89d2756bdc2..9dbc1e0fce9 100644
--- a/src/third_party/wiredtiger/src/schema/schema_rename.c
+++ b/src/third_party/wiredtiger/src/schema/schema_rename.c
@@ -236,7 +236,7 @@ __rename_table(WT_SESSION_IMPL *session,
table->indices[i]->name, cfg));
/* Make sure the table data handle is closed. */
- WT_TRET(__wt_schema_release_table(session, table));
+ WT_ERR(__wt_schema_release_table(session, &table));
WT_ERR(__wt_schema_get_table_uri(
session, uri, true, WT_DHANDLE_EXCLUSIVE, &table));
F_SET(&table->iface, WT_DHANDLE_DISCARD);
@@ -251,7 +251,7 @@ __rename_table(WT_SESSION_IMPL *session,
ret = __metadata_rename(session, uri, newuri);
err: if (!tracked)
- WT_TRET(__wt_schema_release_table(session, table));
+ WT_TRET(__wt_schema_release_table(session, &table));
return (ret);
}
diff --git a/src/third_party/wiredtiger/src/schema/schema_stat.c b/src/third_party/wiredtiger/src/schema/schema_stat.c
index 2999f92d7b6..e8535513d01 100644
--- a/src/third_party/wiredtiger/src/schema/schema_stat.c
+++ b/src/third_party/wiredtiger/src/schema/schema_stat.c
@@ -184,7 +184,7 @@ __wt_curstat_table_init(WT_SESSION_IMPL *session,
__wt_curstat_dsrc_final(cst);
-err: WT_TRET(__wt_schema_release_table(session, table));
+err: WT_TRET(__wt_schema_release_table(session, &table));
__wt_scr_free(session, &buf);
return (ret);
diff --git a/src/third_party/wiredtiger/src/schema/schema_truncate.c b/src/third_party/wiredtiger/src/schema/schema_truncate.c
index 40511b0572a..0c60a327d9c 100644
--- a/src/third_party/wiredtiger/src/schema/schema_truncate.c
+++ b/src/third_party/wiredtiger/src/schema/schema_truncate.c
@@ -34,7 +34,7 @@ __truncate_table(WT_SESSION_IMPL *session, const char *uri, const char *cfg[])
WT_ERR(__wt_schema_truncate(
session, table->indices[i]->source, cfg));
-err: WT_TRET(__wt_schema_release_table(session, table));
+err: WT_TRET(__wt_schema_release_table(session, &table));
return (ret);
}
diff --git a/src/third_party/wiredtiger/src/schema/schema_worker.c b/src/third_party/wiredtiger/src/schema/schema_worker.c
index eff8e7a83e1..a9362c228fe 100644
--- a/src/third_party/wiredtiger/src/schema/schema_worker.c
+++ b/src/third_party/wiredtiger/src/schema/schema_worker.c
@@ -152,7 +152,6 @@ __wt_schema_worker(WT_SESSION_IMPL *session,
} else
WT_ERR(__wt_bad_object_type(session, uri));
-err: if (table != NULL)
- WT_TRET(__wt_schema_release_table(session, table));
+err: WT_TRET(__wt_schema_release_table(session, &table));
return (ret);
}
diff --git a/src/third_party/wiredtiger/src/session/session_api.c b/src/third_party/wiredtiger/src/session/session_api.c
index e90d37ce6cb..379ec69c77d 100644
--- a/src/third_party/wiredtiger/src/session/session_api.c
+++ b/src/third_party/wiredtiger/src/session/session_api.c
@@ -1016,7 +1016,11 @@ __session_reset(WT_SESSION *wt_session)
WT_TRET(__wt_session_reset_cursors(session, true));
- WT_TRET(__wt_session_cursor_cache_sweep(session));
+ if (--session->cursor_sweep_countdown == 0) {
+ session->cursor_sweep_countdown =
+ WT_SESSION_CURSOR_SWEEP_COUNTDOWN;
+ WT_TRET(__wt_session_cursor_cache_sweep(session));
+ }
/* Release common session resources. */
WT_TRET(__wt_session_release_resources(session));
diff --git a/src/third_party/wiredtiger/src/txn/txn.c b/src/third_party/wiredtiger/src/txn/txn.c
index 13fd1ee1233..f3d9f762750 100644
--- a/src/third_party/wiredtiger/src/txn/txn.c
+++ b/src/third_party/wiredtiger/src/txn/txn.c
@@ -576,17 +576,21 @@ __wt_txn_release(WT_SESSION_IMPL *session)
}
/*
- * __txn_commit_timestamp_validate --
- * Validate that timestamp provided to commit is legal.
+ * __txn_commit_timestamps_validate --
+ * Validate that timestamps provided to commit are legal.
*/
static inline int
-__txn_commit_timestamp_validate(WT_SESSION_IMPL *session)
+__txn_commit_timestamps_validate(WT_SESSION_IMPL *session)
{
+ WT_CURSOR *cursor;
+ WT_DECL_RET;
WT_TXN *txn;
WT_TXN_OP *op;
WT_UPDATE *upd;
wt_timestamp_t op_timestamp;
u_int i;
+ const char *open_cursor_cfg[] = {
+ WT_CONFIG_BASE(session, WT_SESSION_open_cursor), NULL };
bool op_zero_ts, upd_zero_ts;
txn = &session->txn;
@@ -604,11 +608,21 @@ __txn_commit_timestamp_validate(WT_SESSION_IMPL *session)
txn->mod_count != 0)
WT_RET_MSG(session, EINVAL, "no commit_timestamp required and "
"timestamp set on this transaction");
+ if (F_ISSET(txn, WT_TXN_TS_DURABLE_ALWAYS) &&
+ !F_ISSET(txn, WT_TXN_HAS_TS_DURABLE) &&
+ txn->mod_count != 0)
+ WT_RET_MSG(session, EINVAL, "durable_timestamp required and "
+ "none set on this transaction");
+ if (F_ISSET(txn, WT_TXN_TS_DURABLE_NEVER) &&
+ F_ISSET(txn, WT_TXN_HAS_TS_DURABLE) &&
+ txn->mod_count != 0)
+ WT_RET_MSG(session, EINVAL, "no durable_timestamp required and "
+ "durable timestamp set on this transaction");
/*
* If we're not doing any key consistency checking, we're done.
*/
- if (!F_ISSET(txn, WT_TXN_TS_COMMIT_KEYS))
+ if (!F_ISSET(txn, WT_TXN_TS_COMMIT_KEYS | WT_TXN_TS_DURABLE_KEYS))
return (0);
/*
@@ -619,10 +633,35 @@ __txn_commit_timestamp_validate(WT_SESSION_IMPL *session)
if (op->type == WT_TXN_OP_BASIC_COL ||
op->type == WT_TXN_OP_BASIC_ROW) {
/*
+ * Search for prepared updates, so that they will be
+ * restored, if moved to lookaside.
+ */
+ if (F_ISSET(txn, WT_TXN_PREPARE)) {
+ WT_RET(__wt_open_cursor(session,
+ op->btree->dhandle->name, NULL,
+ open_cursor_cfg, &cursor));
+ F_CLR(txn, WT_TXN_PREPARE);
+ if (op->type == WT_TXN_OP_BASIC_ROW)
+ __wt_cursor_set_raw_key(
+ cursor, &op->u.op_row.key);
+ else
+ ((WT_CURSOR_BTREE*)cursor)->iface.recno
+ = op->u.op_col.recno;
+ F_SET(txn, WT_TXN_PREPARE);
+ WT_WITH_BTREE(session, op->btree,
+ ret = __wt_btcur_search_uncommitted(
+ (WT_CURSOR_BTREE *)cursor, &upd));
+ WT_TRET(cursor->close(cursor));
+ if (ret != 0)
+ WT_RET_MSG(session, EINVAL,
+ "prepared update restore failed");
+ op->u.op_upd = upd;
+ } else
+ upd = op->u.op_upd->next;
+ /*
* Skip over any aborted update structures or ones
* from our own transaction.
*/
- upd = op->u.op_upd->next;
while (upd != NULL && (upd->txnid == WT_TXN_ABORTED ||
upd->txnid == txn->id))
upd = upd->next;
@@ -662,9 +701,14 @@ __txn_commit_timestamp_validate(WT_SESSION_IMPL *session)
*/
if (op_timestamp == WT_TS_NONE)
op_timestamp = txn->commit_timestamp;
- if (op_timestamp < upd->start_ts)
+ if (F_ISSET(txn, WT_TXN_TS_COMMIT_KEYS) &&
+ op_timestamp < upd->start_ts)
WT_RET_MSG(session, EINVAL,
- "out of order timestamps");
+ "out of order commit timestamps");
+ if (F_ISSET(txn, WT_TXN_TS_DURABLE_KEYS) &&
+ txn->durable_timestamp < upd->durable_ts)
+ WT_RET_MSG(session, EINVAL,
+ "out of order durable timestamps");
}
return (0);
}
@@ -699,6 +743,8 @@ __wt_txn_commit(WT_SESSION_IMPL *session, const char *cfg[])
txn->mod_count == 0);
readonly = txn->mod_count == 0;
+
+ prepare = F_ISSET(txn, WT_TXN_PREPARE);
/* Look for a commit timestamp. */
WT_ERR(
__wt_config_gets_def(session, cfg, "commit_timestamp", 0, &cval));
@@ -709,45 +755,43 @@ __wt_txn_commit(WT_SESSION_IMPL *session, const char *cfg[])
* than stable timestamp.
*/
WT_ERR(__wt_timestamp_validate(
- session, "commit", ts, &cval, false));
+ session, "commit", ts, &cval, !prepare));
txn->commit_timestamp = ts;
__wt_txn_set_commit_timestamp(session);
+ if (!prepare)
+ txn->durable_timestamp = txn->commit_timestamp;
}
- prepare = F_ISSET(txn, WT_TXN_PREPARE);
if (prepare && !F_ISSET(txn, WT_TXN_HAS_TS_COMMIT))
WT_ERR_MSG(session, EINVAL,
"commit_timestamp is required for a prepared transaction");
- /* Durable timestamp is required for a prepared transaction. */
- if (prepare) {
- WT_ERR(__wt_config_gets_def(
- session, cfg, "durable_timestamp", 0, &cval));
- if (cval.len != 0) {
- WT_ERR(__wt_txn_parse_timestamp(
- session, "durable", &ts, &cval));
- WT_ERR(__wt_timestamp_validate(
- session, "durable", ts, &cval, true));
- txn->durable_timestamp = ts;
- } else
- /*
- * If durable timestamp is not given, commit timestamp
- * will be considered as durable timestamp.
- * TODO : error if durable timestamp is not given.
- */
- txn->durable_timestamp = txn->commit_timestamp;
-
- } else
- txn->durable_timestamp = txn->commit_timestamp;
-
- /* Durable timestamp should be later than stable timestamp. */
- if (cval.len != 0)
+ /*
+ * Durable timestamp is required for a prepared transaction.
+ * If durable timestamp is not given, commit timestamp will be
+ * considered as durable timestamp. We don't flag error if durable
+ * timestamp is not specified for prepared transactions, but will flag
+ * error if durable timestamp is specified for non-prepared
+ * transactions.
+ */
+ WT_ERR(__wt_config_gets_def(
+ session, cfg, "durable_timestamp", 0, &cval));
+ if (cval.len != 0) {
+ if (!prepare)
+ WT_ERR_MSG(session, EINVAL,
+ "durable_timestamp should not be given for "
+ "non-prepared transaction");
+
+ WT_ERR(__wt_txn_parse_timestamp(
+ session, "durable", &ts, &cval));
+ /* Durable timestamp should be later than stable timestamp. */
+ F_SET(txn, WT_TXN_HAS_TS_DURABLE);
+ txn->durable_timestamp = ts;
WT_ERR(__wt_timestamp_validate(
- session, "durable", txn->durable_timestamp, &cval, true));
-
- WT_ERR(__txn_commit_timestamp_validate(session));
+ session, "durable", ts, &cval, true));
+ }
- /* TODO : assert durable_timestamp. */
+ WT_ERR(__txn_commit_timestamps_validate(session));
/*
* The default sync setting is inherited from the connection, but can
diff --git a/src/third_party/wiredtiger/src/txn/txn_ckpt.c b/src/third_party/wiredtiger/src/txn/txn_ckpt.c
index 3a111d271ca..a1c700661ce 100644
--- a/src/third_party/wiredtiger/src/txn/txn_ckpt.c
+++ b/src/third_party/wiredtiger/src/txn/txn_ckpt.c
@@ -467,7 +467,8 @@ __wt_checkpoint_progress(WT_SESSION_IMPL *session, bool closing)
time_diff = WT_TIMEDIFF_SEC(cur_time,
conn->ckpt_timer_start);
- if (closing || (time_diff / 20) > conn->ckpt_progress_msg_count) {
+ if (closing || (time_diff / WT_PROGRESS_MSG_PERIOD) >
+ conn->ckpt_progress_msg_count) {
__wt_verbose(session, WT_VERB_CHECKPOINT_PROGRESS,
"Checkpoint %s for %" PRIu64
" seconds and wrote: %" PRIu64 " pages (%" PRIu64 " MB)",
@@ -1537,7 +1538,7 @@ __checkpoint_mark_skip(
/*
* __wt_checkpoint_tree_reconcile_update --
- * Update a checkpoint based on reconciliation's results.
+ * Update a checkpoint based on reconciliation results.
*/
void
__wt_checkpoint_tree_reconcile_update(
@@ -1554,7 +1555,7 @@ __wt_checkpoint_tree_reconcile_update(
/*
* Reconciliation just wrote a checkpoint, everything has been written.
- * Update the checkpoint with reconciliation's information. The reason
+ * Update the checkpoint with reconciliation information. The reason
* for this function is the reconciliation code just passes through the
* btree structure's checkpoint array, it doesn't know any more.
*/
diff --git a/src/third_party/wiredtiger/src/txn/txn_timestamp.c b/src/third_party/wiredtiger/src/txn/txn_timestamp.c
index 66ec0536d2b..de474dba222 100644
--- a/src/third_party/wiredtiger/src/txn/txn_timestamp.c
+++ b/src/third_party/wiredtiger/src/txn/txn_timestamp.c
@@ -591,7 +591,7 @@ __wt_timestamp_validate(WT_SESSION_IMPL *session, const char *name,
WT_TXN *txn = &session->txn;
WT_TXN_GLOBAL *txn_global = &S2C(session)->txn_global;
wt_timestamp_t oldest_ts, stable_ts;
- char ts_string[WT_TS_INT_STRING_SIZE];
+ char ts_string[2][WT_TS_INT_STRING_SIZE];
bool has_oldest_ts, has_stable_ts;
/*
@@ -611,17 +611,17 @@ __wt_timestamp_validate(WT_SESSION_IMPL *session, const char *name,
if (has_oldest_ts && ts < oldest_ts) {
__wt_timestamp_to_string(
- oldest_ts, ts_string, sizeof(ts_string));
+ oldest_ts, ts_string[0], sizeof(ts_string[0]));
WT_RET_MSG(session, EINVAL,
"%s timestamp %.*s older than oldest timestamp %s",
- name, (int)cval->len, cval->str, ts_string);
+ name, (int)cval->len, cval->str, ts_string[0]);
}
if (compare_stable && has_stable_ts && ts < stable_ts) {
__wt_timestamp_to_string(
- stable_ts, ts_string, sizeof(ts_string));
+ stable_ts, ts_string[0], sizeof(ts_string[0]));
WT_RET_MSG(session, EINVAL,
"%s timestamp %.*s older than stable timestamp %s",
- name, (int)cval->len, cval->str, ts_string);
+ name, (int)cval->len, cval->str, ts_string[0]);
}
/*
@@ -631,12 +631,12 @@ __wt_timestamp_validate(WT_SESSION_IMPL *session, const char *name,
*/
if (F_ISSET(txn, WT_TXN_HAS_TS_COMMIT) &&
ts < txn->first_commit_timestamp) {
- __wt_timestamp_to_string(
- txn->first_commit_timestamp, ts_string, sizeof(ts_string));
+ __wt_timestamp_to_string(txn->first_commit_timestamp,
+ ts_string[0], sizeof(ts_string[0]));
WT_RET_MSG(session, EINVAL,
"%s timestamp %.*s older than the first "
"commit timestamp %s for this transaction",
- name, (int)cval->len, cval->str, ts_string);
+ name, (int)cval->len, cval->str, ts_string[0]);
}
/*
@@ -646,11 +646,23 @@ __wt_timestamp_validate(WT_SESSION_IMPL *session, const char *name,
*/
if (F_ISSET(txn, WT_TXN_PREPARE) && ts < txn->prepare_timestamp) {
__wt_timestamp_to_string(
- txn->prepare_timestamp, ts_string, sizeof(ts_string));
+ txn->prepare_timestamp, ts_string[0], sizeof(ts_string[0]));
WT_RET_MSG(session, EINVAL,
"%s timestamp %.*s older than the prepare timestamp %s "
"for this transaction",
- name, (int)cval->len, cval->str, ts_string);
+ name, (int)cval->len, cval->str, ts_string[0]);
+ }
+
+ if (F_ISSET(txn, WT_TXN_HAS_TS_DURABLE) &&
+ txn->durable_timestamp < txn->commit_timestamp) {
+ __wt_timestamp_to_string(
+ txn->durable_timestamp, ts_string[0], sizeof(ts_string[0]));
+ __wt_timestamp_to_string(
+ txn->commit_timestamp, ts_string[1], sizeof(ts_string[1]));
+ WT_RET_MSG(session, EINVAL,
+ "%s timestamp %s older than the commit timestamp %s "
+ "for this transaction",
+ name, ts_string[0], ts_string[1]);
}
return (0);
@@ -705,6 +717,7 @@ __wt_txn_set_timestamp(WT_SESSION_IMPL *session, const char *cfg[])
WT_RET(__wt_txn_parse_timestamp(
session, "durable", &ts, &cval));
txn->durable_timestamp = ts;
+ F_SET(txn, WT_TXN_HAS_TS_DURABLE);
prepare_allowed = true;
}
}
diff --git a/src/third_party/wiredtiger/src/utilities/util.h b/src/third_party/wiredtiger/src/utilities/util.h
index 05d65440958..be58f8eae0e 100644
--- a/src/third_party/wiredtiger/src/utilities/util.h
+++ b/src/third_party/wiredtiger/src/utilities/util.h
@@ -32,7 +32,7 @@ int util_cerr(WT_CURSOR *, const char *, int);
int util_compact(WT_SESSION *, int, char *[]);
void util_copyright(void);
int util_create(WT_SESSION *, int, char *[]);
-int util_downgrade(WT_SESSION *, WT_CONNECTION *, int, char *[]);
+int util_downgrade(WT_SESSION *, int, char *[]);
int util_drop(WT_SESSION *, int, char *[]);
int util_dump(WT_SESSION *, int, char *[]);
int util_err(WT_SESSION *, int, const char *, ...)
diff --git a/src/third_party/wiredtiger/src/utilities/util_downgrade.c b/src/third_party/wiredtiger/src/utilities/util_downgrade.c
index e03544f8422..ce780c8614c 100644
--- a/src/third_party/wiredtiger/src/utilities/util_downgrade.c
+++ b/src/third_party/wiredtiger/src/utilities/util_downgrade.c
@@ -11,8 +11,9 @@
static int usage(void);
int
-util_downgrade(WT_SESSION *session, WT_CONNECTION *conn, int argc, char *argv[])
+util_downgrade(WT_SESSION *session, int argc, char *argv[])
{
+ WT_CONNECTION *conn;
WT_DECL_RET;
int ch;
char config_str[128], *release;
@@ -39,8 +40,9 @@ util_downgrade(WT_SESSION *session, WT_CONNECTION *conn, int argc, char *argv[])
if ((ret = __wt_snprintf(config_str, sizeof(config_str),
"compatibility=(release=%s)", release)) != 0)
return (util_err(session, ret, NULL));
+ conn = session->connection;
if ((ret = conn->reconfigure(conn, config_str)) != 0)
- return (util_err(session, ret, "conn.downgrade"));
+ return (util_err(session, ret, "WT_CONNECTION.downgrade"));
return (0);
}
diff --git a/src/third_party/wiredtiger/src/utilities/util_main.c b/src/third_party/wiredtiger/src/utilities/util_main.c
index de4f31fcf23..9b94acdc9ed 100644
--- a/src/third_party/wiredtiger/src/utilities/util_main.c
+++ b/src/third_party/wiredtiger/src/utilities/util_main.c
@@ -67,11 +67,10 @@ main(int argc, char *argv[])
WT_DECL_RET;
WT_SESSION *session;
size_t len;
- int (*cfunc)(WT_SESSION *, WT_CONNECTION *, int, char *[]);
int ch, major_v, minor_v, tret, (*func)(WT_SESSION *, int, char *[]);
const char *cmd_config, *config, *p1, *p2, *p3, *rec_config;
char *p, *secretkey;
- bool logoff, needconn, recover, salvage;
+ bool logoff, recover, salvage;
conn = NULL;
p = NULL;
@@ -83,8 +82,6 @@ main(int argc, char *argv[])
++progname;
command = "";
- needconn = false;
-
/* Check the version against the library build. */
(void)wiredtiger_version(&major_v, & minor_v, NULL);
if (major_v != WIREDTIGER_VERSION_MAJOR ||
@@ -166,7 +163,6 @@ main(int argc, char *argv[])
__wt_optreset = __wt_optind = 1;
func = NULL;
- cfunc = NULL;
switch (command[0]) {
case 'a':
if (strcmp(command, "alter") == 0)
@@ -188,10 +184,9 @@ main(int argc, char *argv[])
}
break;
case 'd':
- if (strcmp(command, "downgrade") == 0) {
- cfunc = util_downgrade;
- needconn = true;
- } else if (strcmp(command, "drop") == 0)
+ if (strcmp(command, "downgrade") == 0)
+ func = util_downgrade;
+ else if (strcmp(command, "drop") == 0)
func = util_drop;
else if (strcmp(command, "dump") == 0)
func = util_dump;
@@ -248,7 +243,7 @@ main(int argc, char *argv[])
default:
break;
}
- if (func == NULL && cfunc == NULL) {
+ if (func == NULL) {
usage();
goto err;
}
@@ -293,10 +288,7 @@ main(int argc, char *argv[])
}
/* Call the function. */
- if (needconn)
- ret = cfunc(session, conn, argc, argv);
- else
- ret = func(session, argc, argv);
+ ret = func(session, argc, argv);
if (0) {
err: ret = 1;
diff --git a/src/third_party/wiredtiger/test/csuite/wt4156_metadata_salvage/main.c b/src/third_party/wiredtiger/test/csuite/wt4156_metadata_salvage/main.c
index 614ee01486a..fd734b1a4a2 100644
--- a/src/third_party/wiredtiger/test/csuite/wt4156_metadata_salvage/main.c
+++ b/src/third_party/wiredtiger/test/csuite/wt4156_metadata_salvage/main.c
@@ -188,10 +188,10 @@ create_data(TABLE_INFO *t)
/*
* corrupt_metadata --
- * Corrupt the metadata by scribbling on the "corrupt" URI string.
+ * Corrupt the file by scribbling on the provided URI string.
*/
static void
-corrupt_metadata(void)
+corrupt_file(const char *file_name, const char *uri)
{
struct stat sb;
FILE *fp;
@@ -207,7 +207,7 @@ corrupt_metadata(void)
* when WiredTiger next reads it.
*/
testutil_check(__wt_snprintf(
- path, sizeof(path), "%s/%s", home, WT_METAFILE));
+ path, sizeof(path), "%s/%s", home, file_name));
if ((fp = fopen(path, "r+")) == NULL)
testutil_die(errno, "fopen: %s", path);
testutil_check(fstat(fileno(fp), &sb));
@@ -219,7 +219,7 @@ corrupt_metadata(void)
/*
* Corrupt all occurrences of the string in the file.
*/
- while ((corrupt = byte_str(buf, meta_size, CORRUPT)) != NULL) {
+ while ((corrupt = byte_str(buf, meta_size, uri)) != NULL) {
corrupted = true;
testutil_assert(*(char *)corrupt != 'X');
*(char *)corrupt = 'X';
@@ -715,7 +715,7 @@ main(int argc, char *argv[])
* Damage/corrupt WiredTiger.wt.
*/
printf("corrupt metadata\n");
- corrupt_metadata();
+ corrupt_file(WT_METAFILE, CORRUPT);
testutil_check(__wt_snprintf(buf, sizeof(buf),
"cp -p %s/WiredTiger.wt ./%s.SAVE/WiredTiger.wt.CORRUPT",
home, home));
@@ -724,6 +724,19 @@ main(int argc, char *argv[])
testutil_die(ret, "system: %s", buf);
run_all_verification(NULL, &table_data[0]);
+ /*
+ * Damage/corrupt WiredTiger.turtle.
+ */
+ printf("corrupt turtle\n");
+ corrupt_file(WT_METADATA_TURTLE, WT_METAFILE_URI);
+ testutil_check(__wt_snprintf(buf, sizeof(buf),
+ "cp -p %s/WiredTiger.turtle ./%s.SAVE/WiredTiger.turtle.CORRUPT",
+ home, home));
+ printf("copy: %s\n", buf);
+ if ((ret = system(buf)) < 0)
+ testutil_die(ret, "system: %s", buf);
+ run_all_verification(NULL, &table_data[0]);
+
out_of_sync(&table_data[0]);
/*
diff --git a/src/third_party/wiredtiger/test/csuite/wt4333_handle_locks/main.c b/src/third_party/wiredtiger/test/csuite/wt4333_handle_locks/main.c
index 5c8b965ceb5..a551cbf29de 100644
--- a/src/third_party/wiredtiger/test/csuite/wt4333_handle_locks/main.c
+++ b/src/third_party/wiredtiger/test/csuite/wt4333_handle_locks/main.c
@@ -252,7 +252,7 @@ sweep_stats(void)
}
static void
-run(bool config_cache)
+runone(bool config_cache)
{
pthread_t idlist[1000];
u_int i, j;
@@ -309,8 +309,8 @@ run(bool config_cache)
testutil_check(conn->close(conn, NULL));
}
-int
-main(int argc, char *argv[])
+static int
+run(int argc, char *argv[])
{
static const struct {
u_int workers;
@@ -338,12 +338,6 @@ main(int argc, char *argv[])
u_int i, n;
int ch;
- /*
- * Bypass this test for valgrind. It has a fairly low thread limit.
- */
- if (testutil_is_flag_set("TESTUTIL_BYPASS_VALGRIND"))
- return (EXIT_SUCCESS);
-
(void)testutil_set_progname(argv);
__wt_random_init_seed(NULL, &rnd);
@@ -365,10 +359,34 @@ main(int argc, char *argv[])
n = __wt_random(&rnd) % WT_ELEMENTS(runs);
workers = runs[n].workers;
uris = runs[n].uris;
- run(runs[n].cache_cursors);
+ runone(runs[n].cache_cursors);
}
uri_teardown();
return (EXIT_SUCCESS);
}
+
+int
+main(int argc, char *argv[])
+{
+ bool skip;
+
+ skip = false;
+
+ /*
+ * Bypass this test for valgrind. It has a fairly low thread limit.
+ */
+ if (testutil_is_flag_set("TESTUTIL_BYPASS_VALGRIND"))
+ skip = true;
+
+ /*
+ * Bypass this test for OS X. We periodically see it hang without error,
+ * leaving a zombie process that never exits (WT-4613, BUILD-7616).
+ */
+#if defined(__APPLE__)
+ skip = true;
+#endif
+
+ return (skip ? EXIT_SUCCESS : run(argc, argv));
+}
diff --git a/src/third_party/wiredtiger/test/format/config.c b/src/third_party/wiredtiger/test/format/config.c
index f439c5bd2bb..d19963d75a5 100644
--- a/src/third_party/wiredtiger/test/format/config.c
+++ b/src/third_party/wiredtiger/test/format/config.c
@@ -778,9 +778,6 @@ config_pct(void)
static void
config_prepare(void)
{
- /* WT-4537: REMOVE when that merges */
- config_single("prepare=off", 0);
-
/*
* We cannot prepare a transaction if logging is configured, or if
* timestamps are not configured.
diff --git a/src/third_party/wiredtiger/test/suite/test_assert04.py b/src/third_party/wiredtiger/test/suite/test_assert04.py
index 57ee267ab01..59aafd2d4cc 100644
--- a/src/third_party/wiredtiger/test/suite/test_assert04.py
+++ b/src/third_party/wiredtiger/test/suite/test_assert04.py
@@ -124,6 +124,9 @@ class test_assert04(wttest.WiredTigerTestCase, suite_subprocess):
c.close()
# Detect using a timestamp on the non-timestamp key.
+ # We must first use a non timestamped operation on the key
+ # in order to violate the key consistency condition in the
+ # following transaction.
c = self.session.open_cursor(uri)
self.session.begin_transaction()
c['key_nots'] = 'value_nots3'
diff --git a/src/third_party/wiredtiger/test/suite/test_assert05.py b/src/third_party/wiredtiger/test/suite/test_assert05.py
new file mode 100644
index 00000000000..b709e8df8dd
--- /dev/null
+++ b/src/third_party/wiredtiger/test/suite/test_assert05.py
@@ -0,0 +1,120 @@
+#!/usr/bin/env python
+#
+# Public Domain 2014-2019 MongoDB, Inc.
+# Public Domain 2008-2014 WiredTiger, Inc.
+#
+# This is free and unencumbered software released into the public domain.
+#
+# Anyone is free to copy, modify, publish, use, compile, sell, or
+# distribute this software, either in source code form or as a compiled
+# binary, for any purpose, commercial or non-commercial, and by any
+# means.
+#
+# In jurisdictions that recognize copyright laws, the author or authors
+# of this software dedicate any and all copyright interest in the
+# software to the public domain. We make this dedication for the benefit
+# of the public at large and to the detriment of our heirs and
+# successors. We intend this dedication to be an overt act of
+# relinquishment in perpetuity of all present and future rights to this
+# software under copyright law.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+# IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+# OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+# ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+# OTHER DEALINGS IN THE SOFTWARE.
+#
+# test_assert05.py
+# Timestamps: assert durable timestamp settings
+#
+
+from suite_subprocess import suite_subprocess
+import wiredtiger, wttest
+
+def timestamp_str(t):
+ return '%x' % t
+
+class test_assert05(wttest.WiredTigerTestCase, suite_subprocess):
+ base = 'assert05'
+ base_uri = 'file:' + base
+ uri_always = base_uri + '.always.wt'
+ uri_def = base_uri + '.def.wt'
+ uri_never = base_uri + '.never.wt'
+ uri_none = base_uri + '.none.wt'
+ cfg = 'key_format=S,value_format=S,'
+ cfg_always = 'assert=(durable_timestamp=always)'
+ cfg_def = ''
+ cfg_never = 'assert=(durable_timestamp=never)'
+ cfg_none = 'assert=(durable_timestamp=none)'
+
+ count = 1
+ #
+ # Commit a k/v pair making sure that it detects an error if needed, when
+ # used with and without a durable timestamp.
+ #
+ def insert_check(self, uri, use_ts):
+ c = self.session.open_cursor(uri)
+ key = 'key' + str(self.count)
+ val = 'value' + str(self.count)
+
+ # Commit with a timestamp
+ self.session.begin_transaction()
+ c[key] = val
+ self.session.prepare_transaction(
+ 'prepare_timestamp=' + timestamp_str(self.count))
+ self.session.timestamp_transaction(
+ 'commit_timestamp=' + timestamp_str(self.count))
+ self.session.timestamp_transaction(
+ 'durable_timestamp=' + timestamp_str(self.count))
+ # All settings other than never should commit successfully
+ if (use_ts != 'never'):
+ self.session.commit_transaction()
+ else:
+ msg = "/timestamp set on this transaction/"
+ self.assertRaisesWithMessage(wiredtiger.WiredTigerError,
+ lambda:self.assertEquals(self.session.commit_transaction(),
+ 0), msg)
+ c.close()
+ self.count += 1
+
+ # Commit without a timestamp
+ key = 'key' + str(self.count)
+ val = 'value' + str(self.count)
+ c = self.session.open_cursor(uri)
+ self.session.begin_transaction()
+ c[key] = val
+ self.session.prepare_transaction(
+ 'prepare_timestamp=' + timestamp_str(self.count))
+ self.session.timestamp_transaction(
+ 'commit_timestamp=' + timestamp_str(self.count))
+ # All settings other than always should commit successfully
+ if (use_ts != 'always'):
+ self.session.commit_transaction()
+ else:
+ msg = "/none set on this transaction/"
+ self.assertRaisesWithMessage(wiredtiger.WiredTigerError,
+ lambda:self.assertEquals(self.session.commit_transaction(),
+ 0), msg)
+ self.count += 1
+ c.close()
+
+ def test_durable_timestamp(self):
+ #if not wiredtiger.diagnostic_build():
+ # self.skipTest('requires a diagnostic build')
+
+ # Create a data item at a timestamp
+ self.session.create(self.uri_always, self.cfg + self.cfg_always)
+ self.session.create(self.uri_def, self.cfg + self.cfg_def)
+ self.session.create(self.uri_never, self.cfg + self.cfg_never)
+ self.session.create(self.uri_none, self.cfg + self.cfg_none)
+
+ # Check inserting into each table
+ self.insert_check(self.uri_always, 'always')
+ self.insert_check(self.uri_def, 'none')
+ self.insert_check(self.uri_never, 'never')
+ self.insert_check(self.uri_none, 'none')
+
+if __name__ == '__main__':
+ wttest.run()
diff --git a/src/third_party/wiredtiger/test/suite/test_assert06.py b/src/third_party/wiredtiger/test/suite/test_assert06.py
new file mode 100644
index 00000000000..828968441a9
--- /dev/null
+++ b/src/third_party/wiredtiger/test/suite/test_assert06.py
@@ -0,0 +1,388 @@
+#!/usr/bin/env python
+#
+# Public Domain 2014-2019 MongoDB, Inc.
+# Public Domain 2008-2014 WiredTiger, Inc.
+#
+# This is free and unencumbered software released into the public domain.
+#
+# Anyone is free to copy, modify, publish, use, compile, sell, or
+# distribute this software, either in source code form or as a compiled
+# binary, for any purpose, commercial or non-commercial, and by any
+# means.
+#
+# In jurisdictions that recognize copyright laws, the author or authors
+# of this software dedicate any and all copyright interest in the
+# software to the public domain. We make this dedication for the benefit
+# of the public at large and to the detriment of our heirs and
+# successors. We intend this dedication to be an overt act of
+# relinquishment in perpetuity of all present and future rights to this
+# software under copyright law.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+# IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+# OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+# ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+# OTHER DEALINGS IN THE SOFTWARE.
+#
+# test_assert06.py
+# Timestamps: verify key consistent setting for durable timestamps
+#
+
+from suite_subprocess import suite_subprocess
+import wiredtiger, wttest
+
+def timestamp_str(t):
+ return '%x' % t
+
+class test_assert06(wttest.WiredTigerTestCase, suite_subprocess):
+ def apply_timestamps(self, timestamp):
+ self.session.prepare_transaction(
+ 'prepare_timestamp=' + timestamp_str(timestamp))
+ self.session.timestamp_transaction(
+ 'commit_timestamp=' + timestamp_str(timestamp))
+ self.session.timestamp_transaction(
+ 'durable_timestamp=' + timestamp_str(timestamp))
+
+ def test_timestamp_alter(self):
+ base = 'assert06'
+ uri = 'file:' + base
+ cfg_on = 'assert=(durable_timestamp=key_consistent)'
+ cfg_off = 'assert=(durable_timestamp=none)'
+ msg_ooo='/out of order/'
+ msg_usage='/used inconsistently/'
+
+ # Create the table without the key consistency checking turned on.
+ # Create a few items breaking the rules. Then alter the setting and
+ # verify the inconsistent usage is detected.
+ self.session.create(uri, 'key_format=S,value_format=S')
+ # Insert a data item at timestamp 2.
+ c = self.session.open_cursor(uri)
+ self.session.begin_transaction()
+ c['key_ts1'] = 'value2'
+ self.apply_timestamps(2)
+ self.session.commit_transaction()
+ c.close()
+
+ # Modify the data item at timestamp 1.
+ c = self.session.open_cursor(uri)
+ self.session.begin_transaction()
+ c['key_ts1'] = 'value1'
+ self.apply_timestamps(1)
+ self.session.commit_transaction()
+ c.close()
+
+ # Insert a non-timestamped item. Then modify with a timestamp. And
+ # again modify without a timestamp.
+ c = self.session.open_cursor(uri)
+ self.session.begin_transaction()
+ c['key_nots'] = 'value_nots'
+ self.session.commit_transaction()
+ c.close()
+
+ c = self.session.open_cursor(uri)
+ self.session.begin_transaction()
+ c['key_nots'] = 'value2'
+ self.apply_timestamps(2)
+ self.session.commit_transaction()
+ c.close()
+
+ c = self.session.open_cursor(uri)
+ self.session.begin_transaction()
+ c['key_nots'] = 'value_nots2'
+ self.session.commit_transaction()
+ c.close()
+
+ # We must move the oldest timestamp forward in order to alter.
+ # Otherwise alter closing the file will fail with EBUSY.
+ self.conn.set_timestamp('oldest_timestamp=' + timestamp_str(2))
+
+ # Now alter the setting and make sure we detect incorrect usage.
+ self.session.alter(uri, cfg_on)
+
+ # Detect decreasing timestamp.
+ c = self.session.open_cursor(uri)
+ self.session.begin_transaction()
+ c['key_ts1'] = 'value5'
+ self.apply_timestamps(5)
+ self.session.commit_transaction()
+ c.close()
+
+ c = self.session.open_cursor(uri)
+ self.session.begin_transaction()
+ c['key_ts1'] = 'value4'
+ self.apply_timestamps(4)
+ self.assertRaisesWithMessage(wiredtiger.WiredTigerError,
+ lambda: self.session.commit_transaction(), msg_ooo)
+ c.close()
+
+ # Detect not using a timestamp.
+ c = self.session.open_cursor(uri)
+ self.session.begin_transaction()
+ c['key_ts1'] = 'value_nots3'
+ self.assertRaisesWithMessage(wiredtiger.WiredTigerError,
+ lambda: self.session.commit_transaction(), msg_usage)
+ c.close()
+
+ # Detect using a timestamp on the non-timestamp key.
+ # We must first use a non timestamped operation on the key
+ # in order to violate the key consistency condition in the
+ # following transaction.
+ c = self.session.open_cursor(uri)
+ self.session.begin_transaction()
+ c['key_nots'] = 'value_nots3'
+ self.session.commit_transaction()
+ c.close()
+
+ c = self.session.open_cursor(uri)
+ self.session.begin_transaction()
+ c['key_nots'] = 'value3'
+ self.apply_timestamps(3)
+ self.assertRaisesWithMessage(wiredtiger.WiredTigerError,
+ lambda: self.session.commit_transaction(), msg_usage)
+ c.close()
+ self.session.checkpoint()
+
+ c = self.session.open_cursor(uri)
+ self.assertEquals(c['key_ts1'], 'value5')
+ self.assertEquals(c['key_nots'], 'value_nots3')
+ c.close()
+
+ # Test to make sure that key consistency can be turned off
+ # after turning it on.
+ self.conn.set_timestamp('oldest_timestamp=' + timestamp_str(5))
+ self.session.alter(uri, cfg_off)
+
+ # Detection is off we can successfully change the same key with and
+ # without a timestamp.
+ c = self.session.open_cursor(uri)
+ self.session.begin_transaction()
+ c['key_nots'] = 'value_nots4'
+ self.session.commit_transaction()
+ c.close()
+
+ c = self.session.open_cursor(uri)
+ self.session.begin_transaction()
+ c['key_nots'] = 'value6'
+ self.apply_timestamps(6)
+ self.session.commit_transaction()
+ c.close()
+
+ def test_timestamp_usage(self):
+ base = 'assert06'
+ uri = 'file:' + base
+ msg_ooo='/out of order/'
+ msg_usage='/used inconsistently/'
+
+ # Create the table with the key consistency checking turned on.
+ # That checking will verify any individual key is always or never
+ # used with a timestamp. And if it is used with a timestamp that
+ # the timestamps are in increasing order for that key.
+ self.session.create(uri, 'key_format=S,value_format=S,assert=(durable_timestamp=key_consistent)')
+
+ # Insert a data item at timestamp 2.
+ c = self.session.open_cursor(uri)
+ self.session.begin_transaction()
+ c['key_ts1'] = 'value2'
+ self.apply_timestamps(2)
+ self.session.commit_transaction()
+ c.close()
+
+ # Modify the data item at timestamp 1. We should detect it is wrong.
+ c = self.session.open_cursor(uri)
+ self.session.begin_transaction()
+ c['key_ts1'] = 'value1'
+ self.apply_timestamps(1)
+ self.assertRaisesWithMessage(wiredtiger.WiredTigerError,
+ lambda: self.session.commit_transaction(), msg_ooo)
+ c.close()
+
+ # Make sure we can successfully add a different key at timestamp 1.
+ c = self.session.open_cursor(uri)
+ self.session.begin_transaction()
+ c['key_ts2'] = 'value1'
+ self.apply_timestamps(1)
+ self.session.commit_transaction()
+ c.close()
+
+ #
+ # Insert key_ts3 at timestamp 10 and key_ts4 at 15.
+ # Then modify both keys in one transaction at timestamp 13.
+ # We should not be allowed to modify the one from 15.
+ # So the whole transaction should fail.
+ #
+ c = self.session.open_cursor(uri)
+ self.session.begin_transaction()
+ c['key_ts3'] = 'value10'
+ self.apply_timestamps(10)
+ self.session.commit_transaction()
+ self.session.begin_transaction()
+ c['key_ts4'] = 'value15'
+ self.apply_timestamps(15)
+ self.session.commit_transaction()
+
+ c = self.session.open_cursor(uri)
+ self.session.begin_transaction()
+ c['key_ts3'] = 'value13'
+ c['key_ts4'] = 'value13'
+ self.apply_timestamps(13)
+ self.assertRaisesWithMessage(wiredtiger.WiredTigerError,
+ lambda: self.session.commit_transaction(), msg_ooo)
+ c.close()
+
+ c = self.session.open_cursor(uri)
+ self.assertEquals(c['key_ts3'], 'value10')
+ self.assertEquals(c['key_ts4'], 'value15')
+ c.close()
+
+ #
+ # Separately, we should be able to update key_ts3 at timestamp 10
+ # but not update key_ts4 inserted at timestamp 15.
+ #
+ c = self.session.open_cursor(uri)
+ self.session.begin_transaction()
+ c['key_ts3'] = 'value13'
+ self.apply_timestamps(13)
+ self.session.commit_transaction()
+
+ c = self.session.open_cursor(uri)
+ self.session.begin_transaction()
+ c['key_ts4'] = 'value13'
+ self.apply_timestamps(13)
+ self.assertRaisesWithMessage(wiredtiger.WiredTigerError,
+ lambda: self.session.commit_transaction(), msg_ooo)
+ c.close()
+
+ # Make sure multiple update attempts still fail and eventually
+ # succeed with a later timestamp. This tests that aborted entries
+ # in the update chain are not considered for the timestamp check.
+ c = self.session.open_cursor(uri)
+ self.session.begin_transaction()
+ c['key_ts4'] = 'value14'
+ self.apply_timestamps(14)
+ self.assertRaisesWithMessage(wiredtiger.WiredTigerError,
+ lambda: self.session.commit_transaction(), msg_ooo)
+ c.close()
+ c = self.session.open_cursor(uri)
+ self.assertEquals(c['key_ts4'], 'value15')
+ c.close()
+
+ c = self.session.open_cursor(uri)
+ self.session.begin_transaction()
+ c['key_ts4'] = 'value16'
+ self.apply_timestamps(16)
+ self.session.commit_transaction()
+ c.close()
+ c = self.session.open_cursor(uri)
+ self.assertEquals(c['key_ts4'], 'value16')
+ c.close()
+
+ # Now try to modify a key previously used with timestamps without
+ # one. We should get the inconsistent usage message.
+ c = self.session.open_cursor(uri)
+ self.session.begin_transaction()
+ c['key_ts4'] = 'value_nots'
+ self.assertRaisesWithMessage(wiredtiger.WiredTigerError,
+ lambda: self.session.commit_transaction(), msg_usage)
+ c.close()
+
+ c = self.session.open_cursor(uri)
+ self.session.begin_transaction()
+ c['key_ts4'] = 'value_nots'
+ self.assertRaisesWithMessage(wiredtiger.WiredTigerError,
+ lambda: self.session.commit_transaction(), msg_usage)
+ c.close()
+ c = self.session.open_cursor(uri)
+ self.assertEquals(c['key_ts4'], 'value16')
+ c.close()
+
+ # Now confirm the other way. Create a key without a timestamp and then
+ # attempt to modify it with a timestamp. The only error checking that
+ # makes sense here is the inconsistent usage.
+ c = self.session.open_cursor(uri)
+ self.session.begin_transaction()
+ c['key_nots'] = 'value_nots'
+ self.session.commit_transaction()
+ c.close()
+
+ c = self.session.open_cursor(uri)
+ self.session.begin_transaction()
+ c['key_nots'] = 'value16'
+ self.apply_timestamps(16)
+ self.assertRaisesWithMessage(wiredtiger.WiredTigerError,
+ lambda: self.session.commit_transaction(), msg_usage)
+ c.close()
+
+ c = self.session.open_cursor(uri)
+ self.session.begin_transaction()
+ c['key_nots'] = 'value_nots1'
+ self.session.commit_transaction()
+ c.close()
+
+ c = self.session.open_cursor(uri)
+ self.session.begin_transaction()
+ c['key_nots'] = 'value17'
+ self.apply_timestamps(17)
+ self.assertRaisesWithMessage(wiredtiger.WiredTigerError,
+ lambda: self.session.commit_transaction(), msg_usage)
+ c.close()
+
+ c = self.session.open_cursor(uri)
+ self.assertEquals(c['key_nots'], 'value_nots1')
+ c.close()
+
+ # Confirm it is okay to set the timestamp in the middle or end of the
+ # transaction. That should set the timestamp for the whole thing.
+ c = self.session.open_cursor(uri)
+ self.session.begin_transaction()
+ c['key_ts5'] = 'value_notsyet'
+ c['key_ts5'] = 'value20'
+ self.apply_timestamps(20)
+ self.session.commit_transaction()
+ c.close()
+
+ c = self.session.open_cursor(uri)
+ self.assertEquals(c['key_ts5'], 'value20')
+ c.close()
+
+ c = self.session.open_cursor(uri)
+ self.session.begin_transaction()
+ c['key_ts6'] = 'value_notsyet'
+ c['key_ts6'] = 'value21_after'
+ self.apply_timestamps(21)
+ self.session.commit_transaction()
+ c.close()
+
+ c = self.session.open_cursor(uri)
+ self.assertEquals(c['key_ts6'], 'value21_after')
+ c.close()
+
+ # Confirm it is okay to set the durable timestamp on the commit call.
+ # That should set the timestamp for the whole thing.
+ c = self.session.open_cursor(uri)
+ self.session.begin_transaction()
+ c['key_ts6'] = 'value_committs1'
+ c['key_ts6'] = 'value22'
+ self.session.prepare_transaction(
+ 'prepare_timestamp=' + timestamp_str(22))
+ self.session.timestamp_transaction(
+ 'commit_timestamp=' + timestamp_str(22))
+ self.session.commit_transaction('durable_timestamp=' +
+ timestamp_str(22))
+ c.close()
+
+ c = self.session.open_cursor(uri)
+ self.session.begin_transaction()
+ c['key_nots'] = 'value23'
+ self.session.prepare_transaction(
+ 'prepare_timestamp=' + timestamp_str(23))
+ self.session.timestamp_transaction(
+ 'commit_timestamp=' + timestamp_str(23))
+ self.assertRaisesWithMessage(wiredtiger.WiredTigerError,
+ lambda: self.session.commit_transaction(
+ 'durable_timestamp=' + timestamp_str(23)), msg_usage)
+ c.close()
+
+if __name__ == '__main__':
+ wttest.run()
diff --git a/src/third_party/wiredtiger/test/suite/test_prepare_cursor02.py b/src/third_party/wiredtiger/test/suite/test_prepare_cursor02.py
new file mode 100644
index 00000000000..0aeb3f62dbf
--- /dev/null
+++ b/src/third_party/wiredtiger/test/suite/test_prepare_cursor02.py
@@ -0,0 +1,101 @@
+#!/usr/bin/env python
+#
+# Public Domain 2014-2019 MongoDB, Inc.
+# Public Domain 2008-2014 WiredTiger, Inc.
+#
+# This is free and unencumbered software released into the public domain.
+#
+# Anyone is free to copy, modify, publish, use, compile, sell, or
+# distribute this software, either in source code form or as a compiled
+# binary, for any purpose, commercial or non-commercial, and by any
+# means.
+#
+# In jurisdictions that recognize copyright laws, the author or authors
+# of this software dedicate any and all copyright interest in the
+# software to the public domain. We make this dedication for the benefit
+# of the public at large and to the detriment of our heirs and
+# successors. We intend this dedication to be an overt act of
+# relinquishment in perpetuity of all present and future rights to this
+# software under copyright law.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+# IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+# OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+# ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+# OTHER DEALINGS IN THE SOFTWARE.
+
+import wiredtiger, wttest
+from wtdataset import SimpleDataSet, SimpleIndexDataSet
+from wtdataset import SimpleLSMDataSet, ComplexDataSet, ComplexLSMDataSet
+from wtscenario import make_scenarios
+
+def timestamp_str(t):
+ return '%x' %t
+
+# test_prepare_cursor02.py
+# WT_CURSOR navigation (next/prev) tests with prepared transactions
+class test_prepare_cursor02(wttest.WiredTigerTestCase):
+
+ keyfmt = [
+ ('row-store', dict(keyfmt='i')),
+ ('column-store', dict(keyfmt='r')),
+ ]
+ types = [
+ ('table-simple', dict(uri='table', ds=SimpleDataSet)),
+ ]
+
+ iso_types = [
+ ('isolation_read_committed', dict(isolation='read-committed')),
+ ('isolation_snapshot', dict(isolation='snapshot'))
+ ]
+ scenarios = make_scenarios(types, keyfmt, iso_types)
+
+ def skip(self):
+ return self.keyfmt == 'r' and \
+ (self.ds.is_lsm() or self.uri == 'lsm')
+
+ # Test cursor navigate (next/prev) with prepared transactions.
+ def test_cursor_navigate_prepare_transaction(self):
+ if self.skip():
+ return
+
+ # Build an object.
+ uri = self.uri + ':test_prepare_cursor02'
+ ds = self.ds(self, uri, 0, key_format=self.keyfmt)
+ ds.populate()
+
+ session = self.conn.open_session()
+ cursor = session.open_cursor(uri, None)
+ session.begin_transaction()
+ cursor.set_key(ds.key(1))
+ cursor.set_value(ds.value(1))
+ cursor.insert()
+ session.prepare_transaction('prepare_timestamp=' + timestamp_str(100))
+
+ prep_session = self.conn.open_session()
+ prep_cursor = prep_session.open_cursor(uri, None)
+
+ # Check cursor navigate with insert in prepared transaction.
+ # Data set is empty
+ # Insert key 1 in prepared state.
+ prep_session.begin_transaction()
+ # Check next operation.
+ prep_cursor.set_key(ds.key(1))
+ self.assertRaisesException(wiredtiger.WiredTigerError, lambda: prep_cursor.search())
+ self.assertRaisesException(wiredtiger.WiredTigerError, lambda: prep_cursor.next())
+ self.assertRaisesException(wiredtiger.WiredTigerError, lambda: prep_cursor.next())
+
+ # Check prev operation.
+ prep_cursor.set_key(ds.key(1))
+ self.assertRaisesException(wiredtiger.WiredTigerError, lambda: prep_cursor.search())
+ self.assertRaisesException(wiredtiger.WiredTigerError, lambda: prep_cursor.prev())
+ self.assertRaisesException(wiredtiger.WiredTigerError, lambda: prep_cursor.prev())
+ prep_cursor.close()
+ prep_session.commit_transaction()
+
+ session.rollback_transaction()
+
+if __name__ == '__main__':
+ wttest.run()
diff --git a/src/third_party/wiredtiger/tools/optrack/find-latency-spikes.py b/src/third_party/wiredtiger/tools/optrack/find-latency-spikes.py
index 43331f7d169..8a0dfb00e88 100755
--- a/src/third_party/wiredtiger/tools/optrack/find-latency-spikes.py
+++ b/src/third_party/wiredtiger/tools/optrack/find-latency-spikes.py
@@ -36,11 +36,15 @@ from bokeh.models.annotations import Label
from bokeh.plotting import figure, output_file, reset_output, save, show
from bokeh.resources import CDN
import matplotlib
+from multiprocessing import Process, Queue, Array
+import multiprocessing
import numpy as np
import os
import pandas as pd
import sys
+import statvfs
import traceback
+import time
# A directory where we store cross-file plots for each bucket of the outlier
# histogram.
@@ -78,8 +82,8 @@ lastTimeStamp = 0;
# us when the function is to be considered an outlier. These values
# would be read from a config file, if supplied by the user.
#
-outlierThresholdDict = {};
-outlierPrettyNames = {};
+userDefinedLatencyThresholds = {};
+userDefinedThresholdNames = {};
# A dictionary that holds a reference to the raw dataframe for each file.
#
@@ -98,16 +102,19 @@ pixelsForTitle = 30;
pixelsPerHeightUnit = 30;
pixelsPerWidthUnit = 5;
+# How many work units for perform in parallel.
+targetParallelism = 0;
+
# The name of the time units that were used when recording timestamps.
# We assume that it's nanoseconds by default. Alternative units can be
# set in the configuration file.
#
timeUnitString = "nanoseconds";
-# The coefficient by which we multiply the standard deviation when
-# setting the outlier threshold, in case it is not specified by the user.
+# The percentile threshold. A function duration above that percentile
+# is deemed an outlier.
#
-STDEV_MULT = 2;
+PERCENTILE = 0.999;
def initColorList():
@@ -119,6 +126,12 @@ def initColorList():
# Some browsers break if you try to give them 'sage'
if (color == "sage"):
colorList.remove(color);
+ # We reserve red to highlight occurrences of functions
+ # that exceeded the user-defined latency threshold. Do
+ # not use red for regular function colors.
+ #
+ elif (color == "red"):
+ colorList.remove(color);
#
# Each unique function name gets a unique color.
@@ -180,7 +193,9 @@ def getIntervalData(intervalBeginningsStack, intervalEnd, logfile):
return intervalBegin[0], intervalEnd[0], intervalEnd[2], errorOccurred;
-def plotOutlierHistogram(dataframe, maxOutliers, func, durationThreshold,
+def plotOutlierHistogram(dataframe, maxOutliers, func,
+ statisticalOutlierThreshold,
+ userLatencyThreshold,
averageDuration, maxDuration):
global pixelsForTitle;
@@ -191,7 +206,7 @@ def plotOutlierHistogram(dataframe, maxOutliers, func, durationThreshold,
cds = ColumnDataSource(dataframe);
figureTitle = "Occurrences of " + func + " that took longer than " \
- + durationThreshold + ".";
+ + statisticalOutlierThreshold + ".";
hover = HoverTool(tooltips = [
("interval start", "@lowerbound{0,0}"),
@@ -209,7 +224,7 @@ def plotOutlierHistogram(dataframe, maxOutliers, func, durationThreshold,
y_ticker_max = p.plot_height / pixelsPerHeightUnit;
y_ticker_step = max(1, (maxOutliers + 1)/y_ticker_max);
- y_upper_bound = (maxOutliers / y_ticker_step + 1) * y_ticker_step;
+ y_upper_bound = (maxOutliers / y_ticker_step + 2) * y_ticker_step;
p.yaxis.ticker = FixedTicker(ticks =
range(0, y_upper_bound, y_ticker_step));
@@ -221,18 +236,28 @@ def plotOutlierHistogram(dataframe, maxOutliers, func, durationThreshold,
p.quad(left = 'lowerbound', right = 'upperbound', bottom = 'bottom',
top = 'height', color = funcToColor[func], source = cds,
- nonselection_fill_color=funcToColor[func],
+ nonselection_fill_color= funcToColor[func],
nonselection_fill_alpha = 1.0,
line_color = "lightgrey",
selection_fill_color = funcToColor[func],
selection_line_color="grey"
);
+ p.x(x='markerX', y='markerY', size='markersize', color = 'navy',
+ line_width = 1, source = cds);
+
# Add an annotation to the chart
#
y_max = dataframe['height'].max();
- text = "Average duration: " + '{0:,.0f}'.format(averageDuration) + \
- ". Maximum duration: " + '{0:,.0f}'.format(maxDuration) + ".";
+ text = "Average duration: " + '{0:,.0f}'.format(averageDuration) + " " + \
+ timeUnitString + \
+ ". Maximum duration: " + '{0:,.0f}'.format(maxDuration) + " " + \
+ timeUnitString + ". ";
+ if (userLatencyThreshold is not None):
+ text = text + \
+ "An \'x\' shows intervals with operations exceeding " + \
+ "a user-defined threshold of " + \
+ userLatencyThreshold + ".";
mytext = Label(x=0, y=y_upper_bound-y_ticker_step, text=text,
text_color = "grey", text_font = "helvetica",
text_font_size = "10pt",
@@ -280,13 +305,13 @@ def assignStackDepths(dataframe):
for i in range(len(df.index)):
- myStartTime = df.at[i, 'start'];
+ myEndTime = df.at[i, 'end'];
# Pop all items off stack whose end time is earlier than my
- # start time. They are not part of my stack, so I don't want to
+ # end time. They are not the callers on my stack, so I don't want to
# count them.
#
- while (len(stack) > 0 and stack[-1] < myStartTime):
+ while (len(stack) > 0 and stack[-1] < myEndTime):
stack.pop();
df.at[i, 'stackdepth'] = len(stack);
@@ -469,11 +494,12 @@ def createLegendFigure(legendDict):
def generateBucketChartForFile(figureName, dataframe, y_max, x_min, x_max):
- global colorAlreadyUsedInLegend;
global funcToColor;
global plotWidth;
global timeUnitString;
+ colorAlreadyUsedInLegend = {};
+
MAX_ITEMS_PER_LEGEND = 10;
numLegends = 0;
legendItems = {};
@@ -562,10 +588,10 @@ def generateEmptyDataset():
# across the timelines for all files. We call it a bucket, because it
# corresponds to a bucket in the outlier histogram.
#
-def generateCrossFilePlotsForBucket(i, lowerBound, upperBound, navigatorDF):
+def generateCrossFilePlotsForBucket(i, lowerBound, upperBound, navigatorDF,
+ retFilename):
global bucketDir;
- global colorAlreadyUsedInLegend;
global timeUnitString;
aggregateLegendDict = {};
@@ -584,12 +610,6 @@ def generateCrossFilePlotsForBucket(i, lowerBound, upperBound, navigatorDF):
navigatorFigure = generateNavigatorFigure(navigatorDF, i, intervalTitle);
figuresForAllFiles.append(navigatorFigure);
- # The following dictionary keeps track of legends. We need
- # a legend for each new HTML file. So we reset the dictionary
- # before generating a new file.
- #
- colorAlreadyUsedInLegend = {};
-
# Select from the dataframe for this file the records whose 'start'
# and 'end' timestamps fall within the lower and upper bound.
#
@@ -653,7 +673,8 @@ def generateCrossFilePlotsForBucket(i, lowerBound, upperBound, navigatorDF):
save(column(figuresForAllFiles), filename = fileName,
title=intervalTitle, resources=CDN);
- return fileName;
+ retFilename.value = fileName;
+
# Generate a plot that shows a view of the entire timeline in a form of
# intervals. By clicking on an interval we can navigate to that interval.
@@ -752,6 +773,31 @@ def createIntervalNavigatorDF(numBuckets, timeUnitsPerBucket):
dataframe['intervalnumbernext'] = dataframe['intervalnumber'] + 1;
return dataframe;
+
+def waitOnOneProcess(runningProcesses):
+
+ success = False;
+ for i, p in runningProcesses.items():
+ if (not p.is_alive()):
+ del runningProcesses[i];
+ success = True;
+
+ # If we have not found a terminated process, sleep for a while
+ if (not success):
+ time.sleep(1);
+
+# Update the UI message showing what percentage of work done by
+# parallel processes has completed.
+#
+def updatePercentComplete(runnableProcesses, runningProcesses,
+ totalWorkItems, workName):
+
+ percentComplete = float(totalWorkItems - len(runnableProcesses) \
+ - len(runningProcesses)) / float(totalWorkItems) * 100;
+ print(color.BLUE + color.BOLD + " " + workName),
+ sys.stdout.write(" %d%% complete \r" % (percentComplete) );
+ sys.stdout.flush();
+
# Generate plots of time series slices across all files for each bucket
# in the outlier histogram. Save each cross-file slice to an HTML file.
#
@@ -761,32 +807,99 @@ def generateTSSlicesForBuckets():
global lastTimeStamp;
global plotWidth;
global pixelsPerWidthUnit;
+ global targetParallelism;
bucketFilenames = [];
+ runnableProcesses = {};
+ returnValues = {};
+ spawnedProcesses = {};
numBuckets = plotWidth / pixelsPerWidthUnit;
timeUnitsPerBucket = (lastTimeStamp - firstTimeStamp) / numBuckets;
navigatorDF = createIntervalNavigatorDF(numBuckets, timeUnitsPerBucket);
+ print(color.BLUE + color.BOLD +
+ "Will process " + str(targetParallelism) + " work units in parallel."
+ + color.END);
+
for i in range(numBuckets):
+ retFilename = Array('c', os.statvfs('/')[statvfs.F_NAMEMAX]);
lowerBound = i * timeUnitsPerBucket;
upperBound = (i+1) * timeUnitsPerBucket;
- fileName = generateCrossFilePlotsForBucket(i, lowerBound, upperBound,
- navigatorDF);
-
- percentComplete = float(i) / float(numBuckets) * 100;
- print(color.BLUE + color.BOLD + " Generating timeline charts... "),
- sys.stdout.write("%d%% complete \r" % (percentComplete) );
- sys.stdout.flush();
- bucketFilenames.append(fileName);
-
+ p = Process(target=generateCrossFilePlotsForBucket,
+ args=(i, lowerBound, upperBound,
+ navigatorDF, retFilename));
+ runnableProcesses[i] = p;
+ returnValues[i] = retFilename;
+
+ while (len(runnableProcesses) > 0):
+ while (len(spawnedProcesses) < targetParallelism
+ and len(runnableProcesses) > 0):
+
+ i, p = runnableProcesses.popitem();
+ p.start();
+ spawnedProcesses[i] = p;
+
+ # Find at least one terminated process
+ waitOnOneProcess(spawnedProcesses);
+ updatePercentComplete(runnableProcesses, spawnedProcesses,
+ numBuckets, "Generating timeline charts");
+
+ # Wait for all processes to terminate
+ while (len(spawnedProcesses) > 0):
+ waitOnOneProcess(spawnedProcesses);
+ updatePercentComplete(runnableProcesses, spawnedProcesses,
+ numBuckets, "Generating timeline charts");
+
+ for i, fname in returnValues.items():
+ bucketFilenames.append(str(fname.value));
print(color.END);
return bucketFilenames;
-def processFile(fname):
+#
+# After we have cleaned up the data by getting rid of incomplete function
+# call records (e.g., a function begin record is presend but a function end
+# is not or vice versa), we optionally dump this clean data into a file, so
+# it can be re-processed by other visualization tools. The output format is
+#
+# <0/1> <funcname> <timestamp>
+#
+# We use '0' if it's a function entry, '1' if it's a function exit.
+#
+def dumpCleanData(fname, df):
+
+ enterExit = [];
+ timestamps = [];
+ functionNames = [];
+
+ outfile = None;
+ fnameParts = fname.split(".txt");
+ newfname = fnameParts[0] + "-clean.txt";
+
+ for index, row in df.iterrows():
+ # Append the function enter record:
+ enterExit.append(0);
+ timestamps.append(row['start']);
+ functionNames.append(row['function']);
+
+ # Append the function exit record:
+ enterExit.append(1);
+ timestamps.append(row['end']);
+ functionNames.append(row['function']);
+
+ newDF = pd.DataFrame({'enterExit' : enterExit, 'timestamp' : timestamps,
+ 'function' : functionNames});
+ newDF = newDF.set_index('timestamp', drop=False);
+ newDF.sort_index(inplace = True);
+
+ print("Dumping clean data to " + newfname);
+ newDF.to_csv(newfname, sep=' ', index=False, header=False,
+ columns = ['enterExit', 'function', 'timestamp']);
+
+def processFile(fname, dumpCleanDataBool):
global perFileDataFrame;
global perFuncDF;
@@ -802,6 +915,10 @@ def processFile(fname):
"Processing file " + str(fname) + color.END);
iDF = createCallstackSeries(rawData, "." + fname + ".log");
+ if (dumpCleanDataBool):
+ dumpCleanData(fname, iDF);
+
+
perFileDataFrame[fname] = iDF;
for func in funcToColor.keys():
@@ -820,11 +937,6 @@ def processFile(fname):
# show how many times this function took an unusually long time to
# execute.
#
-# The parameter durationThreshold tells us when a function should be
-# considered as unusually long. If this parameter is "-1" we count
-# all functions whose duration exceeded the average by more than
-# two standard deviations.
-#
def createOutlierHistogramForFunction(func, funcDF, bucketFilenames):
global firstTimeStamp;
@@ -832,10 +944,13 @@ def createOutlierHistogramForFunction(func, funcDF, bucketFilenames):
global plotWidth;
global pixelsPerWidthUnit;
global timeUnitString;
- global STDEV_MULT;
+ global PERCENTILE;
+
+ statisticalOutlierThreshold = 0;
+ statisticalOutlierThresholdDescr = "";
+ userLatencyThreshold = sys.maxint;
+ userLatencyThresholdDescr = None;
- durationThreshold = 0;
- durationThresholdDescr = "";
#
# funcDF is a list of functions along with their start and end
@@ -853,46 +968,78 @@ def createOutlierHistogramForFunction(func, funcDF, bucketFilenames):
averageDuration = funcDF['durations'].mean();
maxDuration = funcDF['durations'].max();
- if (outlierThresholdDict.has_key(func)):
- durationThreshold = outlierThresholdDict[func];
- durationThresholdDescr = outlierPrettyNames[func];
- elif (outlierThresholdDict.has_key("*")):
- durationThreshold = outlierThresholdDict["*"];
- durationThresholdDescr = outlierPrettyNames["*"];
- else:
- # Signal that we will use standard deviation
- durationThreshold = -STDEV_MULT;
+ # There are two things that we want to capture on the
+ # outlier charts: statistical outliers and functions exceeding the
+ # user-defined latency threshold. An outlier is a function
+ # whose duration is in the 99.9th percentile. For each
+ # time period we will show a bar whose height corresponds
+ # to the number of outliers observed during this exection
+ # period.
+ #
+ # Not all outliers are indicative of performance problems.
+ # To highlight real performance problems (as defined by the user)
+ # we will highlight those bars that contain operations whose
+ # duration exceeded the user-defined threshold.
+ #
+ if (userDefinedLatencyThresholds.has_key(func)):
+ userLatencyThreshold = userDefinedLatencyThresholds[func];
+ userLatencyThresholdDescr = userDefinedThresholdNames[func];
+ elif (userDefinedLatencyThresholds.has_key("*")):
+ userLatencyThreshold = userDefinedLatencyThresholds["*"];
+ userLatencyThresholdDescr = userDefinedThresholdNames["*"];
+
+ statisticalOutlierThreshold = funcDF['durations'].quantile(PERCENTILE);
+ statisticalOutlierThresholdDescr = \
+ '{0:,.0f}'.format(statisticalOutlierThreshold) \
+ + " " + timeUnitString + \
+ " (" + str(PERCENTILE * 100) + \
+ "th percentile)";
- if (durationThreshold < 0): # this is a stdev multiplier
- mult = -durationThreshold;
- stdDev = funcDF['durations'].std();
- durationThreshold = averageDuration + mult * stdDev;
- durationThresholdDescr = '{0:,.0f}'.format(durationThreshold) \
- + " " + timeUnitString + " (" + str(mult) + \
- " standard deviations)";
numBuckets = plotWidth / pixelsPerWidthUnit;
timeUnitsPerBucket = (lastTimeStamp - firstTimeStamp) / numBuckets;
- lowerBounds = [];
- upperBounds = [];
+
bucketHeights = [];
+ markers = [];
+ lowerBounds = [];
maxOutliers = 0;
+ upperBounds = [];
for i in range(numBuckets):
+ markerSize = 0;
lowerBound = i * timeUnitsPerBucket;
upperBound = (i+1) * timeUnitsPerBucket;
+ # Find out how many statistical outliers we have in the
+ # current period.
bucketDF = funcDF.loc[(funcDF['start'] >= lowerBound)
- & (funcDF['start'] < upperBound)
- & (funcDF['durations'] >= durationThreshold)];
+ & (funcDF['start'] < upperBound)
+ & (funcDF['durations'] >=
+ statisticalOutlierThreshold)];
+ # The number of statistical outliers is the height of the bar
numOutliers = bucketDF.size;
if (numOutliers > maxOutliers):
maxOutliers = numOutliers;
+ # Find out whether we have any functions whose duration exceeded
+ # the user-defined threshold.
+ if (userLatencyThresholdDescr is not None):
+ bucketDF = funcDF.loc[(funcDF['start'] >= lowerBound)
+ & (funcDF['start'] < upperBound)
+ & (funcDF['durations'] >=
+ userLatencyThreshold)];
+
+ # If there is at least one element in this dataframe, then the
+ # operations that exceeded the user defined latency threshold are
+ # present in this period. Highlight this bucket with a bright color.
+ if (bucketDF.size > 0):
+ markerSize = 6;
+
lowerBounds.append(lowerBound);
upperBounds.append(upperBound);
bucketHeights.append(numOutliers);
+ markers.append(markerSize);
if (maxOutliers == 0):
return None;
@@ -903,11 +1050,18 @@ def createOutlierHistogramForFunction(func, funcDF, bucketFilenames):
dict['height'] = bucketHeights;
dict['bottom'] = [0] * len(lowerBounds);
dict['bucketfiles'] = bucketFilenames;
+ dict['markersize'] = markers;
dataframe = pd.DataFrame(data=dict);
+ dataframe['markerX'] = dataframe['lowerbound'] + \
+ (dataframe['upperbound'] -
+ dataframe['lowerbound']) / 2 ;
+ dataframe['markerY'] = dataframe['height'] + 0.2;
return plotOutlierHistogram(dataframe, maxOutliers, func,
- durationThresholdDescr, averageDuration,
+ statisticalOutlierThresholdDescr,
+ userLatencyThresholdDescr,
+ averageDuration,
maxDuration);
#
@@ -967,8 +1121,8 @@ def getTimeUnitString(unitsPerSecond):
#
def parseConfigFile(fname):
- global outlierThresholdDict;
- global outlierPrettyNames;
+ global userDefinedLatencyThresholds;
+ global userDefinedThresholdNames;
global timeUnitString;
configFile = None;
@@ -1046,14 +1200,13 @@ def parseConfigFile(fname):
print(line);
continue;
- outlierThresholdDict[func] = threshold;
- outlierPrettyNames[func] = str(number) + " " + units;
+ userDefinedLatencyThresholds[func] = threshold;
+ userDefinedThresholdNames[func] = str(number) + " " + units;
# We were given an empty config file
if (firstNonCommentLine):
return False;
- print outlierThresholdDict;
return True;
@@ -1063,6 +1216,7 @@ def main():
global arrowRightImg;
global bucketDir;
global perFuncDF;
+ global targetParallelism;
configSupplied = False;
figuresForAllFunctions = [];
@@ -1074,12 +1228,27 @@ def main():
parser.add_argument('files', type=str, nargs='*',
help='log files to process');
parser.add_argument('-c', '--config', dest='configFile', default='');
+ parser.add_argument('-d', '--dumpCleanData', dest='dumpCleanData',
+ default=False, action='store_true',
+ help='Dump clean log data. Clean data will \
+ not include incomplete function call records, \
+ e.g., if there is a function begin record, but\
+ no function end record, or vice versa.');
+ parser.add_argument('-j', dest='jobParallelism', type=int,
+ default='0');
+
args = parser.parse_args();
if (len(args.files) == 0):
parser.print_help();
sys.exit(1);
+ # Determine the target job parallelism
+ if (args.jobParallelism > 0):
+ targetParallelism = args.jobParallelism;
+ else:
+ targetParallelism = multiprocessing.cpu_count() * 2;
+
# Get names of standard CSS colors that we will use for the legend
initColorList();
@@ -1089,12 +1258,11 @@ def main():
if (not configSupplied):
pluralSuffix = "";
- if (STDEV_MULT > 1):
- pluralSuffix = "s";
+
print(color.BLUE + color.BOLD +
"Will deem as outliers all function instances whose runtime " +
- "was " + str(STDEV_MULT) + " standard deviation" + pluralSuffix +
- " greater than the average runtime for that function."
+ "was higher than the " + str(PERCENTILE * 100) +
+ "th percentile for that function."
+ color.END);
@@ -1106,7 +1274,7 @@ def main():
# Parallelize this later, so we are working on files in parallel.
for fname in args.files:
- processFile(fname);
+ processFile(fname, args.dumpCleanData);
# Normalize all intervals by subtracting the first timestamp.
normalizeIntervalData();
diff --git a/src/third_party/wiredtiger/tools/optrack/wt_optrack_decode.py b/src/third_party/wiredtiger/tools/optrack/wt_optrack_decode.py
index ea937f36b4a..971f3729981 100755
--- a/src/third_party/wiredtiger/tools/optrack/wt_optrack_decode.py
+++ b/src/third_party/wiredtiger/tools/optrack/wt_optrack_decode.py
@@ -265,7 +265,6 @@ def waitOnOneProcess(runningProcesses):
def main():
runnableProcesses = {};
- returnValues = {};
spawnedProcesses = {};
successfullyProcessedFiles = [];
targetParallelism = multiprocessing.cpu_count();