summaryrefslogtreecommitdiff
path: root/src/third_party/wiredtiger
diff options
context:
space:
mode:
authorLuke Chen <luke.chen@mongodb.com>2021-05-19 15:49:07 +1000
committerEvergreen Agent <no-reply@evergreen.mongodb.com>2021-05-19 06:02:30 +0000
commit1d20af63ae5e95e0ed1219809c5af571de8e3ae3 (patch)
tree8b046a8403a959b712715fdfd296a38f0c4b1fb4 /src/third_party/wiredtiger
parentcc1640581318df61a5fedc5c7ddd1a91c13e0712 (diff)
downloadmongo-1d20af63ae5e95e0ed1219809c5af571de8e3ae3.tar.gz
Import wiredtiger: bae0c1c914bc0fa92f3775c08650b65663094034 from branch mongodb-4.4
ref: aadac22242..bae0c1c914 for: 4.4.7 WT-6403 Restore format non-timestamp transactional testing WT-6576 Fix the aborted on-disk prepared key WT-7106 Increase how often delta encoding is used for history store records WT-7204 Update cursor-backward walks key instantiation support WT-7234 Prefix-compressed keys and memory amplification WT-7296 Merge default configuration with supplied test configuration in test framework WT-7325 Created a script to generate a new test in the WT test framework WT-7381 Cache btree's ckptlist between checkpoints WT-7382 Refactor of database validation in the test framework WT-7407 test/format failure classifier WT-7411 Stats and counter to track prepared updates WT-7416 Imported table requires full copy between incremental backups WT-7446 Fix incorrect duration_seconds value in test framework WT-7486 Coverity explcit NULL dereferenced WT-7487 Coverity explcit NULL dereferenced WT-7497 Add flush component to object metadata WT-7499 Change WT_STORAGE_SOURCE.flush API and add flush_finish WT-7503 Change default compressor for WT HS to Zstandard WT-7506 Allow single and double quotes inside auto-formatted comments WT-7511 Add assert to ensure the history store page is pinned before search WT-7519 Fix flags field overflow in WT_DATA_HANDLE WT-7525 Add key order check right after history store insert WT-7537 Change local tier object suffix to .wtobj WT-7546 Coverity: Minor issues in CppSuite test harness
Diffstat (limited to 'src/third_party/wiredtiger')
-rw-r--r--src/third_party/wiredtiger/.clang-format1
-rw-r--r--src/third_party/wiredtiger/bench/wtperf/config.c13
-rw-r--r--src/third_party/wiredtiger/dist/api_data.py28
-rw-r--r--src/third_party/wiredtiger/dist/s_comment.py4
-rw-r--r--src/third_party/wiredtiger/dist/s_define.list2
-rw-r--r--src/third_party/wiredtiger/dist/s_string.ok2
-rwxr-xr-xsrc/third_party/wiredtiger/dist/s_void2
-rw-r--r--src/third_party/wiredtiger/dist/stat_data.py5
-rw-r--r--src/third_party/wiredtiger/dist/test_data.py28
-rw-r--r--src/third_party/wiredtiger/examples/c/ex_all.c6
-rw-r--r--src/third_party/wiredtiger/examples/c/ex_backup.c11
-rw-r--r--src/third_party/wiredtiger/examples/c/ex_backup_block.c11
-rw-r--r--src/third_party/wiredtiger/examples/c/ex_data_source.c9
-rw-r--r--src/third_party/wiredtiger/ext/storage_sources/local_store/local_store.c495
-rw-r--r--src/third_party/wiredtiger/import.data2
-rw-r--r--src/third_party/wiredtiger/lang/python/wiredtiger.i13
-rw-r--r--src/third_party/wiredtiger/src/block/block_addr.c17
-rw-r--r--src/third_party/wiredtiger/src/block/block_ext.c14
-rw-r--r--src/third_party/wiredtiger/src/block/block_tiered.c14
-rw-r--r--src/third_party/wiredtiger/src/btree/bt_curnext.c5
-rw-r--r--src/third_party/wiredtiger/src/btree/bt_curprev.c13
-rw-r--r--src/third_party/wiredtiger/src/btree/bt_debug.c2
-rw-r--r--src/third_party/wiredtiger/src/btree/bt_discard.c16
-rw-r--r--src/third_party/wiredtiger/src/btree/bt_handle.c13
-rw-r--r--src/third_party/wiredtiger/src/btree/bt_import.c14
-rw-r--r--src/third_party/wiredtiger/src/btree/bt_page.c109
-rw-r--r--src/third_party/wiredtiger/src/btree/bt_ret.c13
-rw-r--r--src/third_party/wiredtiger/src/btree/bt_slvg.c13
-rw-r--r--src/third_party/wiredtiger/src/btree/bt_split.c12
-rw-r--r--src/third_party/wiredtiger/src/btree/bt_stat.c2
-rw-r--r--src/third_party/wiredtiger/src/btree/bt_vrfy.c2
-rw-r--r--src/third_party/wiredtiger/src/btree/row_key.c387
-rw-r--r--src/third_party/wiredtiger/src/config/config_collapse.c5
-rw-r--r--src/third_party/wiredtiger/src/config/config_def.c5
-rw-r--r--src/third_party/wiredtiger/src/config/test_config.c56
-rw-r--r--src/third_party/wiredtiger/src/conn/conn_api.c21
-rw-r--r--src/third_party/wiredtiger/src/cursor/cur_backup_incr.c27
-rw-r--r--src/third_party/wiredtiger/src/cursor/cur_file.c3
-rw-r--r--src/third_party/wiredtiger/src/cursor/cur_hs.c7
-rw-r--r--src/third_party/wiredtiger/src/cursor/cur_join.c5
-rw-r--r--src/third_party/wiredtiger/src/docs/transactions.dox11
-rw-r--r--src/third_party/wiredtiger/src/history/hs_cursor.c24
-rw-r--r--src/third_party/wiredtiger/src/history/hs_rec.c34
-rw-r--r--src/third_party/wiredtiger/src/include/api.h59
-rw-r--r--src/third_party/wiredtiger/src/include/btmem.h41
-rw-r--r--src/third_party/wiredtiger/src/include/btree.h38
-rw-r--r--src/third_party/wiredtiger/src/include/btree_inline.h558
-rw-r--r--src/third_party/wiredtiger/src/include/buf_inline.h10
-rw-r--r--src/third_party/wiredtiger/src/include/cursor_inline.h80
-rw-r--r--src/third_party/wiredtiger/src/include/dhandle.h14
-rw-r--r--src/third_party/wiredtiger/src/include/extern.h30
-rw-r--r--src/third_party/wiredtiger/src/include/meta.h2
-rw-r--r--src/third_party/wiredtiger/src/include/misc.h6
-rw-r--r--src/third_party/wiredtiger/src/include/reconcile.h4
-rw-r--r--src/third_party/wiredtiger/src/include/stat.h5
-rw-r--r--src/third_party/wiredtiger/src/include/txn.h3
-rw-r--r--src/third_party/wiredtiger/src/include/wiredtiger.in415
-rw-r--r--src/third_party/wiredtiger/src/lsm/lsm_cursor.c7
-rw-r--r--src/third_party/wiredtiger/src/meta/meta_ckpt.c329
-rw-r--r--src/third_party/wiredtiger/src/meta/meta_ext.c2
-rw-r--r--src/third_party/wiredtiger/src/meta/meta_track.c4
-rw-r--r--src/third_party/wiredtiger/src/os_posix/os_fs.c28
-rw-r--r--src/third_party/wiredtiger/src/reconcile/rec_row.c122
-rw-r--r--src/third_party/wiredtiger/src/reconcile/rec_write.c16
-rw-r--r--src/third_party/wiredtiger/src/schema/schema_create.c21
-rw-r--r--src/third_party/wiredtiger/src/schema/schema_plan.c7
-rw-r--r--src/third_party/wiredtiger/src/schema/schema_rename.c29
-rw-r--r--src/third_party/wiredtiger/src/schema/schema_util.c6
-rw-r--r--src/third_party/wiredtiger/src/session/session_api.c2
-rw-r--r--src/third_party/wiredtiger/src/session/session_compact.c3
-rw-r--r--src/third_party/wiredtiger/src/support/err.c4
-rw-r--r--src/third_party/wiredtiger/src/support/scratch.c38
-rw-r--r--src/third_party/wiredtiger/src/support/stat.c13
-rw-r--r--src/third_party/wiredtiger/src/tiered/tiered_config.c4
-rw-r--r--src/third_party/wiredtiger/src/tiered/tiered_cursor.c7
-rw-r--r--src/third_party/wiredtiger/src/tiered/tiered_handle.c45
-rw-r--r--src/third_party/wiredtiger/src/txn/txn.c93
-rw-r--r--src/third_party/wiredtiger/src/txn/txn_ckpt.c143
-rw-r--r--src/third_party/wiredtiger/src/txn/txn_rollback_to_stable.c4
-rw-r--r--src/third_party/wiredtiger/src/utilities/util_load.c10
-rw-r--r--src/third_party/wiredtiger/src/utilities/util_load_json.c8
-rw-r--r--src/third_party/wiredtiger/test/cppsuite/configs/config_example_test_default.txt52
-rw-r--r--src/third_party/wiredtiger/test/cppsuite/configs/config_poc_test_default.txt26
-rw-r--r--src/third_party/wiredtiger/test/cppsuite/configs/config_poc_test_stress.txt13
-rwxr-xr-xsrc/third_party/wiredtiger/test/cppsuite/create_test.sh81
-rw-r--r--src/third_party/wiredtiger/test/cppsuite/test_harness/core/component.h2
-rw-r--r--src/third_party/wiredtiger/test/cppsuite/test_harness/core/configuration.h114
-rw-r--r--src/third_party/wiredtiger/test/cppsuite/test_harness/runtime_monitor.h4
-rw-r--r--src/third_party/wiredtiger/test/cppsuite/test_harness/test.h12
-rw-r--r--src/third_party/wiredtiger/test/cppsuite/test_harness/util/debug_utils.h1
-rw-r--r--src/third_party/wiredtiger/test/cppsuite/test_harness/workload/database_model.h15
-rw-r--r--src/third_party/wiredtiger/test/cppsuite/test_harness/workload/database_operation.h111
-rw-r--r--src/third_party/wiredtiger/test/cppsuite/test_harness/workload/thread_context.h18
-rw-r--r--src/third_party/wiredtiger/test/cppsuite/test_harness/workload/workload_tracking.h66
-rw-r--r--src/third_party/wiredtiger/test/cppsuite/test_harness/workload/workload_validation.h397
-rw-r--r--src/third_party/wiredtiger/test/cppsuite/test_harness/workload_generator.h16
-rwxr-xr-xsrc/third_party/wiredtiger/test/cppsuite/tests/run.cxx2
-rw-r--r--src/third_party/wiredtiger/test/csuite/incr_backup/main.c7
-rw-r--r--src/third_party/wiredtiger/test/csuite/wt2719_reconfig/main.c5
-rwxr-xr-xsrc/third_party/wiredtiger/test/evergreen.yml4
-rw-r--r--src/third_party/wiredtiger/test/format/bulk.c17
-rw-r--r--src/third_party/wiredtiger/test/format/config.c171
-rw-r--r--src/third_party/wiredtiger/test/format/config.h34
-rw-r--r--src/third_party/wiredtiger/test/format/config_compat.c4
-rw-r--r--src/third_party/wiredtiger/test/format/config_compat.sed1
-rw-r--r--src/third_party/wiredtiger/test/format/format.h16
-rwxr-xr-xsrc/third_party/wiredtiger/test/format/format.sh33
-rw-r--r--src/third_party/wiredtiger/test/format/kv.c67
-rw-r--r--src/third_party/wiredtiger/test/format/ops.c230
-rw-r--r--src/third_party/wiredtiger/test/format/snap.c78
-rw-r--r--src/third_party/wiredtiger/test/format/util.c2
-rw-r--r--src/third_party/wiredtiger/test/format/wts.c26
-rw-r--r--src/third_party/wiredtiger/test/suite/test_backup22.py93
-rw-r--r--src/third_party/wiredtiger/test/suite/test_prepare15.py204
-rwxr-xr-xsrc/third_party/wiredtiger/test/suite/test_tiered04.py2
-rwxr-xr-xsrc/third_party/wiredtiger/test/suite/test_tiered06.py255
116 files changed, 3316 insertions, 2549 deletions
diff --git a/src/third_party/wiredtiger/.clang-format b/src/third_party/wiredtiger/.clang-format
index db3bac132a4..7f291933076 100644
--- a/src/third_party/wiredtiger/.clang-format
+++ b/src/third_party/wiredtiger/.clang-format
@@ -71,6 +71,7 @@ ForEachMacros:
- WT_CELL_FOREACH_KV
- WT_CELL_FOREACH_VRFY
- WT_CKPT_FOREACH
+ - WT_CKPT_FOREACH_NAME_OR_ORDER
- WT_COL_FOREACH
- WT_EXT_FOREACH
- WT_EXT_FOREACH_OFF
diff --git a/src/third_party/wiredtiger/bench/wtperf/config.c b/src/third_party/wiredtiger/bench/wtperf/config.c
index a6c625e5ef0..70e01507719 100644
--- a/src/third_party/wiredtiger/bench/wtperf/config.c
+++ b/src/third_party/wiredtiger/bench/wtperf/config.c
@@ -462,8 +462,7 @@ config_opt(WTPERF *wtperf, WT_CONFIG_ITEM *k, WT_CONFIG_ITEM *v)
break;
case STRING_TYPE:
/*
- * Thread configuration is the one case where the type isn't a
- * "string", it's a "struct".
+ * Thread configuration is the one case where the type isn't a "string", it's a "struct".
*/
if (v->type == WT_CONFIG_ITEM_STRUCT && STRING_MATCH("threads", k->str, k->len))
return (config_threads(wtperf, v->str, v->len));
@@ -821,12 +820,10 @@ config_consolidate(CONFIG_OPTS *opts)
char *string_key;
/*
- * This loop iterates over the config queue and for each entry checks if
- * a later queue entry has the same key. If there's a match, and key is
- * "conn_config" or "table_config", the later queue entry is replaced
- * with a concatenated entry of the two queue entries, the current queue
- * entry is removed. For any other key, if there is a match, the current
- * queue entry is removed.
+ * This loop iterates over the config queue and for each entry checks if a later queue entry has
+ * the same key. If there's a match, and key is "conn_config" or "table_config", the later queue
+ * entry is replaced with a concatenated entry of the two queue entries, the current queue entry
+ * is removed. For any other key, if there is a match, the current queue entry is removed.
*/
conf_line = TAILQ_FIRST(&opts->config_head);
while (conf_line != NULL) {
diff --git a/src/third_party/wiredtiger/dist/api_data.py b/src/third_party/wiredtiger/dist/api_data.py
index 953474d404f..6d65d9d217a 100644
--- a/src/third_party/wiredtiger/dist/api_data.py
+++ b/src/third_party/wiredtiger/dist/api_data.py
@@ -49,12 +49,10 @@ common_runtime_config = [
enable enhanced checking. ''',
type='category', subconfig= [
Config('commit_timestamp', 'none', r'''
- This option is no longer supported. Retained for backward
- compatibility. Use \c write_timestamp option instead.''',
+ This option is no longer supported, retained for backward compatibility.''',
choices=['always', 'key_consistent', 'never', 'none']),
Config('durable_timestamp', 'none', r'''
- This option is no longer supported. Retained for backward
- compatibility. Use \c write_timestamp option instead.''',
+ This option is no longer supported, retained for backward compatibility.''',
choices=['always', 'key_consistent', 'never', 'none']),
Config('write_timestamp', 'off', r'''
verify that commit timestamps are used per the configured
@@ -330,8 +328,7 @@ file_config = format_meta + file_runtime_config + tiered_config + [
the file format''',
choices=['btree']),
Config('huffman_key', 'none', r'''
- This option is no longer supported. Retained for backward
- compatibility. See @ref huffman for more information'''),
+ This option is no longer supported, retained for backward compatibility.'''),
Config('huffman_value', 'none', r'''
configure Huffman encoding for values. Permitted values are
\c "none", \c "english", \c "utf8<file>" or \c "utf16<file>".
@@ -355,8 +352,8 @@ file_config = format_meta + file_runtime_config + tiered_config + [
block compression is done''',
min='512B', max='512MB'),
Config('internal_item_max', '0', r'''
- historic term for internal_key_max''',
- min=0, undoc=True),
+ This option is no longer supported, retained for backward compatibility.''',
+ min=0),
Config('internal_key_max', '0', r'''
the largest key stored in an internal node, in bytes. If set, keys
larger than the specified size are stored as overflow items (which
@@ -365,10 +362,8 @@ file_config = format_meta + file_runtime_config + tiered_config + [
page''',
min='0'),
Config('key_gap', '10', r'''
- the maximum gap between instantiated keys in a Btree leaf page,
- constraining the number of keys processed to instantiate a
- random Btree leaf page key''',
- min='0', undoc=True),
+ This option is no longer supported, retained for backward compatibility.''',
+ min='0'),
Config('leaf_key_max', '0', r'''
the largest key stored in a leaf node, in bytes. If set, keys
larger than the specified size are stored as overflow items (which
@@ -392,8 +387,8 @@ file_config = format_meta + file_runtime_config + tiered_config + [
a newly split leaf page''',
min='0'),
Config('leaf_item_max', '0', r'''
- historic term for leaf_key_max and leaf_value_max''',
- min=0, undoc=True),
+ This option is no longer supported, retained for backward compatibility.''',
+ min=0),
Config('memory_page_image_max', '0', r'''
the maximum in-memory page image represented by a single storage block.
Depending on compression efficiency, compression can create storage
@@ -467,7 +462,10 @@ tiered_meta = common_meta + tiered_config + [
tier_meta = file_meta + tiered_tree_config
# Objects need to have the readonly setting set and bucket_prefix.
# The file_meta already contains those pieces.
-object_meta = file_meta
+object_meta = file_meta + [
+ Config('flush', '0', r'''
+ indicates the time this object was flushed to shared storage or 0 if unflushed'''),
+]
table_only_config = [
Config('colgroups', '', r'''
diff --git a/src/third_party/wiredtiger/dist/s_comment.py b/src/third_party/wiredtiger/dist/s_comment.py
index 482fded4fff..92e4ef348da 100644
--- a/src/third_party/wiredtiger/dist/s_comment.py
+++ b/src/third_party/wiredtiger/dist/s_comment.py
@@ -104,11 +104,11 @@ for line in sys.stdin:
(sline[2].islower() or sline[2] == '_') and sline.endswith('--')):
function_desc = True
# We're only reformatting block comments where each line begins with a space and an
- # alphabetic character after the asterisk, or a parenthetical. The only exceptions
+ # normal comment character after the asterisk, or a parenthetical. The only exceptions
# are function descriptions.
block = block and \
len(sline) >= 3 and sline.startswith('*') and sline[1] == ' ' and \
- (sline[2].isalpha() or (len(sline) >= 5 and \
+ (sline[2].isalpha() or sline[2] == '"' or sline[2] == "'" or (len(sline) >= 5 and \
(sline[2] == '(' and sline[3].isalpha() and sline[4] != ')'))) or function_desc
# Trim asterisks at the beginning of each line in a multiline comment.
if sline.startswith('*'):
diff --git a/src/third_party/wiredtiger/dist/s_define.list b/src/third_party/wiredtiger/dist/s_define.list
index 3e0f6af6581..34e0a9a8aa2 100644
--- a/src/third_party/wiredtiger/dist/s_define.list
+++ b/src/third_party/wiredtiger/dist/s_define.list
@@ -32,6 +32,8 @@ WT_CLOCKDIFF_NS
WT_CONN_CHECK_PANIC
WT_DEADLOCK
WT_DEBUG_BYTE
+WT_DHANDLE_MAX_FLAG
+WT_DHANDLE_ZZZ_ENDFLAG
WT_ERR_ASSERT
WT_ERR_ERROR_OK
WT_EXT_FOREACH_OFF
diff --git a/src/third_party/wiredtiger/dist/s_string.ok b/src/third_party/wiredtiger/dist/s_string.ok
index 9c6976641b0..db198204b10 100644
--- a/src/third_party/wiredtiger/dist/s_string.ok
+++ b/src/third_party/wiredtiger/dist/s_string.ok
@@ -1035,6 +1035,7 @@ mbss
mem
memalign
membar
+memcmp
memcpy
memdup
memget
@@ -1493,6 +1494,7 @@ wtstats
xF
xdeadbeef
xff
+xfff
xxxx
xxxxx
xxxxxx
diff --git a/src/third_party/wiredtiger/dist/s_void b/src/third_party/wiredtiger/dist/s_void
index 95cf8ec5215..ab46f05c593 100755
--- a/src/third_party/wiredtiger/dist/s_void
+++ b/src/third_party/wiredtiger/dist/s_void
@@ -194,6 +194,8 @@ for f in `find bench ext src test -name '*.c' -o -name '*_inline.h'`; do
-e '/WT_ERR/d' \
-e '/WT_SYSCALL.*ret/d' \
-e '/WT_TRET/d' \
+ -e '/__wt_buf_catfmt/d' \
+ -e '/__wt_buf_fmt/d' \
-e 's/^\([^(]*\).*/\1/' \
-e 's/^ *//' > $t
test -s $t && {
diff --git a/src/third_party/wiredtiger/dist/stat_data.py b/src/third_party/wiredtiger/dist/stat_data.py
index 34e5b020a4a..89a5578b362 100644
--- a/src/third_party/wiredtiger/dist/stat_data.py
+++ b/src/third_party/wiredtiger/dist/stat_data.py
@@ -551,7 +551,10 @@ connection_stats = [
TxnStat('txn_prepare_active', 'prepared transactions currently active'),
TxnStat('txn_prepare_commit', 'prepared transactions committed'),
TxnStat('txn_prepare_rollback', 'prepared transactions rolled back'),
- TxnStat('txn_prepared_updates_count', 'Number of prepared updates'),
+ TxnStat('txn_prepared_updates_committed', 'Number of prepared updates committed'),
+ TxnStat('txn_prepared_updates', 'Number of prepared updates'),
+ TxnStat('txn_prepared_updates_key_repeated', 'Number of prepared updates repeated on the same key'),
+ TxnStat('txn_prepared_updates_rolledback', 'Number of prepared updates rolled back'),
TxnStat('txn_query_ts', 'query timestamp calls'),
TxnStat('txn_rollback', 'transactions rolled back'),
TxnStat('txn_rollback_to_stable_running', 'transaction rollback to stable currently running', 'no_clear,no_scale'),
diff --git a/src/third_party/wiredtiger/dist/test_data.py b/src/third_party/wiredtiger/dist/test_data.py
index d0c8d36e955..23667a35751 100644
--- a/src/third_party/wiredtiger/dist/test_data.py
+++ b/src/third_party/wiredtiger/dist/test_data.py
@@ -59,9 +59,9 @@ throttle_config = [
# Record config specifies the format of the keys and values used in the database
#
record_config = throttle_config + [
- Config('key_size', 0, r'''
+ Config('key_size', 5, r'''
The size of the keys created''', min=0, max=10000),
- Config('value_size', 0, r'''
+ Config('value_size', 5, r'''
The size of the values created''', min=0, max=1000000000),
]
@@ -79,27 +79,33 @@ populate_config = [
# A generic configuration used by various other configurations to define whether that component or
# similar is enabled or not.
#
-enable_config = [
+enabled_config_true = [
+ Config('enabled', 'true', r'''
+ Whether or not this is relevant to the workload''',
+ type='boolean'),
+]
+
+enabled_config_false = [
Config('enabled', 'false', r'''
Whether or not this is relevant to the workload''',
type='boolean'),
]
-stat_config = enable_config
+stat_config = enabled_config_false
limit_stat = stat_config + [
Config('limit', 0, r'''
- The limit value a statistic is allowed to reach''')
+ The limit value a statistic is allowed to reach''', min=0)
]
range_config = [
Config('min', 0, r'''
- The minimum a value can be in a range'''),
+ The minimum a value can be in a range''', min=0),
Config('max', 1, r'''
The maximum a value can be in a range''')
]
-component_config = enable_config + throttle_config
+component_config = enabled_config_true + throttle_config
transaction_config = [
Config('ops_per_transaction', '', r'''
@@ -122,16 +128,16 @@ runtime_monitor = component_config + [
# Configuration that applies to the timestamp_manager component.
#
timestamp_manager = component_config + [
- Config('oldest_lag', 0, r'''
+ Config('oldest_lag', 1, r'''
The duration between the stable and oldest timestamps''', min=0, max=1000000),
- Config('stable_lag', 0, r'''
+ Config('stable_lag', 1, r'''
The duration between the latest and stable timestamps''', min=0, max=1000000),
]
#
# Configuration that applies to the workload tracking component.
#
-workload_tracking = enable_config
+workload_tracking = component_config
#
# Configuration that applies to the workload_generator component.
@@ -173,7 +179,7 @@ test_config = [
The cache size that wiredtiger will be configured to run with''', min=0, max=100000000000),
Config('duration_seconds', 0, r'''
The duration that the test run will last''', min=0, max=1000000),
- Config('enable_logging', 'true', r'''
+ Config('enable_logging', 'false', r'''
Enables write ahead logs''', type='boolean'),
]
diff --git a/src/third_party/wiredtiger/examples/c/ex_all.c b/src/third_party/wiredtiger/examples/c/ex_all.c
index fd4b9ca43b9..36de04005c0 100644
--- a/src/third_party/wiredtiger/examples/c/ex_all.c
+++ b/src/third_party/wiredtiger/examples/c/ex_all.c
@@ -1012,9 +1012,9 @@ connection_ops(WT_CONNECTION *conn)
*
* Functions are specified by name (for example, "wiredtiger_open").
*
- * Methods are specified using a concatenation of the handle name, a
- * period and the method name (for example, session create would be
- * "WT_SESSION.create" and cursor close would be WT_CURSOR.close").
+ * Methods are specified using a concatenation of the handle name, a period and the method name
+ * (for example, session create would be "WT_SESSION.create" and cursor close would be
+ * "WT_CURSOR.close").
*/
error_check(
wiredtiger_config_validate(NULL, NULL, "WT_SESSION.create", "allocation_size=32KB"));
diff --git a/src/third_party/wiredtiger/examples/c/ex_backup.c b/src/third_party/wiredtiger/examples/c/ex_backup.c
index 9267bd1ccf6..23636f3494e 100644
--- a/src/third_party/wiredtiger/examples/c/ex_backup.c
+++ b/src/third_party/wiredtiger/examples/c/ex_backup.c
@@ -50,13 +50,12 @@ compare_backups(int i)
char buf[1024], msg[32];
/*
- * We run 'wt dump' on both the full backup directory and the
- * incremental backup directory for this iteration. Since running
- * 'wt' runs recovery and makes both directories "live", we need
- * a new directory for each iteration.
+ * We run 'wt dump' on both the full backup directory and the incremental backup directory for
+ * this iteration. Since running 'wt' runs recovery and makes both directories "live", we need a
+ * new directory for each iteration.
*
- * If i == 0, we're comparing against the main, original directory
- * with the final incremental directory.
+ * If i == 0, we're comparing against the main, original directory with the final incremental
+ * directory.
*/
if (i == 0)
(void)snprintf(
diff --git a/src/third_party/wiredtiger/examples/c/ex_backup_block.c b/src/third_party/wiredtiger/examples/c/ex_backup_block.c
index c935baf9c75..1cbbe1fbf09 100644
--- a/src/third_party/wiredtiger/examples/c/ex_backup_block.c
+++ b/src/third_party/wiredtiger/examples/c/ex_backup_block.c
@@ -65,13 +65,12 @@ compare_backups(int i)
char buf[1024], msg[32];
/*
- * We run 'wt dump' on both the full backup directory and the
- * incremental backup directory for this iteration. Since running
- * 'wt' runs recovery and makes both directories "live", we need
- * a new directory for each iteration.
+ * We run 'wt dump' on both the full backup directory and the incremental backup directory for
+ * this iteration. Since running 'wt' runs recovery and makes both directories "live", we need a
+ * new directory for each iteration.
*
- * If i == 0, we're comparing against the main, original directory
- * with the final incremental directory.
+ * If i == 0, we're comparing against the main, original directory with the final incremental
+ * directory.
*/
if (i == 0)
(void)snprintf(buf, sizeof(buf), "../../wt -R -h %s dump main > %s.%d", home, full_out, i);
diff --git a/src/third_party/wiredtiger/examples/c/ex_data_source.c b/src/third_party/wiredtiger/examples/c/ex_data_source.c
index ae16b837d29..9ee94e6cfae 100644
--- a/src/third_party/wiredtiger/examples/c/ex_data_source.c
+++ b/src/third_party/wiredtiger/examples/c/ex_data_source.c
@@ -351,8 +351,7 @@ my_open_cursor(WT_DATA_SOURCE *dsrc, WT_SESSION *session, const char *uri, WT_CO
int my_data_source_overwrite;
/*
- * Retrieve the value of the boolean type configuration string
- * "overwrite".
+ * Retrieve the value of the boolean type configuration string "overwrite".
*/
error_check(wt_api->config_get(wt_api, session, config, "overwrite", &v));
my_data_source_overwrite = v.val != 0;
@@ -367,8 +366,7 @@ my_open_cursor(WT_DATA_SOURCE *dsrc, WT_SESSION *session, const char *uri, WT_CO
int64_t my_data_source_page_size;
/*
- * Retrieve the value of the integer type configuration string
- * "page_size".
+ * Retrieve the value of the integer type configuration string "page_size".
*/
error_check(wt_api->config_get(wt_api, session, config, "page_size", &v));
my_data_source_page_size = v.val;
@@ -383,8 +381,7 @@ my_open_cursor(WT_DATA_SOURCE *dsrc, WT_SESSION *session, const char *uri, WT_CO
const char *my_data_source_key;
/*
- * Retrieve the value of the string type configuration string
- * "key_format".
+ * Retrieve the value of the string type configuration string "key_format".
*/
error_check(wt_api->config_get(wt_api, session, config, "key_format", &v));
diff --git a/src/third_party/wiredtiger/ext/storage_sources/local_store/local_store.c b/src/third_party/wiredtiger/ext/storage_sources/local_store/local_store.c
index 00f65988843..0f6a7cfe473 100644
--- a/src/third_party/wiredtiger/ext/storage_sources/local_store/local_store.c
+++ b/src/third_party/wiredtiger/ext/storage_sources/local_store/local_store.c
@@ -66,7 +66,6 @@ typedef struct {
* Locks are used to protect the file handle queue and flush queue.
*/
pthread_rwlock_t file_handle_lock;
- pthread_rwlock_t flush_lock;
/*
* Configuration values are set at startup.
@@ -87,7 +86,6 @@ typedef struct {
/* Queue of file handles */
TAILQ_HEAD(local_file_handle_qh, local_file_handle) fileq;
- TAILQ_HEAD(local_flush_qh, local_flush_item) flushq;
} LOCAL_STORAGE;
@@ -102,34 +100,13 @@ typedef struct {
char *auth_token; /* Identifier for key management system */
char *bucket_dir; /* Directory that stands in for cloud storage bucket */
char *cache_dir; /* Directory for pre-flushed objects and cached objects */
- char *fs_prefix; /* File system prefix, allowing for a "directory" within a bucket */
} LOCAL_FILE_SYSTEM;
-/*
- * Indicates a object that has not yet been flushed.
- */
-typedef struct local_flush_item {
- char *src_path; /* File name to copy from, object name and cache name derived from this */
-
- /*
- * These fields would be used in performing a flush.
- */
- char *auth_token; /* Identifier for key management system */
- char *bucket; /* Bucket name */
- char *cache_dir; /* Cache directory */
- char *fs_prefix; /* Prefix for file system */
- WT_FS_OPEN_FILE_TYPE file_type; /* File type */
-
- TAILQ_ENTRY(local_flush_item) q; /* Queue of items */
-} LOCAL_FLUSH_ITEM;
-
typedef struct local_file_handle {
WT_FILE_HANDLE iface; /* Must come first */
- LOCAL_STORAGE *local; /* Enclosing storage source */
- WT_FILE_HANDLE *fh; /* File handle */
- char *path; /* Path name of file */
- LOCAL_FLUSH_ITEM *flush; /* Flush information, set if newly created */
+ LOCAL_STORAGE *local; /* Enclosing storage source */
+ WT_FILE_HANDLE *fh; /* File handle */
TAILQ_ENTRY(local_file_handle) q; /* Queue of handles */
} LOCAL_FILE_HANDLE;
@@ -137,24 +114,28 @@ typedef struct local_file_handle {
/*
* Forward function declarations for internal functions
*/
+static int local_bucket_path(WT_FILE_SYSTEM *, const char *, char **);
+static int local_cache_path(WT_FILE_SYSTEM *, const char *, char **);
static int local_configure(LOCAL_STORAGE *, WT_CONFIG_ARG *);
static int local_configure_int(LOCAL_STORAGE *, WT_CONFIG_ARG *, const char *, uint32_t *);
static int local_delay(LOCAL_STORAGE *);
static int local_err(LOCAL_STORAGE *, WT_SESSION *, int, const char *, ...);
-static void local_flush_free(LOCAL_FLUSH_ITEM *);
+static int local_file_copy(
+ LOCAL_STORAGE *, WT_SESSION *, const char *, const char *, WT_FS_OPEN_FILE_TYPE);
static int local_get_directory(const char *, ssize_t len, char **);
-static int local_location_path(WT_FILE_SYSTEM *, const char *, char **);
+static int local_path(WT_FILE_SYSTEM *, const char *, const char *, char **);
static int local_writeable(LOCAL_STORAGE *, const char *name, bool *writeable);
/*
* Forward function declarations for storage source API implementation
*/
static int local_exist(WT_FILE_SYSTEM *, WT_SESSION *, const char *, bool *);
-static int local_customize_file_system(WT_STORAGE_SOURCE *, WT_SESSION *, const char *,
- const char *, const char *, const char *, WT_FILE_SYSTEM **);
+static int local_customize_file_system(
+ WT_STORAGE_SOURCE *, WT_SESSION *, const char *, const char *, const char *, WT_FILE_SYSTEM **);
static int local_flush(
- WT_STORAGE_SOURCE *, WT_SESSION *, WT_FILE_SYSTEM *, const char *, const char *);
-static int local_flush_one(LOCAL_STORAGE *, WT_SESSION *, LOCAL_FLUSH_ITEM *);
+ WT_STORAGE_SOURCE *, WT_SESSION *, WT_FILE_SYSTEM *, const char *, const char *, const char *);
+static int local_flush_finish(
+ WT_STORAGE_SOURCE *, WT_SESSION *, WT_FILE_SYSTEM *, const char *, const char *, const char *);
static int local_terminate(WT_STORAGE_SOURCE *, WT_SESSION *);
/*
@@ -296,23 +277,6 @@ local_err(LOCAL_STORAGE *local, WT_SESSION *session, int ret, const char *format
}
/*
- * local_flush_free --
- * Free storage for a flush item.
- */
-static void
-local_flush_free(LOCAL_FLUSH_ITEM *flush)
-{
- if (flush != NULL) {
- free(flush->auth_token);
- free(flush->bucket);
- free(flush->cache_dir);
- free(flush->fs_prefix);
- free(flush->src_path);
- free(flush);
- }
-}
-
-/*
* local_get_directory --
* Return a copy of a directory name after verifying that it is a directory.
*/
@@ -363,19 +327,37 @@ local_writeable(LOCAL_STORAGE *local, const char *name, bool *writeablep)
}
/*
- * local_location_path --
+ * local_bucket_path --
+ * Construct the bucket pathname from the file system and local name.
+ */
+static int
+local_bucket_path(WT_FILE_SYSTEM *file_system, const char *name, char **pathp)
+{
+ return (local_path(file_system, ((LOCAL_FILE_SYSTEM *)file_system)->bucket_dir, name, pathp));
+}
+
+/*
+ * local_cache_path --
+ * Construct the cache pathname from the file system and local name.
+ */
+static int
+local_cache_path(WT_FILE_SYSTEM *file_system, const char *name, char **pathp)
+{
+ return (local_path(file_system, ((LOCAL_FILE_SYSTEM *)file_system)->cache_dir, name, pathp));
+}
+
+/*
+ * local_path --
* Construct a pathname from the file system and local name.
*/
-int
-local_location_path(WT_FILE_SYSTEM *file_system, const char *name, char **pathp)
+static int
+local_path(WT_FILE_SYSTEM *file_system, const char *dir, const char *name, char **pathp)
{
- LOCAL_FILE_SYSTEM *local_fs;
size_t len;
int ret;
char *p;
ret = 0;
- local_fs = (LOCAL_FILE_SYSTEM *)file_system;
/* Skip over "./" and variations (".//", ".///./././//") at the beginning of the name. */
while (*name == '.') {
@@ -385,10 +367,10 @@ local_location_path(WT_FILE_SYSTEM *file_system, const char *name, char **pathp)
while (*name == '/')
name++;
}
- len = strlen(local_fs->cache_dir) + strlen(local_fs->fs_prefix) + strlen(name) + 2;
+ len = strlen(dir) + strlen(name) + 2;
if ((p = malloc(len)) == NULL)
- return (local_err(FS2LOCAL(file_system), NULL, ENOMEM, "local_location_path"));
- snprintf(p, len, "%s/%s%s", local_fs->cache_dir, local_fs->fs_prefix, name);
+ return (local_err(FS2LOCAL(file_system), NULL, ENOMEM, "local_path"));
+ snprintf(p, len, "%s/%s", dir, name);
*pathp = p;
return (ret);
}
@@ -399,7 +381,7 @@ local_location_path(WT_FILE_SYSTEM *file_system, const char *name, char **pathp)
*/
static int
local_customize_file_system(WT_STORAGE_SOURCE *storage_source, WT_SESSION *session,
- const char *bucket_name, const char *prefix, const char *auth_token, const char *config,
+ const char *bucket_name, const char *auth_token, const char *config,
WT_FILE_SYSTEM **file_systemp)
{
LOCAL_STORAGE *local;
@@ -470,10 +452,6 @@ local_customize_file_system(WT_STORAGE_SOURCE *storage_source, WT_SESSION *sessi
local_err(local, session, ret, "%*s: cache directory", (int)cachedir.len, cachedir.str);
goto err;
}
- if ((fs->fs_prefix = strdup(prefix)) == NULL) {
- ret = local_err(local, session, ENOMEM, "local_file_system.prefix");
- goto err;
- }
fs->file_system.fs_directory_list = local_directory_list;
fs->file_system.fs_directory_list_single = local_directory_list_single;
fs->file_system.fs_directory_list_free = local_directory_list_free;
@@ -491,7 +469,6 @@ err:
free(fs->auth_token);
free(fs->bucket_dir);
free(fs->cache_dir);
- free(fs->fs_prefix);
free(fs);
}
return (ret);
@@ -521,7 +498,7 @@ local_exist(WT_FILE_SYSTEM *file_system, WT_SESSION *session, const char *name,
ret = local_err(local, session, errno, "%s: ss_exist stat", path);
local->op_count++;
- if ((ret = local_location_path(file_system, name, &path)) != 0)
+ if ((ret = local_cache_path(file_system, name, &path)) != 0)
goto err;
ret = stat(path, &sb);
@@ -539,149 +516,46 @@ err:
}
/*
- * local_flush --
- * Return when the files have been flushed.
+ * local_file_copy --
+ * Copy a file.
*/
static int
-local_flush(WT_STORAGE_SOURCE *storage_source, WT_SESSION *session, WT_FILE_SYSTEM *file_system,
- const char *name, const char *config)
-{
- LOCAL_STORAGE *local;
- LOCAL_FLUSH_ITEM *flush, *safe_flush;
- int ret, t_ret;
- char *match;
-
- (void)config; /* Unused */
-
- /*
- * This implementation does not do anything meaningful on flush. However, we do track which
- * objects have not yet been flushed and note which ones need to be flushed now.
- */
- ret = 0;
- local = (LOCAL_STORAGE *)storage_source;
- match = NULL;
-
- if (file_system == NULL && name != NULL)
- return local_err(local, session, EINVAL, "flush: cannot specify name without file system");
-
- local->op_count++;
- if (file_system != NULL) {
- if ((ret = local_location_path(file_system, name == NULL ? "" : name, &match)) != 0)
- goto err;
- }
- VERBOSE(local, "Flush: match=%s\n", SHOW_STRING(match));
-
- /*
- * Note: we retain the lock on the data structure while flushing all entries. This is fine for
- * our local file implementation, when we don't have to do anything to flush, but for a cloud
- * implementation, we'll want some way to not hold the lock while transferring data.
- */
- if ((ret = pthread_rwlock_wrlock(&local->flush_lock)) != 0) {
- (void)local_err(local, session, ret, "flush: pthread_rwlock_wrlock");
- goto err;
- }
-
- TAILQ_FOREACH_SAFE(flush, &local->flushq, q, safe_flush)
- {
- if (match != NULL) {
- /*
- * We must match against the bucket and the name if given.
- * Our match string is of the form:
- * <bucket_name>/<fs_prefix><name>
- *
- * If name is given, we must match the entire path.
- * If name is not given, we must match up to the beginning
- * of the name.
- */
- if (name != NULL) {
- /* Exact name match required. */
- if (strcmp(flush->src_path, match) != 0)
- continue;
- }
- /* No name specified, everything up to the name must match. */
- else if (strncmp(flush->src_path, match, strlen(match)) != 0)
- continue;
- }
- if ((t_ret = local_flush_one(local, session, flush)) != 0 && ret == 0)
- ret = t_ret;
- TAILQ_REMOVE(&local->flushq, flush, q);
- local_flush_free(flush);
- }
-
- if ((t_ret = pthread_rwlock_unlock(&local->flush_lock)) != 0) {
- (void)local_err(local, session, t_ret, "flush: pthread_rwlock_unlock");
- if (ret == 0)
- ret = t_ret;
- }
-
-err:
- free(match);
-
- return (ret);
-}
-
-/*
- * local_flush_one --
- * Flush one item on the flush queue.
- */
-static int
-local_flush_one(LOCAL_STORAGE *local, WT_SESSION *session, LOCAL_FLUSH_ITEM *flush)
+local_file_copy(LOCAL_STORAGE *local, WT_SESSION *session, const char *src_path,
+ const char *dest_path, WT_FS_OPEN_FILE_TYPE type)
{
WT_FILE_HANDLE *dest, *src;
WT_FILE_SYSTEM *wt_fs;
wt_off_t copy_size, file_size, left;
+ ssize_t pos;
int ret, t_ret;
- char *object_name;
char buffer[1024 * 64];
- char dest_path[1024];
- ssize_t pos;
- ret = 0;
- src = dest = NULL;
-
- object_name = strrchr(flush->src_path, '/');
- if (object_name == NULL) {
- ret = local_err(local, session, errno, "%s: unexpected src path", flush->src_path);
- goto err;
- }
- object_name++;
-
- /*
- * Here's where we flush the file to the cloud. This "local" implementation copies the file to
- * the bucket directory.
- */
- VERBOSE(local, "Flush object: from=%s, bucket=%s, object=%s, auth_token=%s, \n",
- flush->src_path, flush->bucket, object_name, flush->auth_token);
-
- if ((ret = local_delay(local)) != 0)
- goto err;
+ dest = src = NULL;
if ((ret = local->wt_api->file_system_get(local->wt_api, session, &wt_fs)) != 0) {
ret =
local_err(local, session, ret, "local_file_system: cannot get WiredTiger file system");
goto err;
}
- snprintf(dest_path, sizeof(dest_path), "%s/%s", flush->bucket, object_name);
-
- if ((ret = wt_fs->fs_open_file(
- wt_fs, session, flush->src_path, flush->file_type, WT_FS_OPEN_READONLY, &src)) != 0) {
- ret = local_err(local, session, ret, "%s: cannot open for read", flush->src_path);
+ if ((ret = wt_fs->fs_open_file(wt_fs, session, src_path, type, WT_FS_OPEN_READONLY, &src)) !=
+ 0) {
+ ret = local_err(local, session, ret, "%s: cannot open for read", src_path);
goto err;
}
- if ((ret = wt_fs->fs_open_file(
- wt_fs, session, dest_path, flush->file_type, WT_FS_OPEN_CREATE, &dest)) != 0) {
+ if ((ret = wt_fs->fs_open_file(wt_fs, session, dest_path, type, WT_FS_OPEN_CREATE, &dest)) !=
+ 0) {
ret = local_err(local, session, ret, "%s: cannot create", dest_path);
goto err;
}
- if ((ret = wt_fs->fs_size(wt_fs, session, flush->src_path, &file_size)) != 0) {
- ret = local_err(local, session, ret, "%s: cannot get size", flush->src_path);
+ if ((ret = wt_fs->fs_size(wt_fs, session, src_path, &file_size)) != 0) {
+ ret = local_err(local, session, ret, "%s: cannot get size", src_path);
goto err;
}
for (pos = 0, left = file_size; left > 0; pos += copy_size, left -= copy_size) {
copy_size = left < (wt_off_t)sizeof(buffer) ? left : (wt_off_t)sizeof(buffer);
if ((ret = src->fh_read(src, session, pos, (size_t)copy_size, buffer)) != 0) {
- ret = local_err(local, session, ret, "%s: cannot read", flush->src_path);
+ ret = local_err(local, session, ret, "%s: cannot read", src_path);
goto err;
}
if ((ret = dest->fh_write(dest, session, pos, (size_t)copy_size, buffer)) != 0) {
@@ -689,16 +563,7 @@ local_flush_one(LOCAL_STORAGE *local, WT_SESSION *session, LOCAL_FLUSH_ITEM *flu
goto err;
}
}
- if ((ret = dest->fh_sync(dest, session)) != 0) {
- ret = local_err(local, session, ret, "%s: cannot sync", dest_path);
- goto err;
- }
- local->object_flushes++;
-
err:
- /* When we're done with flushing this file, set the file to readonly. */
- if (ret == 0 && (ret = chmod(flush->src_path, 0444)) < 0)
- ret = local_err(local, session, errno, "%s: chmod flushed file failed", flush->src_path);
if (src != NULL && (t_ret = src->close(src, session)) != 0)
if (ret == 0)
ret = t_ret;
@@ -710,6 +575,79 @@ err:
}
/*
+ * local_flush --
+ * Return when the file has been flushed.
+ */
+static int
+local_flush(WT_STORAGE_SOURCE *storage_source, WT_SESSION *session, WT_FILE_SYSTEM *file_system,
+ const char *source, const char *object, const char *config)
+{
+ LOCAL_STORAGE *local;
+ int ret;
+ char *dest_path;
+
+ (void)config; /* unused */
+ dest_path = NULL;
+ local = (LOCAL_STORAGE *)storage_source;
+ ret = 0;
+
+ if (file_system == NULL || source == NULL || object == NULL)
+ return local_err(local, session, EINVAL, "ss_flush_finish: required arguments missing");
+
+ if ((ret = local_bucket_path(file_system, object, &dest_path)) != 0)
+ goto err;
+
+ if ((ret = local_delay(local)) != 0)
+ goto err;
+
+ if ((ret = local_file_copy(local, session, source, dest_path, WT_FS_OPEN_FILE_TYPE_DATA)) != 0)
+ goto err;
+
+ local->object_flushes++;
+
+err:
+ free(dest_path);
+ return (ret);
+}
+
+/*
+ * local_flush_finish --
+ * Move a file from the default file system to the cache in the new file system.
+ */
+static int
+local_flush_finish(WT_STORAGE_SOURCE *storage_source, WT_SESSION *session,
+ WT_FILE_SYSTEM *file_system, const char *source, const char *object, const char *config)
+{
+ LOCAL_STORAGE *local;
+ int ret;
+ char *dest_path;
+
+ (void)config; /* unused */
+ dest_path = NULL;
+ local = (LOCAL_STORAGE *)storage_source;
+ ret = 0;
+
+ if (file_system == NULL || source == NULL || object == NULL)
+ return local_err(local, session, EINVAL, "ss_flush_finish: required arguments missing");
+
+ if ((ret = local_cache_path(file_system, object, &dest_path)) != 0)
+ goto err;
+
+ local->op_count++;
+ if ((ret = rename(source, dest_path)) != 0) {
+ ret = local_err(
+ local, session, errno, "ss_flush_finish rename %s to %s failed", source, dest_path);
+ goto err;
+ }
+ /* Set the file to readonly in the cache. */
+ if (ret == 0 && (ret = chmod(dest_path, 0444)) < 0)
+ ret = local_err(local, session, errno, "%s: ss_flush_finish chmod failed", dest_path);
+err:
+ free(dest_path);
+ return (ret);
+}
+
+/*
* local_directory_list --
* Return a list of object names for the given location.
*/
@@ -791,9 +729,8 @@ local_directory_list_internal(WT_FILE_SYSTEM *file_system, WT_SESSION *session,
struct dirent *dp;
DIR *dirp;
LOCAL_FILE_SYSTEM *local_fs;
- LOCAL_FLUSH_ITEM *flush;
LOCAL_STORAGE *local;
- size_t dir_len, fs_prefix_len, prefix_len;
+ size_t dir_len, prefix_len;
uint32_t allocated, count;
int ret, t_ret;
char **entries;
@@ -803,7 +740,6 @@ local_directory_list_internal(WT_FILE_SYSTEM *file_system, WT_SESSION *session,
local = local_fs->local_storage;
entries = NULL;
allocated = count = 0;
- fs_prefix_len = strlen(local_fs->fs_prefix);
dir_len = (directory == NULL ? 0 : strlen(directory));
prefix_len = (prefix == NULL ? 0 : strlen(prefix));
ret = 0;
@@ -811,6 +747,9 @@ local_directory_list_internal(WT_FILE_SYSTEM *file_system, WT_SESSION *session,
*dirlistp = NULL;
*countp = 0;
+ /*
+ * We list items in the cache directory (these have 'finished' flushing).
+ */
if ((dirp = opendir(local_fs->cache_dir)) == NULL) {
ret = errno;
if (ret == 0)
@@ -819,9 +758,6 @@ local_directory_list_internal(WT_FILE_SYSTEM *file_system, WT_SESSION *session,
local_err(local, session, ret, "%s: ss_directory_list: opendir", local_fs->cache_dir));
}
- /*
- * We list items in the cache directory as well as items in the "to be flushed" list.
- */
for (count = 0; (dp = readdir(dirp)) != NULL && (limit == 0 || count < limit);) {
/* Skip . and .. */
basename = dp->d_name;
@@ -833,36 +769,6 @@ local_directory_list_internal(WT_FILE_SYSTEM *file_system, WT_SESSION *session,
continue;
basename += dir_len;
- /* Skip files not associated with our file system prefix. */
- if (strncmp(basename, local_fs->fs_prefix, fs_prefix_len) != 0)
- continue;
-
- basename += fs_prefix_len;
- /* The list of files is optionally filtered by a prefix. */
- if (prefix != NULL && strncmp(basename, prefix, prefix_len) != 0)
- continue;
-
- if ((ret = local_directory_list_add(local, &entries, basename, count, &allocated)) != 0)
- goto err;
- count++;
- }
-
- TAILQ_FOREACH (flush, &local->flushq, q) {
- if (limit != 0 && count >= limit)
- break;
-
- /* Skip files not associated with this file system. */
- if (strcmp(local_fs->bucket_dir, flush->bucket) != 0 ||
- strcmp(local_fs->cache_dir, flush->cache_dir) != 0 ||
- strcmp(local_fs->fs_prefix, flush->fs_prefix) != 0)
- continue;
-
- basename = strrchr(flush->src_path, '/');
- if (basename == NULL)
- basename = flush->src_path;
- else
- basename++;
-
/* The list of files is optionally filtered by a prefix. */
if (prefix != NULL && strncmp(basename, prefix, prefix_len) != 0)
continue;
@@ -909,7 +815,6 @@ local_fs_terminate(WT_FILE_SYSTEM *file_system, WT_SESSION *session)
free(local_fs->auth_token);
free(local_fs->bucket_dir);
free(local_fs->cache_dir);
- free(local_fs->fs_prefix);
free(file_system);
return (0);
@@ -925,12 +830,13 @@ local_open(WT_FILE_SYSTEM *file_system, WT_SESSION *session, const char *name,
{
LOCAL_FILE_HANDLE *local_fh;
LOCAL_FILE_SYSTEM *local_fs;
- LOCAL_FLUSH_ITEM *flush;
LOCAL_STORAGE *local;
WT_FILE_HANDLE *file_handle, *wt_fh;
WT_FILE_SYSTEM *wt_fs;
struct stat sb;
int ret;
+ char *alloced_path;
+ const char *path;
bool create, exists;
(void)flags; /* Unused */
@@ -941,6 +847,7 @@ local_open(WT_FILE_SYSTEM *file_system, WT_SESSION *session, const char *name,
local_fs = (LOCAL_FILE_SYSTEM *)file_system;
local = local_fs->local_storage;
wt_fs = local_fs->wt_fs;
+ alloced_path = NULL;
/*
* We expect that the local file system will be used narrowly, like when creating or opening a
@@ -972,18 +879,16 @@ local_open(WT_FILE_SYSTEM *file_system, WT_SESSION *session, const char *name,
exists = (ret == 0);
} else
exists = false;
- if (create || exists) {
+ if (create || exists)
/* The file has not been flushed, use the file directly in the file system. */
- if ((local_fh->path = strdup(name)) == NULL) {
- ret = local_err(local, session, ENOMEM, "local_open");
- goto err;
- }
- } else {
- if ((ret = local_location_path(file_system, name, &local_fh->path)) != 0)
+ path = name;
+ else {
+ if ((ret = local_cache_path(file_system, name, &alloced_path)) != 0)
goto err;
- ret = stat(local_fh->path, &sb);
+ path = alloced_path;
+ ret = stat(path, &sb);
if (ret != 0 && errno != ENOENT) {
- ret = local_err(local, session, errno, "%s: local_open stat", local_fh->path);
+ ret = local_err(local, session, errno, "%s: local_open stat", path);
goto err;
}
exists = (ret == 0);
@@ -997,35 +902,8 @@ local_open(WT_FILE_SYSTEM *file_system, WT_SESSION *session, const char *name,
}
#endif
- if (create && !exists) {
- if ((flush = calloc(1, sizeof(LOCAL_FLUSH_ITEM))) == NULL) {
- ret = ENOMEM;
- goto err;
- }
- local_fh->flush = flush;
-
- if ((flush->auth_token = strdup(local_fs->auth_token)) == NULL) {
- ret = local_err(local, session, ENOMEM, "open.auth_token");
- goto err;
- }
- if ((flush->bucket = strdup(local_fs->bucket_dir)) == NULL) {
- ret = local_err(local, session, ENOMEM, "open.bucket");
- goto err;
- }
- if ((flush->cache_dir = strdup(local_fs->cache_dir)) == NULL) {
- ret = local_err(local, session, ENOMEM, "open.cache_dir");
- goto err;
- }
- if ((flush->fs_prefix = strdup(local_fs->fs_prefix)) == NULL) {
- ret = local_err(local, session, ENOMEM, "open.fs_prefix");
- goto err;
- }
- flush->file_type = file_type;
- }
-
- if ((ret = wt_fs->fs_open_file(wt_fs, session, local_fh->path, file_type, flags, &wt_fh)) !=
- 0) {
- ret = local_err(local, session, ret, "ss_open_object: open: %s", local_fh->path);
+ if ((ret = wt_fs->fs_open_file(wt_fs, session, path, file_type, flags, &wt_fh)) != 0) {
+ ret = local_err(local, session, ret, "ss_open_object: open: %s", path);
goto err;
}
local_fh->fh = wt_fh;
@@ -1071,9 +949,10 @@ local_open(WT_FILE_SYSTEM *file_system, WT_SESSION *session, const char *name,
*file_handlep = file_handle;
VERBOSE(
- local, "File opened: %s final path=%s\n", SHOW_STRING(name), SHOW_STRING(local_fh->path));
+ local, "File opened: %s final path=%s\n", SHOW_STRING(name), SHOW_STRING(local_fh->fh->name));
err:
+ free(alloced_path);
if (ret != 0) {
if (local_fh != NULL)
local_file_close_internal(local, session, local_fh);
@@ -1093,11 +972,9 @@ local_rename(WT_FILE_SYSTEM *file_system, WT_SESSION *session, const char *from,
uint32_t flags)
{
LOCAL_FILE_SYSTEM *local_fs;
- LOCAL_FLUSH_ITEM *flush;
LOCAL_STORAGE *local;
WT_FILE_SYSTEM *wt_fs;
- int ret, t_ret;
- char *copy;
+ int ret;
bool writeable;
local = FS2LOCAL(file_system);
@@ -1117,33 +994,6 @@ local_rename(WT_FILE_SYSTEM *file_system, WT_SESSION *session, const char *from,
goto err;
}
- /*
- * Find any flush entry that matches, and rename that too.
- */
- if ((ret = pthread_rwlock_wrlock(&local->flush_lock)) != 0) {
- ret = local_err(local, session, ret, "ss_remove: pthread_rwlock_wrlock");
- goto err;
- }
-
- TAILQ_FOREACH (flush, &local->flushq, q) {
- if (strcmp(flush->src_path, from) == 0) {
- if ((copy = strdup(to)) == NULL)
- ret = ENOMEM;
- else {
- free(flush->src_path);
- flush->src_path = copy;
- }
- break;
- }
- }
-
- if ((t_ret = pthread_rwlock_unlock(&local->flush_lock)) != 0) {
- (void)local_err(local, session, t_ret, "ss_remove: pthread_rwlock_unlock");
- if (ret == 0)
- ret = t_ret;
- goto err;
- }
-
err:
return (ret);
}
@@ -1157,7 +1007,6 @@ err:
static int
local_remove(WT_FILE_SYSTEM *file_system, WT_SESSION *session, const char *name, uint32_t flags)
{
- LOCAL_FLUSH_ITEM *flush;
LOCAL_STORAGE *local;
int ret;
bool writeable;
@@ -1180,27 +1029,6 @@ local_remove(WT_FILE_SYSTEM *file_system, WT_SESSION *session, const char *name,
goto err;
}
- /*
- * Find any flush entry that matches, and remove that too.
- */
- if ((ret = pthread_rwlock_wrlock(&local->flush_lock)) != 0) {
- ret = local_err(local, session, ret, "ss_remove: pthread_rwlock_wrlock");
- goto err;
- }
-
- TAILQ_FOREACH (flush, &local->flushq, q) {
- if (strcmp(flush->src_path, name) == 0) {
- TAILQ_REMOVE(&local->flushq, flush, q);
- local_flush_free(flush);
- break;
- }
- }
-
- if ((ret = pthread_rwlock_unlock(&local->flush_lock)) != 0) {
- ret = local_err(local, session, ret, "ss_remove: pthread_rwlock_unlock");
- goto err;
- }
-
err:
return (ret);
}
@@ -1226,7 +1054,7 @@ local_size(WT_FILE_SYSTEM *file_system, WT_SESSION *session, const char *name, w
ret = stat(name, &sb);
if (ret == ENOENT) {
/* Otherwise, we'll see if it's in the cache directory. */
- if ((ret = local_location_path(file_system, name, &path)) != 0)
+ if ((ret = local_cache_path(file_system, name, &path)) != 0)
goto err;
ret = stat(path, &sb);
@@ -1282,7 +1110,6 @@ local_file_close(WT_FILE_HANDLE *file_handle, WT_SESSION *session)
{
LOCAL_STORAGE *local;
LOCAL_FILE_HANDLE *local_fh;
- LOCAL_FLUSH_ITEM *flush;
int ret, t_ret;
ret = 0;
@@ -1299,28 +1126,6 @@ local_file_close(WT_FILE_HANDLE *file_handle, WT_SESSION *session)
if ((ret = pthread_rwlock_unlock(&local->file_handle_lock)) != 0)
(void)local_err(local, session, ret, "file handle close: pthread_rwlock_unlock");
- /*
- * If we need to track flushes for this file, save the flush item on our queue.
- */
- if (ret == 0 && ((flush = local_fh->flush)) != NULL) {
- if ((ret = pthread_rwlock_wrlock(&local->flush_lock)) != 0)
- (void)local_err(local, session, ret, "file handle close: pthread_rwlock_wrlock2");
-
- if (ret == 0) {
- /*
- * Move the flush object from the file handle and to the flush queue. It is now owned by
- * the flush queue and will be freed when that item is flushed.
- */
- TAILQ_INSERT_HEAD(&local->flushq, flush, q);
- local_fh->flush = NULL;
-
- if ((ret = pthread_rwlock_unlock(&local->flush_lock)) != 0)
- (void)local_err(local, session, ret, "file handle close: pthread_rwlock_unlock2");
- if (ret == 0 && ((flush->src_path = strdup(local_fh->path)) == NULL))
- ret = ENOMEM;
- }
- }
-
if ((t_ret = local_file_close_internal(local, session, local_fh)) != 0) {
if (ret == 0)
ret = t_ret;
@@ -1344,8 +1149,6 @@ local_file_close_internal(LOCAL_STORAGE *local, WT_SESSION *session, LOCAL_FILE_
if (wt_fh != NULL && (ret = wt_fh->close(wt_fh, session)) != 0)
ret = local_err(local, session, ret, "WT_FILE_HANDLE->close: close");
- local_flush_free(local_fh->flush);
- free(local_fh->path);
free(local_fh->iface.name);
free(local_fh);
@@ -1451,8 +1254,7 @@ wiredtiger_extension_init(WT_CONNECTION *connection, WT_CONFIG_ARG *config)
if ((local = calloc(1, sizeof(LOCAL_STORAGE))) == NULL)
return (errno);
local->wt_api = connection->get_extension_api(connection);
- if ((ret = pthread_rwlock_init(&local->file_handle_lock, NULL)) != 0 ||
- (ret = pthread_rwlock_init(&local->flush_lock, NULL)) != 0) {
+ if ((ret = pthread_rwlock_init(&local->file_handle_lock, NULL)) != 0) {
(void)local_err(local, NULL, ret, "pthread_rwlock_init");
free(local);
return (ret);
@@ -1464,6 +1266,7 @@ wiredtiger_extension_init(WT_CONNECTION *connection, WT_CONFIG_ARG *config)
*/
local->storage_source.ss_customize_file_system = local_customize_file_system;
local->storage_source.ss_flush = local_flush;
+ local->storage_source.ss_flush_finish = local_flush_finish;
local->storage_source.terminate = local_terminate;
if ((ret = local_configure(local, config)) != 0) {
diff --git a/src/third_party/wiredtiger/import.data b/src/third_party/wiredtiger/import.data
index 392d1bb1861..2c1e2219e30 100644
--- a/src/third_party/wiredtiger/import.data
+++ b/src/third_party/wiredtiger/import.data
@@ -2,5 +2,5 @@
"vendor": "wiredtiger",
"github": "wiredtiger/wiredtiger.git",
"branch": "mongodb-4.4",
- "commit": "aadac222429faa9b20d9344e3648a19be97811b9"
+ "commit": "bae0c1c914bc0fa92f3775c08650b65663094034"
}
diff --git a/src/third_party/wiredtiger/lang/python/wiredtiger.i b/src/third_party/wiredtiger/lang/python/wiredtiger.i
index 3dbf69d35c2..0105a5a70d6 100644
--- a/src/third_party/wiredtiger/lang/python/wiredtiger.i
+++ b/src/third_party/wiredtiger/lang/python/wiredtiger.i
@@ -986,14 +986,19 @@ typedef int int_void;
%enddef
SIDESTEP_METHOD(__wt_storage_source, ss_customize_file_system,
- (WT_SESSION *session, const char *bucket_name, const char *prefix,
+ (WT_SESSION *session, const char *bucket_name,
const char *auth_token, const char *config, WT_FILE_SYSTEM **file_systemp),
- (self, session, bucket_name, prefix, auth_token, config, file_systemp))
+ (self, session, bucket_name, auth_token, config, file_systemp))
SIDESTEP_METHOD(__wt_storage_source, ss_flush,
(WT_SESSION *session, WT_FILE_SYSTEM *file_system,
- const char *name, const char *config),
- (self, session, file_system, name, config))
+ const char *source, const char *object, const char *config),
+ (self, session, file_system, source, object, config))
+
+SIDESTEP_METHOD(__wt_storage_source, ss_flush_finish,
+ (WT_SESSION *session, WT_FILE_SYSTEM *file_system,
+ const char *source, const char *object, const char *config),
+ (self, session, file_system, source, object, config))
SIDESTEP_METHOD(__wt_storage_source, terminate,
(WT_SESSION *session),
diff --git a/src/third_party/wiredtiger/src/block/block_addr.c b/src/third_party/wiredtiger/src/block/block_addr.c
index cc8a46a196a..15295bc02d3 100644
--- a/src/third_party/wiredtiger/src/block/block_addr.c
+++ b/src/third_party/wiredtiger/src/block/block_addr.c
@@ -28,16 +28,13 @@ __block_buffer_to_addr(WT_BLOCK *block, const uint8_t **pp, uint32_t *logidp, wt
WT_RET(__wt_vunpack_uint(pp, 0, &c));
/*
- * To avoid storing large offsets, we minimize the value by subtracting
- * a block for description information, then storing a count of block
- * allocation units. That implies there is no such thing as an
- * "invalid" offset though, they could all be valid (other than very
- * large numbers), which is what we didn't want to store in the first
- * place. Use the size: writing a block of size 0 makes no sense, so
- * that's the out-of-band value. Once we're out of this function and
- * are working with a real file offset, size and checksum triplet, there
- * can be invalid offsets, that's simpler than testing sizes of 0 all
- * over the place.
+ * To avoid storing large offsets, we minimize the value by subtracting a block for description
+ * information, then storing a count of block allocation units. That implies there is no such
+ * thing as an "invalid" offset though, they could all be valid (other than very large numbers),
+ * which is what we didn't want to store in the first place. Use the size: writing a block of
+ * size 0 makes no sense, so that's the out-of-band value. Once we're out of this function and
+ * are working with a real file offset, size and checksum triplet, there can be invalid offsets,
+ * that's simpler than testing sizes of 0 all over the place.
*/
if (s == 0) {
*offsetp = 0;
diff --git a/src/third_party/wiredtiger/src/block/block_ext.c b/src/third_party/wiredtiger/src/block/block_ext.c
index 0deaef04654..310330da831 100644
--- a/src/third_party/wiredtiger/src/block/block_ext.c
+++ b/src/third_party/wiredtiger/src/block/block_ext.c
@@ -280,18 +280,16 @@ __wt_block_misplaced(WT_SESSION_IMPL *session, WT_BLOCK *block, const char *list
return (0);
/*
- * Verify a block the btree engine thinks it "owns" doesn't appear on
- * the available or discard lists (it might reasonably be on the alloc
- * list, if it was allocated since the last checkpoint). The engine
- * "owns" a block if it's trying to read or free the block, and those
+ * Verify a block the btree engine thinks it "owns" doesn't appear on the available or discard
+ * lists (it might reasonably be on the alloc list, if it was allocated since the last
+ * checkpoint). The engine "owns" a block if it's trying to read or free the block, and those
* functions make this check.
*
* Any block being read or freed should not be "available".
*
- * Any block being read or freed in the live system should not be on the
- * discard list. (A checkpoint handle might be reading a block which is
- * on the live system's discard list; any attempt to free a block from a
- * checkpoint handle has already failed.)
+ * Any block being read or freed in the live system should not be on the discard list. (A
+ * checkpoint handle might be reading a block which is on the live system's discard list; any
+ * attempt to free a block from a checkpoint handle has already failed.)
*/
__wt_spin_lock(session, &block->live_lock);
if (__block_off_match(&block->live.avail, offset, size))
diff --git a/src/third_party/wiredtiger/src/block/block_tiered.c b/src/third_party/wiredtiger/src/block/block_tiered.c
index 776b2a127ad..d922a663e03 100644
--- a/src/third_party/wiredtiger/src/block/block_tiered.c
+++ b/src/third_party/wiredtiger/src/block/block_tiered.c
@@ -52,7 +52,6 @@ __wt_block_tiered_newfile(WT_SESSION_IMPL *session, WT_BLOCK *block)
{
WT_DECL_ITEM(tmp);
WT_DECL_RET;
- WT_STORAGE_SOURCE *storage_source;
const char *filename;
/* Get the old file name again. */
@@ -62,21 +61,8 @@ __wt_block_tiered_newfile(WT_SESSION_IMPL *session, WT_BLOCK *block)
* TODO: tiered: We will get rid of the log id, and this name generation will be replaced by the
* name generated by __tiered_switch.
*/
- WT_ERR(__wt_buf_fmt(session, tmp, "%s.%08" PRIu32, block->name, block->logid));
- filename = tmp->data;
WT_ERR(__wt_close(session, &block->fh));
- /*
- * TODO: tiered: Assert that session->bucket_storage is not NULL. We can't do that while we have
- * tests that use block_allocation=log without setting up bucket storage. This whole function is
- * going to look very different when flush_tier is fully integrated.
- */
- if (session->bucket_storage != NULL && block->logid != 0) {
- storage_source = session->bucket_storage->storage_source;
- WT_ASSERT(session, storage_source != NULL);
- WT_ERR(storage_source->ss_flush(
- storage_source, &session->iface, session->bucket_storage->file_system, filename, NULL));
- }
/* Bump to a new file ID. */
++block->logid;
WT_ERR(__wt_buf_fmt(session, tmp, "%s.%08" PRIu32, block->name, block->logid));
diff --git a/src/third_party/wiredtiger/src/btree/bt_curnext.c b/src/third_party/wiredtiger/src/btree/bt_curnext.c
index a2b7f161d3e..5f3e03bde3e 100644
--- a/src/third_party/wiredtiger/src/btree/bt_curnext.c
+++ b/src/third_party/wiredtiger/src/btree/bt_curnext.c
@@ -314,7 +314,6 @@ __cursor_row_next(
WT_PAGE *page;
WT_ROW *rip;
WT_SESSION_IMPL *session;
- bool kpack_used;
session = CUR2S(cbt);
page = cbt->ref->page;
@@ -402,7 +401,7 @@ restart_read_insert:
cbt->slot = cbt->row_iteration_slot / 2 - 1;
restart_read_page:
rip = &page->pg_row[cbt->slot];
- WT_RET(__cursor_row_slot_key_return(cbt, rip, &kpack, &kpack_used));
+ WT_RET(__cursor_row_slot_key_return(cbt, rip, &kpack));
/*
* If the cursor has prefix search configured we can early exit here if the key that we are
* visiting is after our prefix.
@@ -679,6 +678,8 @@ __wt_btcur_next_prefix(WT_CURSOR_BTREE *cbt, WT_ITEM *prefix, bool truncating)
page = cbt->ref == NULL ? NULL : cbt->ref->page;
if (F_ISSET(cbt, WT_CBT_ITERATE_APPEND)) {
+ /* The page cannot be NULL if the above flag is set. */
+ WT_ASSERT(session, page != NULL);
switch (page->type) {
case WT_PAGE_COL_FIX:
ret = __cursor_fix_append_next(cbt, newpage, restart);
diff --git a/src/third_party/wiredtiger/src/btree/bt_curprev.c b/src/third_party/wiredtiger/src/btree/bt_curprev.c
index 867a46201a4..abf31424525 100644
--- a/src/third_party/wiredtiger/src/btree/bt_curprev.c
+++ b/src/third_party/wiredtiger/src/btree/bt_curprev.c
@@ -454,7 +454,6 @@ __cursor_row_prev(
WT_PAGE *page;
WT_ROW *rip;
WT_SESSION_IMPL *session;
- bool kpack_used;
session = CUR2S(cbt);
page = cbt->ref->page;
@@ -480,12 +479,8 @@ __cursor_row_prev(
* Initialize for each new page.
*/
if (newpage) {
- /*
- * If we haven't instantiated keys on this page, do so, else it is a very, very slow
- * traversal.
- */
- if (!F_ISSET_ATOMIC(page, WT_PAGE_BUILD_KEYS))
- WT_RET(__wt_row_leaf_keys(session, page));
+ /* Check if keys need to be instantiated before we walk the page. */
+ WT_RET(__wt_row_leaf_key_instantiate(session, page));
/*
* Be paranoid and set the slot out of bounds when moving to a new page.
@@ -554,7 +549,7 @@ restart_read_insert:
cbt->slot = cbt->row_iteration_slot / 2 - 1;
restart_read_page:
rip = &page->pg_row[cbt->slot];
- WT_RET(__cursor_row_slot_key_return(cbt, rip, &kpack, &kpack_used));
+ WT_RET(__cursor_row_slot_key_return(cbt, rip, &kpack));
/*
* If the cursor has prefix search configured we can early exit here if the key we are
* visiting is before our prefix.
@@ -638,6 +633,8 @@ __wt_btcur_prev_prefix(WT_CURSOR_BTREE *cbt, WT_ITEM *prefix, bool truncating)
F_SET(cbt, WT_CBT_ITERATE_APPEND);
if (F_ISSET(cbt, WT_CBT_ITERATE_APPEND)) {
+ /* The page cannot be NULL if the above flag is set. */
+ WT_ASSERT(session, page != NULL);
switch (page->type) {
case WT_PAGE_COL_FIX:
ret = __cursor_fix_append_prev(cbt, newpage, restart);
diff --git a/src/third_party/wiredtiger/src/btree/bt_debug.c b/src/third_party/wiredtiger/src/btree/bt_debug.c
index c9acfff3628..c68855988a2 100644
--- a/src/third_party/wiredtiger/src/btree/bt_debug.c
+++ b/src/third_party/wiredtiger/src/btree/bt_debug.c
@@ -1345,7 +1345,7 @@ __debug_page_row_leaf(WT_DBG *ds, WT_PAGE *page, WT_CURSOR *hs_cursor)
WT_RET(__wt_row_leaf_key(session, page, rip, ds->key, false));
WT_RET(__debug_item_key(ds, "K", ds->key->data, ds->key->size));
- __wt_row_leaf_value_cell(session, page, rip, NULL, unpack);
+ __wt_row_leaf_value_cell(session, page, rip, unpack);
WT_RET(__debug_cell_kv(ds, page, WT_PAGE_ROW_LEAF, "V", unpack));
if ((upd = WT_ROW_UPDATE(page, rip)) != NULL)
diff --git a/src/third_party/wiredtiger/src/btree/bt_discard.c b/src/third_party/wiredtiger/src/btree/bt_discard.c
index 03f789868db..02bd970e0c6 100644
--- a/src/third_party/wiredtiger/src/btree/bt_discard.c
+++ b/src/third_party/wiredtiger/src/btree/bt_discard.c
@@ -360,22 +360,12 @@ __free_page_col_var(WT_SESSION_IMPL *session, WT_PAGE *page)
static void
__free_page_row_leaf(WT_SESSION_IMPL *session, WT_PAGE *page)
{
- WT_IKEY *ikey;
WT_ROW *rip;
uint32_t i;
- void *copy;
- /*
- * Free the in-memory index array.
- *
- * For each entry, see if the key was an allocation (that is, if it points somewhere other than
- * the original page), and if so, free the memory.
- */
- WT_ROW_FOREACH (page, rip, i) {
- copy = WT_ROW_KEY_COPY(rip);
- WT_IGNORE_RET_BOOL(__wt_row_leaf_key_info(page, copy, &ikey, NULL, NULL, NULL));
- __wt_free(session, ikey);
- }
+ /* Free any allocated memory used by instantiated keys. */
+ WT_ROW_FOREACH (page, rip, i)
+ __wt_row_leaf_key_free(session, page, rip);
}
/*
diff --git a/src/third_party/wiredtiger/src/btree/bt_handle.c b/src/third_party/wiredtiger/src/btree/bt_handle.c
index 0d36f155f7a..5e6444dc202 100644
--- a/src/third_party/wiredtiger/src/btree/bt_handle.c
+++ b/src/third_party/wiredtiger/src/btree/bt_handle.c
@@ -222,6 +222,9 @@ __wt_btree_close(WT_SESSION_IMPL *session)
!F_ISSET(S2C(session), WT_CONN_HS_OPEN) || !btree->hs_entries ||
(!WT_IS_METADATA(btree->dhandle) && !WT_IS_HS(btree->dhandle)));
+ /* Clear the saved checkpoint information. */
+ __wt_meta_saved_ckptlist_free(session);
+
/*
* If we turned eviction off and never turned it back on, do that now, otherwise the counter
* will be off.
@@ -344,7 +347,7 @@ __btree_conf(WT_SESSION_IMPL *session, WT_CKPT *ckpt)
WT_RET(__wt_struct_confchk(session, &cval));
WT_RET(__wt_strndup(session, cval.str, cval.len, &btree->value_format));
- /* Row-store key comparison and key gap for prefix compression. */
+ /* Row-store key comparison. */
if (btree->type == BTREE_ROW) {
WT_RET(__wt_config_gets_none(session, cfg, "collator", &cval));
if (cval.len != 0) {
@@ -352,9 +355,6 @@ __btree_conf(WT_SESSION_IMPL *session, WT_CKPT *ckpt)
WT_RET(__wt_collator_config(session, btree->dhandle->name, &cval, &metadata,
&btree->collator, &btree->collator_owned));
}
-
- WT_RET(__wt_config_gets(session, cfg, "key_gap", &cval));
- btree->key_gap = (uint32_t)cval.val;
}
/* Column-store: check for fixed-size data. */
@@ -389,9 +389,8 @@ __btree_conf(WT_SESSION_IMPL *session, WT_CKPT *ckpt)
F_CLR(btree, WT_BTREE_IGNORE_CACHE);
/*
- * The metadata isn't blocked by in-memory cache limits because metadata
- * "unroll" is performed by updates that are potentially blocked by the
- * cache-full checks.
+ * The metadata isn't blocked by in-memory cache limits because metadata "unroll" is performed
+ * by updates that are potentially blocked by the cache-full checks.
*/
if (WT_IS_METADATA(btree->dhandle))
F_SET(btree, WT_BTREE_IGNORE_CACHE);
diff --git a/src/third_party/wiredtiger/src/btree/bt_import.c b/src/third_party/wiredtiger/src/btree/bt_import.c
index 4e207d7aa8f..6a650cf0647 100644
--- a/src/third_party/wiredtiger/src/btree/bt_import.c
+++ b/src/third_party/wiredtiger/src/btree/bt_import.c
@@ -20,6 +20,7 @@ __wt_import_repair(WT_SESSION_IMPL *session, const char *uri, char **configp)
WT_CONFIG_ITEM v;
WT_DECL_ITEM(a);
WT_DECL_ITEM(b);
+ WT_DECL_ITEM(buf);
WT_DECL_ITEM(checkpoint);
WT_DECL_RET;
WT_KEYED_ENCRYPTOR *kencryptor;
@@ -33,6 +34,7 @@ __wt_import_repair(WT_SESSION_IMPL *session, const char *uri, char **configp)
WT_ERR(__wt_scr_alloc(session, 0, &a));
WT_ERR(__wt_scr_alloc(session, 0, &b));
+ WT_ERR(__wt_scr_alloc(session, 1024, &buf));
WT_ERR(__wt_scr_alloc(session, 0, &checkpoint));
WT_ASSERT(session, WT_PREFIX_MATCH(uri, "file:"));
@@ -92,13 +94,14 @@ __wt_import_repair(WT_SESSION_IMPL *session, const char *uri, char **configp)
* Build and flatten the metadata and the checkpoint list, then insert it into the metadata for
* this file.
*
- * Strip out any incremental backup information, an imported file has not been part of a backup.
- * Strip out the checkpoint LSN, an imported file isn't associated with any log files. Assign a
- * unique file ID.
+ * Reconstruct the incremental backup information, to indicate copying the whole file as an
+ * imported file has not been part of backup. Strip out the checkpoint LSN, an imported file
+ * isn't associated with any log files. Assign a unique file ID.
*/
cfg[1] = a->data;
cfg[2] = checkpoint_list;
- cfg[3] = "checkpoint_backup_info=";
+ WT_ERR(__wt_reset_blkmod(session, a->data, buf));
+ cfg[3] = buf->mem;
cfg[4] = "checkpoint_lsn=";
WT_WITH_SCHEMA_LOCK(session,
ret = __wt_snprintf(fileid, sizeof(fileid), "id=%" PRIu32, ++S2C(session)->next_file_id));
@@ -129,7 +132,7 @@ __wt_import_repair(WT_SESSION_IMPL *session, const char *uri, char **configp)
* Update the last checkpoint with the corrected information. Update the file's metadata with
* the new checkpoint information.
*/
- WT_ERR(__wt_meta_ckptlist_get_from_config(session, false, &ckptbase, config_tmp));
+ WT_ERR(__wt_meta_ckptlist_get_from_config(session, false, &ckptbase, NULL, config_tmp));
WT_CKPT_FOREACH (ckptbase, ckpt)
if (ckpt->name == NULL || (ckpt + 1)->name == NULL)
break;
@@ -154,6 +157,7 @@ err:
__wt_scr_free(session, &a);
__wt_scr_free(session, &b);
+ __wt_scr_free(session, &buf);
__wt_scr_free(session, &checkpoint);
return (ret);
diff --git a/src/third_party/wiredtiger/src/btree/bt_page.c b/src/third_party/wiredtiger/src/btree/bt_page.c
index 5a6f36b160c..22766b682bf 100644
--- a/src/third_party/wiredtiger/src/btree/bt_page.c
+++ b/src/third_party/wiredtiger/src/btree/bt_page.c
@@ -539,11 +539,21 @@ __inmem_row_leaf(WT_SESSION_IMPL *session, WT_PAGE *page)
WT_ROW *rip;
WT_UPDATE *tombstone, *upd;
size_t size, total_size;
+ uint32_t best_prefix_count, best_prefix_start, best_prefix_stop;
+ uint32_t last_slot, prefix_count, prefix_start, prefix_stop, slot;
+ uint8_t smallest_prefix;
btree = S2BT(session);
tombstone = upd = NULL;
+ last_slot = 0;
size = total_size = 0;
+ /* The code depends on the prefix count variables, other initialization shouldn't matter. */
+ best_prefix_count = prefix_count = 0;
+ smallest_prefix = 0; /* [-Wconditional-uninitialized] */
+ prefix_start = prefix_stop = 0; /* [-Wconditional-uninitialized] */
+ best_prefix_start = best_prefix_stop = 0; /* [-Wconditional-uninitialized] */
+
/*
* Optionally instantiate prepared updates. In-memory databases restore non-obsolete updates on
* the page as part of the __split_multi_inmem function.
@@ -557,19 +567,74 @@ __inmem_row_leaf(WT_SESSION_IMPL *session, WT_PAGE *page)
rip = page->pg_row;
WT_CELL_FOREACH_KV (session, page->dsk, unpack) {
switch (unpack.type) {
- case WT_CELL_KEY_OVFL:
- __wt_row_leaf_key_set_cell(page, rip, unpack.cell);
- ++rip;
- continue;
case WT_CELL_KEY:
/*
- * Simple keys without prefix compression can be directly referenced on the page to
+ * Simple keys and prefix-compressed keys can be directly referenced on the page to
* avoid repeatedly unpacking their cells.
+ *
+ * Review groups of prefix-compressed keys, and track the biggest group as the page's
+ * prefix. What we're finding is the biggest group of prefix-compressed keys we can
+ * immediately build using a previous key plus their suffix bytes, without rolling
+ * forward through intermediate keys. We save that information on the page and then
+ * never physically instantiate those keys, avoiding memory amplification for pages with
+ * a page-wide prefix. On the first of a group of prefix-compressed keys, track the slot
+ * of the fully-instantiated key from which it's derived and the current key's prefix
+ * length. On subsequent keys, if the key can be built from the original key plus the
+ * current key's suffix bytes, update the maximum slot to which the prefix applies and
+ * the smallest prefix length.
+ *
+ * Groups of prefix-compressed keys end when a key is not prefix-compressed (ignoring
+ * overflow keys), or the key's prefix length increases. A prefix length decreasing is
+ * OK, it only means fewer bytes taken from the original key. A prefix length increasing
+ * doesn't necessarily end a group of prefix-compressed keys as we might be able to
+ * build a subsequent key using the original key and the key's suffix bytes, that is the
+ * prefix length could increase and then decrease to the same prefix length as before
+ * and those latter keys could be built without rolling forward through intermediate
+ * keys.
+ *
+ * However, that gets tricky: once a key prefix grows, we can never include a prefix
+ * smaller than the smallest prefix found so far, in the group, as a subsequent key
+ * prefix larger than the smallest prefix found so far might include bytes not present
+ * in the original instantiated key. Growing and shrinking is complicated to track, so
+ * rather than code up that complexity, we close out a group whenever the prefix grows.
+ * Plus, growing has additional issues. Any key with a larger prefix cannot be
+ * instantiated without rolling forward through intermediate keys, and so while such a
+ * key isn't required to close out the prefix group in all cases, it's not a useful
+ * entry for finding the best group of prefix-compressed keys, either, it's only
+ * possible keys after the prefix shrinks again that are potentially worth including in
+ * a group.
+ */
+ slot = WT_ROW_SLOT(page, rip);
+ if (unpack.prefix == 0) {
+ /* If the last prefix group was the best, track it. */
+ if (prefix_count > best_prefix_count) {
+ best_prefix_start = prefix_start;
+ best_prefix_stop = prefix_stop;
+ best_prefix_count = prefix_count;
+ }
+ prefix_count = 0;
+ prefix_start = slot;
+ } else {
+ /* Check for starting or continuing a prefix group. */
+ if (prefix_count == 0 ||
+ (last_slot == slot - 1 && unpack.prefix <= smallest_prefix)) {
+ smallest_prefix = unpack.prefix;
+ last_slot = prefix_stop = slot;
+ ++prefix_count;
+ }
+ }
+ __wt_row_leaf_key_set(page, rip, &unpack);
+ ++rip;
+ continue;
+ case WT_CELL_KEY_OVFL:
+ /*
+ * Prefix compression skips overflow items, ignore this slot. The last slot value is
+ * only used inside a group of prefix-compressed keys, so blindly increment it, it's not
+ * used unless the count of prefix-compressed keys is non-zero.
*/
- if (unpack.prefix == 0)
- __wt_row_leaf_key_set(page, rip, &unpack);
- else
- __wt_row_leaf_key_set_cell(page, rip, unpack.cell);
+ ++last_slot;
+
+ __wt_row_leaf_key_set(page, rip, &unpack);
++rip;
continue;
case WT_CELL_VALUE:
@@ -584,7 +649,7 @@ __inmem_row_leaf(WT_SESSION_IMPL *session, WT_PAGE *page)
(WT_TIME_WINDOW_IS_EMPTY(&unpack.tw) ||
(!WT_TIME_WINDOW_HAS_STOP(&unpack.tw) &&
__wt_txn_tw_start_visible_all(session, &unpack.tw))))
- __wt_row_leaf_value_set(page, rip - 1, &unpack);
+ __wt_row_leaf_value_set(rip - 1, &unpack);
break;
case WT_CELL_VALUE_OVFL:
break;
@@ -610,6 +675,9 @@ __inmem_row_leaf(WT_SESSION_IMPL *session, WT_PAGE *page)
prepare = PREPARE_INITIALIZED;
}
+ /* Make sure that there is no in-memory update for this key. */
+ WT_ASSERT(session, page->modify->mod_row_update[WT_ROW_SLOT(page, rip - 1)] == NULL);
+
/* Take the value from the page cell. */
WT_ERR(__wt_page_cell_data_ref(session, page, &unpack, value));
@@ -618,6 +686,7 @@ __inmem_row_leaf(WT_SESSION_IMPL *session, WT_PAGE *page)
upd->durable_ts = unpack.tw.durable_start_ts;
upd->start_ts = unpack.tw.start_ts;
upd->txnid = unpack.tw.start_txn;
+ F_SET(upd, WT_UPDATE_PREPARE_RESTORED_FROM_DS);
/*
* Instantiate both update and tombstone if the prepared update is a tombstone. This is
@@ -632,7 +701,6 @@ __inmem_row_leaf(WT_SESSION_IMPL *session, WT_PAGE *page)
tombstone->txnid = unpack.tw.stop_txn;
tombstone->prepare_state = WT_PREPARE_INPROGRESS;
F_SET(tombstone, WT_UPDATE_PREPARE_RESTORED_FROM_DS);
- F_SET(upd, WT_UPDATE_RESTORED_FROM_DS);
/*
* Mark the update also as in-progress if the update and tombstone are from same
@@ -644,14 +712,12 @@ __inmem_row_leaf(WT_SESSION_IMPL *session, WT_PAGE *page)
unpack.tw.start_txn == unpack.tw.stop_txn) {
upd->durable_ts = WT_TS_NONE;
upd->prepare_state = WT_PREPARE_INPROGRESS;
- F_SET(upd, WT_UPDATE_PREPARE_RESTORED_FROM_DS);
}
tombstone->next = upd;
} else {
upd->durable_ts = WT_TS_NONE;
upd->prepare_state = WT_PREPARE_INPROGRESS;
- F_SET(upd, WT_UPDATE_PREPARE_RESTORED_FROM_DS);
tombstone = upd;
}
@@ -660,6 +726,23 @@ __inmem_row_leaf(WT_SESSION_IMPL *session, WT_PAGE *page)
}
WT_CELL_FOREACH_END;
+ /* If the last prefix group was the best, track it. Save the best prefix group for the page. */
+ if (prefix_count > best_prefix_count) {
+ best_prefix_start = prefix_start;
+ best_prefix_stop = prefix_stop;
+ }
+ page->prefix_start = best_prefix_start;
+ page->prefix_stop = best_prefix_stop;
+
+ /*
+ * Backward cursor traversal can be too slow if we're forced to process long stretches of
+ * prefix-compressed keys to create every key as we walk backwards through the page, and we
+ * handle that by instantiating periodic keys when backward cursor traversal enters a new page.
+ * Mark the page as not needing that work if there aren't stretches of prefix-compressed keys.
+ */
+ if (best_prefix_count <= 10)
+ F_SET_ATOMIC(page, WT_PAGE_BUILD_KEYS);
+
__wt_cache_page_inmem_incr(session, page, total_size);
err:
diff --git a/src/third_party/wiredtiger/src/btree/bt_ret.c b/src/third_party/wiredtiger/src/btree/bt_ret.c
index 4ea7884d4b7..b9d14f0a889 100644
--- a/src/third_party/wiredtiger/src/btree/bt_ret.c
+++ b/src/third_party/wiredtiger/src/btree/bt_ret.c
@@ -91,15 +91,16 @@ __wt_read_row_time_window(WT_SESSION_IMPL *session, WT_PAGE *page, WT_ROW *rip,
{
WT_CELL_UNPACK_KV unpack;
- WT_TIME_WINDOW_INIT(tw);
/*
- * If a value is simple and is globally visible at the time of reading a page into cache, we set
- * the start time point as globally visible.
+ * Simple values are encoded at the time of reading a page into cache, in which case we set the
+ * start time point as globally visible.
*/
- if (__wt_row_leaf_value_exists(rip))
+ if (__wt_row_leaf_value_is_encoded(rip)) {
+ WT_TIME_WINDOW_INIT(tw);
return;
+ }
- __wt_row_leaf_value_cell(session, page, rip, NULL, &unpack);
+ __wt_row_leaf_value_cell(session, page, rip, &unpack);
WT_TIME_WINDOW_COPY(tw, &unpack.tw);
}
@@ -165,7 +166,7 @@ __wt_value_return_buf(WT_CURSOR_BTREE *cbt, WT_REF *ref, WT_ITEM *buf, WT_TIME_W
}
/* Take the value from the original page cell. */
- __wt_row_leaf_value_cell(session, page, rip, NULL, &unpack);
+ __wt_row_leaf_value_cell(session, page, rip, &unpack);
if (tw != NULL)
WT_TIME_WINDOW_COPY(tw, &unpack.tw);
return (__wt_page_cell_data_ref(session, page, &unpack, buf));
diff --git a/src/third_party/wiredtiger/src/btree/bt_slvg.c b/src/third_party/wiredtiger/src/btree/bt_slvg.c
index 0fccaa8c801..5608242a5dd 100644
--- a/src/third_party/wiredtiger/src/btree/bt_slvg.c
+++ b/src/third_party/wiredtiger/src/btree/bt_slvg.c
@@ -194,6 +194,9 @@ __slvg_checkpoint(WT_SESSION_IMPL *session, WT_REF *root)
* We may not have found any pages during salvage and there's no tree to flush.
*/
if (root->page != NULL) {
+ /* Make sure that the saved checkpoint information has been cleared. */
+ WT_ASSERT(session, btree->ckpt == NULL);
+
btree->ckpt = ckptbase;
ret = __wt_evict(session, root, WT_REF_MEM, WT_EVICT_CALL_CLOSING);
root->page = NULL;
@@ -1993,12 +1996,10 @@ __slvg_row_ovfl(
*/
for (rip = page->pg_row + start; start < stop; ++start, ++rip) {
copy = WT_ROW_KEY_COPY(rip);
- WT_IGNORE_RET_BOOL(__wt_row_leaf_key_info(page, copy, NULL, &cell, NULL, NULL));
- if (cell != NULL) {
- __wt_cell_unpack_kv(session, page->dsk, cell, &unpack);
- WT_RET(__slvg_row_ovfl_single(session, trk, &unpack));
- }
- __wt_row_leaf_value_cell(session, page, rip, NULL, &unpack);
+ __wt_row_leaf_key_info(page, copy, NULL, &cell, NULL, NULL, NULL);
+ __wt_cell_unpack_kv(session, page->dsk, cell, &unpack);
+ WT_RET(__slvg_row_ovfl_single(session, trk, &unpack));
+ __wt_row_leaf_value_cell(session, page, rip, &unpack);
WT_RET(__slvg_row_ovfl_single(session, trk, &unpack));
}
return (0);
diff --git a/src/third_party/wiredtiger/src/btree/bt_split.c b/src/third_party/wiredtiger/src/btree/bt_split.c
index 765d9240657..6fc62f0a52b 100644
--- a/src/third_party/wiredtiger/src/btree/bt_split.c
+++ b/src/third_party/wiredtiger/src/btree/bt_split.c
@@ -1782,14 +1782,12 @@ __split_insert(WT_SESSION_IMPL *session, WT_REF *ref)
if (type == WT_PAGE_ROW_LEAF) {
/*
- * Copy the first key from the original page into first ref in
- * the new parent. Pages created in memory always have a
- * "smallest" insert list, so look there first. If we don't
- * find one, get the first key from the disk image.
+ * Copy the first key from the original page into first ref in the new parent. Pages created
+ * in memory always have a "smallest" insert list, so look there first. If we don't find
+ * one, get the first key from the disk image.
*
- * We can't just use the key from the original ref: it may have
- * been suffix-compressed, and after the split the truncated key
- * may not be valid.
+ * We can't just use the key from the original ref: it may have been suffix-compressed, and
+ * after the split the truncated key may not be valid.
*/
WT_ERR(__wt_scr_alloc(session, 0, &key));
if ((ins = WT_SKIP_FIRST(WT_ROW_INSERT_SMALLEST(page))) != NULL) {
diff --git a/src/third_party/wiredtiger/src/btree/bt_stat.c b/src/third_party/wiredtiger/src/btree/bt_stat.c
index cc1d83e3335..9e452f39d2d 100644
--- a/src/third_party/wiredtiger/src/btree/bt_stat.c
+++ b/src/third_party/wiredtiger/src/btree/bt_stat.c
@@ -274,7 +274,7 @@ __stat_page_row_leaf(WT_SESSION_IMPL *session, WT_PAGE *page, WT_DSRC_STATS **st
if (upd == NULL || (upd->type != WT_UPDATE_RESERVE && upd->type != WT_UPDATE_TOMBSTONE))
++entry_cnt;
if (upd == NULL) {
- __wt_row_leaf_value_cell(session, page, rip, NULL, &unpack);
+ __wt_row_leaf_value_cell(session, page, rip, &unpack);
if (unpack.type == WT_CELL_VALUE_OVFL)
++ovfl_cnt;
}
diff --git a/src/third_party/wiredtiger/src/btree/bt_vrfy.c b/src/third_party/wiredtiger/src/btree/bt_vrfy.c
index cf2aca0fc87..039a9e7e823 100644
--- a/src/third_party/wiredtiger/src/btree/bt_vrfy.c
+++ b/src/third_party/wiredtiger/src/btree/bt_vrfy.c
@@ -213,7 +213,7 @@ __wt_verify(WT_SESSION_IMPL *session, const char *cfg[])
* Get a list of the checkpoints for this file. Empty objects have no checkpoints, in which case
* there's no work to do.
*/
- WT_ERR_NOTFOUND_OK(__wt_meta_ckptlist_get(session, name, false, &ckptbase), true);
+ WT_ERR_NOTFOUND_OK(__wt_meta_ckptlist_get(session, name, false, &ckptbase, NULL), true);
if (ret == WT_NOTFOUND) {
ret = 0;
goto done;
diff --git a/src/third_party/wiredtiger/src/btree/row_key.c b/src/third_party/wiredtiger/src/btree/row_key.c
index 12558339f97..1049b0d2186 100644
--- a/src/third_party/wiredtiger/src/btree/row_key.c
+++ b/src/third_party/wiredtiger/src/btree/row_key.c
@@ -8,100 +8,6 @@
#include "wt_internal.h"
-static void __inmem_row_leaf_slots(uint8_t *, uint32_t, uint32_t, uint32_t);
-
-/*
- * __wt_row_leaf_keys --
- * Instantiate the interesting keys for random search of a page.
- */
-int
-__wt_row_leaf_keys(WT_SESSION_IMPL *session, WT_PAGE *page)
-{
- WT_BTREE *btree;
- WT_DECL_ITEM(key);
- WT_DECL_ITEM(tmp);
- WT_DECL_RET;
- WT_ROW *rip;
- uint32_t gap, i;
-
- btree = S2BT(session);
-
- if (page->entries == 0) { /* Just checking... */
- F_SET_ATOMIC(page, WT_PAGE_BUILD_KEYS);
- return (0);
- }
-
- /*
- * Row-store leaf pages are written as one big prefix-compressed chunk,
- * that is, only the first key on the page is not prefix-compressed, and
- * to instantiate the last key on the page, you have to take the first
- * key on the page and roll it forward to the end of the page. We don't
- * want to do that on every page access, of course, so we instantiate a
- * set of keys, essentially creating prefix chunks on the page, where we
- * can roll forward from the closest, previous, instantiated key. The
- * complication is that not all keys on a page are equal: we're doing a
- * binary search on the page, which means there are keys we look at a
- * lot (every time we search the page), and keys we never look at unless
- * they are actually being searched for. This function figures out the
- * "interesting" keys on a page, and then we sequentially walk that list
- * instantiating those keys.
- *
- * Allocate a bit array and figure out the set of "interesting" keys,
- * marking up the array.
- */
- WT_RET(__wt_scr_alloc(session, 0, &key));
- WT_RET(__wt_scr_alloc(session, (uint32_t)__bitstr_size(page->entries), &tmp));
- memset(tmp->mem, 0, tmp->memsize);
-
- if ((gap = btree->key_gap) == 0)
- gap = 1;
- __inmem_row_leaf_slots(tmp->mem, 0, page->entries, gap);
-
- /* Instantiate the keys. */
- for (rip = page->pg_row, i = 0; i < page->entries; ++rip, ++i)
- if (__bit_test(tmp->mem, i))
- WT_ERR(__wt_row_leaf_key_work(session, page, rip, key, true));
-
- F_SET_ATOMIC(page, WT_PAGE_BUILD_KEYS);
-
-err:
- __wt_scr_free(session, &key);
- __wt_scr_free(session, &tmp);
- return (ret);
-}
-
-/*
- * __inmem_row_leaf_slots --
- * Figure out the interesting slots of a page for random search, up to the specified depth.
- */
-static void
-__inmem_row_leaf_slots(uint8_t *list, uint32_t base, uint32_t entries, uint32_t gap)
-{
- uint32_t indx, limit;
-
- if (entries < gap)
- return;
-
- /*
- * !!!
- * Don't clean this code up -- it deliberately looks like the binary
- * search code.
- *
- * !!!
- * There's got to be a function that would give me this information, but
- * I don't see any performance reason we can't just do this recursively.
- */
- limit = entries;
- indx = base + (limit >> 1);
- __bit_set(list, indx);
-
- __inmem_row_leaf_slots(list, base, limit >> 1, gap);
-
- base = indx + 1;
- --limit;
- __inmem_row_leaf_slots(list, base, limit >> 1, gap);
-}
-
/*
* __wt_row_leaf_key_copy --
* Get a copy of a row-store leaf-page key.
@@ -131,21 +37,19 @@ __wt_row_leaf_key_work(
WT_BTREE *btree;
WT_CELL *cell;
WT_CELL_UNPACK_KV *unpack, _unpack;
- WT_DECL_ITEM(tmp);
WT_DECL_RET;
WT_IKEY *ikey;
WT_ROW *rip, *jump_rip;
- size_t size;
- u_int last_prefix;
- int jump_slot_offset, slot_offset;
+ size_t group_size, key_size;
+ uint32_t slot;
+ u_int jump_slot_offset, slot_offset;
+ uint8_t group_prefix, key_prefix, last_prefix;
void *copy;
- const void *p;
+ const void *group_key, *key_data;
/*
- * !!!
- * It is unusual to call this function: most code should be calling the
- * front-end, __wt_row_leaf_key, be careful if you're calling this code
- * directly.
+ * It is unusual to call this function: most code should be calling the front-end,
+ * __wt_row_leaf_key, be careful if you're calling this code directly.
*/
btree = S2BT(session);
@@ -154,10 +58,10 @@ __wt_row_leaf_key_work(
jump_rip = NULL;
jump_slot_offset = 0;
- last_prefix = 0;
+ last_prefix = key_prefix = 0;
- p = NULL; /* -Werror=maybe-uninitialized */
- size = 0; /* -Werror=maybe-uninitialized */
+ key_data = NULL; /* -Werror=maybe-uninitialized */
+ key_size = 0; /* -Werror=maybe-uninitialized */
direction = BACKWARD;
for (slot_offset = 0;;) {
@@ -171,17 +75,26 @@ switch_and_jump:
rip = jump_rip;
slot_offset = jump_slot_offset;
}
- copy = WT_ROW_KEY_COPY(rip);
+overflow_retry:
/*
- * Figure out what the key looks like.
+ * Figure out what the key looks like. The row-store key can change underfoot; explicitly
+ * take a copy.
*/
- WT_IGNORE_RET_BOOL(__wt_row_leaf_key_info(page, copy, &ikey, &cell, &p, &size));
+ copy = WT_ROW_KEY_COPY(rip);
+ __wt_row_leaf_key_info(page, copy, &ikey, &cell, &key_data, &key_size, &key_prefix);
/* 1: the test for a directly referenced on-page key. */
- if (cell == NULL) {
- keyb->data = p;
- keyb->size = size;
+ if (ikey == NULL && key_data != NULL) {
+ /*
+ * If there's a key without prefix compression, we're good to go, otherwise we have to
+ * deal with the prefix.
+ */
+ if (key_prefix == 0) {
+ keyb->data = key_data;
+ keyb->size = key_size;
+ } else
+ goto prefix_continue;
/*
* If this is the key we originally wanted, we don't care if we're rolling forward or
@@ -189,18 +102,19 @@ switch_and_jump:
* normally happen, the fast-path code that front-ends this function will have figured
* it out before we were called.
*
- * The key doesn't need to be instantiated, skip past that test.
+ * The key doesn't need to be instantiated, just return.
*/
if (slot_offset == 0)
- goto done;
+ return (0);
/*
- * This key is not an overflow key by definition and
- * isn't compressed in any way, we can use it to roll
- * forward.
- * If rolling backward, switch directions.
- * If rolling forward: there's a bug somewhere,
- * we should have hit this key when rolling backward.
+ * This key is not an overflow key by definition and isn't compressed in any way, we can
+ * use it to roll forward.
+ *
+ * If rolling backward, switch directions.
+ *
+ * If rolling forward: there's a bug somewhere, we should have hit this key when rolling
+ * backward.
*/
goto switch_and_jump;
}
@@ -212,119 +126,154 @@ switch_and_jump:
* backward, or if it's an overflow key or not, it's what we wanted. Take a copy and
* wrap up.
*
- * The key doesn't need to be instantiated, skip past that test.
+ * The key doesn't need to be instantiated, just return.
*/
if (slot_offset == 0) {
- keyb->data = p;
- keyb->size = size;
- goto done;
+ keyb->data = key_data;
+ keyb->size = key_size;
+ return (0);
}
/*
- * If we wanted a different key and this key is an
- * overflow key:
- * If we're rolling backward, this key is useless
- * to us because it doesn't have a valid prefix: keep
- * rolling backward.
- * If we're rolling forward, there's no work to be
- * done because prefixes skip overflow keys: keep
- * rolling forward.
+ * If we wanted a different key and this key is an overflow key:
+ *
+ * If we're rolling backward, this key is useless to us because it doesn't have a valid
+ * prefix: keep rolling backward.
+ *
+ * If we're rolling forward, there's no work to be done because prefixes skip overflow
+ * keys: keep rolling forward.
*/
if (__wt_cell_type(cell) == WT_CELL_KEY_OVFL)
goto next;
/*
- * If we wanted a different key and this key is not an
- * overflow key, it has a valid prefix, we can use it.
- * If rolling backward, take a copy of the key and
- * switch directions, we can roll forward from this key.
- * If rolling forward, replace the key we've been
- * building with this key, it's what we would have built
- * anyway.
- * In short: if it's not an overflow key, take a copy
- * and roll forward.
+ * If we wanted a different key and this key is not an overflow key, it has a valid
+ * prefix, we can use it.
+ *
+ * If rolling backward, take a copy of the key and switch directions, we can roll
+ * forward from this key.
+ *
+ * If rolling forward, replace the key we've been building with this key, it's what we
+ * would have built anyway.
+ *
+ * In short: if it's not an overflow key, take a copy and roll forward.
*/
- keyb->data = p;
- keyb->size = size;
+ keyb->data = key_data;
+ keyb->size = key_size;
direction = FORWARD;
goto next;
}
- /*
- * It must be an on-page cell, unpack it.
- */
+ /* Unpack the on-page cell. */
__wt_cell_unpack_kv(session, page->dsk, cell, unpack);
/* 3: the test for an on-page reference to an overflow key. */
- if (unpack->type == WT_CELL_KEY_OVFL) {
+ if (unpack->type == WT_CELL_KEY_OVFL || unpack->type == WT_CELL_KEY_OVFL_RM) {
/*
* If this is the key we wanted from the start, we don't care if it's an overflow key,
* get a copy and wrap up.
*
- * Avoid racing with reconciliation deleting overflow keys. Deleted overflow keys must
- * be instantiated first, acquire the overflow lock and check. Read the key if we still
- * need to do so, but holding the overflow lock. Note we are not using the version of
- * the cell-data-ref calls that acquire the overflow lock and do a look-aside into the
- * tracking cache: this is an overflow key, not a value, meaning it's instantiated
- * before being deleted, not copied into the tracking cache.
+ * We can race with reconciliation deleting overflow keys. Deleted overflow keys must be
+ * instantiated before deletion, acquire the overflow lock and check. If the key has
+ * been deleted, restart the slot and get the instantiated key, else read the key before
+ * releasing the lock.
*/
if (slot_offset == 0) {
__wt_readlock(session, &btree->ovfl_lock);
- copy = WT_ROW_KEY_COPY(rip);
- if (!__wt_row_leaf_key_info(page, copy, NULL, &cell, &keyb->data, &keyb->size)) {
- __wt_cell_unpack_kv(session, page->dsk, cell, unpack);
- ret = __wt_dsk_cell_data_ref(session, WT_PAGE_ROW_LEAF, unpack, keyb);
+ if (__wt_cell_type_raw(unpack->cell) == WT_CELL_KEY_OVFL_RM) {
+ __wt_readunlock(session, &btree->ovfl_lock);
+ goto overflow_retry;
}
+ ret = __wt_dsk_cell_data_ref(session, WT_PAGE_ROW_LEAF, unpack, keyb);
__wt_readunlock(session, &btree->ovfl_lock);
- WT_ERR(ret);
+ WT_RET(ret);
break;
}
/*
* If we wanted a different key:
- * If we're rolling backward, this key is useless
- * to us because it doesn't have a valid prefix: keep
- * rolling backward.
- * If we're rolling forward, there's no work to be
- * done because prefixes skip overflow keys: keep
- * rolling forward.
+ *
+ * If we're rolling backward, this key is useless to us because it doesn't have a valid
+ * prefix: keep rolling backward.
+ *
+ * If we're rolling forward, there's no work to be done because prefixes skip overflow
+ * keys: keep rolling forward.
*/
goto next;
}
/*
- * 4: the test for an on-page reference to a key that isn't
- * prefix compressed.
+ * 4: the test for an on-page reference to a key that isn't prefix compressed.
*/
if (unpack->prefix == 0) {
/*
- * If this is the key we originally wanted, we don't
- * care if we're rolling forward or backward, it's
- * what we want. Take a copy and wrap up.
+ * If this is the key we originally wanted, we don't care if we're rolling forward or
+ * backward, it's what we want. Take a copy and wrap up.
*
- * If we wanted a different key, this key has a valid
- * prefix, we can use it.
- * If rolling backward, take a copy of the key and
- * switch directions, we can roll forward from this key.
- * If rolling forward there's a bug, we should have
- * found this key while rolling backwards and switched
- * directions then.
+ * If we wanted a different key, this key has a valid prefix, we can use it.
+ *
+ * If rolling backward, take a copy of the key and switch directions, we can roll
+ * forward from this key.
+ *
+ * If rolling forward there's a bug, we should have found this key while rolling
+ * backwards and switched directions then.
*
- * The key doesn't need to be instantiated, skip past
- * that test.
+ * The key doesn't need to be instantiated, just return.
*/
- WT_ERR(__wt_dsk_cell_data_ref(session, WT_PAGE_ROW_LEAF, unpack, keyb));
+ WT_RET(__wt_dsk_cell_data_ref(session, WT_PAGE_ROW_LEAF, unpack, keyb));
if (slot_offset == 0)
- goto done;
+ return (0);
goto switch_and_jump;
}
+ key_data = unpack->data;
+ key_size = unpack->size;
+ key_prefix = unpack->prefix;
+
+prefix_continue:
+ /*
+ * Proceed with a prefix-compressed key.
+ *
+ * Prefix compression means we don't yet have a key, but there's a special case: if the key
+ * is part of the group of compressed key prefixes we saved when reading the page into
+ * memory, we can build a key for this slot. Otherwise we have to keep rolling forward or
+ * backward.
+ */
+ slot = WT_ROW_SLOT(page, rip);
+ if (slot > page->prefix_start && slot <= page->prefix_stop) {
+ /*
+ * Get the root key's information (the row-store key can change underfoot; explicitly
+ * take a copy). Ignore the root key's size and prefix information because it must be
+ * large enough (else the current key couldn't have been prefix-compressed based on its
+ * value), and it can't have a prefix-compression value, it's a root key which is never
+ * prefix-compressed.
+ */
+ copy = WT_ROW_KEY_COPY(&page->pg_row[page->prefix_start]);
+
+ __wt_row_leaf_key_info(page, copy, NULL, NULL, &group_key, &group_size, &group_prefix);
+ if (group_key != NULL) {
+ WT_RET(__wt_buf_init(session, keyb, key_prefix + key_size));
+ memcpy(keyb->mem, group_key, key_prefix);
+ memcpy((uint8_t *)keyb->mem + key_prefix, key_data, key_size);
+ keyb->size = key_prefix + key_size;
+ /*
+ * If this is the key we originally wanted, we don't care if we're rolling forward
+ * or backward, it's what we want.
+ *
+ * The key doesn't need to be instantiated, just return.
+ */
+ if (slot_offset == 0)
+ return (0);
+ goto switch_and_jump;
+ }
+ }
+
/*
* 5: an on-page reference to a key that's prefix compressed.
- * If rolling backward, keep looking for something we can
- * use.
- * If rolling forward, build the full key and keep rolling
- * forward.
+ *
+ * If rolling backward, keep looking for something we can use.
+ *
+ * If rolling forward, build the full key and keep rolling forward.
*/
if (direction == BACKWARD) {
/*
@@ -337,28 +286,26 @@ switch_and_jump:
* find a key without a prefix.
*/
if (slot_offset == 0)
- last_prefix = unpack->prefix;
- if (slot_offset == 0 || last_prefix > unpack->prefix) {
+ last_prefix = key_prefix;
+ if (slot_offset == 0 || last_prefix > key_prefix) {
jump_rip = rip;
jump_slot_offset = slot_offset;
- last_prefix = unpack->prefix;
+ last_prefix = key_prefix;
}
}
if (direction == FORWARD) {
- p = unpack->data;
- size = unpack->size;
-
/*
* Grow the buffer as necessary as well as ensure data has been copied into local buffer
* space, then append the suffix to the prefix already in the buffer.
*
* Don't grow the buffer unnecessarily or copy data we don't need, truncate the item's
- * data length to the prefix bytes.
+ * CURRENT data length to the prefix bytes before growing the buffer.
*/
- keyb->size = unpack->prefix;
- WT_ERR(__wt_buf_grow(session, keyb, keyb->size + size));
- memcpy((uint8_t *)keyb->data + keyb->size, p, size);
- keyb->size += size;
+ WT_ASSERT(session, keyb->size >= key_prefix);
+ keyb->size = key_prefix;
+ WT_RET(__wt_buf_grow(session, keyb, key_prefix + key_size));
+ memcpy((uint8_t *)keyb->data + key_prefix, key_data, key_size);
+ keyb->size = key_prefix + key_size;
if (slot_offset == 0)
break;
@@ -379,37 +326,35 @@ next:
/*
* Optionally instantiate the key: there's a cost to figuring out a key value in a leaf page
- * with prefix-compressed or Huffman encoded keys, amortize the cost by instantiating a copy of
- * the calculated key in allocated memory. We don't instantiate keys when pages are first
- * brought into memory because it's wasted effort if the page is only read by a cursor in sorted
- * order. If, instead, the page is read by a cursor in reverse order, we immediately instantiate
- * periodic keys for the page (otherwise the reverse walk would be insanely slow). If, instead,
- * the page is randomly searched, we instantiate keys as they are accessed (meaning, for
- * example, as long as the binary search only touches one-half of the page, the only keys we
- * instantiate will be in that half of the page).
+ * with prefix-compressed keys, amortize the cost by instantiating a copy of the calculated key
+ * in allocated memory. We don't instantiate keys when pages are first brought into memory
+ * because it's wasted effort if the page is only read by a cursor in sorted order. If, instead,
+ * the page is read by a cursor in reverse order, we immediately instantiate periodic keys for
+ * the page (otherwise the reverse walk would be insanely slow). If, instead, the page is
+ * randomly searched, we instantiate keys as they are accessed (meaning, for example, as long as
+ * the binary search only touches one-half of the page, the only keys we instantiate will be in
+ * that half of the page).
*/
if (instantiate) {
copy = WT_ROW_KEY_COPY(rip_arg);
- WT_IGNORE_RET_BOOL(__wt_row_leaf_key_info(page, copy, &ikey, &cell, NULL, NULL));
- if (ikey == NULL) {
- WT_ERR(__wt_row_ikey_alloc(
- session, WT_PAGE_DISK_OFFSET(page, cell), keyb->data, keyb->size, &ikey));
+ __wt_row_leaf_key_info(page, copy, &ikey, &cell, NULL, NULL, NULL);
- /*
- * Serialize the swap of the key into place: on success, update the page's memory
- * footprint, on failure, free the allocated memory.
- */
- if (__wt_atomic_cas_ptr((void *)&WT_ROW_KEY_COPY(rip), copy, ikey))
- __wt_cache_page_inmem_incr(session, page, sizeof(WT_IKEY) + ikey->size);
- else
- __wt_free(session, ikey);
- }
- }
+ /* Check if we raced with another thread instantiating the key before doing real work. */
+ if (ikey != NULL)
+ return (0);
+ WT_RET(__wt_row_ikey_alloc(
+ session, WT_PAGE_DISK_OFFSET(page, cell), keyb->data, keyb->size, &ikey));
-done:
-err:
- __wt_scr_free(session, &tmp);
- return (ret);
+ /*
+ * Serialize the swap of the key into place: on success, update the page's memory footprint,
+ * on failure, free the allocated memory.
+ */
+ if (__wt_atomic_cas_ptr((void *)&WT_ROW_KEY_COPY(rip), copy, ikey))
+ __wt_cache_page_inmem_incr(session, page, sizeof(WT_IKEY) + ikey->size);
+ else
+ __wt_free(session, ikey);
+ }
+ return (0);
}
/*
diff --git a/src/third_party/wiredtiger/src/config/config_collapse.c b/src/third_party/wiredtiger/src/config/config_collapse.c
index bd1d172cbaa..527d839f2ec 100644
--- a/src/third_party/wiredtiger/src/config/config_collapse.c
+++ b/src/third_party/wiredtiger/src/config/config_collapse.c
@@ -222,9 +222,8 @@ __config_merge_format_next(WT_SESSION_IMPL *session, const char *prefix, size_t
continue;
/*
- * The test is complicated by matching empty entries
- * "foo=" against nested structures "foo,bar=", where
- * the latter is a replacement for the former.
+ * The test is complicated by matching empty entries "foo=" against nested structures
+ * "foo,bar=", where the latter is a replacement for the former.
*/
if (len2 > len1 && (ep + 1)->k[len1] == SEPC && memcmp(ep->k, (ep + 1)->k, len1) == 0)
continue;
diff --git a/src/third_party/wiredtiger/src/config/config_def.c b/src/third_party/wiredtiger/src/config/config_def.c
index 25cbb0e8b33..654abaf40d5 100644
--- a/src/third_party/wiredtiger/src/config/config_def.c
+++ b/src/third_party/wiredtiger/src/config/config_def.c
@@ -590,6 +590,7 @@ static const WT_CONFIG_CHECK confchk_object_meta[] = {
{"collator", "string", NULL, NULL, NULL, 0}, {"columns", "list", NULL, NULL, NULL, 0},
{"dictionary", "int", NULL, "min=0", NULL, 0},
{"encryption", "category", NULL, NULL, confchk_WT_SESSION_create_encryption_subconfigs, 2},
+ {"flush", "string", NULL, NULL, NULL, 0},
{"format", "string", NULL, "choices=[\"btree\"]", NULL, 0},
{"huffman_key", "string", NULL, NULL, NULL, 0}, {"huffman_value", "string", NULL, NULL, NULL, 0},
{"id", "string", NULL, NULL, NULL, 0},
@@ -1264,7 +1265,7 @@ static const WT_CONFIG_ENTRY config_entries[] = {{"WT_CONNECTION.add_collator",
"block_compressor=,cache_resident=false,checkpoint=,"
"checkpoint_backup_info=,checkpoint_lsn=,checksum=uncompressed,"
"collator=,columns=,dictionary=0,encryption=(keyid=,name=),"
- "format=btree,huffman_key=,huffman_value=,id=,"
+ "flush=0,format=btree,huffman_key=,huffman_value=,id=,"
"ignore_in_memory_cache_size=false,internal_item_max=0,"
"internal_key_max=0,internal_key_truncate=true,"
"internal_page_max=4KB,key_format=u,key_gap=10,leaf_item_max=0,"
@@ -1276,7 +1277,7 @@ static const WT_CONFIG_ENTRY config_entries[] = {{"WT_CONNECTION.add_collator",
"tiered_storage=(auth_token=,bucket=,bucket_prefix=,"
"local_retention=300,name=,object_target_size=10M),value_format=u"
",verbose=[],version=(major=0,minor=0),write_timestamp_usage=none",
- confchk_object_meta, 46},
+ confchk_object_meta, 47},
{"table.meta",
"app_metadata=,assert=(commit_timestamp=none,"
"durable_timestamp=none,read_timestamp=none,write_timestamp=off),"
diff --git a/src/third_party/wiredtiger/src/config/test_config.c b/src/third_party/wiredtiger/src/config/test_config.c
index bb46c2a1f24..c517ba96f5a 100644
--- a/src/third_party/wiredtiger/src/config/test_config.c
+++ b/src/third_party/wiredtiger/src/config/test_config.c
@@ -3,7 +3,7 @@
#include "wt_internal.h"
static const WT_CONFIG_CHECK confchk_stat_cache_size_subconfigs[] = {
- {"enabled", "boolean", NULL, NULL, NULL, 0}, {"limit", "string", NULL, NULL, NULL, 0},
+ {"enabled", "boolean", NULL, NULL, NULL, 0}, {"limit", "int", NULL, "min=0", NULL, 0},
{NULL, NULL, NULL, NULL, NULL, 0}};
static const WT_CONFIG_CHECK confchk_runtime_monitor_subconfigs[] = {
@@ -27,7 +27,7 @@ static const WT_CONFIG_CHECK confchk_insert_config_subconfigs[] = {
{"value_size", "int", NULL, "min=0,max=1000000000", NULL, 0}, {NULL, NULL, NULL, NULL, NULL, 0}};
static const WT_CONFIG_CHECK confchk_ops_per_transaction_subconfigs[] = {
- {"max", "string", NULL, NULL, NULL, 0}, {"min", "string", NULL, NULL, NULL, 0},
+ {"max", "string", NULL, NULL, NULL, 0}, {"min", "int", NULL, "min=0", NULL, 0},
{NULL, NULL, NULL, NULL, NULL, 0}};
static const WT_CONFIG_CHECK confchk_update_config_subconfigs[] = {
@@ -54,7 +54,9 @@ static const WT_CONFIG_CHECK confchk_workload_generator_subconfigs[] = {
{"value_size", "int", NULL, "min=0,max=1000000000", NULL, 0}, {NULL, NULL, NULL, NULL, NULL, 0}};
static const WT_CONFIG_CHECK confchk_workload_tracking_subconfigs[] = {
- {"enabled", "boolean", NULL, NULL, NULL, 0}, {NULL, NULL, NULL, NULL, NULL, 0}};
+ {"enabled", "boolean", NULL, NULL, NULL, 0},
+ {"interval", "string", NULL, "choices=[\"s\",\"m\",\"h\"]", NULL, 0},
+ {"op_count", "int", NULL, "min=1,max=10000", NULL, 0}, {NULL, NULL, NULL, NULL, NULL, 0}};
static const WT_CONFIG_CHECK confchk_example_test[] = {
{"cache_size_mb", "int", NULL, "min=0,max=100000000000", NULL, 0},
@@ -63,7 +65,7 @@ static const WT_CONFIG_CHECK confchk_example_test[] = {
{"runtime_monitor", "category", NULL, NULL, confchk_runtime_monitor_subconfigs, 4},
{"timestamp_manager", "category", NULL, NULL, confchk_timestamp_manager_subconfigs, 5},
{"workload_generator", "category", NULL, NULL, confchk_workload_generator_subconfigs, 15},
- {"workload_tracking", "category", NULL, NULL, confchk_workload_tracking_subconfigs, 1},
+ {"workload_tracking", "category", NULL, NULL, confchk_workload_tracking_subconfigs, 3},
{NULL, NULL, NULL, NULL, NULL, 0}};
static const WT_CONFIG_CHECK confchk_poc_test[] = {
@@ -73,35 +75,35 @@ static const WT_CONFIG_CHECK confchk_poc_test[] = {
{"runtime_monitor", "category", NULL, NULL, confchk_runtime_monitor_subconfigs, 4},
{"timestamp_manager", "category", NULL, NULL, confchk_timestamp_manager_subconfigs, 5},
{"workload_generator", "category", NULL, NULL, confchk_workload_generator_subconfigs, 15},
- {"workload_tracking", "category", NULL, NULL, confchk_workload_tracking_subconfigs, 1},
+ {"workload_tracking", "category", NULL, NULL, confchk_workload_tracking_subconfigs, 3},
{NULL, NULL, NULL, NULL, NULL, 0}};
static const WT_CONFIG_ENTRY config_entries[] = {
{"example_test",
- "cache_size_mb=0,duration_seconds=0,enable_logging=true,"
- "runtime_monitor=(enabled=false,interval=s,op_count=1,"
- "stat_cache_size=(enabled=false,limit=)),"
- "timestamp_manager=(enabled=false,interval=s,oldest_lag=0,"
- "op_count=1,stable_lag=0),workload_generator=(collection_count=1,"
- "enabled=false,insert_config=(interval=s,key_size=0,op_count=1,"
- "value_size=0),insert_threads=0,interval=s,interval=s,key_count=0"
- ",key_size=0,op_count=1,op_count=1,ops_per_transaction=(max=1,"
- "min=),read_threads=0,update_config=(interval=s,key_size=0,"
- "op_count=1,value_size=0),update_threads=0,value_size=0),"
- "workload_tracking=(enabled=false)",
+ "cache_size_mb=0,duration_seconds=0,enable_logging=false,"
+ "runtime_monitor=(enabled=true,interval=s,op_count=1,"
+ "stat_cache_size=(enabled=false,limit=0)),"
+ "timestamp_manager=(enabled=true,interval=s,oldest_lag=1,"
+ "op_count=1,stable_lag=1),workload_generator=(collection_count=1,"
+ "enabled=true,insert_config=(interval=s,key_size=5,op_count=1,"
+ "value_size=5),insert_threads=0,interval=s,interval=s,key_count=0"
+ ",key_size=5,op_count=1,op_count=1,ops_per_transaction=(max=1,"
+ "min=0),read_threads=0,update_config=(interval=s,key_size=5,"
+ "op_count=1,value_size=5),update_threads=0,value_size=5),"
+ "workload_tracking=(enabled=true,interval=s,op_count=1)",
confchk_example_test, 7},
{"poc_test",
- "cache_size_mb=0,duration_seconds=0,enable_logging=true,"
- "runtime_monitor=(enabled=false,interval=s,op_count=1,"
- "stat_cache_size=(enabled=false,limit=)),"
- "timestamp_manager=(enabled=false,interval=s,oldest_lag=0,"
- "op_count=1,stable_lag=0),workload_generator=(collection_count=1,"
- "enabled=false,insert_config=(interval=s,key_size=0,op_count=1,"
- "value_size=0),insert_threads=0,interval=s,interval=s,key_count=0"
- ",key_size=0,op_count=1,op_count=1,ops_per_transaction=(max=1,"
- "min=),read_threads=0,update_config=(interval=s,key_size=0,"
- "op_count=1,value_size=0),update_threads=0,value_size=0),"
- "workload_tracking=(enabled=false)",
+ "cache_size_mb=0,duration_seconds=0,enable_logging=false,"
+ "runtime_monitor=(enabled=true,interval=s,op_count=1,"
+ "stat_cache_size=(enabled=false,limit=0)),"
+ "timestamp_manager=(enabled=true,interval=s,oldest_lag=1,"
+ "op_count=1,stable_lag=1),workload_generator=(collection_count=1,"
+ "enabled=true,insert_config=(interval=s,key_size=5,op_count=1,"
+ "value_size=5),insert_threads=0,interval=s,interval=s,key_count=0"
+ ",key_size=5,op_count=1,op_count=1,ops_per_transaction=(max=1,"
+ "min=0),read_threads=0,update_config=(interval=s,key_size=5,"
+ "op_count=1,value_size=5),update_threads=0,value_size=5),"
+ "workload_tracking=(enabled=true,interval=s,op_count=1)",
confchk_poc_test, 7},
{NULL, NULL, NULL, 0}};
diff --git a/src/third_party/wiredtiger/src/conn/conn_api.c b/src/third_party/wiredtiger/src/conn/conn_api.c
index 47a28e016f2..3d7b5fed416 100644
--- a/src/third_party/wiredtiger/src/conn/conn_api.c
+++ b/src/third_party/wiredtiger/src/conn/conn_api.c
@@ -1287,7 +1287,7 @@ __conn_query_timestamp(WT_CONNECTION *wt_conn, char *hex_timestamp, const char *
conn = (WT_CONNECTION_IMPL *)wt_conn;
CONNECTION_API_CALL(conn, session, query_timestamp, config, cfg);
- WT_TRET(__wt_txn_query_timestamp(session, hex_timestamp, cfg, true));
+ ret = __wt_txn_query_timestamp(session, hex_timestamp, cfg, true);
err:
API_END_RET(session, ret);
}
@@ -1306,7 +1306,7 @@ __conn_set_timestamp(WT_CONNECTION *wt_conn, const char *config)
conn = (WT_CONNECTION_IMPL *)wt_conn;
CONNECTION_API_CALL(conn, session, set_timestamp, config, cfg);
- WT_TRET(__wt_txn_global_set_timestamp(session, cfg));
+ ret = __wt_txn_global_set_timestamp(session, cfg);
err:
API_END_RET(session, ret);
}
@@ -1326,7 +1326,7 @@ __conn_rollback_to_stable(WT_CONNECTION *wt_conn, const char *config)
CONNECTION_API_CALL(conn, session, rollback_to_stable, config, cfg);
WT_STAT_CONN_INCR(session, txn_rts);
- WT_TRET(__wt_rollback_to_stable(session, cfg, false));
+ ret = __wt_rollback_to_stable(session, cfg, false);
err:
API_END_RET(session, ret);
}
@@ -2810,16 +2810,13 @@ wiredtiger_open(const char *home, WT_EVENT_HANDLER *event_handler, const char *c
WT_ERR(__wt_tiered_conn_config(session, cfg, false));
/*
- * The metadata/log encryptor is configured after extensions, since
- * extensions may load encryptors. We have to do this before creating
- * the metadata file.
+ * The metadata/log encryptor is configured after extensions, since extensions may load
+ * encryptors. We have to do this before creating the metadata file.
*
- * The encryption customize callback needs the fully realized set of
- * encryption args, as simply grabbing "encryption" doesn't work.
- * As an example, configuration for the current call may just be
- * "encryption=(secretkey=xxx)", with encryption.name,
- * encryption.keyid being 'inherited' from the stored base
- * configuration.
+ * The encryption customize callback needs the fully realized set of encryption args, as simply
+ * grabbing "encryption" doesn't work. As an example, configuration for the current call may
+ * just be "encryption=(secretkey=xxx)", with encryption.name, encryption.keyid being
+ * 'inherited' from the stored base configuration.
*/
WT_ERR(__wt_config_gets_none(session, cfg, "encryption.name", &cval));
WT_ERR(__wt_config_gets_none(session, cfg, "encryption.keyid", &keyid));
diff --git a/src/third_party/wiredtiger/src/cursor/cur_backup_incr.c b/src/third_party/wiredtiger/src/cursor/cur_backup_incr.c
index 9b31214a0ee..0f1cd879d72 100644
--- a/src/third_party/wiredtiger/src/cursor/cur_backup_incr.c
+++ b/src/third_party/wiredtiger/src/cursor/cur_backup_incr.c
@@ -78,23 +78,28 @@ __curbackup_incr_blkmod(WT_SESSION_IMPL *session, WT_BTREE *btree, WT_CURSOR_BAC
/*
* The rename configuration string component was added later. So don't error if we don't
- * find it in the string. If we don't have it, we're not doing a rename.
+ * find it in the string. If we don't have it, we're not doing a rename. Otherwise rename
+ * forces full copies, there is no need to traverse the blocks information.
*/
WT_ERR_NOTFOUND_OK(__wt_config_subgets(session, &v, "rename", &b), true);
- if (ret == 0 && b.val)
+ if (ret == 0 && b.val) {
+ cb->nbits = 0;
+ cb->offset = 0;
+ cb->bit_offset = 0;
F_SET(cb, WT_CURBACKUP_RENAME);
- else
+ } else {
F_CLR(cb, WT_CURBACKUP_RENAME);
- /*
- * We found a match. Load the block information into the cursor.
- */
- if ((ret = __wt_config_subgets(session, &v, "blocks", &b)) == 0) {
- WT_ERR(__wt_backup_load_incr(session, &b, &cb->bitstring, cb->nbits));
- cb->bit_offset = 0;
- F_SET(cb, WT_CURBACKUP_INCR_INIT);
+ /*
+ * We found a match. Load the block information into the cursor.
+ */
+ if ((ret = __wt_config_subgets(session, &v, "blocks", &b)) == 0) {
+ WT_ERR(__wt_backup_load_incr(session, &b, &cb->bitstring, cb->nbits));
+ cb->bit_offset = 0;
+ F_SET(cb, WT_CURBACKUP_INCR_INIT);
+ }
+ WT_ERR_NOTFOUND_OK(ret, false);
}
- WT_ERR_NOTFOUND_OK(ret, false);
break;
}
WT_ERR_NOTFOUND_OK(ret, false);
diff --git a/src/third_party/wiredtiger/src/cursor/cur_file.c b/src/third_party/wiredtiger/src/cursor/cur_file.c
index bc9057a47d5..046a7ac52db 100644
--- a/src/third_party/wiredtiger/src/cursor/cur_file.c
+++ b/src/third_party/wiredtiger/src/cursor/cur_file.c
@@ -423,7 +423,8 @@ __curfile_remove(WT_CURSOR *cursor)
WT_ASSERT(session, F_MASK(cursor, WT_CURSTD_VALUE_SET) == 0);
err:
- CURSOR_UPDATE_API_END(session, ret);
+ /* If we've lost an initial position, we must fail. */
+ CURSOR_UPDATE_API_END_RETRY(session, ret, !positioned || F_ISSET(cursor, WT_CURSTD_KEY_INT));
return (ret);
}
diff --git a/src/third_party/wiredtiger/src/cursor/cur_hs.c b/src/third_party/wiredtiger/src/cursor/cur_hs.c
index a090d0fe0e1..74dd2899fe0 100644
--- a/src/third_party/wiredtiger/src/cursor/cur_hs.c
+++ b/src/third_party/wiredtiger/src/cursor/cur_hs.c
@@ -867,6 +867,13 @@ retry:
goto retry;
WT_ERR(ret);
+#ifdef HAVE_DIAGNOSTIC
+ /* Do a search again and call next to check the key order. */
+ WT_WITH_PAGE_INDEX(session, ret = __wt_hs_row_search(cbt, &file_cursor->key, true));
+ WT_ASSERT(session, ret == 0);
+ WT_ERR_NOTFOUND_OK(__curhs_file_cursor_next(session, file_cursor), false);
+#endif
+
/* Insert doesn't maintain a position across calls, clear resources. */
if (0) {
err:
diff --git a/src/third_party/wiredtiger/src/cursor/cur_join.c b/src/third_party/wiredtiger/src/cursor/cur_join.c
index 29398aedb2b..7f00ea9bc3d 100644
--- a/src/third_party/wiredtiger/src/cursor/cur_join.c
+++ b/src/third_party/wiredtiger/src/cursor/cur_join.c
@@ -1422,9 +1422,8 @@ __wt_curjoin_join(WT_SESSION_IMPL *session, WT_CURSOR_JOIN *cjoin, WT_INDEX *idx
WT_RET(__curjoin_open_main(session, cjoin, entry));
/*
- * When we are repacking index keys to remove the
- * primary key, we never want to transform trailing
- * 'u'. Use no-op padding to force this.
+ * When we are repacking index keys to remove the primary key, we never want to
+ * transform trailing 'u'. Use no-op padding to force this.
*/
cindex = (WT_CURSOR_INDEX *)ref_cursor;
len = strlen(cindex->iface.key_format) + 3;
diff --git a/src/third_party/wiredtiger/src/docs/transactions.dox b/src/third_party/wiredtiger/src/docs/transactions.dox
index 3bfae988747..c594b6b15c9 100644
--- a/src/third_party/wiredtiger/src/docs/transactions.dox
+++ b/src/third_party/wiredtiger/src/docs/transactions.dox
@@ -16,14 +16,11 @@ operate on data concurrently because they have the following properties:
WiredTiger supports transactions with the following caveats to the ACID
properties:
-- the maximum level of isolation supported is snapshot isolation.
- See @ref transaction_isolation for more details.
+- the maximum level of isolation supported is snapshot isolation and all updates must be done at
+ snapshot isolation. See @ref transaction_isolation for more details.
- transactional updates are made durable by a combination of checkpoints
and logging. See @ref checkpoint for information on checkpoint durability
and @ref durability for information on commit-level durability.
-- each transaction's uncommitted changes must fit in memory: for
- efficiency, WiredTiger does not write to the log until a transaction
- commits.
@section transactions_api Transactional API
@@ -101,8 +98,8 @@ transactional readers, an operation may fail and return ::WT_ROLLBACK.
@section transaction_isolation Isolation levels
WiredTiger supports <code>read-uncommitted</code>,
-<code>read-committed</code> and <code>snapshot</code> isolation levels;
-the default isolation level is <code>snapshot</code>.
+<code>read-committed</code> and <code>snapshot</code> isolation levels; the default isolation
+level is <code>snapshot</code>, and all updates must be done at snapshot isolation.
- <code>read-uncommitted</code>:
Transactions can see changes made by other transactions before those
diff --git a/src/third_party/wiredtiger/src/history/hs_cursor.c b/src/third_party/wiredtiger/src/history/hs_cursor.c
index 1799b068e7e..8dfad4cd983 100644
--- a/src/third_party/wiredtiger/src/history/hs_cursor.c
+++ b/src/third_party/wiredtiger/src/history/hs_cursor.c
@@ -15,10 +15,17 @@
int
__wt_hs_row_search(WT_CURSOR_BTREE *hs_cbt, WT_ITEM *srch_key, bool insert)
{
+ WT_BTREE *hs_btree;
WT_CURSOR *hs_cursor;
WT_DECL_RET;
+ WT_SESSION_IMPL *session;
bool leaf_found;
+#ifdef HAVE_DIAGNOSTIC
+ WT_PAGE *page;
+#endif
+ hs_btree = CUR2BT(hs_cbt);
+ session = CUR2S(hs_cbt);
hs_cursor = &hs_cbt->iface;
leaf_found = false;
@@ -27,7 +34,15 @@ __wt_hs_row_search(WT_CURSOR_BTREE *hs_cbt, WT_ITEM *srch_key, bool insert)
* perform a full search.
*/
if (hs_cbt->ref != NULL) {
- WT_WITH_BTREE(CUR2S(hs_cbt), CUR2BT(hs_cbt),
+#ifdef HAVE_DIAGNOSTIC
+ WT_ORDERED_READ(page, hs_cbt->ref->page);
+#endif
+ /*
+ * The page must be pinned and we should have a hazard pointer on that. Ensure the page is
+ * not evictable.
+ */
+ WT_ASSERT(session, __wt_hazard_check(session, hs_cbt->ref, NULL) != NULL);
+ WT_WITH_BTREE(session, hs_btree,
ret = __wt_row_search(hs_cbt, srch_key, insert, hs_cbt->ref, false, &leaf_found));
WT_RET(ret);
@@ -40,13 +55,16 @@ __wt_hs_row_search(WT_CURSOR_BTREE *hs_cbt, WT_ITEM *srch_key, bool insert)
(hs_cbt->compare != 0 &&
(hs_cbt->slot == 0 || hs_cbt->slot == hs_cbt->ref->page->entries - 1)))
leaf_found = false;
+
+ /* Ensure there is no eviction happened on this page. */
+ WT_ASSERT(session, page == hs_cbt->ref->page);
if (!leaf_found)
hs_cursor->reset(hs_cursor);
}
if (!leaf_found)
- WT_WITH_BTREE(CUR2S(hs_cbt), CUR2BT(hs_cbt),
- ret = __wt_row_search(hs_cbt, srch_key, insert, NULL, false, NULL));
+ WT_WITH_BTREE(
+ session, hs_btree, ret = __wt_row_search(hs_cbt, srch_key, insert, NULL, false, NULL));
if (ret == 0 && !insert) {
WT_ERR(__wt_key_return(hs_cbt));
diff --git a/src/third_party/wiredtiger/src/history/hs_rec.c b/src/third_party/wiredtiger/src/history/hs_rec.c
index 0e7e2424c57..6e30d425aec 100644
--- a/src/third_party/wiredtiger/src/history/hs_rec.c
+++ b/src/third_party/wiredtiger/src/history/hs_rec.c
@@ -274,6 +274,8 @@ __wt_hs_insert_updates(
WT_DECL_ITEM(prev_full_value);
WT_DECL_ITEM(tmp);
WT_DECL_RET;
+/* Limit the number of consecutive reverse modifies. */
+#define WT_MAX_CONSECUTIVE_REVERSE_MODIFY 10
/* If the limit is exceeded, we will insert a full update to the history store */
#define MAX_REVERSE_MODIFY_NUM 16
WT_MODIFY entries[MAX_REVERSE_MODIFY_NUM];
@@ -284,7 +286,7 @@ __wt_hs_insert_updates(
WT_UPDATE *non_aborted_upd, *oldest_upd, *prev_upd, *tombstone, *upd;
WT_TIME_WINDOW tw;
wt_off_t hs_size;
- uint64_t insert_cnt, max_hs_size;
+ uint64_t insert_cnt, max_hs_size, modify_cnt;
uint32_t i;
uint8_t *p;
int nentries;
@@ -363,7 +365,12 @@ __wt_hs_insert_updates(
}
first_globally_visible_upd = min_ts_upd = out_of_order_ts_upd = NULL;
- enable_reverse_modify = true;
+
+ /*
+ * Reverse deltas are only supported on 'S' and 'u' value formats.
+ */
+ enable_reverse_modify =
+ (WT_STREQ(btree->value_format, "S") || WT_STREQ(btree->value_format, "u"));
/*
* The algorithm assumes the oldest update on the update chain in memory is either a full
@@ -374,10 +381,11 @@ __wt_hs_insert_updates(
* newer than a TOMBSTONE must be a full update.
*
* The algorithm walks from the oldest update, or the most recently inserted into history
- * store update, to the newest update and build full updates along the way. It sets the stop
- * time point of the update to the start time point of the next update, squashes the updates
- * that are from the same transaction and of the same start timestamp, calculates reverse
- * modification if prev_upd is a MODIFY, and inserts the update to the history store.
+ * store update, to the newest update and builds full updates along the way. It sets the
+ * stop time point of the update to the start time point of the next update, squashes the
+ * updates that are from the same transaction and of the same start timestamp, checks if the
+ * update can be written as reverse modification, and inserts the update to the history
+ * store either as a full update or a reverse modification.
*
* It deals with the following scenarios:
* 1) We only have full updates on the chain and we only insert full updates to
@@ -486,6 +494,7 @@ __wt_hs_insert_updates(
* time point, we can squash updates with the same start time point as the onpage update
* away.
*/
+ modify_cnt = 0;
for (; updates.size > 0 &&
!(upd->txnid == list->onpage_upd->txnid &&
upd->start_ts == list->onpage_upd->start_ts);
@@ -605,7 +614,9 @@ __wt_hs_insert_updates(
* Calculate reverse modify and clear the history store records with timestamps when
* inserting the first update. Always write on-disk data store updates to the history
* store as a full update because the on-disk update will be the base update for all the
- * updates that are older than the on-disk update.
+ * updates that are older than the on-disk update. Limit the number of consecutive
+ * reverse modifies for standard updates. We want to ensure we do not store a large
+ * chain of reverse modifies as to impact read performance.
*
* Due to concurrent operation of checkpoint and eviction, it is possible that history
* store may have more recent versions of a key than the on-disk version. Without a
@@ -613,17 +624,20 @@ __wt_hs_insert_updates(
* the RTS.
*/
nentries = MAX_REVERSE_MODIFY_NUM;
- if (!F_ISSET(upd, WT_UPDATE_DS) && upd->type == WT_UPDATE_MODIFY &&
- enable_reverse_modify &&
+ if (!F_ISSET(upd, WT_UPDATE_DS) && !F_ISSET(prev_upd, WT_UPDATE_DS) &&
+ enable_reverse_modify && modify_cnt < WT_MAX_CONSECUTIVE_REVERSE_MODIFY &&
__wt_calc_modify(session, prev_full_value, full_value, prev_full_value->size / 10,
entries, &nentries) == 0) {
WT_ERR(__wt_modify_pack(hs_cursor, entries, nentries, &modify_value));
WT_ERR(__hs_insert_record(
session, hs_cursor, btree, key, WT_UPDATE_MODIFY, modify_value, &tw));
__wt_scr_free(session, &modify_value);
- } else
+ ++modify_cnt;
+ } else {
+ modify_cnt = 0;
WT_ERR(__hs_insert_record(
session, hs_cursor, btree, key, WT_UPDATE_STANDARD, full_value, &tw));
+ }
/* Flag the update as now in the history store. */
F_SET(upd, WT_UPDATE_HS);
diff --git a/src/third_party/wiredtiger/src/include/api.h b/src/third_party/wiredtiger/src/include/api.h
index 2a2bd5aca2f..20a86779a91 100644
--- a/src/third_party/wiredtiger/src/include/api.h
+++ b/src/third_party/wiredtiger/src/include/api.h
@@ -123,34 +123,31 @@
if (__update) \
F_SET((s)->txn, WT_TXN_UPDATE);
-/* End a transactional API call, optional retry on deadlock. */
-#define TXN_API_END_RETRY(s, ret, retry) \
- API_END(s, ret); \
- if (__update) \
- F_CLR((s)->txn, WT_TXN_UPDATE); \
- if (__autotxn) { \
- if (F_ISSET((s)->txn, WT_TXN_AUTOCOMMIT)) \
- F_CLR((s)->txn, WT_TXN_AUTOCOMMIT); \
- else if ((ret) == 0) \
- (ret) = __wt_txn_commit((s), NULL); \
- else { \
- if (retry) \
- WT_TRET(__wt_session_copy_values(s)); \
- WT_TRET(__wt_txn_rollback((s), NULL)); \
- if (((ret) == 0 || (ret) == WT_ROLLBACK) && (retry)) { \
- (ret) = 0; \
- continue; \
- } \
- WT_TRET(__wt_session_reset_cursors(s, false)); \
- } \
- } \
- break; \
- } \
+/* End a transactional API call, optional retry on rollback. */
+#define TXN_API_END(s, ret, retry) \
+ API_END(s, ret); \
+ if (__update) \
+ F_CLR((s)->txn, WT_TXN_UPDATE); \
+ if (__autotxn) { \
+ if (F_ISSET((s)->txn, WT_TXN_AUTOCOMMIT)) \
+ F_CLR((s)->txn, WT_TXN_AUTOCOMMIT); \
+ else if ((ret) == 0) \
+ (ret) = __wt_txn_commit((s), NULL); \
+ else { \
+ if (retry) \
+ WT_TRET(__wt_session_copy_values(s)); \
+ WT_TRET(__wt_txn_rollback((s), NULL)); \
+ if ((retry) && (ret) == WT_ROLLBACK) { \
+ (ret) = 0; \
+ continue; \
+ } \
+ WT_TRET(__wt_session_reset_cursors(s, false)); \
+ } \
+ } \
+ break; \
+ } \
while (1)
-/* End a transactional API call, retry on deadlock. */
-#define TXN_API_END(s, ret) TXN_API_END_RETRY(s, ret, 1)
-
/*
* In almost all cases, API_END is returning immediately, make it simple. If a session or connection
* method is about to return WT_NOTFOUND (some underlying object was not found), map it to ENOENT,
@@ -265,7 +262,9 @@
CURSOR_UPDATE_API_CALL(cur, s, n); \
JOINABLE_CURSOR_CALL_CHECK(cur)
-#define CURSOR_UPDATE_API_END(s, ret) \
- if ((ret) == WT_PREPARE_CONFLICT) \
- (ret) = WT_ROLLBACK; \
- TXN_API_END(s, ret)
+#define CURSOR_UPDATE_API_END_RETRY(s, ret, retry) \
+ if ((ret) == WT_PREPARE_CONFLICT) \
+ (ret) = WT_ROLLBACK; \
+ TXN_API_END(s, ret, retry)
+
+#define CURSOR_UPDATE_API_END(s, ret) CURSOR_UPDATE_API_END_RETRY(s, ret, true)
diff --git a/src/third_party/wiredtiger/src/include/btmem.h b/src/third_party/wiredtiger/src/include/btmem.h
index 24562280ac1..5283c46df55 100644
--- a/src/third_party/wiredtiger/src/include/btmem.h
+++ b/src/third_party/wiredtiger/src/include/btmem.h
@@ -232,8 +232,8 @@ struct __wt_ovfl_reuse {
* We also configure a larger than default internal page size to accommodate for larger history
* store keys. We do that to reduce the chances of having to create overflow keys on the page.
*/
-#ifdef HAVE_BUILTIN_EXTENSION_SNAPPY
-#define WT_HS_COMPRESSOR "snappy"
+#ifdef HAVE_BUILTIN_EXTENSION_ZSTD
+#define WT_HS_COMPRESSOR "zstd"
#else
#define WT_HS_COMPRESSOR "none"
#endif
@@ -635,14 +635,17 @@ struct __wt_page {
} u;
/*
- * Page entries, type and flags are positioned at the end of the WT_PAGE union to reduce cache
- * misses in the row-store search function.
+ * Page entry count, page-wide prefix information, type and flags are positioned at the end of
+ * the WT_PAGE union to reduce cache misses when searching row-store pages.
*
* The entries field only applies to leaf pages, internal pages use the page-index entries
* instead.
*/
uint32_t entries; /* Leaf page entries */
+ uint32_t prefix_start; /* Best page prefix starting slot */
+ uint32_t prefix_stop; /* Maximum slot to which the best page prefix applies */
+
#define WT_PAGE_IS_INTERNAL(page) \
((page)->type == WT_PAGE_COL_INT || (page)->type == WT_PAGE_ROW_INT)
#define WT_PAGE_INVALID 0 /* Invalid page */
@@ -669,6 +672,19 @@ struct __wt_page {
uint8_t unused[2]; /* Unused padding */
+ size_t memory_footprint; /* Memory attached to the page */
+
+ /* Page's on-disk representation: NULL for pages created in memory. */
+ const WT_PAGE_HEADER *dsk;
+
+ /* If/when the page is modified, we need lots more information. */
+ WT_PAGE_MODIFY *modify;
+
+ /*
+ * !!!
+ * This is the 64 byte boundary, try to keep hot fields above here.
+ */
+
/*
* The page's read generation acts as an LRU value for each page in the
* tree; it is used by the eviction server thread to select pages to be
@@ -698,16 +714,6 @@ struct __wt_page {
#define WT_READGEN_STEP 100
uint64_t read_gen;
- size_t memory_footprint; /* Memory attached to the page */
-
- /* Page's on-disk representation: NULL for pages created in memory. */
- const WT_PAGE_HEADER *dsk;
-
- /* If/when the page is modified, we need lots more information. */
- WT_PAGE_MODIFY *modify;
-
- /* This is the 64 byte boundary, try to keep hot fields above here. */
-
uint64_t cache_create_gen; /* Page create timestamp */
uint64_t evict_pass_gen; /* Eviction pass generation */
};
@@ -1301,10 +1307,9 @@ struct __wt_insert_head {
NULL : \
(page)->modify->mod_row_update[WT_ROW_SLOT(page, ip)])
/*
- * WT_ROW_INSERT_SMALLEST references an additional slot past the end of the
- * "one per WT_ROW slot" insert array. That's because the insert array requires
- * an extra slot to hold keys that sort before any key found on the original
- * page.
+ * WT_ROW_INSERT_SMALLEST references an additional slot past the end of the "one per WT_ROW slot"
+ * insert array. That's because the insert array requires an extra slot to hold keys that sort
+ * before any key found on the original page.
*/
#define WT_ROW_INSERT_SMALLEST(page) \
((page)->modify == NULL || (page)->modify->mod_row_insert == NULL ? \
diff --git a/src/third_party/wiredtiger/src/include/btree.h b/src/third_party/wiredtiger/src/include/btree.h
index a253887faf6..6f3a6e086b3 100644
--- a/src/third_party/wiredtiger/src/include/btree.h
+++ b/src/third_party/wiredtiger/src/include/btree.h
@@ -102,7 +102,8 @@ typedef enum { /* Start position for eviction walk */
struct __wt_btree {
WT_DATA_HANDLE *dhandle;
- WT_CKPT *ckpt; /* Checkpoint information */
+ WT_CKPT *ckpt; /* Checkpoint information */
+ size_t ckpt_bytes_allocated; /* Checkpoint information array allocation size */
WT_BTREE_TYPE type; /* Type */
@@ -115,8 +116,6 @@ struct __wt_btree {
uint32_t id; /* File ID, for logging */
- uint32_t key_gap; /* Row-store prefix key gap */
-
uint32_t allocsize; /* Allocation size */
uint32_t maxintlpage; /* Internal page max size */
uint32_t maxintlkey; /* Internal page max key size */
@@ -256,22 +255,25 @@ struct __wt_btree {
WT_EVICT_WALK_TYPE evict_start_type;
/*
- * Flag values up to 0xff are reserved for WT_DHANDLE_XXX. We don't automatically generate these
- * flag values for that reason, there's no way to start at an offset.
+ * Flag values up to 0xfff are reserved for WT_DHANDLE_XXX. See comment with dhandle flags for an
+ * explanation.
+ *
+ * We don't automatically generate these flag values for this reason; there's no way to start at an
+ * offset.
*/
-#define WT_BTREE_ALTER 0x000100u /* Handle is for alter */
-#define WT_BTREE_BULK 0x000200u /* Bulk-load handle */
-#define WT_BTREE_CLOSED 0x000400u /* Handle closed */
-#define WT_BTREE_IGNORE_CACHE 0x000800u /* Cache-resident object */
-#define WT_BTREE_IN_MEMORY 0x001000u /* Cache-resident object */
-#define WT_BTREE_NO_CHECKPOINT 0x002000u /* Disable checkpoints */
-#define WT_BTREE_NO_LOGGING 0x004000u /* Disable logging */
-#define WT_BTREE_OBSOLETE_PAGES 0x008000u /* Handle has obsolete pages */
-#define WT_BTREE_READONLY 0x010000u /* Handle is readonly */
-#define WT_BTREE_SALVAGE 0x020000u /* Handle is for salvage */
-#define WT_BTREE_SKIP_CKPT 0x040000u /* Handle skipped checkpoint */
-#define WT_BTREE_UPGRADE 0x080000u /* Handle is for upgrade */
-#define WT_BTREE_VERIFY 0x100000u /* Handle is for verify */
+#define WT_BTREE_ALTER 0x0001000u /* Handle is for alter */
+#define WT_BTREE_BULK 0x0002000u /* Bulk-load handle */
+#define WT_BTREE_CLOSED 0x0004000u /* Handle closed */
+#define WT_BTREE_IGNORE_CACHE 0x0008000u /* Cache-resident object */
+#define WT_BTREE_IN_MEMORY 0x0010000u /* Cache-resident object */
+#define WT_BTREE_NO_CHECKPOINT 0x0020000u /* Disable checkpoints */
+#define WT_BTREE_NO_LOGGING 0x0040000u /* Disable logging */
+#define WT_BTREE_OBSOLETE_PAGES 0x0080000u /* Handle has obsolete pages */
+#define WT_BTREE_READONLY 0x0100000u /* Handle is readonly */
+#define WT_BTREE_SALVAGE 0x0200000u /* Handle is for salvage */
+#define WT_BTREE_SKIP_CKPT 0x0400000u /* Handle skipped checkpoint */
+#define WT_BTREE_UPGRADE 0x0800000u /* Handle is for upgrade */
+#define WT_BTREE_VERIFY 0x1000000u /* Handle is for verify */
uint32_t flags;
};
diff --git a/src/third_party/wiredtiger/src/include/btree_inline.h b/src/third_party/wiredtiger/src/include/btree_inline.h
index 41510592410..9c0e0ce784e 100644
--- a/src/third_party/wiredtiger/src/include/btree_inline.h
+++ b/src/third_party/wiredtiger/src/include/btree_inline.h
@@ -874,151 +874,177 @@ __wt_ref_key_clear(WT_REF *ref)
* Return a row-store leaf page key referenced by a WT_ROW if it can be had without unpacking a
* cell, and information about the cell, if the key isn't cheaply available.
*/
-static inline bool
-__wt_row_leaf_key_info(
- WT_PAGE *page, void *copy, WT_IKEY **ikeyp, WT_CELL **cellp, void *datap, size_t *sizep)
+static inline void
+__wt_row_leaf_key_info(WT_PAGE *page, void *copy, WT_IKEY **ikeyp, WT_CELL **cellp, void *datap,
+ size_t *sizep, uint8_t *prefixp)
{
WT_IKEY *ikey;
uintptr_t v;
v = (uintptr_t)copy;
-/*
- * A row-store leaf page key is in one of two places: if instantiated,
- * the WT_ROW pointer references a WT_IKEY structure, otherwise, it
- * references an on-page offset. Further, on-page keys are in one of
- * two states: if the key is a simple key (not an overflow key or prefix
- * compressed, all of which are likely), the key's offset/size is encoded
- * in the pointer. Otherwise, the offset is to the key's on-page cell.
- *
- * Now the magic: allocated memory must be aligned to store any standard
- * type, and we expect some standard type to require at least quad-byte
- * alignment, so allocated memory should have some clear low-order bits.
- * On-page objects consist of an offset/length pair: the maximum page
- * size currently fits into 29 bits, so we use the low-order bits of the
- * pointer to mark the other bits of the pointer as encoding the key's
- * location and length. This breaks if allocated memory isn't aligned,
- * of course.
- *
- * In this specific case, we use bit 0x01 to mark an on-page cell, bit
- * 0x02 to mark an on-page key, 0x03 to mark an on-page key/value pair,
- * otherwise it's a WT_IKEY reference. The bit pattern for on-page cells
- * is:
- * 29 bits page offset of the key's cell,
- * 2 bits flags
- *
- * The bit pattern for on-page keys is:
- * 32 bits key length,
- * 29 bits page offset of the key's bytes,
- * 2 bits flags
- *
- * But, while that allows us to skip decoding simple key cells, we also
- * want to skip decoding the value cell in the case where the value cell
- * is also simple/short. We use bit 0x03 to mark an encoded on-page key
- * and value pair. The bit pattern for on-page key/value pairs is:
- * 9 bits key length,
- * 13 bits value length,
- * 20 bits page offset of the key's bytes,
- * 20 bits page offset of the value's bytes,
- * 2 bits flags
- *
- * These bit patterns are in-memory only, of course, so can be modified
- * (we could even tune for specific workloads). Generally, the fields
- * are larger than the anticipated values being stored (512B keys, 8KB
- * values, 1MB pages), hopefully that won't be necessary.
- *
- * This function returns a list of things about the key (instantiation
- * reference, cell reference and key/length pair). Our callers know
- * the order in which we look things up and the information returned;
- * for example, the cell will never be returned if we are working with
- * an on-page key.
- */
+ /*
+ * A row-store leaf page key is in one of two places: if instantiated, the WT_ROW pointer
+ * references a WT_IKEY structure, otherwise, it references an on-page item. Further, on-page
+ * items are in one of two states: if the key is a simple key (not an overflow key, which is
+ * likely), the key's offset, size and prefix is encoded in the 8B of pointer. Otherwise, the
+ * offset is to the key's on-page cell.
+ *
+ * This function returns information from a set of things about the key (WT_IKEY reference, cell
+ * reference and/or key/length/prefix triplet). Our callers know the order we resolve items and
+ * what information will be returned. Specifically, the caller gets a key (in the form of a
+ * pointer to the bytes, a length and a prefix length in all cases where we can get it without
+ * unpacking a cell), plus an optional WT_IKEY reference, and in all cases, a pointer to the
+ * on-page cell. Our caller's test is generally if there is a returned key or not, falling back
+ * to the returned cell.
+ *
+ * Now the magic: allocated memory must be aligned to store any standard type and we expect some
+ * standard type to require at least quad-byte alignment, so allocated memory should have two
+ * clear low-order bits. On-page objects consist of an offset/length pair and a prefix in the
+ * case of a key: the maximum page size is 29 bits (512MB), the remaining bits hold the key or
+ * value location and bytes. This breaks if allocated memory isn't aligned, of course.
+ *
+ * In this specific case, we use bit 0x01 to mark an on-page cell, bit 0x02 to mark an on-page
+ * key, 0x03 to mark an on-page key/value pair, otherwise it's a WT_IKEY reference. The bit
+ * pattern for on-page cells is:
+ *
+ * 29 bits offset of the key's cell (512MB)
+ * 2 bits 0x01 flag
+ *
+ * The on-page cell is our fallback: if a key or value won't fit into our encoding (unlikely,
+ * but possible), we fall back to using a cell reference, which obviously has enough room for
+ * all possible values.
+ *
+ * The next encoding is for on-page keys:
+ *
+ * 19 bits key's length (512KB)
+ * 6 bits offset of the key's bytes from the key's cell (32B)
+ * 8 bits key's prefix length (256B, the maximum possible value)
+ * 29 bits offset of the key's cell (512MB)
+ * 2 bits 0x02 flag
+ *
+ * But, while that allows us to skip decoding simple key cells, we also want to skip decoding
+ * value cells in the case where the value cell is also simple/short. We use bit 0x03 to mark
+ * an encoded on-page key and value pair. The encoding for on-page key/value pairs is:
+ *
+ * 13 bits value's length (8KB)
+ * 6 bits offset of the value's bytes from the end of the key's cell (32B)
+ * 12 bits key's length (4KB)
+ * 6 bits offset of the key's bytes from the key's cell (32B)
+ * 8 bits key's prefix length (256B, the maximum possible value)
+ * 17 bits offset of the key's cell (128KB)
+ * 2 bits 0x03 flag
+ *
+ * A reason for the complexity here is we need to be able to find the key and value cells from
+ * the encoded form: for that reason we store an offset to the key cell plus a second offset to
+ * the start of the key's bytes. Finding the value cell is reasonably straight-forward, we use
+ * the location of the key to find the cell immediately following the key.
+ *
+ * A simple extension of this encoding would be to encode zero-length values similarly to how we
+ * encode short values. However, zero-length values are noted by adjacent key cells on the page,
+ * and we detect that without decoding the second cell by checking the cell's type byte. Tests
+ * indicate it's slightly slower to encode missing value cells than to check the cell type, so
+ * we don't bother with the encoding.
+ *
+ * Generally, the bitfields are expected to be larger than the stored items (4/8KB keys/values,
+ * 128KB pages), but the underlying limits are larger and we can see items we cannot encode in
+ * this way. For example, if an application creates pages larger than 128KB, encoded key/value
+ * offsets after the maximum offset (the offsets of cells at the end of the page), couldn't be
+ * encoded. If that's not working, these bit patterns can be changed as they are in-memory only
+ * (we could even tune for specific workloads in specific trees).
+ */
+#define WT_KEY_FLAG_BITS 0x03
+
#define WT_CELL_FLAG 0x01
+/* key cell offset field size can hold maximum value, WT_CELL_MAX_KEY_CELL_OFFSET not needed. */
#define WT_CELL_ENCODE_OFFSET(v) ((uintptr_t)(v) << 2)
-#define WT_CELL_DECODE_OFFSET(v) (((v)&0xFFFFFFFF) >> 2)
+#define WT_CELL_DECODE_OFFSET(v) ((v) >> 2)
#define WT_K_FLAG 0x02
-#define WT_K_ENCODE_KEY_LEN(v) ((uintptr_t)(v) << 32)
-#define WT_K_DECODE_KEY_LEN(v) ((v) >> 32)
-#define WT_K_ENCODE_KEY_OFFSET(v) ((uintptr_t)(v) << 2)
-#define WT_K_DECODE_KEY_OFFSET(v) (((v)&0xFFFFFFFF) >> 2)
+#define WT_K_MAX_KEY_LEN (0x80000 - 1)
+#define WT_K_DECODE_KEY_LEN(v) (((v)&0xffffe00000000000) >> 45)
+#define WT_K_ENCODE_KEY_LEN(v) ((uintptr_t)(v) << 45)
+#define WT_K_MAX_KEY_OFFSET (0x40 - 1)
+#define WT_K_DECODE_KEY_OFFSET(v) (((v)&0x001f8000000000) >> 39)
+#define WT_K_ENCODE_KEY_OFFSET(v) ((uintptr_t)(v) << 39)
+/* Key prefix field size can hold maximum value, WT_K_MAX_KEY_PREFIX not needed. */
+#define WT_K_DECODE_KEY_PREFIX(v) (((v)&0x00007f80000000) >> 31)
+#define WT_K_ENCODE_KEY_PREFIX(v) ((uintptr_t)(v) << 31)
+/* Key cell offset field size can hold maximum value, WT_K_MAX_KEY_CELL_OFFSET not needed. */
+#define WT_K_DECODE_KEY_CELL_OFFSET(v) (((v)&0x0000007ffffffc) >> 2)
+#define WT_K_ENCODE_KEY_CELL_OFFSET(v) ((uintptr_t)(v) << 2)
#define WT_KV_FLAG 0x03
-#define WT_KV_ENCODE_KEY_LEN(v) ((uintptr_t)(v) << 55)
-#define WT_KV_DECODE_KEY_LEN(v) ((v) >> 55)
-#define WT_KV_MAX_KEY_LEN (0x200 - 1)
-#define WT_KV_ENCODE_VALUE_LEN(v) ((uintptr_t)(v) << 42)
-#define WT_KV_DECODE_VALUE_LEN(v) (((v)&0x007FFC0000000000) >> 42)
#define WT_KV_MAX_VALUE_LEN (0x2000 - 1)
-#define WT_KV_ENCODE_KEY_OFFSET(v) ((uintptr_t)(v) << 22)
-#define WT_KV_DECODE_KEY_OFFSET(v) (((v)&0x000003FFFFC00000) >> 22)
-#define WT_KV_MAX_KEY_OFFSET (0x100000 - 1)
-#define WT_KV_ENCODE_VALUE_OFFSET(v) ((uintptr_t)(v) << 2)
-#define WT_KV_DECODE_VALUE_OFFSET(v) (((v)&0x00000000003FFFFC) >> 2)
-#define WT_KV_MAX_VALUE_OFFSET (0x100000 - 1)
- switch (v & 0x03) {
- case WT_CELL_FLAG:
- /* On-page cell: no instantiated key. */
+#define WT_KV_DECODE_VALUE_LEN(v) (((v)&0xfff8000000000000) >> 51)
+#define WT_KV_ENCODE_VALUE_LEN(v) ((uintptr_t)(v) << 51)
+#define WT_KV_MAX_VALUE_OFFSET (0x40 - 1)
+#define WT_KV_DECODE_VALUE_OFFSET(v) (((v)&0x07e00000000000) >> 45)
+#define WT_KV_ENCODE_VALUE_OFFSET(v) ((uintptr_t)(v) << 45)
+#define WT_KV_MAX_KEY_LEN (0x1000 - 1)
+#define WT_KV_DECODE_KEY_LEN(v) (((v)&0x001ffe00000000) >> 33)
+#define WT_KV_ENCODE_KEY_LEN(v) ((uintptr_t)(v) << 33)
+/* Key offset encoding is the same for key and key/value forms, WT_KV_MAX_KEY_OFFSET not needed. */
+#define WT_KV_DECODE_KEY_OFFSET(v) (((v)&0x000001f8000000) >> 27)
+#define WT_KV_ENCODE_KEY_OFFSET(v) ((uintptr_t)(v) << 27)
+/* Key prefix encoding is the same for key and key/value forms, WT_KV_MAX_KEY_PREFIX not needed. */
+#define WT_KV_DECODE_KEY_PREFIX(v) (((v)&0x00000007f80000) >> 19)
+#define WT_KV_ENCODE_KEY_PREFIX(v) ((uintptr_t)(v) << 19)
+#define WT_KV_MAX_KEY_CELL_OFFSET (0x20000 - 1)
+#define WT_KV_DECODE_KEY_CELL_OFFSET(v) (((v)&0x0000000007fffc) >> 2)
+#define WT_KV_ENCODE_KEY_CELL_OFFSET(v) ((uintptr_t)(v) << 2)
+
+ switch (v & WT_KEY_FLAG_BITS) {
+ case WT_CELL_FLAG: /* On-page cell. */
if (ikeyp != NULL)
*ikeyp = NULL;
if (cellp != NULL)
*cellp = (WT_CELL *)WT_PAGE_REF_OFFSET(page, WT_CELL_DECODE_OFFSET(v));
- return (false);
- case WT_K_FLAG:
- /* Encoded key: no instantiated key, no cell. */
- if (cellp != NULL)
- *cellp = NULL;
+ if (datap != NULL) {
+ *(void **)datap = NULL;
+ *sizep = 0;
+ *prefixp = 0;
+ }
+ break;
+ case WT_K_FLAG: /* Encoded key. */
if (ikeyp != NULL)
*ikeyp = NULL;
+ if (cellp != NULL)
+ *cellp = (WT_CELL *)WT_PAGE_REF_OFFSET(page, WT_K_DECODE_KEY_CELL_OFFSET(v));
if (datap != NULL) {
- *(void **)datap = WT_PAGE_REF_OFFSET(page, WT_K_DECODE_KEY_OFFSET(v));
+ *(void **)datap =
+ WT_PAGE_REF_OFFSET(page, WT_K_DECODE_KEY_CELL_OFFSET(v) + WT_K_DECODE_KEY_OFFSET(v));
*sizep = WT_K_DECODE_KEY_LEN(v);
- return (true);
+ *prefixp = (uint8_t)WT_K_DECODE_KEY_PREFIX(v);
}
- return (false);
- case WT_KV_FLAG:
- /* Encoded key/value pair: no instantiated key, no cell. */
- if (cellp != NULL)
- *cellp = NULL;
+ break;
+ case WT_KV_FLAG: /* Encoded key/value pair. */
if (ikeyp != NULL)
*ikeyp = NULL;
+ if (cellp != NULL)
+ *cellp = (WT_CELL *)WT_PAGE_REF_OFFSET(page, WT_KV_DECODE_KEY_CELL_OFFSET(v));
if (datap != NULL) {
- *(void **)datap = WT_PAGE_REF_OFFSET(page, WT_KV_DECODE_KEY_OFFSET(v));
+ *(void **)datap = WT_PAGE_REF_OFFSET(
+ page, WT_KV_DECODE_KEY_CELL_OFFSET(v) + WT_KV_DECODE_KEY_OFFSET(v));
*sizep = WT_KV_DECODE_KEY_LEN(v);
- return (true);
+ *prefixp = (uint8_t)WT_KV_DECODE_KEY_PREFIX(v);
}
- return (false);
- }
-
- /* Instantiated key. */
- ikey = (WT_IKEY *)copy;
- if (ikeyp != NULL)
- *ikeyp = (WT_IKEY *)copy;
- if (cellp != NULL)
- *cellp = (WT_CELL *)WT_PAGE_REF_OFFSET(page, ikey->cell_offset);
- if (datap != NULL) {
- *(void **)datap = WT_IKEY_DATA(ikey);
- *sizep = ikey->size;
- return (true);
+ break;
+ default: /* Instantiated key. */
+ ikey = (WT_IKEY *)copy;
+ if (ikeyp != NULL)
+ *ikeyp = ikey;
+ if (cellp != NULL)
+ *cellp = ikey->cell_offset == 0 ?
+ NULL :
+ (WT_CELL *)WT_PAGE_REF_OFFSET(page, ikey->cell_offset);
+ if (datap != NULL) {
+ *(void **)datap = WT_IKEY_DATA(ikey);
+ *sizep = ikey->size;
+ *prefixp = 0;
+ }
+ break;
}
- return (false);
-}
-
-/*
- * __wt_row_leaf_key_set_cell --
- * Set a WT_ROW to reference an on-page row-store leaf cell.
- */
-static inline void
-__wt_row_leaf_key_set_cell(WT_PAGE *page, WT_ROW *rip, WT_CELL *cell)
-{
- uintptr_t v;
-
- /*
- * See the comment in __wt_row_leaf_key_info for an explanation of the magic.
- */
- v = WT_CELL_ENCODE_OFFSET(WT_PAGE_DISK_OFFSET(page, cell)) | WT_CELL_FLAG;
- WT_ASSERT(NULL, WT_ROW_SLOT(page, rip) < page->entries);
- WT_ROW_KEY_SET(rip, v);
}
/*
@@ -1028,54 +1054,92 @@ __wt_row_leaf_key_set_cell(WT_PAGE *page, WT_ROW *rip, WT_CELL *cell)
static inline void
__wt_row_leaf_key_set(WT_PAGE *page, WT_ROW *rip, WT_CELL_UNPACK_KV *unpack)
{
- uintptr_t v;
+ uintptr_t key_offset, v;
/*
* See the comment in __wt_row_leaf_key_info for an explanation of the magic.
+ *
+ * Not checking the prefix and cell offset sizes, the fields hold any legitimate value.
*/
- v = WT_K_ENCODE_KEY_LEN(unpack->size) |
- WT_K_ENCODE_KEY_OFFSET(WT_PAGE_DISK_OFFSET(page, unpack->data)) | WT_K_FLAG;
- WT_ASSERT(NULL, WT_ROW_SLOT(page, rip) < page->entries);
+ key_offset = (uintptr_t)WT_PTRDIFF(unpack->data, unpack->cell);
+ if (unpack->type != WT_CELL_KEY || key_offset > WT_K_MAX_KEY_OFFSET ||
+ unpack->size > WT_K_MAX_KEY_LEN)
+ v = WT_CELL_ENCODE_OFFSET(WT_PAGE_DISK_OFFSET(page, unpack->cell)) | WT_CELL_FLAG;
+ else
+ v = WT_K_ENCODE_KEY_CELL_OFFSET(WT_PAGE_DISK_OFFSET(page, unpack->cell)) |
+ WT_K_ENCODE_KEY_PREFIX(unpack->prefix) | WT_K_ENCODE_KEY_OFFSET(key_offset) |
+ WT_K_ENCODE_KEY_LEN(unpack->size) | WT_K_FLAG;
+
WT_ROW_KEY_SET(rip, v);
}
/*
* __wt_row_leaf_value_set --
- * Set a WT_ROW to reference an on-page row-store leaf value.
+ * Set a WT_ROW to reference an on-page row-store leaf key and value pair, if possible.
*/
static inline void
-__wt_row_leaf_value_set(WT_PAGE *page, WT_ROW *rip, WT_CELL_UNPACK_KV *unpack)
+__wt_row_leaf_value_set(WT_ROW *rip, WT_CELL_UNPACK_KV *unpack)
{
- uintptr_t key_len, key_offset, value_offset, v;
+ uintptr_t value_offset, value_size, v;
+ /* The row-store key can change underfoot; explicitly take a copy. */
v = (uintptr_t)WT_ROW_KEY_COPY(rip);
/*
* See the comment in __wt_row_leaf_key_info for an explanation of the magic.
+ *
+ * Only encoded keys can be upgraded to encoded key/value pairs.
*/
- if (!(v & WT_K_FLAG)) /* Already an encoded key */
+ if ((v & WT_KEY_FLAG_BITS) != WT_K_FLAG)
return;
- key_len = WT_K_DECODE_KEY_LEN(v); /* Key length */
- if (key_len > WT_KV_MAX_KEY_LEN)
+ if (WT_K_DECODE_KEY_CELL_OFFSET(v) > WT_KV_MAX_KEY_CELL_OFFSET) /* Key cell offset */
return;
- if (unpack->size > WT_KV_MAX_VALUE_LEN) /* Value length */
+ /*
+ * Not checking the prefix size, the field sizes are the same in both encodings.
+ *
+ * Not checking the key offset, the field sizes are the same in both encodings.
+ */
+ if (WT_K_DECODE_KEY_LEN(v) > WT_KV_MAX_KEY_LEN) /* Key len */
return;
- key_offset = WT_K_DECODE_KEY_OFFSET(v); /* Page offsets */
- if (key_offset > WT_KV_MAX_KEY_OFFSET)
+ value_offset = (uintptr_t)WT_PTRDIFF(unpack->data, unpack->cell);
+ if (value_offset > WT_KV_MAX_VALUE_OFFSET) /* Value offset */
return;
- value_offset = WT_PAGE_DISK_OFFSET(page, unpack->data);
- if (value_offset > WT_KV_MAX_VALUE_OFFSET)
+ value_size = unpack->size;
+ if (value_size > WT_KV_MAX_VALUE_LEN) /* Value length */
return;
- v = WT_KV_ENCODE_KEY_LEN(key_len) | WT_KV_ENCODE_VALUE_LEN(unpack->size) |
- WT_KV_ENCODE_KEY_OFFSET(key_offset) | WT_KV_ENCODE_VALUE_OFFSET(value_offset) | WT_KV_FLAG;
- WT_ASSERT(NULL, WT_ROW_SLOT(page, rip) < page->entries);
+ v = WT_KV_ENCODE_KEY_CELL_OFFSET(WT_K_DECODE_KEY_CELL_OFFSET(v)) |
+ WT_KV_ENCODE_KEY_PREFIX(WT_K_DECODE_KEY_PREFIX(v)) |
+ WT_KV_ENCODE_KEY_OFFSET(WT_K_DECODE_KEY_OFFSET(v)) |
+ WT_KV_ENCODE_KEY_LEN(WT_K_DECODE_KEY_LEN(v)) | WT_KV_ENCODE_VALUE_OFFSET(value_offset) |
+ WT_KV_ENCODE_VALUE_LEN(value_size) | WT_KV_FLAG;
WT_ROW_KEY_SET(rip, v);
}
/*
+ * __wt_row_leaf_key_free --
+ * Discard any memory allocated for an instantiated key.
+ */
+static inline void
+__wt_row_leaf_key_free(WT_SESSION_IMPL *session, WT_PAGE *page, WT_ROW *rip)
+{
+ WT_IKEY *ikey;
+ void *copy;
+
+ /* The row-store key can change underfoot; explicitly take a copy. */
+ copy = WT_ROW_KEY_COPY(rip);
+
+ /*
+ * If the key was a WT_IKEY allocation (that is, if it points somewhere other than the original
+ * page), free the memory.
+ */
+ __wt_row_leaf_key_info(page, copy, &ikey, NULL, NULL, NULL, NULL);
+ __wt_free(session, ikey);
+}
+
+/*
* __wt_row_leaf_key --
* Set a buffer to reference a row-store leaf page key as cheaply as possible.
*/
@@ -1083,7 +1147,12 @@ static inline int
__wt_row_leaf_key(
WT_SESSION_IMPL *session, WT_PAGE *page, WT_ROW *rip, WT_ITEM *key, bool instantiate)
{
+ WT_CELL *cell;
+ size_t group_size, key_size;
+ uint32_t slot;
+ uint8_t group_prefix, key_prefix;
void *copy;
+ const void *group_key, *key_data;
/*
* A front-end for __wt_row_leaf_key_work, here to inline fast paths.
@@ -1093,12 +1162,34 @@ __wt_row_leaf_key(
copy = WT_ROW_KEY_COPY(rip);
/*
- * All we handle here are on-page keys (which should be a common case), and instantiated keys
- * (which start out rare, but become more common as a leaf page is searched, instantiating
- * prefix-compressed keys).
+ * Handle keys taken directly from the disk image (which should be a common case), instantiated
+ * keys (rare initially, but possibly more common as leaf page search instantiates keys), and
+ * keys built using the most-used page key prefix.
+ *
+ * The most-used page key prefix: the longest group of compressed key prefixes on the page that
+ * can be built from a single, fully instantiated key on the page, was tracked when the page was
+ * read. Build keys in that group by appending the key's bytes to the root key from which it was
+ * compressed.
*/
- if (__wt_row_leaf_key_info(page, copy, NULL, NULL, &key->data, &key->size))
+ __wt_row_leaf_key_info(page, copy, NULL, &cell, &key_data, &key_size, &key_prefix);
+ if (key_data != NULL && key_prefix == 0) {
+ key->data = key_data;
+ key->size = key_size;
return (0);
+ }
+ slot = WT_ROW_SLOT(page, rip);
+ if (key_data != NULL && slot > page->prefix_start && slot <= page->prefix_stop) {
+ /* The row-store key can change underfoot; explicitly take a copy. */
+ copy = WT_ROW_KEY_COPY(&page->pg_row[page->prefix_start]);
+ __wt_row_leaf_key_info(page, copy, NULL, NULL, &group_key, &group_size, &group_prefix);
+ if (group_key != NULL) {
+ WT_RET(__wt_buf_init(session, key, key_prefix + key_size));
+ memcpy(key->mem, group_key, key_prefix);
+ memcpy((uint8_t *)key->mem + key_prefix, key_data, key_size);
+ key->size = key_prefix + key_size;
+ return (0);
+ }
+ }
/*
* The alternative is an on-page cell with some kind of compressed or overflow key that's never
@@ -1108,55 +1199,103 @@ __wt_row_leaf_key(
}
/*
- * __wt_row_leaf_value_cell --
- * Return the unpacked value for a row-store leaf page key.
+ * __wt_row_leaf_key_instantiate --
+ * Instantiate the keys on a leaf page as needed.
*/
-static inline void
-__wt_row_leaf_value_cell(WT_SESSION_IMPL *session, WT_PAGE *page, WT_ROW *rip,
- WT_CELL_UNPACK_KV *kpack, WT_CELL_UNPACK_KV *vpack)
+static inline int
+__wt_row_leaf_key_instantiate(WT_SESSION_IMPL *session, WT_PAGE *page)
{
- WT_CELL *kcell, *vcell;
- WT_CELL_UNPACK_KV unpack;
- size_t size;
- void *copy, *key;
+ WT_CELL *cell;
+ WT_DECL_ITEM(key);
+ WT_DECL_RET;
+ WT_ROW *rip;
+ size_t key_size;
+ uint32_t i, slot;
+ uint8_t key_prefix;
+ u_int skip;
+ void *copy;
+ const void *key_data;
- size = 0; /* -Werror=maybe-uninitialized */
- key = NULL; /* -Werror=maybe-uninitialized */
+ /*
+ * Cursor previous traversals will be too slow in the case of a set of prefix-compressed keys
+ * requiring long roll-forward processing. In the worst case, each key would require processing
+ * every key appearing before it on the page as we walk backwards through the page. If we're
+ * doing a cursor previous call, and this page has never been checked for excessively long
+ * stretches of prefix-compressed keys, do it now.
+ */
+ if (F_ISSET_ATOMIC(page, WT_PAGE_BUILD_KEYS))
+ return (0);
+ F_SET_ATOMIC(page, WT_PAGE_BUILD_KEYS);
- /* If we already have an unpacked key cell, use it. */
- if (kpack != NULL)
- vcell = (WT_CELL *)((uint8_t *)kpack->cell + __wt_cell_total_len(kpack));
- else {
+ /* Walk the keys, making sure there's something easy to work with periodically. */
+ skip = 0;
+ WT_ROW_FOREACH (page, rip, i) {
/*
- * The row-store key can change underfoot; explicitly take a copy.
+ * Get the key's information. The row-store key can change underfoot; explicitly take a
+ * copy.
*/
copy = WT_ROW_KEY_COPY(rip);
+ __wt_row_leaf_key_info(page, copy, NULL, &cell, &key_data, &key_size, &key_prefix);
/*
- * Figure out where the key is, step past it to the value cell. The test for a cell not
- * being set tells us that we have an on-page key, otherwise we're looking at an
- * instantiated key or on-page cell, both of which require an unpack of the key's cell to
- * find the value cell that follows.
+ * If the key isn't prefix compressed, or is a prefix-compressed key we can derive from the
+ * group record, we're done.
*/
- if (__wt_row_leaf_key_info(page, copy, NULL, &kcell, &key, &size) && kcell == NULL)
- vcell = (WT_CELL *)((uint8_t *)key + size);
- else {
- __wt_cell_unpack_kv(session, page->dsk, kcell, &unpack);
- vcell = (WT_CELL *)((uint8_t *)unpack.cell + __wt_cell_total_len(&unpack));
+ slot = WT_ROW_SLOT(page, rip);
+ if (key_data != NULL &&
+ (key_prefix == 0 || (slot > page->prefix_start && slot <= page->prefix_stop))) {
+ skip = 0;
+ continue;
+ }
+
+ /*
+ * Skip overflow keys: we'll instantiate them on demand and they don't require any special
+ * processing (but they don't help with long strings of prefix compressed keys, either, so
+ * we'll likely want to instantiate the first key we find after a long stretch of overflow
+ * keys). More importantly, we don't want to instantiate them for a cursor traversal, we
+ * only want to instantiate them for a tree search, as that's likely to happen repeatedly.
+ */
+ if (__wt_cell_type(cell) == WT_CELL_KEY_OVFL) {
+ ++skip;
+ continue;
+ }
+
+ /*
+ * If we skip 10 keys, instantiate one, limiting how far we're forced to roll backward. (The
+ * value 10 was chosen for no particular reason.) There are still cases where we might not
+ * need to instantiate this key (for example, a key too large to be encoded, but still
+ * on-page and not prefix-compressed). Let the underlying worker function figure that out,
+ * we should have found the vast majority of cases by now.
+ */
+ if (++skip >= 10) {
+ if (key == NULL)
+ WT_ERR(__wt_scr_alloc(session, 0, &key));
+ WT_ERR(__wt_row_leaf_key(session, page, rip, key, true));
+ skip = 0;
}
}
- __wt_cell_unpack_kv(session, page->dsk, __wt_cell_leaf_value_parse(page, vcell), vpack);
+err:
+ __wt_scr_free(session, &key);
+ return (ret);
}
/*
- * __wt_row_leaf_value_exists --
- * Check if the value for a row-store leaf page encoded key/value pair exists.
+ * __wt_row_leaf_value_is_encoded --
+ * Return if the value for a row-store leaf page is an encoded key/value pair.
*/
static inline bool
-__wt_row_leaf_value_exists(WT_ROW *rip)
+__wt_row_leaf_value_is_encoded(WT_ROW *rip)
{
- return (((uintptr_t)WT_ROW_KEY_COPY(rip) & 0x03) == WT_KV_FLAG);
+ uintptr_t v;
+
+ /* The row-store key can change underfoot; explicitly take a copy. */
+ v = (uintptr_t)WT_ROW_KEY_COPY(rip);
+
+ /*
+ * See the comment in __wt_row_leaf_key_info for an explanation of the magic.
+ */
+ return ((v & WT_KEY_FLAG_BITS) == WT_KV_FLAG);
}
/*
@@ -1171,11 +1310,22 @@ __wt_row_leaf_value(WT_PAGE *page, WT_ROW *rip, WT_ITEM *value)
/* The row-store key can change underfoot; explicitly take a copy. */
v = (uintptr_t)WT_ROW_KEY_COPY(rip);
- /*
- * See the comment in __wt_row_leaf_key_info for an explanation of the magic.
- */
- if ((v & 0x03) == WT_KV_FLAG) {
- value->data = WT_PAGE_REF_OFFSET(page, WT_KV_DECODE_VALUE_OFFSET(v));
+ if ((v & WT_KEY_FLAG_BITS) == WT_KV_FLAG) {
+ /*
+ * See the comment in __wt_row_leaf_key_info for an explanation of the magic.
+ *
+ * Normally a value is represented by the value's cell in the disk image (or an update), but
+ * there is a fast path for returning a simple value, where it's worth the additional effort
+ * of encoding the value in the per-row reference and retrieving it. This function does that
+ * work, while most value retrieval goes through the "return the unpacked cell" version.
+ *
+ * The value's data is the page offset of the key's cell, plus the key's offset, plus the
+ * key's size, plus the value's offset: in other words, we know where the key's cell starts,
+ * the key's data ends the key's cell, and the value cell immediately follows, Skip past the
+ * key cell to the value cell, then skip to the start of the value's data.
+ */
+ value->data = (uint8_t *)WT_PAGE_REF_OFFSET(page, WT_KV_DECODE_KEY_CELL_OFFSET(v)) +
+ WT_KV_DECODE_KEY_OFFSET(v) + WT_KV_DECODE_KEY_LEN(v) + WT_KV_DECODE_VALUE_OFFSET(v);
value->size = WT_KV_DECODE_VALUE_LEN(v);
return (true);
}
@@ -1183,6 +1333,55 @@ __wt_row_leaf_value(WT_PAGE *page, WT_ROW *rip, WT_ITEM *value)
}
/*
+ * __wt_row_leaf_value_cell --
+ * Return the unpacked value for a row-store leaf page key.
+ */
+static inline void
+__wt_row_leaf_value_cell(
+ WT_SESSION_IMPL *session, WT_PAGE *page, WT_ROW *rip, WT_CELL_UNPACK_KV *vpack)
+{
+ WT_CELL *kcell, *vcell;
+ WT_CELL_UNPACK_KV unpack;
+ WT_IKEY *ikey;
+ uintptr_t v;
+
+ /* The row-store key can change underfoot; explicitly take a copy. */
+ v = (uintptr_t)WT_ROW_KEY_COPY(rip);
+
+ kcell = vcell = NULL;
+ switch (v & WT_KEY_FLAG_BITS) {
+ case WT_CELL_FLAG:
+ /* We have a direct reference the key's cell, step past it to the value's cell. */
+ kcell = (WT_CELL *)WT_PAGE_REF_OFFSET(page, WT_CELL_DECODE_OFFSET(v));
+ break;
+ case WT_K_FLAG:
+ /* We have an encoded on-page key, the value's cell follows the key's data. */
+ vcell = (WT_CELL *)((uint8_t *)WT_PAGE_REF_OFFSET(page, WT_K_DECODE_KEY_CELL_OFFSET(v)) +
+ WT_K_DECODE_KEY_OFFSET(v) + WT_K_DECODE_KEY_LEN(v));
+ break;
+ case WT_KV_FLAG:
+ /* We have an encoded on-page key/value pair, the value's cell follows the key's data. */
+ vcell = (WT_CELL *)((uint8_t *)WT_PAGE_REF_OFFSET(page, WT_KV_DECODE_KEY_CELL_OFFSET(v)) +
+ WT_KV_DECODE_KEY_OFFSET(v) + WT_KV_DECODE_KEY_LEN(v));
+ break;
+ default:
+ /* We have an instantiated key, the key cell's offset is included in the structure. */
+ ikey = (WT_IKEY *)v;
+ kcell =
+ ikey->cell_offset == 0 ? NULL : (WT_CELL *)WT_PAGE_REF_OFFSET(page, ikey->cell_offset);
+ break;
+ }
+
+ /* If we only have the key cell, unpack it and skip past it to the value cell. */
+ if (vcell == NULL) {
+ __wt_cell_unpack_kv(session, page->dsk, kcell, &unpack);
+ vcell = (WT_CELL *)((uint8_t *)unpack.cell + __wt_cell_total_len(&unpack));
+ }
+
+ __wt_cell_unpack_kv(session, page->dsk, __wt_cell_leaf_value_parse(page, vcell), vpack);
+}
+
+/*
* __wt_ref_addr_copy --
* Return a copy of the WT_REF address information.
*/
@@ -1726,14 +1925,13 @@ __wt_page_swap_func(WT_SESSION_IMPL *session, WT_REF *held, WT_REF *want, uint32
bool acquired;
/*
- * This function is here to simplify the error handling during hazard
- * pointer coupling so we never leave a hazard pointer dangling. The
- * assumption is we're holding a hazard pointer on "held", and want to
- * acquire a hazard pointer on "want", releasing the hazard pointer on
+ * This function is here to simplify the error handling during hazard pointer coupling so we
+ * never leave a hazard pointer dangling. The assumption is we're holding a hazard pointer on
+ * "held", and want to acquire a hazard pointer on "want", releasing the hazard pointer on
* "held" when we're done.
*
- * When walking the tree, we sometimes swap to the same page. Fast-path
- * that to avoid thinking about error handling.
+ * When walking the tree, we sometimes swap to the same page. Fast-path that to avoid thinking
+ * about error handling.
*/
if (held == want)
return (0);
diff --git a/src/third_party/wiredtiger/src/include/buf_inline.h b/src/third_party/wiredtiger/src/include/buf_inline.h
index f38a632b4e4..610ccf8d698 100644
--- a/src/third_party/wiredtiger/src/include/buf_inline.h
+++ b/src/third_party/wiredtiger/src/include/buf_inline.h
@@ -13,8 +13,14 @@
static inline int
__wt_buf_grow(WT_SESSION_IMPL *session, WT_ITEM *buf, size_t size)
{
- return (
- size > buf->memsize || !WT_DATA_IN_ITEM(buf) ? __wt_buf_grow_worker(session, buf, size) : 0);
+ /*
+ * Take any offset in the buffer into account when calculating the size to allocate, it saves
+ * complex calculations in our callers to decide if the buffer is large enough in the case of
+ * buffers with offset data pointers.
+ */
+ return (!WT_DATA_IN_ITEM(buf) || size + WT_PTRDIFF(buf->data, buf->mem) > buf->memsize ?
+ __wt_buf_grow_worker(session, buf, size) :
+ 0);
}
/*
diff --git a/src/third_party/wiredtiger/src/include/cursor_inline.h b/src/third_party/wiredtiger/src/include/cursor_inline.h
index 4c5889b6b9e..8325e0ad8e6 100644
--- a/src/third_party/wiredtiger/src/include/cursor_inline.h
+++ b/src/third_party/wiredtiger/src/include/cursor_inline.h
@@ -446,16 +446,16 @@ __cursor_func_init(WT_CURSOR_BTREE *cbt, bool reenter)
* Return a row-store leaf page slot's key.
*/
static inline int
-__cursor_row_slot_key_return(
- WT_CURSOR_BTREE *cbt, WT_ROW *rip, WT_CELL_UNPACK_KV *kpack, bool *kpack_used)
+__cursor_row_slot_key_return(WT_CURSOR_BTREE *cbt, WT_ROW *rip, WT_CELL_UNPACK_KV *kpack)
{
WT_CELL *cell;
WT_ITEM *kb;
WT_PAGE *page;
WT_SESSION_IMPL *session;
+ size_t key_size;
+ uint8_t key_prefix;
void *copy;
-
- *kpack_used = false;
+ const void *key_data;
session = CUR2S(cbt);
page = cbt->ref->page;
@@ -468,47 +468,53 @@ __cursor_row_slot_key_return(
copy = WT_ROW_KEY_COPY(rip);
/*
- * Get a key: we could just call __wt_row_leaf_key, but as a cursor is running through the tree,
- * we may have additional information here (we may have the fully-built key that's immediately
- * before the prefix-compressed key we want, so it's a faster construction).
- *
- * First, check for an immediately available key.
+ * Check for an immediately available key from an encoded or instantiated key, and if that's not
+ * available, from the unpacked cell.
*/
- if (__wt_row_leaf_key_info(page, copy, NULL, &cell, &kb->data, &kb->size))
+ __wt_row_leaf_key_info(page, copy, NULL, &cell, &key_data, &key_size, &key_prefix);
+ if (key_data == NULL) {
+ if (__wt_cell_type(cell) != WT_CELL_KEY)
+ goto slow;
+ __wt_cell_unpack_kv(session, page->dsk, cell, kpack);
+ key_data = kpack->data;
+ key_size = kpack->size;
+ key_prefix = kpack->prefix;
+ }
+ if (key_prefix == 0) {
+ kb->data = key_data;
+ kb->size = key_size;
return (0);
+ }
/*
- * Unpack the cell and deal with overflow and prefix-compressed keys. Inline building simple
- * prefix-compressed keys from a previous key, otherwise build from scratch.
+ * A prefix compressed key. As a cursor is running through the tree, we may have the fully-built
+ * key immediately before the prefix-compressed key we want, so it's faster to build here.
+ */
+ if (cbt->rip_saved == NULL || cbt->rip_saved != rip - 1)
+ goto slow;
+
+ /*
+ * Inline building simple prefix-compressed keys from a previous key.
*
- * Clear the key cell structure. It shouldn't be necessary (as far as I can tell, and we don't
- * do it in lots of other places), but disabling shared builds (--disable-shared) results in the
- * compiler complaining about uninitialized field use.
+ * Grow the buffer as necessary as well as ensure data has been copied into local buffer space,
+ * then append the suffix to the prefix already in the buffer. Don't grow the buffer
+ * unnecessarily or copy data we don't need, truncate the item's CURRENT data length to the
+ * prefix bytes before growing the buffer.
*/
- memset(kpack, 0, sizeof(*kpack));
- __wt_cell_unpack_kv(session, page->dsk, cell, kpack);
- *kpack_used = true;
- if (kpack->type == WT_CELL_KEY && cbt->rip_saved != NULL && cbt->rip_saved == rip - 1) {
- WT_ASSERT(session, cbt->row_key->size >= kpack->prefix);
-
- /*
- * Grow the buffer as necessary as well as ensure data has been copied into local buffer
- * space, then append the suffix to the prefix already in the buffer.
- *
- * Don't grow the buffer unnecessarily or copy data we don't need, truncate the item's data
- * length to the prefix bytes.
- */
- cbt->row_key->size = kpack->prefix;
- WT_RET(__wt_buf_grow(session, cbt->row_key, cbt->row_key->size + kpack->size));
- memcpy((uint8_t *)cbt->row_key->data + cbt->row_key->size, kpack->data, kpack->size);
- cbt->row_key->size += kpack->size;
- } else {
- /*
- * Call __wt_row_leaf_key_work instead of __wt_row_leaf_key: we already did
- * __wt_row_leaf_key's fast-path checks inline.
- */
+ WT_ASSERT(session, cbt->row_key->size >= key_prefix);
+ cbt->row_key->size = key_prefix;
+ WT_RET(__wt_buf_grow(session, cbt->row_key, key_prefix + key_size));
+ memcpy((uint8_t *)cbt->row_key->data + key_prefix, key_data, key_size);
+ cbt->row_key->size = key_prefix + key_size;
+
+ if (0) {
+slow: /*
+ * Call __wt_row_leaf_key_work() instead of __wt_row_leaf_key(): we already did the
+ * __wt_row_leaf_key() fast-path checks inline.
+ */
WT_RET(__wt_row_leaf_key_work(session, page, rip, cbt->row_key, false));
}
+
kb->data = cbt->row_key->data;
kb->size = cbt->row_key->size;
cbt->rip_saved = rip;
diff --git a/src/third_party/wiredtiger/src/include/dhandle.h b/src/third_party/wiredtiger/src/include/dhandle.h
index 967d0b08be4..1ca46550587 100644
--- a/src/third_party/wiredtiger/src/include/dhandle.h
+++ b/src/third_party/wiredtiger/src/include/dhandle.h
@@ -104,7 +104,14 @@ struct __wt_data_handle {
WT_DSRC_STATS *stats[WT_COUNTER_SLOTS];
WT_DSRC_STATS *stat_array;
-/* Flags values over 0xff are reserved for WT_BTREE_* */
+/*
+ * Flags values over 0xfff are reserved for WT_BTREE_*. This lets us combine the dhandle and btree
+ * flags when we need, for example, to pass both sets in a function call.
+ *
+ * To help avoid accidental overrun of the flag values, we add a special flag value that should
+ * always be the last and highest. We use this value to assert that the dhandle flags haven't run
+ * into the space reserved for btree flags.
+ */
/* AUTOMATIC FLAG VALUE GENERATION START */
#define WT_DHANDLE_DEAD 0x001u /* Dead, awaiting discard */
#define WT_DHANDLE_DISCARD 0x002u /* Close on release */
@@ -115,8 +122,13 @@ struct __wt_data_handle {
#define WT_DHANDLE_IS_METADATA 0x040u /* Metadata handle */
#define WT_DHANDLE_LOCK_ONLY 0x080u /* Handle only used as a lock */
#define WT_DHANDLE_OPEN 0x100u /* Handle is open */
+#define WT_DHANDLE_ZZZ_ENDFLAG 0x200u /* One past highest flag value */
/* AUTOMATIC FLAG VALUE GENERATION STOP */
uint32_t flags;
+#define WT_DHANDLE_MAX_FLAG 0x1000u /* Used to ensure we don't overflow legal flag values */
+#if WT_DHANDLE_ZZZ_ENDFLAG > WT_DHANDLE_MAX_FLAG
+#error "Too many dhandle flags"
+#endif
/* AUTOMATIC FLAG VALUE GENERATION START */
#define WT_DHANDLE_ASSERT_TS_READ_ALWAYS 0x001u /* Assert read always checking. */
diff --git a/src/third_party/wiredtiger/src/include/extern.h b/src/third_party/wiredtiger/src/include/extern.h
index fb5c8e361ba..15fbcdb4a74 100644
--- a/src/third_party/wiredtiger/src/include/extern.h
+++ b/src/third_party/wiredtiger/src/include/extern.h
@@ -1041,8 +1041,6 @@ extern int __wt_meta_apply_all(WT_SESSION_IMPL *session,
int (*file_func)(WT_SESSION_IMPL *, const char *[]),
int (*name_func)(WT_SESSION_IMPL *, const char *, bool *), const char *cfg[])
WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
-extern int __wt_meta_blk_mods_load(WT_SESSION_IMPL *session, const char *config, WT_CKPT *ckpt,
- bool rename) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
extern int __wt_meta_block_metadata(WT_SESSION_IMPL *session, const char *config, WT_CKPT *ckpt)
WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
extern int __wt_meta_checkpoint(WT_SESSION_IMPL *session, const char *fname, const char *checkpoint,
@@ -1052,15 +1050,18 @@ extern int __wt_meta_checkpoint_clear(WT_SESSION_IMPL *session, const char *fnam
extern int __wt_meta_checkpoint_last_name(WT_SESSION_IMPL *session, const char *fname,
const char **namep) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
extern int __wt_meta_ckptlist_get(WT_SESSION_IMPL *session, const char *fname, bool update,
- WT_CKPT **ckptbasep) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
+ WT_CKPT **ckptbasep, size_t *allocated) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
extern int __wt_meta_ckptlist_get_from_config(WT_SESSION_IMPL *session, bool update,
- WT_CKPT **ckptbasep, const char *config) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
+ WT_CKPT **ckptbasep, size_t *allocatedp, const char *config)
+ WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
extern int __wt_meta_ckptlist_set(WT_SESSION_IMPL *session, const char *fname, WT_CKPT *ckptbase,
WT_LSN *ckptlsn) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
extern int __wt_meta_ckptlist_to_meta(WT_SESSION_IMPL *session, WT_CKPT *ckptbase, WT_ITEM *buf)
WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
extern int __wt_meta_ckptlist_update_config(WT_SESSION_IMPL *session, WT_CKPT *ckptbase,
const char *oldcfg, char **newcfgp) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
+extern int __wt_meta_saved_ckptlist_get(WT_SESSION_IMPL *session, const char *fname,
+ WT_CKPT **ckptbasep) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
extern int __wt_meta_sysinfo_set(WT_SESSION_IMPL *session)
WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
extern int __wt_meta_track_checkpoint(WT_SESSION_IMPL *session)
@@ -1236,6 +1237,8 @@ extern int __wt_reconcile(WT_SESSION_IMPL *session, WT_REF *ref, WT_SALVAGE_COOK
uint32_t flags) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
extern int __wt_remove_if_exists(WT_SESSION_IMPL *session, const char *name, bool durable)
WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
+extern int __wt_reset_blkmod(WT_SESSION_IMPL *session, const char *orig_config, WT_ITEM *buf)
+ WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
extern int __wt_rollback_to_stable(WT_SESSION_IMPL *session, const char *cfg[], bool no_ckpt)
WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
extern int __wt_row_ikey(WT_SESSION_IMPL *session, uint32_t cell_offset, const void *key,
@@ -1250,8 +1253,6 @@ extern int __wt_row_leaf_key_copy(WT_SESSION_IMPL *session, WT_PAGE *page, WT_RO
WT_ITEM *key) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
extern int __wt_row_leaf_key_work(WT_SESSION_IMPL *session, WT_PAGE *page, WT_ROW *rip_arg,
WT_ITEM *keyb, bool instantiate) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
-extern int __wt_row_leaf_keys(WT_SESSION_IMPL *session, WT_PAGE *page)
- WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
extern int __wt_row_modify(WT_CURSOR_BTREE *cbt, const WT_ITEM *key, const WT_ITEM *value,
WT_UPDATE *upd_arg, u_int modify_type, bool exclusive)
WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
@@ -1752,6 +1753,7 @@ extern void __wt_lsm_tree_writelock(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_t
extern void __wt_lsm_tree_writeunlock(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree);
extern void __wt_meta_checkpoint_free(WT_SESSION_IMPL *session, WT_CKPT *ckpt);
extern void __wt_meta_ckptlist_free(WT_SESSION_IMPL *session, WT_CKPT **ckptbasep);
+extern void __wt_meta_saved_ckptlist_free(WT_SESSION_IMPL *session);
extern void __wt_meta_track_discard(WT_SESSION_IMPL *session);
extern void __wt_meta_track_sub_on(WT_SESSION_IMPL *session);
extern void __wt_metadata_free_ckptlist(WT_SESSION *session, WT_CKPT *ckptbase)
@@ -1898,11 +1900,9 @@ static inline bool __wt_ref_addr_copy(WT_SESSION_IMPL *session, WT_REF *ref, WT_
static inline bool __wt_ref_cas_state_int(WT_SESSION_IMPL *session, WT_REF *ref, uint8_t old_state,
uint8_t new_state, const char *func, int line) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
static inline bool __wt_ref_is_root(WT_REF *ref) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
-static inline bool __wt_row_leaf_key_info(WT_PAGE *page, void *copy, WT_IKEY **ikeyp,
- WT_CELL **cellp, void *datap, size_t *sizep) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
static inline bool __wt_row_leaf_value(WT_PAGE *page, WT_ROW *rip, WT_ITEM *value)
WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
-static inline bool __wt_row_leaf_value_exists(WT_ROW *rip)
+static inline bool __wt_row_leaf_value_is_encoded(WT_ROW *rip)
WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
static inline bool __wt_session_can_wait(WT_SESSION_IMPL *session)
WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
@@ -2051,6 +2051,8 @@ static inline int __wt_ref_block_free(WT_SESSION_IMPL *session, WT_REF *ref)
WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
static inline int __wt_row_leaf_key(WT_SESSION_IMPL *session, WT_PAGE *page, WT_ROW *rip,
WT_ITEM *key, bool instantiate) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
+static inline int __wt_row_leaf_key_instantiate(WT_SESSION_IMPL *session, WT_PAGE *page)
+ WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
static inline int __wt_snprintf(char *buf, size_t size, const char *fmt, ...)
WT_GCC_FUNC_DECL_ATTRIBUTE((format(printf, 3, 4)))
WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
@@ -2262,11 +2264,13 @@ static inline void __wt_rec_incr(
static inline void __wt_ref_key(WT_PAGE *page, WT_REF *ref, void *keyp, size_t *sizep);
static inline void __wt_ref_key_clear(WT_REF *ref);
static inline void __wt_ref_key_onpage_set(WT_PAGE *page, WT_REF *ref, WT_CELL_UNPACK_ADDR *unpack);
+static inline void __wt_row_leaf_key_free(WT_SESSION_IMPL *session, WT_PAGE *page, WT_ROW *rip);
+static inline void __wt_row_leaf_key_info(WT_PAGE *page, void *copy, WT_IKEY **ikeyp,
+ WT_CELL **cellp, void *datap, size_t *sizep, uint8_t *prefixp);
static inline void __wt_row_leaf_key_set(WT_PAGE *page, WT_ROW *rip, WT_CELL_UNPACK_KV *unpack);
-static inline void __wt_row_leaf_key_set_cell(WT_PAGE *page, WT_ROW *rip, WT_CELL *cell);
-static inline void __wt_row_leaf_value_cell(WT_SESSION_IMPL *session, WT_PAGE *page, WT_ROW *rip,
- WT_CELL_UNPACK_KV *kpack, WT_CELL_UNPACK_KV *vpack);
-static inline void __wt_row_leaf_value_set(WT_PAGE *page, WT_ROW *rip, WT_CELL_UNPACK_KV *unpack);
+static inline void __wt_row_leaf_value_cell(
+ WT_SESSION_IMPL *session, WT_PAGE *page, WT_ROW *rip, WT_CELL_UNPACK_KV *vpack);
+static inline void __wt_row_leaf_value_set(WT_ROW *rip, WT_CELL_UNPACK_KV *unpack);
static inline void __wt_scr_free(WT_SESSION_IMPL *session, WT_ITEM **bufp);
static inline void __wt_seconds(WT_SESSION_IMPL *session, uint64_t *secondsp);
static inline void __wt_seconds32(WT_SESSION_IMPL *session, uint32_t *secondsp);
diff --git a/src/third_party/wiredtiger/src/include/meta.h b/src/third_party/wiredtiger/src/include/meta.h
index 22b0de65308..924aacc54db 100644
--- a/src/third_party/wiredtiger/src/include/meta.h
+++ b/src/third_party/wiredtiger/src/include/meta.h
@@ -118,6 +118,8 @@ struct __wt_block_mods {
*/
#define WT_CHECKPOINT "WiredTigerCheckpoint"
#define WT_CKPT_FOREACH(ckptbase, ckpt) for ((ckpt) = (ckptbase); (ckpt)->name != NULL; ++(ckpt))
+#define WT_CKPT_FOREACH_NAME_OR_ORDER(ckptbase, ckpt) \
+ for ((ckpt) = (ckptbase); (ckpt)->name != NULL || (ckpt)->order != 0; ++(ckpt))
struct __wt_ckpt {
char *name; /* Name or NULL */
diff --git a/src/third_party/wiredtiger/src/include/misc.h b/src/third_party/wiredtiger/src/include/misc.h
index 14cf9133502..4451d2aa638 100644
--- a/src/third_party/wiredtiger/src/include/misc.h
+++ b/src/third_party/wiredtiger/src/include/misc.h
@@ -336,7 +336,7 @@ union __wt_rand_state {
do { \
size_t __len, __space; \
va_list __ap; \
- int __ret_xx; /* __ret already used by WT_RET */ \
+ int __ret_xx; /* __ret already used by WT_ERR */ \
char *__p; \
\
/* \
@@ -355,7 +355,7 @@ union __wt_rand_state {
va_start(__ap, fmt); \
__ret_xx = __wt_vsnprintf_len_set(__p, __space, &__len, fmt, __ap); \
va_end(__ap); \
- WT_RET(__ret_xx); \
+ WT_ERR(__ret_xx); \
\
/* Check if there was enough space. */ \
if (__len < __space) { \
@@ -368,6 +368,6 @@ union __wt_rand_state {
* If not, double the size of the buffer: we're dealing \
* with strings, we don't expect the size to get huge. \
*/ \
- WT_RET(__wt_buf_extend(session, buf, (buf)->size + __len + 1)); \
+ WT_ERR(__wt_buf_extend(session, buf, (buf)->size + __len + 1)); \
} \
} while (0)
diff --git a/src/third_party/wiredtiger/src/include/reconcile.h b/src/third_party/wiredtiger/src/include/reconcile.h
index f4bda5f5434..00057870a0c 100644
--- a/src/third_party/wiredtiger/src/include/reconcile.h
+++ b/src/third_party/wiredtiger/src/include/reconcile.h
@@ -252,6 +252,10 @@ struct __wt_reconcile {
WT_ITEM *cur, _cur; /* Key/Value being built */
WT_ITEM *last, _last; /* Last key/value built */
+/* Don't increase key prefix-compression unless there's a significant gain. */
+#define WT_KEY_PREFIX_PREVIOUS_MINIMUM 10
+ uint8_t key_pfx_last; /* Last prefix compression */
+
bool key_pfx_compress; /* If can prefix-compress next key */
bool key_pfx_compress_conf; /* If prefix compression configured */
bool key_sfx_compress; /* If can suffix-compress next key */
diff --git a/src/third_party/wiredtiger/src/include/stat.h b/src/third_party/wiredtiger/src/include/stat.h
index 534d4a1cf40..540c79187b4 100644
--- a/src/third_party/wiredtiger/src/include/stat.h
+++ b/src/third_party/wiredtiger/src/include/stat.h
@@ -624,7 +624,10 @@ struct __wt_connection_stats {
int64_t page_sleep;
int64_t page_del_rollback_blocked;
int64_t child_modify_blocked_page;
- int64_t txn_prepared_updates_count;
+ int64_t txn_prepared_updates;
+ int64_t txn_prepared_updates_committed;
+ int64_t txn_prepared_updates_key_repeated;
+ int64_t txn_prepared_updates_rolledback;
int64_t txn_prepare;
int64_t txn_prepare_commit;
int64_t txn_prepare_active;
diff --git a/src/third_party/wiredtiger/src/include/txn.h b/src/third_party/wiredtiger/src/include/txn.h
index 7dbc17b9063..07f1599aca8 100644
--- a/src/third_party/wiredtiger/src/include/txn.h
+++ b/src/third_party/wiredtiger/src/include/txn.h
@@ -281,6 +281,9 @@ struct __wt_txn {
WT_TXN_OP *mod;
size_t mod_alloc;
u_int mod_count;
+#ifdef HAVE_DIAGNOSTIC
+ u_int prepare_count;
+#endif
/* Scratch buffer for in-memory log records. */
WT_ITEM *logrec;
diff --git a/src/third_party/wiredtiger/src/include/wiredtiger.in b/src/third_party/wiredtiger/src/include/wiredtiger.in
index a4a1b584b35..c21c3dac748 100644
--- a/src/third_party/wiredtiger/src/include/wiredtiger.in
+++ b/src/third_party/wiredtiger/src/include/wiredtiger.in
@@ -480,6 +480,8 @@ struct __wt_cursor {
* (as it partially depends on the underlying file configuration), but
* is always a small number of bytes less than 4GB.
*
+ * The WT_CURSOR::insert method can only be used at snapshot isolation.
+ *
* @param cursor the cursor handle
* @errors
* In particular, if \c overwrite=false is configured and a record with
@@ -491,10 +493,8 @@ struct __wt_cursor {
int __F(insert)(WT_CURSOR *cursor);
/*!
- * Modify an existing record.
- *
- * Both the key and value must be set and the record must already exist;
- * the record will be updated.
+ * Modify an existing record. Both the key and value must be set and the record must
+ * already exist.
*
* Modifications are specified in WT_MODIFY structures. Modifications
* are applied in order and later modifications can update earlier ones.
@@ -503,9 +503,6 @@ struct __wt_cursor {
* \c S), or raw byte arrays accessed using a WT_ITEM structure (value
* format type \c u).
*
- * The WT_CURSOR::modify method can only be called from within an
- * explicit transaction configured at the snapshot isolation level.
- *
* The WT_CURSOR::modify method stores a change record in cache and
* writes a change record to the log instead of the usual complete
* values. Note that WT_CURSOR::modify is generally slower than the
@@ -526,6 +523,8 @@ struct __wt_cursor {
* (as it partially depends on the underlying file configuration), but
* is always a small number of bytes less than 4GB.
*
+ * The WT_CURSOR::modify method can only be used at snapshot isolation.
+ *
* @param cursor the cursor handle
* @param entries an array of modification data structures
* @param nentries the number of modification data structures
@@ -561,6 +560,8 @@ struct __wt_cursor {
* (as it partially depends on the underlying file configuration), but
* is always a small number of bytes less than 4GB.
*
+ * The WT_CURSOR::update method can only be used at snapshot isolation.
+ *
* @param cursor the cursor handle
* @errors
* In particular, if \c overwrite=false is configured and no record with
@@ -594,6 +595,8 @@ struct __wt_cursor {
* (that is, a store with an 'r' type key and 't' type value) is
* identical to setting the record's value to 0.
*
+ * The WT_CURSOR::remove method can only be used at snapshot isolation.
+ *
* @param cursor the cursor handle
* @errors
*/
@@ -1119,8 +1122,8 @@ struct __wt_session {
* \c none.}
* @config{format, the file format., a string\, chosen from the following options: \c
* "btree"; default \c btree.}
- * @config{huffman_key, This option is no longer supported. Retained for backward
- * compatibility. See @ref huffman for more information., a string; default \c none.}
+ * @config{huffman_key, This option is no longer supported\, retained for backward
+ * compatibility., a string; default \c none.}
* @config{huffman_value, configure Huffman encoding for values. Permitted values are \c
* "none"\, \c "english"\, \c "utf8<file>" or \c "utf16<file>". See @ref huffman for more
* information., a string; default \c none.}
@@ -1140,6 +1143,8 @@ struct __wt_session {
* @config{&nbsp;&nbsp;&nbsp;&nbsp;repair, whether to reconstruct the metadata from
* the raw file content., a boolean flag; default \c false.}
* @config{ ),,}
+ * @config{internal_item_max, This option is no longer supported\, retained for backward
+ * compatibility., an integer greater than or equal to 0; default \c 0.}
* @config{internal_key_max, the largest key stored in an internal node\, in bytes. If
* set\, keys larger than the specified size are stored as overflow items (which may require
* additional I/O to access). The default and the maximum allowed value are both one-tenth
@@ -1158,6 +1163,10 @@ struct __wt_session {
* use WT_ITEM structures to manipulate raw byte arrays. By default\, records are stored in
* row-store files: keys of type \c 'r' are record numbers and records referenced by record
* number are stored in column-store files., a format string; default \c u.}
+ * @config{key_gap, This option is no longer supported\, retained for backward
+ * compatibility., an integer greater than or equal to 0; default \c 10.}
+ * @config{leaf_item_max, This option is no longer supported\, retained for backward
+ * compatibility., an integer greater than or equal to 0; default \c 0.}
* @config{leaf_key_max, the largest key stored in a leaf node\, in bytes. If set\, keys
* larger than the specified size are stored as overflow items (which may require additional
* I/O to access). The default value is one-tenth the size of a newly split leaf page., an
@@ -1511,16 +1520,18 @@ struct __wt_session {
* contains.
* @snippet ex_all.c Truncate a range
*
- * Any specified cursors end with no position, and subsequent calls to
- * the WT_CURSOR::next (WT_CURSOR::prev) method will iterate from the
- * beginning (end) of the table.
- *
* When a range truncate is in progress, and another transaction inserts
* a key into that range, the behavior is not well defined - a conflict
* may be detected or both transactions may be permitted to commit. If
* they do commit, and if there is a crash and recovery runs, the result
* may be different than what was in cache before the crash.
*
+ * The WT_CURSOR::truncate range truncate operation can only be used at snapshot isolation.
+ *
+ * Any specified cursors end with no position, and subsequent calls to
+ * the WT_CURSOR::next (WT_CURSOR::prev) method will iterate from the
+ * beginning (end) of the table.
+ *
* Truncate a backup cursor. This operation removes all log files that
* have been returned by the backup cursor. It can be used to remove log
* files after copying them during @ref backup_incremental.
@@ -1783,6 +1794,8 @@ struct __wt_session {
/*!
* Set a timestamp on a transaction.
*
+ * The WT_SESSION.timestamp_transaction method can only be used at snapshot isolation.
+ *
* @snippet ex_all.c transaction timestamp
*
* @requires_transaction
@@ -1811,6 +1824,8 @@ struct __wt_session {
/*!
* Query the session's transaction timestamp state.
*
+ * The WT_SESSION.query_timestamp method can only be used at snapshot isolation.
+ *
* @param session the session handle
* @param[out] hex_timestamp a buffer that will be set to the
* hexadecimal encoding of the timestamp being queried. Must be large
@@ -4778,35 +4793,49 @@ struct __wt_storage_source {
* @param storage_source the WT_STORAGE_SOURCE
* @param session the current WiredTiger session
* @param bucket_name the name of the bucket. Use of '/' is implementation dependent.
- * @param prefix a prefix for each file. If used, the prefix will be added to the
- * name of each object created or otherwise accessed in the bucket. Also, only
- * objects with this prefix will be visible, and the prefix will be removed when
- * listed. Prefixes may contain '/' as a separator.
* @param auth_token the authorization identifier.
- * @param config additional configuration, currently must be NULL.
+ * @param config additional configuration. The only allowable value is \c cache_directory,
+ * the name of a directory holding cached objects. Its default is
+ * \c "<home>/cache-<bucket>" with \c <home> replaced by the @ref home, and
+ * \c <bucket> replaced by the bucket_name.
* @param[out] file_system the customized file system returned
*/
int (*ss_customize_file_system)(WT_STORAGE_SOURCE *storage_source, WT_SESSION *session,
- const char *bucket_name, const char *prefix, const char *auth_token, const char *config,
+ const char *bucket_name, const char *auth_token, const char *config,
WT_FILE_SYSTEM **file_system);
/*!
- * Flush any existing objects that match the location and name from
- * local storage to shared object storage. The implementation guarantees
- * that all objects that are in a created state (see WT_STORAGE_SOURCE::ss_open_object)
- * at the beginning of this call have been transferred when this call returns.
+ * Copy a file from the default file system to an object name in shared object storage.
*
* @errors
*
* @param storage_source the WT_STORAGE_SOURCE
* @param session the current WiredTiger session
- * @param file_system if NULL, all objects are considered, otherwise only objects
- * managed by the given file system.
- * @param name the name of the object to flush (or NULL for all)
+ * @param file_system the destination bucket and credentials
+ * @param source the name of the source input file
+ * @param object the name of the destination object
* @param config additional configuration, currently must be NULL
*/
int (*ss_flush)(WT_STORAGE_SOURCE *storage_source, WT_SESSION *session,
- WT_FILE_SYSTEM *file_system, const char *name, const char *config);
+ WT_FILE_SYSTEM *file_system, const char *source, const char *object,
+ const char *config);
+
+ /*!
+ * After a flush, rename the source file from the default file system to be cached in
+ * the shared object storage.
+ *
+ * @errors
+ *
+ * @param storage_source the WT_STORAGE_SOURCE
+ * @param session the current WiredTiger session
+ * @param file_system the destination bucket and credentials
+ * @param source the name of the source input file
+ * @param object the name of the destination object
+ * @param config additional configuration, currently must be NULL
+ */
+ int (*ss_flush_finish)(WT_STORAGE_SOURCE *storage_source, WT_SESSION *session,
+ WT_FILE_SYSTEM *file_system, const char *source, const char *object,
+ const char *config);
/*!
* A callback performed when the storage source is closed and will no
@@ -5619,445 +5648,451 @@ extern int wiredtiger_extension_terminate(WT_CONNECTION *connection);
/*! thread-yield: page reconciliation yielded due to child modification */
#define WT_STAT_CONN_CHILD_MODIFY_BLOCKED_PAGE 1310
/*! transaction: Number of prepared updates */
-#define WT_STAT_CONN_TXN_PREPARED_UPDATES_COUNT 1311
+#define WT_STAT_CONN_TXN_PREPARED_UPDATES 1311
+/*! transaction: Number of prepared updates committed */
+#define WT_STAT_CONN_TXN_PREPARED_UPDATES_COMMITTED 1312
+/*! transaction: Number of prepared updates repeated on the same key */
+#define WT_STAT_CONN_TXN_PREPARED_UPDATES_KEY_REPEATED 1313
+/*! transaction: Number of prepared updates rolled back */
+#define WT_STAT_CONN_TXN_PREPARED_UPDATES_ROLLEDBACK 1314
/*! transaction: prepared transactions */
-#define WT_STAT_CONN_TXN_PREPARE 1312
+#define WT_STAT_CONN_TXN_PREPARE 1315
/*! transaction: prepared transactions committed */
-#define WT_STAT_CONN_TXN_PREPARE_COMMIT 1313
+#define WT_STAT_CONN_TXN_PREPARE_COMMIT 1316
/*! transaction: prepared transactions currently active */
-#define WT_STAT_CONN_TXN_PREPARE_ACTIVE 1314
+#define WT_STAT_CONN_TXN_PREPARE_ACTIVE 1317
/*! transaction: prepared transactions rolled back */
-#define WT_STAT_CONN_TXN_PREPARE_ROLLBACK 1315
+#define WT_STAT_CONN_TXN_PREPARE_ROLLBACK 1318
/*! transaction: query timestamp calls */
-#define WT_STAT_CONN_TXN_QUERY_TS 1316
+#define WT_STAT_CONN_TXN_QUERY_TS 1319
/*! transaction: rollback to stable calls */
-#define WT_STAT_CONN_TXN_RTS 1317
+#define WT_STAT_CONN_TXN_RTS 1320
/*! transaction: rollback to stable pages visited */
-#define WT_STAT_CONN_TXN_RTS_PAGES_VISITED 1318
+#define WT_STAT_CONN_TXN_RTS_PAGES_VISITED 1321
/*! transaction: rollback to stable tree walk skipping pages */
-#define WT_STAT_CONN_TXN_RTS_TREE_WALK_SKIP_PAGES 1319
+#define WT_STAT_CONN_TXN_RTS_TREE_WALK_SKIP_PAGES 1322
/*! transaction: rollback to stable updates aborted */
-#define WT_STAT_CONN_TXN_RTS_UPD_ABORTED 1320
+#define WT_STAT_CONN_TXN_RTS_UPD_ABORTED 1323
/*! transaction: sessions scanned in each walk of concurrent sessions */
-#define WT_STAT_CONN_TXN_SESSIONS_WALKED 1321
+#define WT_STAT_CONN_TXN_SESSIONS_WALKED 1324
/*! transaction: set timestamp calls */
-#define WT_STAT_CONN_TXN_SET_TS 1322
+#define WT_STAT_CONN_TXN_SET_TS 1325
/*! transaction: set timestamp durable calls */
-#define WT_STAT_CONN_TXN_SET_TS_DURABLE 1323
+#define WT_STAT_CONN_TXN_SET_TS_DURABLE 1326
/*! transaction: set timestamp durable updates */
-#define WT_STAT_CONN_TXN_SET_TS_DURABLE_UPD 1324
+#define WT_STAT_CONN_TXN_SET_TS_DURABLE_UPD 1327
/*! transaction: set timestamp oldest calls */
-#define WT_STAT_CONN_TXN_SET_TS_OLDEST 1325
+#define WT_STAT_CONN_TXN_SET_TS_OLDEST 1328
/*! transaction: set timestamp oldest updates */
-#define WT_STAT_CONN_TXN_SET_TS_OLDEST_UPD 1326
+#define WT_STAT_CONN_TXN_SET_TS_OLDEST_UPD 1329
/*! transaction: set timestamp stable calls */
-#define WT_STAT_CONN_TXN_SET_TS_STABLE 1327
+#define WT_STAT_CONN_TXN_SET_TS_STABLE 1330
/*! transaction: set timestamp stable updates */
-#define WT_STAT_CONN_TXN_SET_TS_STABLE_UPD 1328
+#define WT_STAT_CONN_TXN_SET_TS_STABLE_UPD 1331
/*! transaction: transaction begins */
-#define WT_STAT_CONN_TXN_BEGIN 1329
+#define WT_STAT_CONN_TXN_BEGIN 1332
/*! transaction: transaction checkpoint currently running */
-#define WT_STAT_CONN_TXN_CHECKPOINT_RUNNING 1330
+#define WT_STAT_CONN_TXN_CHECKPOINT_RUNNING 1333
/*!
* transaction: transaction checkpoint currently running for history
* store file
*/
-#define WT_STAT_CONN_TXN_CHECKPOINT_RUNNING_HS 1331
+#define WT_STAT_CONN_TXN_CHECKPOINT_RUNNING_HS 1334
/*! transaction: transaction checkpoint generation */
-#define WT_STAT_CONN_TXN_CHECKPOINT_GENERATION 1332
+#define WT_STAT_CONN_TXN_CHECKPOINT_GENERATION 1335
/*!
* transaction: transaction checkpoint history store file duration
* (usecs)
*/
-#define WT_STAT_CONN_TXN_HS_CKPT_DURATION 1333
+#define WT_STAT_CONN_TXN_HS_CKPT_DURATION 1336
/*! transaction: transaction checkpoint max time (msecs) */
-#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_MAX 1334
+#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_MAX 1337
/*! transaction: transaction checkpoint min time (msecs) */
-#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_MIN 1335
+#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_MIN 1338
/*!
* transaction: transaction checkpoint most recent duration for gathering
* all handles (usecs)
*/
-#define WT_STAT_CONN_TXN_CHECKPOINT_HANDLE_DURATION 1336
+#define WT_STAT_CONN_TXN_CHECKPOINT_HANDLE_DURATION 1339
/*!
* transaction: transaction checkpoint most recent duration for gathering
* applied handles (usecs)
*/
-#define WT_STAT_CONN_TXN_CHECKPOINT_HANDLE_DURATION_APPLY 1337
+#define WT_STAT_CONN_TXN_CHECKPOINT_HANDLE_DURATION_APPLY 1340
/*!
* transaction: transaction checkpoint most recent duration for gathering
* skipped handles (usecs)
*/
-#define WT_STAT_CONN_TXN_CHECKPOINT_HANDLE_DURATION_SKIP 1338
+#define WT_STAT_CONN_TXN_CHECKPOINT_HANDLE_DURATION_SKIP 1341
/*! transaction: transaction checkpoint most recent handles applied */
-#define WT_STAT_CONN_TXN_CHECKPOINT_HANDLE_APPLIED 1339
+#define WT_STAT_CONN_TXN_CHECKPOINT_HANDLE_APPLIED 1342
/*! transaction: transaction checkpoint most recent handles skipped */
-#define WT_STAT_CONN_TXN_CHECKPOINT_HANDLE_SKIPPED 1340
+#define WT_STAT_CONN_TXN_CHECKPOINT_HANDLE_SKIPPED 1343
/*! transaction: transaction checkpoint most recent handles walked */
-#define WT_STAT_CONN_TXN_CHECKPOINT_HANDLE_WALKED 1341
+#define WT_STAT_CONN_TXN_CHECKPOINT_HANDLE_WALKED 1344
/*! transaction: transaction checkpoint most recent time (msecs) */
-#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_RECENT 1342
+#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_RECENT 1345
/*! transaction: transaction checkpoint prepare currently running */
-#define WT_STAT_CONN_TXN_CHECKPOINT_PREP_RUNNING 1343
+#define WT_STAT_CONN_TXN_CHECKPOINT_PREP_RUNNING 1346
/*! transaction: transaction checkpoint prepare max time (msecs) */
-#define WT_STAT_CONN_TXN_CHECKPOINT_PREP_MAX 1344
+#define WT_STAT_CONN_TXN_CHECKPOINT_PREP_MAX 1347
/*! transaction: transaction checkpoint prepare min time (msecs) */
-#define WT_STAT_CONN_TXN_CHECKPOINT_PREP_MIN 1345
+#define WT_STAT_CONN_TXN_CHECKPOINT_PREP_MIN 1348
/*! transaction: transaction checkpoint prepare most recent time (msecs) */
-#define WT_STAT_CONN_TXN_CHECKPOINT_PREP_RECENT 1346
+#define WT_STAT_CONN_TXN_CHECKPOINT_PREP_RECENT 1349
/*! transaction: transaction checkpoint prepare total time (msecs) */
-#define WT_STAT_CONN_TXN_CHECKPOINT_PREP_TOTAL 1347
+#define WT_STAT_CONN_TXN_CHECKPOINT_PREP_TOTAL 1350
/*! transaction: transaction checkpoint scrub dirty target */
-#define WT_STAT_CONN_TXN_CHECKPOINT_SCRUB_TARGET 1348
+#define WT_STAT_CONN_TXN_CHECKPOINT_SCRUB_TARGET 1351
/*! transaction: transaction checkpoint scrub time (msecs) */
-#define WT_STAT_CONN_TXN_CHECKPOINT_SCRUB_TIME 1349
+#define WT_STAT_CONN_TXN_CHECKPOINT_SCRUB_TIME 1352
/*! transaction: transaction checkpoint total time (msecs) */
-#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_TOTAL 1350
+#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_TOTAL 1353
/*! transaction: transaction checkpoints */
-#define WT_STAT_CONN_TXN_CHECKPOINT 1351
+#define WT_STAT_CONN_TXN_CHECKPOINT 1354
/*!
* transaction: transaction checkpoints skipped because database was
* clean
*/
-#define WT_STAT_CONN_TXN_CHECKPOINT_SKIPPED 1352
+#define WT_STAT_CONN_TXN_CHECKPOINT_SKIPPED 1355
/*! transaction: transaction failures due to history store */
-#define WT_STAT_CONN_TXN_FAIL_CACHE 1353
+#define WT_STAT_CONN_TXN_FAIL_CACHE 1356
/*!
* transaction: transaction fsync calls for checkpoint after allocating
* the transaction ID
*/
-#define WT_STAT_CONN_TXN_CHECKPOINT_FSYNC_POST 1354
+#define WT_STAT_CONN_TXN_CHECKPOINT_FSYNC_POST 1357
/*!
* transaction: transaction fsync duration for checkpoint after
* allocating the transaction ID (usecs)
*/
-#define WT_STAT_CONN_TXN_CHECKPOINT_FSYNC_POST_DURATION 1355
+#define WT_STAT_CONN_TXN_CHECKPOINT_FSYNC_POST_DURATION 1358
/*! transaction: transaction range of IDs currently pinned */
-#define WT_STAT_CONN_TXN_PINNED_RANGE 1356
+#define WT_STAT_CONN_TXN_PINNED_RANGE 1359
/*! transaction: transaction range of IDs currently pinned by a checkpoint */
-#define WT_STAT_CONN_TXN_PINNED_CHECKPOINT_RANGE 1357
+#define WT_STAT_CONN_TXN_PINNED_CHECKPOINT_RANGE 1360
/*! transaction: transaction range of timestamps currently pinned */
-#define WT_STAT_CONN_TXN_PINNED_TIMESTAMP 1358
+#define WT_STAT_CONN_TXN_PINNED_TIMESTAMP 1361
/*! transaction: transaction range of timestamps pinned by a checkpoint */
-#define WT_STAT_CONN_TXN_PINNED_TIMESTAMP_CHECKPOINT 1359
+#define WT_STAT_CONN_TXN_PINNED_TIMESTAMP_CHECKPOINT 1362
/*!
* transaction: transaction range of timestamps pinned by the oldest
* active read timestamp
*/
-#define WT_STAT_CONN_TXN_PINNED_TIMESTAMP_READER 1360
+#define WT_STAT_CONN_TXN_PINNED_TIMESTAMP_READER 1363
/*!
* transaction: transaction range of timestamps pinned by the oldest
* timestamp
*/
-#define WT_STAT_CONN_TXN_PINNED_TIMESTAMP_OLDEST 1361
+#define WT_STAT_CONN_TXN_PINNED_TIMESTAMP_OLDEST 1364
/*! transaction: transaction read timestamp of the oldest active reader */
-#define WT_STAT_CONN_TXN_TIMESTAMP_OLDEST_ACTIVE_READ 1362
+#define WT_STAT_CONN_TXN_TIMESTAMP_OLDEST_ACTIVE_READ 1365
/*! transaction: transaction rollback to stable currently running */
-#define WT_STAT_CONN_TXN_ROLLBACK_TO_STABLE_RUNNING 1363
+#define WT_STAT_CONN_TXN_ROLLBACK_TO_STABLE_RUNNING 1366
/*! transaction: transaction sync calls */
-#define WT_STAT_CONN_TXN_SYNC 1364
+#define WT_STAT_CONN_TXN_SYNC 1367
/*! transaction: transaction walk of concurrent sessions */
-#define WT_STAT_CONN_TXN_WALK_SESSIONS 1365
+#define WT_STAT_CONN_TXN_WALK_SESSIONS 1368
/*! transaction: transactions committed */
-#define WT_STAT_CONN_TXN_COMMIT 1366
+#define WT_STAT_CONN_TXN_COMMIT 1369
/*! transaction: transactions rolled back */
-#define WT_STAT_CONN_TXN_ROLLBACK 1367
+#define WT_STAT_CONN_TXN_ROLLBACK 1370
/*! LSM: sleep for LSM checkpoint throttle */
-#define WT_STAT_CONN_LSM_CHECKPOINT_THROTTLE 1368
+#define WT_STAT_CONN_LSM_CHECKPOINT_THROTTLE 1371
/*! LSM: sleep for LSM merge throttle */
-#define WT_STAT_CONN_LSM_MERGE_THROTTLE 1369
+#define WT_STAT_CONN_LSM_MERGE_THROTTLE 1372
/*! cache: bytes currently in the cache */
-#define WT_STAT_CONN_CACHE_BYTES_INUSE 1370
+#define WT_STAT_CONN_CACHE_BYTES_INUSE 1373
/*! cache: bytes dirty in the cache cumulative */
-#define WT_STAT_CONN_CACHE_BYTES_DIRTY_TOTAL 1371
+#define WT_STAT_CONN_CACHE_BYTES_DIRTY_TOTAL 1374
/*! cache: bytes read into cache */
-#define WT_STAT_CONN_CACHE_BYTES_READ 1372
+#define WT_STAT_CONN_CACHE_BYTES_READ 1375
/*! cache: bytes written from cache */
-#define WT_STAT_CONN_CACHE_BYTES_WRITE 1373
+#define WT_STAT_CONN_CACHE_BYTES_WRITE 1376
/*! cache: checkpoint blocked page eviction */
-#define WT_STAT_CONN_CACHE_EVICTION_CHECKPOINT 1374
+#define WT_STAT_CONN_CACHE_EVICTION_CHECKPOINT 1377
/*!
* cache: checkpoint of history store file blocked non-history store page
* eviction
*/
-#define WT_STAT_CONN_CACHE_EVICTION_BLOCKED_CHECKPOINT_HS 1375
+#define WT_STAT_CONN_CACHE_EVICTION_BLOCKED_CHECKPOINT_HS 1378
/*! cache: eviction walk target pages histogram - 0-9 */
-#define WT_STAT_CONN_CACHE_EVICTION_TARGET_PAGE_LT10 1376
+#define WT_STAT_CONN_CACHE_EVICTION_TARGET_PAGE_LT10 1379
/*! cache: eviction walk target pages histogram - 10-31 */
-#define WT_STAT_CONN_CACHE_EVICTION_TARGET_PAGE_LT32 1377
+#define WT_STAT_CONN_CACHE_EVICTION_TARGET_PAGE_LT32 1380
/*! cache: eviction walk target pages histogram - 128 and higher */
-#define WT_STAT_CONN_CACHE_EVICTION_TARGET_PAGE_GE128 1378
+#define WT_STAT_CONN_CACHE_EVICTION_TARGET_PAGE_GE128 1381
/*! cache: eviction walk target pages histogram - 32-63 */
-#define WT_STAT_CONN_CACHE_EVICTION_TARGET_PAGE_LT64 1379
+#define WT_STAT_CONN_CACHE_EVICTION_TARGET_PAGE_LT64 1382
/*! cache: eviction walk target pages histogram - 64-128 */
-#define WT_STAT_CONN_CACHE_EVICTION_TARGET_PAGE_LT128 1380
+#define WT_STAT_CONN_CACHE_EVICTION_TARGET_PAGE_LT128 1383
/*!
* cache: eviction walk target pages reduced due to history store cache
* pressure
*/
-#define WT_STAT_CONN_CACHE_EVICTION_TARGET_PAGE_REDUCED 1381
+#define WT_STAT_CONN_CACHE_EVICTION_TARGET_PAGE_REDUCED 1384
/*! cache: eviction walks abandoned */
-#define WT_STAT_CONN_CACHE_EVICTION_WALKS_ABANDONED 1382
+#define WT_STAT_CONN_CACHE_EVICTION_WALKS_ABANDONED 1385
/*! cache: eviction walks gave up because they restarted their walk twice */
-#define WT_STAT_CONN_CACHE_EVICTION_WALKS_STOPPED 1383
+#define WT_STAT_CONN_CACHE_EVICTION_WALKS_STOPPED 1386
/*!
* cache: eviction walks gave up because they saw too many pages and
* found no candidates
*/
-#define WT_STAT_CONN_CACHE_EVICTION_WALKS_GAVE_UP_NO_TARGETS 1384
+#define WT_STAT_CONN_CACHE_EVICTION_WALKS_GAVE_UP_NO_TARGETS 1387
/*!
* cache: eviction walks gave up because they saw too many pages and
* found too few candidates
*/
-#define WT_STAT_CONN_CACHE_EVICTION_WALKS_GAVE_UP_RATIO 1385
+#define WT_STAT_CONN_CACHE_EVICTION_WALKS_GAVE_UP_RATIO 1388
/*! cache: eviction walks reached end of tree */
-#define WT_STAT_CONN_CACHE_EVICTION_WALKS_ENDED 1386
+#define WT_STAT_CONN_CACHE_EVICTION_WALKS_ENDED 1389
/*! cache: eviction walks restarted */
-#define WT_STAT_CONN_CACHE_EVICTION_WALK_RESTART 1387
+#define WT_STAT_CONN_CACHE_EVICTION_WALK_RESTART 1390
/*! cache: eviction walks started from root of tree */
-#define WT_STAT_CONN_CACHE_EVICTION_WALK_FROM_ROOT 1388
+#define WT_STAT_CONN_CACHE_EVICTION_WALK_FROM_ROOT 1391
/*! cache: eviction walks started from saved location in tree */
-#define WT_STAT_CONN_CACHE_EVICTION_WALK_SAVED_POS 1389
+#define WT_STAT_CONN_CACHE_EVICTION_WALK_SAVED_POS 1392
/*! cache: hazard pointer blocked page eviction */
-#define WT_STAT_CONN_CACHE_EVICTION_HAZARD 1390
+#define WT_STAT_CONN_CACHE_EVICTION_HAZARD 1393
/*! cache: history store table insert calls */
-#define WT_STAT_CONN_CACHE_HS_INSERT 1391
+#define WT_STAT_CONN_CACHE_HS_INSERT 1394
/*! cache: history store table insert calls that returned restart */
-#define WT_STAT_CONN_CACHE_HS_INSERT_RESTART 1392
+#define WT_STAT_CONN_CACHE_HS_INSERT_RESTART 1395
/*!
* cache: history store table out-of-order resolved updates that lose
* their durable timestamp
*/
-#define WT_STAT_CONN_CACHE_HS_ORDER_LOSE_DURABLE_TIMESTAMP 1393
+#define WT_STAT_CONN_CACHE_HS_ORDER_LOSE_DURABLE_TIMESTAMP 1396
/*!
* cache: history store table out-of-order updates that were fixed up by
* reinserting with the fixed timestamp
*/
-#define WT_STAT_CONN_CACHE_HS_ORDER_REINSERT 1394
+#define WT_STAT_CONN_CACHE_HS_ORDER_REINSERT 1397
/*! cache: history store table reads */
-#define WT_STAT_CONN_CACHE_HS_READ 1395
+#define WT_STAT_CONN_CACHE_HS_READ 1398
/*! cache: history store table reads missed */
-#define WT_STAT_CONN_CACHE_HS_READ_MISS 1396
+#define WT_STAT_CONN_CACHE_HS_READ_MISS 1399
/*! cache: history store table reads requiring squashed modifies */
-#define WT_STAT_CONN_CACHE_HS_READ_SQUASH 1397
+#define WT_STAT_CONN_CACHE_HS_READ_SQUASH 1400
/*!
* cache: history store table truncation by rollback to stable to remove
* an unstable update
*/
-#define WT_STAT_CONN_CACHE_HS_KEY_TRUNCATE_RTS_UNSTABLE 1398
+#define WT_STAT_CONN_CACHE_HS_KEY_TRUNCATE_RTS_UNSTABLE 1401
/*!
* cache: history store table truncation by rollback to stable to remove
* an update
*/
-#define WT_STAT_CONN_CACHE_HS_KEY_TRUNCATE_RTS 1399
+#define WT_STAT_CONN_CACHE_HS_KEY_TRUNCATE_RTS 1402
/*! cache: history store table truncation to remove an update */
-#define WT_STAT_CONN_CACHE_HS_KEY_TRUNCATE 1400
+#define WT_STAT_CONN_CACHE_HS_KEY_TRUNCATE 1403
/*!
* cache: history store table truncation to remove range of updates due
* to key being removed from the data page during reconciliation
*/
-#define WT_STAT_CONN_CACHE_HS_KEY_TRUNCATE_ONPAGE_REMOVAL 1401
+#define WT_STAT_CONN_CACHE_HS_KEY_TRUNCATE_ONPAGE_REMOVAL 1404
/*!
* cache: history store table truncation to remove range of updates due
* to out-of-order timestamp update on data page
*/
-#define WT_STAT_CONN_CACHE_HS_ORDER_REMOVE 1402
+#define WT_STAT_CONN_CACHE_HS_ORDER_REMOVE 1405
/*! cache: history store table writes requiring squashed modifies */
-#define WT_STAT_CONN_CACHE_HS_WRITE_SQUASH 1403
+#define WT_STAT_CONN_CACHE_HS_WRITE_SQUASH 1406
/*! cache: in-memory page passed criteria to be split */
-#define WT_STAT_CONN_CACHE_INMEM_SPLITTABLE 1404
+#define WT_STAT_CONN_CACHE_INMEM_SPLITTABLE 1407
/*! cache: in-memory page splits */
-#define WT_STAT_CONN_CACHE_INMEM_SPLIT 1405
+#define WT_STAT_CONN_CACHE_INMEM_SPLIT 1408
/*! cache: internal pages evicted */
-#define WT_STAT_CONN_CACHE_EVICTION_INTERNAL 1406
+#define WT_STAT_CONN_CACHE_EVICTION_INTERNAL 1409
/*! cache: internal pages split during eviction */
-#define WT_STAT_CONN_CACHE_EVICTION_SPLIT_INTERNAL 1407
+#define WT_STAT_CONN_CACHE_EVICTION_SPLIT_INTERNAL 1410
/*! cache: leaf pages split during eviction */
-#define WT_STAT_CONN_CACHE_EVICTION_SPLIT_LEAF 1408
+#define WT_STAT_CONN_CACHE_EVICTION_SPLIT_LEAF 1411
/*! cache: modified pages evicted */
-#define WT_STAT_CONN_CACHE_EVICTION_DIRTY 1409
+#define WT_STAT_CONN_CACHE_EVICTION_DIRTY 1412
/*! cache: overflow pages read into cache */
-#define WT_STAT_CONN_CACHE_READ_OVERFLOW 1410
+#define WT_STAT_CONN_CACHE_READ_OVERFLOW 1413
/*! cache: page split during eviction deepened the tree */
-#define WT_STAT_CONN_CACHE_EVICTION_DEEPEN 1411
+#define WT_STAT_CONN_CACHE_EVICTION_DEEPEN 1414
/*! cache: page written requiring history store records */
-#define WT_STAT_CONN_CACHE_WRITE_HS 1412
+#define WT_STAT_CONN_CACHE_WRITE_HS 1415
/*! cache: pages read into cache */
-#define WT_STAT_CONN_CACHE_READ 1413
+#define WT_STAT_CONN_CACHE_READ 1416
/*! cache: pages read into cache after truncate */
-#define WT_STAT_CONN_CACHE_READ_DELETED 1414
+#define WT_STAT_CONN_CACHE_READ_DELETED 1417
/*! cache: pages read into cache after truncate in prepare state */
-#define WT_STAT_CONN_CACHE_READ_DELETED_PREPARED 1415
+#define WT_STAT_CONN_CACHE_READ_DELETED_PREPARED 1418
/*! cache: pages requested from the cache */
-#define WT_STAT_CONN_CACHE_PAGES_REQUESTED 1416
+#define WT_STAT_CONN_CACHE_PAGES_REQUESTED 1419
/*! cache: pages seen by eviction walk */
-#define WT_STAT_CONN_CACHE_EVICTION_PAGES_SEEN 1417
+#define WT_STAT_CONN_CACHE_EVICTION_PAGES_SEEN 1420
/*! cache: pages written from cache */
-#define WT_STAT_CONN_CACHE_WRITE 1418
+#define WT_STAT_CONN_CACHE_WRITE 1421
/*! cache: pages written requiring in-memory restoration */
-#define WT_STAT_CONN_CACHE_WRITE_RESTORE 1419
+#define WT_STAT_CONN_CACHE_WRITE_RESTORE 1422
/*! cache: tracked dirty bytes in the cache */
-#define WT_STAT_CONN_CACHE_BYTES_DIRTY 1420
+#define WT_STAT_CONN_CACHE_BYTES_DIRTY 1423
/*! cache: unmodified pages evicted */
-#define WT_STAT_CONN_CACHE_EVICTION_CLEAN 1421
+#define WT_STAT_CONN_CACHE_EVICTION_CLEAN 1424
/*! checkpoint-cleanup: pages added for eviction */
-#define WT_STAT_CONN_CC_PAGES_EVICT 1422
+#define WT_STAT_CONN_CC_PAGES_EVICT 1425
/*! checkpoint-cleanup: pages removed */
-#define WT_STAT_CONN_CC_PAGES_REMOVED 1423
+#define WT_STAT_CONN_CC_PAGES_REMOVED 1426
/*! checkpoint-cleanup: pages skipped during tree walk */
-#define WT_STAT_CONN_CC_PAGES_WALK_SKIPPED 1424
+#define WT_STAT_CONN_CC_PAGES_WALK_SKIPPED 1427
/*! checkpoint-cleanup: pages visited */
-#define WT_STAT_CONN_CC_PAGES_VISITED 1425
+#define WT_STAT_CONN_CC_PAGES_VISITED 1428
/*! cursor: Total number of entries skipped by cursor next calls */
-#define WT_STAT_CONN_CURSOR_NEXT_SKIP_TOTAL 1426
+#define WT_STAT_CONN_CURSOR_NEXT_SKIP_TOTAL 1429
/*! cursor: Total number of entries skipped by cursor prev calls */
-#define WT_STAT_CONN_CURSOR_PREV_SKIP_TOTAL 1427
+#define WT_STAT_CONN_CURSOR_PREV_SKIP_TOTAL 1430
/*!
* cursor: Total number of entries skipped to position the history store
* cursor
*/
-#define WT_STAT_CONN_CURSOR_SKIP_HS_CUR_POSITION 1428
+#define WT_STAT_CONN_CURSOR_SKIP_HS_CUR_POSITION 1431
/*!
* cursor: Total number of times a search near has exited due to prefix
* config
*/
-#define WT_STAT_CONN_CURSOR_SEARCH_NEAR_PREFIX_FAST_PATHS 1429
+#define WT_STAT_CONN_CURSOR_SEARCH_NEAR_PREFIX_FAST_PATHS 1432
/*!
* cursor: cursor next calls that skip due to a globally visible history
* store tombstone
*/
-#define WT_STAT_CONN_CURSOR_NEXT_HS_TOMBSTONE 1430
+#define WT_STAT_CONN_CURSOR_NEXT_HS_TOMBSTONE 1433
/*!
* cursor: cursor next calls that skip greater than or equal to 100
* entries
*/
-#define WT_STAT_CONN_CURSOR_NEXT_SKIP_GE_100 1431
+#define WT_STAT_CONN_CURSOR_NEXT_SKIP_GE_100 1434
/*! cursor: cursor next calls that skip less than 100 entries */
-#define WT_STAT_CONN_CURSOR_NEXT_SKIP_LT_100 1432
+#define WT_STAT_CONN_CURSOR_NEXT_SKIP_LT_100 1435
/*!
* cursor: cursor prev calls that skip due to a globally visible history
* store tombstone
*/
-#define WT_STAT_CONN_CURSOR_PREV_HS_TOMBSTONE 1433
+#define WT_STAT_CONN_CURSOR_PREV_HS_TOMBSTONE 1436
/*!
* cursor: cursor prev calls that skip greater than or equal to 100
* entries
*/
-#define WT_STAT_CONN_CURSOR_PREV_SKIP_GE_100 1434
+#define WT_STAT_CONN_CURSOR_PREV_SKIP_GE_100 1437
/*! cursor: cursor prev calls that skip less than 100 entries */
-#define WT_STAT_CONN_CURSOR_PREV_SKIP_LT_100 1435
+#define WT_STAT_CONN_CURSOR_PREV_SKIP_LT_100 1438
/*! cursor: open cursor count */
-#define WT_STAT_CONN_CURSOR_OPEN_COUNT 1436
+#define WT_STAT_CONN_CURSOR_OPEN_COUNT 1439
/*! reconciliation: approximate byte size of timestamps in pages written */
-#define WT_STAT_CONN_REC_TIME_WINDOW_BYTES_TS 1437
+#define WT_STAT_CONN_REC_TIME_WINDOW_BYTES_TS 1440
/*!
* reconciliation: approximate byte size of transaction IDs in pages
* written
*/
-#define WT_STAT_CONN_REC_TIME_WINDOW_BYTES_TXN 1438
+#define WT_STAT_CONN_REC_TIME_WINDOW_BYTES_TXN 1441
/*! reconciliation: fast-path pages deleted */
-#define WT_STAT_CONN_REC_PAGE_DELETE_FAST 1439
+#define WT_STAT_CONN_REC_PAGE_DELETE_FAST 1442
/*! reconciliation: page reconciliation calls */
-#define WT_STAT_CONN_REC_PAGES 1440
+#define WT_STAT_CONN_REC_PAGES 1443
/*! reconciliation: page reconciliation calls for eviction */
-#define WT_STAT_CONN_REC_PAGES_EVICTION 1441
+#define WT_STAT_CONN_REC_PAGES_EVICTION 1444
/*! reconciliation: pages deleted */
-#define WT_STAT_CONN_REC_PAGE_DELETE 1442
+#define WT_STAT_CONN_REC_PAGE_DELETE 1445
/*!
* reconciliation: pages written including an aggregated newest start
* durable timestamp
*/
-#define WT_STAT_CONN_REC_TIME_AGGR_NEWEST_START_DURABLE_TS 1443
+#define WT_STAT_CONN_REC_TIME_AGGR_NEWEST_START_DURABLE_TS 1446
/*!
* reconciliation: pages written including an aggregated newest stop
* durable timestamp
*/
-#define WT_STAT_CONN_REC_TIME_AGGR_NEWEST_STOP_DURABLE_TS 1444
+#define WT_STAT_CONN_REC_TIME_AGGR_NEWEST_STOP_DURABLE_TS 1447
/*!
* reconciliation: pages written including an aggregated newest stop
* timestamp
*/
-#define WT_STAT_CONN_REC_TIME_AGGR_NEWEST_STOP_TS 1445
+#define WT_STAT_CONN_REC_TIME_AGGR_NEWEST_STOP_TS 1448
/*!
* reconciliation: pages written including an aggregated newest stop
* transaction ID
*/
-#define WT_STAT_CONN_REC_TIME_AGGR_NEWEST_STOP_TXN 1446
+#define WT_STAT_CONN_REC_TIME_AGGR_NEWEST_STOP_TXN 1449
/*!
* reconciliation: pages written including an aggregated newest
* transaction ID
*/
-#define WT_STAT_CONN_REC_TIME_AGGR_NEWEST_TXN 1447
+#define WT_STAT_CONN_REC_TIME_AGGR_NEWEST_TXN 1450
/*!
* reconciliation: pages written including an aggregated oldest start
* timestamp
*/
-#define WT_STAT_CONN_REC_TIME_AGGR_OLDEST_START_TS 1448
+#define WT_STAT_CONN_REC_TIME_AGGR_OLDEST_START_TS 1451
/*! reconciliation: pages written including an aggregated prepare */
-#define WT_STAT_CONN_REC_TIME_AGGR_PREPARED 1449
+#define WT_STAT_CONN_REC_TIME_AGGR_PREPARED 1452
/*!
* reconciliation: pages written including at least one start durable
* timestamp
*/
-#define WT_STAT_CONN_REC_TIME_WINDOW_PAGES_DURABLE_START_TS 1450
+#define WT_STAT_CONN_REC_TIME_WINDOW_PAGES_DURABLE_START_TS 1453
/*!
* reconciliation: pages written including at least one start transaction
* ID
*/
-#define WT_STAT_CONN_REC_TIME_WINDOW_PAGES_START_TXN 1451
+#define WT_STAT_CONN_REC_TIME_WINDOW_PAGES_START_TXN 1454
/*!
* reconciliation: pages written including at least one stop durable
* timestamp
*/
-#define WT_STAT_CONN_REC_TIME_WINDOW_PAGES_DURABLE_STOP_TS 1452
+#define WT_STAT_CONN_REC_TIME_WINDOW_PAGES_DURABLE_STOP_TS 1455
/*! reconciliation: pages written including at least one stop timestamp */
-#define WT_STAT_CONN_REC_TIME_WINDOW_PAGES_STOP_TS 1453
+#define WT_STAT_CONN_REC_TIME_WINDOW_PAGES_STOP_TS 1456
/*!
* reconciliation: pages written including at least one stop transaction
* ID
*/
-#define WT_STAT_CONN_REC_TIME_WINDOW_PAGES_STOP_TXN 1454
+#define WT_STAT_CONN_REC_TIME_WINDOW_PAGES_STOP_TXN 1457
/*! reconciliation: records written including a start durable timestamp */
-#define WT_STAT_CONN_REC_TIME_WINDOW_DURABLE_START_TS 1455
+#define WT_STAT_CONN_REC_TIME_WINDOW_DURABLE_START_TS 1458
/*! reconciliation: records written including a start timestamp */
-#define WT_STAT_CONN_REC_TIME_WINDOW_START_TS 1456
+#define WT_STAT_CONN_REC_TIME_WINDOW_START_TS 1459
/*! reconciliation: records written including a start transaction ID */
-#define WT_STAT_CONN_REC_TIME_WINDOW_START_TXN 1457
+#define WT_STAT_CONN_REC_TIME_WINDOW_START_TXN 1460
/*! reconciliation: records written including a stop durable timestamp */
-#define WT_STAT_CONN_REC_TIME_WINDOW_DURABLE_STOP_TS 1458
+#define WT_STAT_CONN_REC_TIME_WINDOW_DURABLE_STOP_TS 1461
/*! reconciliation: records written including a stop timestamp */
-#define WT_STAT_CONN_REC_TIME_WINDOW_STOP_TS 1459
+#define WT_STAT_CONN_REC_TIME_WINDOW_STOP_TS 1462
/*! reconciliation: records written including a stop transaction ID */
-#define WT_STAT_CONN_REC_TIME_WINDOW_STOP_TXN 1460
+#define WT_STAT_CONN_REC_TIME_WINDOW_STOP_TXN 1463
/*! session: tiered storage local retention time (secs) */
-#define WT_STAT_CONN_TIERED_RETENTION 1461
+#define WT_STAT_CONN_TIERED_RETENTION 1464
/*! session: tiered storage object size */
-#define WT_STAT_CONN_TIERED_OBJECT_SIZE 1462
+#define WT_STAT_CONN_TIERED_OBJECT_SIZE 1465
/*! transaction: race to read prepared update retry */
-#define WT_STAT_CONN_TXN_READ_RACE_PREPARE_UPDATE 1463
+#define WT_STAT_CONN_TXN_READ_RACE_PREPARE_UPDATE 1466
/*!
* transaction: rollback to stable history store records with stop
* timestamps older than newer records
*/
-#define WT_STAT_CONN_TXN_RTS_HS_STOP_OLDER_THAN_NEWER_START 1464
+#define WT_STAT_CONN_TXN_RTS_HS_STOP_OLDER_THAN_NEWER_START 1467
/*! transaction: rollback to stable inconsistent checkpoint */
-#define WT_STAT_CONN_TXN_RTS_INCONSISTENT_CKPT 1465
+#define WT_STAT_CONN_TXN_RTS_INCONSISTENT_CKPT 1468
/*! transaction: rollback to stable keys removed */
-#define WT_STAT_CONN_TXN_RTS_KEYS_REMOVED 1466
+#define WT_STAT_CONN_TXN_RTS_KEYS_REMOVED 1469
/*! transaction: rollback to stable keys restored */
-#define WT_STAT_CONN_TXN_RTS_KEYS_RESTORED 1467
+#define WT_STAT_CONN_TXN_RTS_KEYS_RESTORED 1470
/*! transaction: rollback to stable restored tombstones from history store */
-#define WT_STAT_CONN_TXN_RTS_HS_RESTORE_TOMBSTONES 1468
+#define WT_STAT_CONN_TXN_RTS_HS_RESTORE_TOMBSTONES 1471
/*! transaction: rollback to stable restored updates from history store */
-#define WT_STAT_CONN_TXN_RTS_HS_RESTORE_UPDATES 1469
+#define WT_STAT_CONN_TXN_RTS_HS_RESTORE_UPDATES 1472
/*! transaction: rollback to stable sweeping history store keys */
-#define WT_STAT_CONN_TXN_RTS_SWEEP_HS_KEYS 1470
+#define WT_STAT_CONN_TXN_RTS_SWEEP_HS_KEYS 1473
/*! transaction: rollback to stable updates removed from history store */
-#define WT_STAT_CONN_TXN_RTS_HS_REMOVED 1471
+#define WT_STAT_CONN_TXN_RTS_HS_REMOVED 1474
/*! transaction: transaction checkpoints due to obsolete pages */
-#define WT_STAT_CONN_TXN_CHECKPOINT_OBSOLETE_APPLIED 1472
+#define WT_STAT_CONN_TXN_CHECKPOINT_OBSOLETE_APPLIED 1475
/*! transaction: update conflicts */
-#define WT_STAT_CONN_TXN_UPDATE_CONFLICT 1473
+#define WT_STAT_CONN_TXN_UPDATE_CONFLICT 1476
/*!
* @}
diff --git a/src/third_party/wiredtiger/src/lsm/lsm_cursor.c b/src/third_party/wiredtiger/src/lsm/lsm_cursor.c
index d2093a26ecb..4c3aa2a3204 100644
--- a/src/third_party/wiredtiger/src/lsm/lsm_cursor.c
+++ b/src/third_party/wiredtiger/src/lsm/lsm_cursor.c
@@ -1281,10 +1281,9 @@ __clsm_search_near(WT_CURSOR *cursor, int *exactp)
}
/*
- * Prefer larger cursors. There are two reasons: (1) we expect
- * prefix searches to be a common case (as in our own indices);
- * and (2) we need a way to unambiguously know we have the
- * "closest" result.
+ * Prefer larger cursors. There are two reasons: (1) we expect prefix searches to be a
+ * common case (as in our own indices); and (2) we need a way to unambiguously know we have
+ * the "closest" result.
*/
if (cmp < 0) {
if ((ret = c->next(c)) == WT_NOTFOUND) {
diff --git a/src/third_party/wiredtiger/src/meta/meta_ckpt.c b/src/third_party/wiredtiger/src/meta/meta_ckpt.c
index 96ed8cb72a4..b5d0ae0b7a1 100644
--- a/src/third_party/wiredtiger/src/meta/meta_ckpt.c
+++ b/src/third_party/wiredtiger/src/meta/meta_ckpt.c
@@ -14,7 +14,7 @@ static int __ckpt_load(WT_SESSION_IMPL *, WT_CONFIG_ITEM *, WT_CONFIG_ITEM *, WT
static int __ckpt_named(WT_SESSION_IMPL *, const char *, const char *, WT_CKPT *);
static int __ckpt_set(WT_SESSION_IMPL *, const char *, const char *, bool);
static int __ckpt_version_chk(WT_SESSION_IMPL *, const char *, const char *);
-
+static int __meta_blk_mods_load(WT_SESSION_IMPL *, const char *, WT_CKPT *, WT_CKPT *, bool);
/*
* __ckpt_load_blk_mods --
* Load the block information from the config string.
@@ -472,18 +472,52 @@ __ckpt_valid_blk_mods(WT_SESSION_IMPL *session, WT_CKPT *ckpt, bool rename)
}
/*
- * __wt_meta_blk_mods_load --
- * Load the block mods for a given checkpoint and set up all the information to store.
+ * __ckpt_copy_blk_mods --
+ * Copy the block mods from a source checkpoint to the destination checkpoint.
*/
-int
-__wt_meta_blk_mods_load(WT_SESSION_IMPL *session, const char *config, WT_CKPT *ckpt, bool rename)
+static int
+__ckpt_copy_blk_mods(WT_SESSION_IMPL *session, WT_CKPT *src_ckpt, WT_CKPT *dst_ckpt)
+{
+ uint64_t i;
+
+ for (i = 0; i < WT_BLKINCR_MAX; ++i) {
+ WT_RET(__wt_strdup(
+ session, src_ckpt->backup_blocks[i].id_str, &dst_ckpt->backup_blocks[i].id_str));
+ WT_RET(__wt_buf_set(session, &dst_ckpt->backup_blocks[i].bitstring,
+ src_ckpt->backup_blocks[i].bitstring.data, src_ckpt->backup_blocks[i].bitstring.size));
+ dst_ckpt->backup_blocks[i].nbits = src_ckpt->backup_blocks[i].nbits;
+ dst_ckpt->backup_blocks[i].offset = src_ckpt->backup_blocks[i].offset;
+ dst_ckpt->backup_blocks[i].granularity = src_ckpt->backup_blocks[i].granularity;
+ dst_ckpt->backup_blocks[i].flags = src_ckpt->backup_blocks[i].flags;
+ }
+
+ return (0);
+}
+
+/*
+ * __meta_blk_mods_load --
+ * Load the block mods for a given checkpoint and set up all the information to store. Load from
+ * either the metadata or from a base checkpoint.
+ */
+static int
+__meta_blk_mods_load(
+ WT_SESSION_IMPL *session, const char *config, WT_CKPT *base_ckpt, WT_CKPT *ckpt, bool rename)
{
/*
- * Load most recent checkpoint backup blocks to this checkpoint.
+ * Load most recent checkpoint backup blocks to this checkpoint, either from metadata or from a
+ * previous checkpoint.
*/
- WT_RET(__ckpt_load_blk_mods(session, config, ckpt));
-
- WT_RET(__wt_meta_block_metadata(session, config, ckpt));
+ if (config != NULL) {
+ /* Load from metadata. */
+ WT_RET(__ckpt_load_blk_mods(session, config, ckpt));
+ WT_RET(__wt_meta_block_metadata(session, config, ckpt));
+ } else {
+ /* Load from an existing base checkpoint. */
+ WT_ASSERT(session, base_ckpt != NULL);
+ WT_RET(__ckpt_copy_blk_mods(session, base_ckpt, ckpt));
+ WT_RET(__wt_strndup(session, base_ckpt->block_metadata, strlen(base_ckpt->block_metadata),
+ &ckpt->block_metadata));
+ }
/*
* Set the add-a-checkpoint flag, and if we're doing incremental backups, request a list of the
@@ -503,7 +537,7 @@ __wt_meta_blk_mods_load(WT_SESSION_IMPL *session, const char *config, WT_CKPT *c
*/
int
__wt_meta_ckptlist_get(
- WT_SESSION_IMPL *session, const char *fname, bool update, WT_CKPT **ckptbasep)
+ WT_SESSION_IMPL *session, const char *fname, bool update, WT_CKPT **ckptbasep, size_t *allocated)
{
WT_DECL_RET;
char *config;
@@ -511,34 +545,216 @@ __wt_meta_ckptlist_get(
config = NULL;
WT_ERR(__wt_metadata_search(session, fname, &config));
- WT_ERR(__wt_meta_ckptlist_get_from_config(session, update, ckptbasep, config));
+ WT_ERR(__wt_meta_ckptlist_get_from_config(session, update, ckptbasep, allocated, config));
err:
__wt_free(session, config);
return (ret);
}
+#ifdef HAVE_DIAGNOSTIC
+/*
+ * __assert_ckpt_matches --
+ * Assert that given two checkpoints match.
+ */
+static void
+__assert_ckpt_matches(WT_SESSION_IMPL *session, WT_CKPT *ckpt_a, WT_CKPT *ckpt_b)
+{
+ /*
+ * We are not checking checkpoint time, because there could be a minute difference depending
+ * upon when the checkpoint information was generated. This is acceptable.
+ */
+ WT_ASSERT(session,
+ (ckpt_a->name == NULL && ckpt_b->name == NULL) ||
+ (ckpt_a->name != NULL && ckpt_b->name != NULL && strcmp(ckpt_a->name, ckpt_b->name) == 0));
+ WT_ASSERT(session, ckpt_a->order == ckpt_b->order);
+ WT_ASSERT(session, ckpt_a->size == ckpt_b->size);
+ WT_ASSERT(session, ckpt_a->write_gen == ckpt_b->write_gen);
+ WT_ASSERT(session, ckpt_a->run_write_gen == ckpt_b->run_write_gen);
+ WT_ASSERT(session,
+ ckpt_a->ta.newest_start_durable_ts == ckpt_b->ta.newest_start_durable_ts &&
+ ckpt_a->ta.newest_stop_durable_ts == ckpt_b->ta.newest_stop_durable_ts &&
+ ckpt_a->ta.oldest_start_ts == ckpt_b->ta.oldest_start_ts &&
+ ckpt_a->ta.newest_txn == ckpt_b->ta.newest_txn &&
+ ckpt_a->ta.newest_stop_ts == ckpt_b->ta.newest_stop_ts &&
+ ckpt_a->ta.newest_stop_txn == ckpt_b->ta.newest_stop_txn &&
+ ckpt_a->ta.prepare == ckpt_b->ta.prepare);
+ /*
+ * The two WT_CKPT structures are created through different paths, specifically in one path the
+ * WT_CKPT.addr and WT_CKPT.raw fields are taken from a configuration file as strings including
+ * a training nul byte. Use the minimum size of the data to ignore that nul byte. Passing nul
+ * pointers to memcmp is undefined, so handle that separately.
+ */
+ WT_ASSERT(session,
+ (ckpt_a->addr.data == NULL && ckpt_b->addr.data == NULL) ||
+ (ckpt_a->addr.data != NULL && ckpt_b->addr.data != NULL &&
+ memcmp(ckpt_a->addr.data, ckpt_b->addr.data,
+ WT_MIN(ckpt_a->addr.size, ckpt_b->addr.size)) == 0));
+ WT_ASSERT(session,
+ (ckpt_a->raw.data == NULL && ckpt_b->raw.data == NULL) ||
+ (ckpt_a->raw.data != NULL && ckpt_b->raw.data != NULL &&
+ memcmp(ckpt_a->raw.data, ckpt_b->raw.data, WT_MIN(ckpt_a->raw.size, ckpt_b->raw.size)) ==
+ 0));
+ WT_ASSERT(session, ckpt_a->bpriv == NULL && ckpt_b->bpriv == NULL);
+ WT_ASSERT(session, ckpt_a->flags == ckpt_b->flags);
+}
+
+/*
+ * __assert_checkpoint_list_matches --
+ * Assert that two given checkpoint lists match.
+ */
+static void
+__assert_checkpoint_list_matches(WT_SESSION_IMPL *session, WT_CKPT *saved_list, WT_CKPT *new_list)
+{
+ WT_CKPT *ckpt_saved, *ckpt_new;
+
+ for (ckpt_saved = saved_list, ckpt_new = new_list;
+ ckpt_saved != NULL && ckpt_saved->order != 0 && ckpt_new != NULL && ckpt_new->order != 0;
+ ckpt_saved++, ckpt_new++)
+ __assert_ckpt_matches(session, ckpt_saved, ckpt_new);
+
+ WT_ASSERT(session,
+ (ckpt_saved == NULL && ckpt_new == NULL) ||
+ ((ckpt_saved != NULL && ckpt_saved->order == 0) &&
+ (ckpt_new != NULL && ckpt_new->order == 0)));
+}
+#endif
+
+/*
+ * __meta_ckptlist_allocate_new_ckpt --
+ * Provided a checkpoint list, allocate a new checkpoint. Either use the last checkpoint in the
+ * list or the file metadata to initialize this new checkpoint.
+ */
+static int
+__meta_ckptlist_allocate_new_ckpt(
+ WT_SESSION_IMPL *session, WT_CKPT **ckptbasep, size_t *allocated, const char *config)
+{
+ WT_CKPT *ckptbase, *ckpt;
+ WT_CONNECTION_IMPL *conn;
+ size_t slot;
+ uint64_t most_recent;
+
+ ckptbase = *ckptbasep;
+ conn = S2C(session);
+ slot = 0;
+
+ if (ckptbase != NULL)
+ WT_CKPT_FOREACH (ckptbase, ckpt)
+ slot++;
+
+ /* Either we have a configuration or an existing checkpoint to initialize with. */
+ WT_ASSERT(session, config != NULL || slot != 0);
+
+ /*
+ * If we are using an existing checkpoint, we must have the associated metadata. Otherwise we
+ * will have to go slow path and read the metadata.
+ */
+ if (config == NULL && ckptbase[slot - 1].block_metadata == NULL)
+ return (WT_NOTFOUND);
+
+ /*
+ * This isn't clean, but there's necessary cooperation between the schema layer (that maintains
+ * the list of checkpoints), the btree layer (that knows when the root page is written, creating
+ * a new checkpoint), and the block manager (which actually creates the checkpoint). All of that
+ * cooperation is handled in the array of checkpoint structures referenced from the WT_BTREE
+ * structure.
+ *
+ * Allocate a slot for a new value, plus a slot to mark the end.
+ */
+ WT_RET(__wt_realloc_def(session, allocated, slot + 2, &ckptbase));
+ *ckptbasep = ckptbase;
+
+ ckpt = &ckptbase[slot];
+ ckpt->order = (slot == 0) ? 1 : ckptbase[slot - 1].order + 1;
+ __wt_seconds(session, &ckpt->sec);
+ /*
+ * Update time value for most recent checkpoint, not letting it move backwards. It is possible
+ * to race here, so use atomic CAS. This code relies on the fact that anyone we race with will
+ * only increase (never decrease) the most recent checkpoint time value.
+ */
+ for (;;) {
+ WT_ORDERED_READ(most_recent, conn->ckpt_most_recent);
+ if (ckpt->sec <= most_recent ||
+ __wt_atomic_cas64(&conn->ckpt_most_recent, most_recent, ckpt->sec))
+ break;
+ }
+
+ /* Either load block mods from the config, or from the previous checkpoint. */
+ WT_RET(
+ __meta_blk_mods_load(session, config, (slot == 0 ? NULL : &ckptbase[slot - 1]), ckpt, false));
+ WT_ASSERT(session, ckpt->block_metadata != NULL);
+
+ return (0);
+}
+
+/*
+ * __wt_meta_saved_ckptlist_get --
+ * Append the ckptlist with a new checkpoint to be added.
+ */
+int
+__wt_meta_saved_ckptlist_get(WT_SESSION_IMPL *session, const char *fname, WT_CKPT **ckptbasep)
+{
+ WT_BTREE *btree;
+#ifdef HAVE_DIAGNOSTIC
+ WT_CKPT *ckptbase_comp;
+#endif
+ WT_DECL_RET;
+
+ *ckptbasep = NULL;
+
+ btree = S2BT(session);
+
+ /* If we do not have a saved ckptlist, return not found. */
+ if (btree->ckpt == NULL)
+ return (WT_NOTFOUND);
+
+ WT_ERR(
+ __meta_ckptlist_allocate_new_ckpt(session, &btree->ckpt, &btree->ckpt_bytes_allocated, NULL));
+
+#ifdef HAVE_DIAGNOSTIC
+ /*
+ * Sanity check: Let's compare to a list generated from metadata. There should be no
+ * differences.
+ */
+ if ((ret = __wt_meta_ckptlist_get(session, fname, true, &ckptbase_comp, NULL)) == 0)
+ __assert_checkpoint_list_matches(session, btree->ckpt, ckptbase_comp);
+ __wt_meta_ckptlist_free(session, &ckptbase_comp);
+ WT_ERR(ret);
+#else
+ WT_UNUSED(fname);
+#endif
+
+ /* Return the array to our caller. */
+ *ckptbasep = btree->ckpt;
+
+ if (0) {
+err:
+ __wt_meta_saved_ckptlist_free(session);
+ }
+
+ return (ret);
+}
+
/*
* __wt_meta_ckptlist_get_from_config --
* Provided a metadata config, load all available checkpoint information for a file.
*/
int
-__wt_meta_ckptlist_get_from_config(
- WT_SESSION_IMPL *session, bool update, WT_CKPT **ckptbasep, const char *config)
+__wt_meta_ckptlist_get_from_config(WT_SESSION_IMPL *session, bool update, WT_CKPT **ckptbasep,
+ size_t *allocatedp, const char *config)
{
WT_CKPT *ckpt, *ckptbase;
WT_CONFIG ckptconf;
WT_CONFIG_ITEM k, v;
- WT_CONNECTION_IMPL *conn;
WT_DECL_RET;
size_t allocated, slot;
- uint64_t most_recent;
*ckptbasep = NULL;
+ if (allocatedp != NULL)
+ *allocatedp = 0;
ckptbase = NULL;
allocated = slot = 0;
- conn = S2C(session);
/* Load any existing checkpoints into the array. */
if ((ret = __wt_config_getones(session, config, "checkpoint", &v)) == 0) {
@@ -560,38 +776,14 @@ __wt_meta_ckptlist_get_from_config(
/* Sort in creation-order. */
__wt_qsort(ckptbase, slot, sizeof(WT_CKPT), __ckpt_compare_order);
- if (update) {
- /*
- * This isn't clean, but there's necessary cooperation between the schema layer (that
- * maintains the list of checkpoints), the btree layer (that knows when the root page is
- * written, creating a new checkpoint), and the block manager (which actually creates the
- * checkpoint). All of that cooperation is handled in the array of checkpoint structures
- * referenced from the WT_BTREE structure.
- *
- * Allocate a slot for a new value, plus a slot to mark the end.
- */
- WT_ERR(__wt_realloc_def(session, &allocated, slot + 2, &ckptbase));
-
- /* The caller may be adding a value, initialize it. */
- ckpt = &ckptbase[slot];
- ckpt->order = (slot == 0) ? 1 : ckptbase[slot - 1].order + 1;
- __wt_seconds(session, &ckpt->sec);
- /*
- * Update time value for most recent checkpoint, not letting it move backwards. It is
- * possible to race here, so use atomic CAS. This code relies on the fact that anyone we
- * race with will only increase (never decrease) the most recent checkpoint time value.
- */
- for (;;) {
- WT_ORDERED_READ(most_recent, conn->ckpt_most_recent);
- if (ckpt->sec <= most_recent ||
- __wt_atomic_cas64(&conn->ckpt_most_recent, most_recent, ckpt->sec))
- break;
- }
- WT_ERR(__wt_meta_blk_mods_load(session, config, ckpt, false));
- }
+ /* The caller might be asking for a new checkpoint to be allocated. */
+ if (update)
+ WT_ERR(__meta_ckptlist_allocate_new_ckpt(session, &ckptbase, &allocated, config));
/* Return the array to our caller. */
*ckptbasep = ckptbase;
+ if (allocatedp != NULL)
+ *allocatedp = allocated;
if (0) {
err:
@@ -932,7 +1124,6 @@ __wt_meta_ckptlist_set(
bool has_lsn;
WT_RET(__wt_scr_alloc(session, 1024, &buf));
-
WT_ERR(__wt_meta_ckptlist_to_meta(session, ckptbase, buf));
/* Add backup block modifications for any added checkpoint. */
WT_CKPT_FOREACH (ckptbase, ckpt)
@@ -963,12 +1154,31 @@ __wt_meta_ckptlist_free(WT_SESSION_IMPL *session, WT_CKPT **ckptbasep)
if ((ckptbase = *ckptbasep) == NULL)
return;
- WT_CKPT_FOREACH (ckptbase, ckpt)
+ /*
+ * Sometimes the checkpoint list has a checkpoint which has not been named yet, but carries an
+ * order number.
+ */
+ WT_CKPT_FOREACH_NAME_OR_ORDER (ckptbase, ckpt)
__wt_meta_checkpoint_free(session, ckpt);
__wt_free(session, *ckptbasep);
}
/*
+ * __wt_meta_saved_ckptlist_free --
+ * Discard the saved checkpoint list.
+ */
+void
+__wt_meta_saved_ckptlist_free(WT_SESSION_IMPL *session)
+{
+ WT_BTREE *btree;
+
+ btree = S2BT(session);
+
+ __wt_meta_ckptlist_free(session, &btree->ckpt);
+ btree->ckpt_bytes_allocated = 0;
+}
+
+/*
* __wt_meta_checkpoint_free --
* Clean up a single checkpoint structure.
*/
@@ -1112,3 +1322,28 @@ __ckpt_version_chk(WT_SESSION_IMPL *session, const char *fname, const char *conf
WT_BTREE_MAJOR_VERSION_MAX, WT_BTREE_MINOR_VERSION_MAX);
return (0);
}
+
+/*
+ * __wt_reset_blkmod --
+ * Reset the incremental backup information, and recreate incremental backup information to
+ * indicate copying the entire file.
+ */
+int
+__wt_reset_blkmod(WT_SESSION_IMPL *session, const char *orig_config, WT_ITEM *buf)
+{
+ WT_CKPT ckpt;
+ WT_DECL_RET;
+
+ WT_CLEAR(ckpt);
+ /*
+ * Replace the old file entries with new file entries. We need to recreate the incremental
+ * backup information to indicate copying the entire file in its bitmap.
+ */
+ /* First load any existing backup information into a temp checkpoint structure. */
+ WT_RET(__meta_blk_mods_load(session, orig_config, NULL, &ckpt, true));
+
+ /* Take the checkpoint structure and generate the metadata string. */
+ ret = __wt_ckpt_blkmod_to_meta(session, buf, &ckpt);
+ __wt_meta_checkpoint_free(session, &ckpt);
+ return (ret);
+}
diff --git a/src/third_party/wiredtiger/src/meta/meta_ext.c b/src/third_party/wiredtiger/src/meta/meta_ext.c
index 11e97ad9c45..aa9b6954683 100644
--- a/src/third_party/wiredtiger/src/meta/meta_ext.c
+++ b/src/third_party/wiredtiger/src/meta/meta_ext.c
@@ -88,7 +88,7 @@ int
__wt_metadata_get_ckptlist(WT_SESSION *session, const char *name, WT_CKPT **ckptbasep)
WT_GCC_FUNC_ATTRIBUTE((visibility("default")))
{
- return (__wt_meta_ckptlist_get((WT_SESSION_IMPL *)session, name, false, ckptbasep));
+ return (__wt_meta_ckptlist_get((WT_SESSION_IMPL *)session, name, false, ckptbasep, NULL));
}
/*
diff --git a/src/third_party/wiredtiger/src/meta/meta_track.c b/src/third_party/wiredtiger/src/meta/meta_track.c
index 0739175c3a1..88186deff68 100644
--- a/src/third_party/wiredtiger/src/meta/meta_track.c
+++ b/src/third_party/wiredtiger/src/meta/meta_track.c
@@ -426,8 +426,8 @@ __wt_meta_track_update(WT_SESSION_IMPL *session, const char *key)
WT_ERR(__wt_strdup(session, key, &trk->a));
/*
- * If there was a previous value, keep it around -- if not, then this
- * "update" is really an insert.
+ * If there was a previous value, keep it around -- if not, then this "update" is really an
+ * insert.
*/
if ((ret = __wt_metadata_search(session, key, &trk->b)) == WT_NOTFOUND) {
trk->op = WT_ST_REMOVE;
diff --git a/src/third_party/wiredtiger/src/os_posix/os_fs.c b/src/third_party/wiredtiger/src/os_posix/os_fs.c
index cc9d2c08ca1..1ae6259e5d8 100644
--- a/src/third_party/wiredtiger/src/os_posix/os_fs.c
+++ b/src/third_party/wiredtiger/src/os_posix/os_fs.c
@@ -49,18 +49,15 @@ __posix_sync(WT_SESSION_IMPL *session, int fd, const char *name, const char *fun
#if defined(F_FULLFSYNC)
/*
- * OS X fsync documentation:
- * "Note that while fsync() will flush all data from the host to the
- * drive (i.e. the "permanent storage device"), the drive itself may
- * not physically write the data to the platters for quite some time
- * and it may be written in an out-of-order sequence. For applications
- * that require tighter guarantees about the integrity of their data,
- * Mac OS X provides the F_FULLFSYNC fcntl. The F_FULLFSYNC fcntl asks
- * the drive to flush all buffered data to permanent storage."
+ * OS X fsync documentation: "Note that while fsync() will flush all data from the host to the
+ * drive (i.e. the "permanent storage device"), the drive itself may not physically write the
+ * data to the platters for quite some time and it may be written in an out-of-order sequence.
+ * For applications that require tighter guarantees about the integrity of their data, Mac OS X
+ * provides the F_FULLFSYNC fcntl. The F_FULLFSYNC fcntl asks the drive to flush all buffered
+ * data to permanent storage."
*
- * OS X F_FULLFSYNC fcntl documentation:
- * "This is currently implemented on HFS, MS-DOS (FAT), and Universal
- * Disk Format (UDF) file systems."
+ * OS X F_FULLFSYNC fcntl documentation: "This is currently implemented on HFS, MS-DOS (FAT),
+ * and Universal Disk Format (UDF) file systems."
*
* See comment in __posix_sync(): sync cannot be retried or fail.
*/
@@ -1032,11 +1029,10 @@ __wt_map_file(WT_FILE_HANDLE *file_handle, WT_SESSION *wt_session)
* file while others might be reading or writing it:
*
* Every time someone reads or writes from the mapped region, they increment the "use" count via
- * cas. If someone wants to change the file size, they set the "stop" flag. If a session sees
- * the stop flag, it does not read via mmap, but resorts to the regular syscall. The session
- * that set the stop flag spin-waits until the "use" count goes to zero. Then it changes the
- * file size and remaps the region without synchronization. Once all that is done, it resets the
- * "stop" flag.
+ * cas. If someone wants to change the file size, they set the "stop" flag. If a session sees the
+ * stop flag, it does not read via mmap, but resorts to the regular syscall. The session that set
+ * the stop flag spin-waits until the "use" count goes to zero. Then it changes the file size and
+ * remaps the region without synchronization. Once all that is done, it resets the "stop" flag.
*/
/*
diff --git a/src/third_party/wiredtiger/src/reconcile/rec_row.c b/src/third_party/wiredtiger/src/reconcile/rec_row.c
index db4bb56c976..bc4174e084a 100644
--- a/src/third_party/wiredtiger/src/reconcile/rec_row.c
+++ b/src/third_party/wiredtiger/src/reconcile/rec_row.c
@@ -18,29 +18,24 @@ __rec_key_state_update(WT_RECONCILE *r, bool ovfl_key)
WT_ITEM *a;
/*
- * If writing an overflow key onto the page, don't update the "last key"
- * value, and leave the state of prefix compression alone. (If we are
- * currently doing prefix compression, we have a key state which will
- * continue to work, we're just skipping the key just created because
- * it's an overflow key and doesn't participate in prefix compression.
- * If we are not currently doing prefix compression, we can't start, an
- * overflow key doesn't give us any state.)
+ * If writing an overflow key onto the page, don't update the "last key" value, and leave the
+ * state of prefix compression alone. (If we are currently doing prefix compression, we have a
+ * key state which will continue to work, we're just skipping the key just created because it's
+ * an overflow key and doesn't participate in prefix compression. If we are not currently doing
+ * prefix compression, we can't start, an overflow key doesn't give us any state.)
*
- * Additionally, if we wrote an overflow key onto the page, turn off the
- * suffix compression of row-store internal node keys. (When we split,
- * "last key" is the largest key on the previous page, and "cur key" is
- * the first key on the next page, which is being promoted. In some
- * cases we can discard bytes from the "cur key" that are not needed to
- * distinguish between the "last key" and "cur key", compressing the
- * size of keys on internal nodes. If we just built an overflow key,
- * we're not going to update the "last key", making suffix compression
- * impossible for the next key. Alternatively, we could remember where
- * the last key was on the page, detect it's an overflow key, read it
- * from disk and do suffix compression, but that's too much work for an
- * unlikely event.)
+ * Additionally, if we wrote an overflow key onto the page, turn off the suffix compression of
+ * row-store internal node keys. (When we split, "last key" is the largest key on the previous
+ * page, and "cur key" is the first key on the next page, which is being promoted. In some cases
+ * we can discard bytes from the "cur key" that are not needed to distinguish between the "last
+ * key" and "cur key", compressing the size of keys on internal nodes. If we just built an
+ * overflow key, we're not going to update the "last key", making suffix compression impossible
+ * for the next key. Alternatively, we could remember where the last key was on the page, detect
+ * it's an overflow key, read it from disk and do suffix compression, but that's too much work
+ * for an unlikely event.)
*
- * If we're not writing an overflow key on the page, update the last-key
- * value and turn on both prefix and suffix compression.
+ * If we're not writing an overflow key on the page, update the last-key value and turn on both
+ * prefix and suffix compression.
*/
if (ovfl_key)
r->key_sfx_compress = false;
@@ -143,18 +138,27 @@ __rec_cell_build_leaf_key(
break;
/*
- * Prefix compression may cost us CPU and memory when the page is re-loaded, don't do it
- * unless there's reasonable gain.
+ * Prefix compression costs CPU and memory when the page is re-loaded, skip unless
+ * there's a reasonable gain. Also, if the previous key was prefix compressed, don't
+ * increase the prefix compression if we aren't getting a reasonable gain. (Groups of
+ * keys with the same prefix can be quickly built without needing to roll forward
+ * through intermediate keys or allocating memory so they can be built faster in the
+ * future, for that reason try and create big groups of keys with the same prefix.)
*/
if (pfx < btree->prefix_compression_min)
pfx = 0;
- else
+ else if (r->key_pfx_last != 0 && pfx > r->key_pfx_last &&
+ pfx < r->key_pfx_last + WT_KEY_PREFIX_PREVIOUS_MINIMUM)
+ pfx = r->key_pfx_last;
+
+ if (pfx != 0)
WT_STAT_DATA_INCRV(session, rec_prefix_compression, pfx);
}
/* Copy the non-prefix bytes into the key buffer. */
WT_RET(__wt_buf_set(session, &key->buf, (uint8_t *)data + pfx, size - pfx));
}
+ r->key_pfx_last = pfx;
/* Create an overflow object if the data won't fit. */
if (key->buf.size > btree->maxleafkey) {
@@ -214,6 +218,7 @@ __wt_bulk_insert_row(WT_SESSION_IMPL *session, WT_CURSOR_BULK *cbulk)
*/
if (r->key_pfx_compress_conf) {
r->key_pfx_compress = false;
+ r->key_pfx_last = 0;
if (!ovfl_key)
WT_RET(__rec_cell_build_leaf_key(session, r, NULL, 0, &ovfl_key));
}
@@ -582,6 +587,7 @@ __rec_row_leaf_insert(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_INSERT *ins)
* Turn off prefix and suffix compression until a full key is written into the new page.
*/
r->key_pfx_compress = r->key_sfx_compress = false;
+ r->key_pfx_last = 0;
continue;
}
@@ -633,6 +639,7 @@ __rec_row_leaf_insert(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_INSERT *ins)
*/
if (r->key_pfx_compress_conf) {
r->key_pfx_compress = false;
+ r->key_pfx_last = 0;
if (!ovfl_key)
WT_RET(__rec_cell_build_leaf_key(session, r, NULL, 0, &ovfl_key));
}
@@ -715,10 +722,13 @@ __wt_rec_row_leaf(
WT_TIME_WINDOW tw;
WT_UPDATE *upd;
WT_UPDATE_SELECT upd_select;
+ size_t key_size;
uint64_t slvg_skip;
uint32_t i;
+ uint8_t key_prefix;
bool dictionary, key_onpage_ovfl, ovfl_key;
void *copy;
+ const void *key_data;
btree = S2BT(session);
hs_cursor = NULL;
@@ -764,20 +774,19 @@ __wt_rec_row_leaf(
dictionary = false;
/*
- * Figure out the key: set any cell reference (and unpack it), set any instantiated key
- * reference.
+ * Figure out if the key is an overflow key, and in that case unpack the cell, we'll need it
+ * later.
*/
copy = WT_ROW_KEY_COPY(rip);
- WT_IGNORE_RET_BOOL(__wt_row_leaf_key_info(page, copy, &ikey, &cell, NULL, NULL));
- if (cell == NULL)
- kpack = NULL;
- else {
+ __wt_row_leaf_key_info(page, copy, &ikey, &cell, &key_data, &key_size, &key_prefix);
+ kpack = NULL;
+ if (__wt_cell_type(cell) == WT_CELL_KEY_OVFL) {
kpack = &_kpack;
__wt_cell_unpack_kv(session, page->dsk, cell, kpack);
}
/* Unpack the on-page value cell. */
- __wt_row_leaf_value_cell(session, page, rip, NULL, vpack);
+ __wt_row_leaf_value_cell(session, page, rip, vpack);
/* Look for an update. */
WT_ERR(__wt_rec_upd_select(session, r, NULL, rip, vpack, &upd_select));
@@ -890,7 +899,7 @@ __wt_rec_row_leaf(
/*
* Keys are part of the name-space, we can't remove them from the in-memory
* tree; if an overflow key was deleted without being instantiated (for example,
- * cursor-based truncation), do it now.
+ * cursor-based truncation), instantiate it now.
*/
if (ikey == NULL)
WT_ERR(__wt_row_leaf_key(session, page, rip, tmpkey, true));
@@ -972,25 +981,41 @@ __wt_rec_row_leaf(
* previous key (it's a fast path for simple, prefix-compressed keys), or by building
* the key from scratch.
*/
- if (__wt_row_leaf_key_info(page, copy, NULL, &cell, &tmpkey->data, &tmpkey->size))
+ __wt_row_leaf_key_info(page, copy, NULL, &cell, &key_data, &key_size, &key_prefix);
+ if (key_data == NULL) {
+ if (__wt_cell_type(cell) != WT_CELL_KEY)
+ goto slow;
+ kpack = &_kpack;
+ __wt_cell_unpack_kv(session, page->dsk, cell, kpack);
+ key_data = kpack->data;
+ key_size = kpack->size;
+ key_prefix = kpack->prefix;
+ }
+ if (key_prefix == 0) {
+ tmpkey->data = key_data;
+ tmpkey->size = key_size;
goto build;
+ }
- kpack = &_kpack;
- __wt_cell_unpack_kv(session, page->dsk, cell, kpack);
- if (kpack->type == WT_CELL_KEY && tmpkey->size >= kpack->prefix && tmpkey->size != 0) {
- /*
- * Grow the buffer as necessary, ensuring data data has been copied into local
- * buffer space, then append the suffix to the prefix already in the buffer.
- *
- * Don't grow the buffer unnecessarily or copy data we don't need, truncate the
- * item's data length to the prefix bytes.
- */
- tmpkey->size = kpack->prefix;
- WT_ERR(__wt_buf_grow(session, tmpkey, tmpkey->size + kpack->size));
- memcpy((uint8_t *)tmpkey->mem + tmpkey->size, kpack->data, kpack->size);
- tmpkey->size += kpack->size;
- } else
+ if (tmpkey->size == 0 || tmpkey->size < key_prefix)
+ goto slow;
+
+ /*
+ * Grow the buffer as necessary as well as ensure data has been copied into local buffer
+ * space, then append the suffix to the prefix already in the buffer. Don't grow the
+ * buffer unnecessarily or copy data we don't need, truncate the item's CURRENT data
+ * length to the prefix bytes before growing the buffer.
+ */
+ tmpkey->size = key_prefix;
+ WT_ERR(__wt_buf_grow(session, tmpkey, key_prefix + key_size));
+ memcpy((uint8_t *)tmpkey->mem + key_prefix, key_data, key_size);
+ tmpkey->size = key_prefix + key_size;
+
+ if (0) {
+slow:
WT_ERR(__wt_row_leaf_key_copy(session, page, rip, tmpkey));
+ }
+
build:
WT_ERR(__rec_cell_build_leaf_key(session, r, tmpkey->data, tmpkey->size, &ovfl_key));
}
@@ -1012,6 +1037,7 @@ build:
*/
if (r->key_pfx_compress_conf) {
r->key_pfx_compress = false;
+ r->key_pfx_last = 0;
if (!ovfl_key)
WT_ERR(__rec_cell_build_leaf_key(session, r, NULL, 0, &ovfl_key));
}
diff --git a/src/third_party/wiredtiger/src/reconcile/rec_write.c b/src/third_party/wiredtiger/src/reconcile/rec_write.c
index 97743d7c3ee..24fe6bb252d 100644
--- a/src/third_party/wiredtiger/src/reconcile/rec_write.c
+++ b/src/third_party/wiredtiger/src/reconcile/rec_write.c
@@ -1217,15 +1217,13 @@ __wt_rec_split(WT_SESSION_IMPL *session, WT_RECONCILE *r, size_t next_len, bool
done:
/*
- * Overflow values can be larger than the maximum page size but still be
- * "on-page". If the next key/value pair is larger than space available
- * after a split has happened (in other words, larger than the maximum
- * page size), create a page sized to hold that one key/value pair. This
- * generally splits the page into key/value pairs before a large object,
- * the object, and key/value pairs after the object. It's possible other
- * key/value pairs will also be aggregated onto the bigger page before
- * or after, if the page happens to hold them, but it won't necessarily
- * happen that way.
+ * Overflow values can be larger than the maximum page size but still be "on-page". If the next
+ * key/value pair is larger than space available after a split has happened (in other words,
+ * larger than the maximum page size), create a page sized to hold that one key/value pair. This
+ * generally splits the page into key/value pairs before a large object, the object, and
+ * key/value pairs after the object. It's possible other key/value pairs will also be aggregated
+ * onto the bigger page before or after, if the page happens to hold them, but it won't
+ * necessarily happen that way.
*/
if (r->space_avail < next_len)
WT_RET(__rec_split_grow(session, r, next_len));
diff --git a/src/third_party/wiredtiger/src/schema/schema_create.c b/src/third_party/wiredtiger/src/schema/schema_create.c
index bc76391db43..c814a32c935 100644
--- a/src/third_party/wiredtiger/src/schema/schema_create.c
+++ b/src/third_party/wiredtiger/src/schema/schema_create.c
@@ -62,7 +62,8 @@ __check_imported_ts(WT_SESSION_IMPL *session, const char *uri, const char *confi
ckptbase = NULL;
txn_global = &S2C(session)->txn_global;
- WT_ERR_NOTFOUND_OK(__wt_meta_ckptlist_get_from_config(session, false, &ckptbase, config), true);
+ WT_ERR_NOTFOUND_OK(
+ __wt_meta_ckptlist_get_from_config(session, false, &ckptbase, NULL, config), true);
if (ret == WT_NOTFOUND)
WT_ERR_MSG(session, EINVAL,
"%s: import could not find any checkpoint information in supplied metadata", uri);
@@ -128,10 +129,11 @@ __create_file(
WT_SESSION_IMPL *session, const char *uri, bool exclusive, bool import, const char *config)
{
WT_CONFIG_ITEM cval;
+ WT_DECL_ITEM(buf);
WT_DECL_ITEM(val);
WT_DECL_RET;
const char *filename, **p,
- *filecfg[] = {WT_CONFIG_BASE(session, file_meta), config, NULL, NULL, NULL};
+ *filecfg[] = {WT_CONFIG_BASE(session, file_meta), config, NULL, NULL, NULL, NULL};
char *fileconf, *filemeta;
uint32_t allocsize;
bool exists, import_repair, is_metadata;
@@ -140,6 +142,7 @@ __create_file(
import_repair = false;
is_metadata = strcmp(uri, WT_METAFILE_URI) == 0;
+ WT_ERR(__wt_scr_alloc(session, 1024, &buf));
filename = uri;
WT_PREFIX_SKIP_REQUIRED(session, filename, "file:");
@@ -200,6 +203,12 @@ __create_file(
}
WT_ERR(__wt_strndup(session, cval.str, cval.len, &filemeta));
filecfg[2] = filemeta;
+ /*
+ * If there is a file metadata provided, reconstruct the incremental backup
+ * information as the imported file was not part of any backup.
+ */
+ WT_ERR(__wt_reset_blkmod(session, config, buf));
+ filecfg[3] = buf->mem;
} else {
/*
* If there is no file metadata provided, the user should be specifying a "repair".
@@ -217,14 +226,15 @@ __create_file(
WT_ERR(__create_file_block_manager(session, uri, filename, allocsize));
/*
- * If creating an ordinary file, update the file ID and current version numbers and strip the
- * incremental backup information and checkpoint LSN from the extracted metadata.
+ * If creating an ordinary file, update the file ID and current version numbers and strip
+ * checkpoint LSN from the extracted metadata. If importing an existing file, incremental backup
+ * information is reconstructed inside import repair or when grabbing file metadata.
*/
if (!is_metadata) {
if (!import_repair) {
WT_ERR(__wt_scr_alloc(session, 0, &val));
WT_ERR(__wt_buf_fmt(session, val,
- "id=%" PRIu32 ",version=(major=%d,minor=%d),checkpoint_backup_info=,checkpoint_lsn=",
+ "id=%" PRIu32 ",version=(major=%d,minor=%d),checkpoint_lsn=",
++S2C(session)->next_file_id, WT_BTREE_MAJOR_VERSION_MAX,
WT_BTREE_MINOR_VERSION_MAX));
for (p = filecfg; *p != NULL; ++p)
@@ -260,6 +270,7 @@ __create_file(
WT_ERR(__wt_session_release_dhandle(session));
err:
+ __wt_scr_free(session, &buf);
__wt_scr_free(session, &val);
__wt_free(session, fileconf);
__wt_free(session, filemeta);
diff --git a/src/third_party/wiredtiger/src/schema/schema_plan.c b/src/third_party/wiredtiger/src/schema/schema_plan.c
index bd2ae7fa45c..8bc330abac7 100644
--- a/src/third_party/wiredtiger/src/schema/schema_plan.c
+++ b/src/third_party/wiredtiger/src/schema/schema_plan.c
@@ -214,10 +214,9 @@ __wt_struct_plan(WT_SESSION_IMPL *session, WT_TABLE *table, const char *columns,
WT_RET(__wt_buf_catfmt(session, plan, "%c", WT_PROJ_SKIP));
}
/*
- * Now copy the value in / out. In the common case,
- * where each value is used in one column, we do a
- * "next" operation. If the value is used again, we do
- * a "reuse" operation to avoid making another copy.
+ * Now copy the value in / out. In the common case, where each value is used in one
+ * column, we do a "next" operation. If the value is used again, we do a "reuse"
+ * operation to avoid making another copy.
*/
if (!have_it) {
WT_RET(__wt_buf_catfmt(session, plan, "%c", WT_PROJ_NEXT));
diff --git a/src/third_party/wiredtiger/src/schema/schema_rename.c b/src/third_party/wiredtiger/src/schema/schema_rename.c
index 7145661420f..b0ce8fc3ed7 100644
--- a/src/third_party/wiredtiger/src/schema/schema_rename.c
+++ b/src/third_party/wiredtiger/src/schema/schema_rename.c
@@ -9,30 +9,6 @@
#include "wt_internal.h"
/*
- * __rename_blkmod --
- * Reset the incremental backup information for a rename.
- */
-static int
-__rename_blkmod(WT_SESSION_IMPL *session, const char *oldvalue, WT_ITEM *buf)
-{
- WT_CKPT ckpt;
- WT_DECL_RET;
-
- WT_CLEAR(ckpt);
- /*
- * Replace the old file entries with new file entries. We need to recreate the incremental
- * backup information to indicate copying the entire file in its bitmap.
- */
- /* First load any existing backup information into a temp checkpoint structure. */
- WT_RET(__wt_meta_blk_mods_load(session, oldvalue, &ckpt, true));
-
- /* Take the checkpoint structure and generate the metadata string. */
- ret = __wt_ckpt_blkmod_to_meta(session, buf, &ckpt);
- __wt_meta_checkpoint_free(session, &ckpt);
- return (ret);
-}
-
-/*
* __rename_file --
* WT_SESSION::rename for a file.
*/
@@ -89,7 +65,7 @@ __rename_file(WT_SESSION_IMPL *session, const char *uri, const char *newuri)
WT_ERR(__wt_metadata_remove(session, uri));
filecfg[0] = oldvalue;
if (F_ISSET(S2C(session), WT_CONN_INCR_BACKUP)) {
- WT_ERR(__rename_blkmod(session, oldvalue, buf));
+ WT_ERR(__wt_reset_blkmod(session, oldvalue, buf));
filecfg[1] = buf->mem;
} else
filecfg[1] = NULL;
@@ -135,8 +111,7 @@ __rename_tree(WT_SESSION_IMPL *session, WT_TABLE *table, const char *newuri, con
/*
* Create the new data source URI and update the schema value.
*
- * 'name' has the format (colgroup|index):<tablename>[:<suffix>];
- * we need the suffix.
+ * 'name' has the format (colgroup|index):<tablename>[:<suffix>]; we need the suffix.
*/
is_colgroup = WT_PREFIX_MATCH(name, "colgroup:");
if (!is_colgroup && !WT_PREFIX_MATCH(name, "index:"))
diff --git a/src/third_party/wiredtiger/src/schema/schema_util.c b/src/third_party/wiredtiger/src/schema/schema_util.c
index eb342f32a5a..215d8713507 100644
--- a/src/third_party/wiredtiger/src/schema/schema_util.c
+++ b/src/third_party/wiredtiger/src/schema/schema_util.c
@@ -141,9 +141,9 @@ __wt_str_name_check(WT_SESSION_IMPL *session, const char *str)
bool skip;
/*
- * Check if name is somewhere in the WiredTiger name space: it would be
- * "bad" if the application truncated the metadata file. Skip any
- * leading URI prefix if needed, check and then skip over a table name.
+ * Check if name is somewhere in the WiredTiger name space: it would be "bad" if the application
+ * truncated the metadata file. Skip any leading URI prefix if needed, check and then skip over
+ * a table name.
*/
name = str;
skip = false;
diff --git a/src/third_party/wiredtiger/src/session/session_api.c b/src/third_party/wiredtiger/src/session/session_api.c
index 338df444cd2..a6796946c99 100644
--- a/src/third_party/wiredtiger/src/session/session_api.c
+++ b/src/third_party/wiredtiger/src/session/session_api.c
@@ -1397,7 +1397,7 @@ __session_truncate(
WT_ERR(__wt_session_range_truncate(session, uri, start, stop));
err:
- TXN_API_END_RETRY(session, ret, 0);
+ TXN_API_END(session, ret, false);
if (ret != 0)
WT_STAT_CONN_INCR(session, session_table_truncate_fail);
diff --git a/src/third_party/wiredtiger/src/session/session_compact.c b/src/third_party/wiredtiger/src/session/session_compact.c
index 057e032a1f1..b4064eb1f69 100644
--- a/src/third_party/wiredtiger/src/session/session_compact.c
+++ b/src/third_party/wiredtiger/src/session/session_compact.c
@@ -213,7 +213,8 @@ __compact_checkpoint(WT_SESSION_IMPL *session)
/* Checkpoints take a lot of time, check if we've run out. */
WT_RET(__wt_session_compact_check_timeout(session));
- if ((ret = __wt_txn_checkpoint(session, checkpoint_cfg, false)) == 0)
+ ret = __wt_txn_checkpoint(session, checkpoint_cfg, false);
+ if (ret == 0)
return (0);
WT_RET_BUSY_OK(ret);
diff --git a/src/third_party/wiredtiger/src/support/err.c b/src/third_party/wiredtiger/src/support/err.c
index d7099586fc3..f4c20e02746 100644
--- a/src/third_party/wiredtiger/src/support/err.c
+++ b/src/third_party/wiredtiger/src/support/err.c
@@ -472,8 +472,8 @@ __wt_msg(WT_SESSION_IMPL *session, const char *fmt, ...) WT_GCC_FUNC_ATTRIBUTE((
handler = session->event_handler;
ret = handler->handle_message(handler, wt_session, buf->data);
+err:
__wt_scr_free(session, &buf);
-
return (ret);
}
@@ -501,8 +501,8 @@ __wt_ext_msg_printf(WT_EXTENSION_API *wt_api, WT_SESSION *wt_session, const char
handler = session->event_handler;
ret = handler->handle_message(handler, wt_session, buf->data);
+err:
__wt_scr_free(session, &buf);
-
return (ret);
}
diff --git a/src/third_party/wiredtiger/src/support/scratch.c b/src/third_party/wiredtiger/src/support/scratch.c
index c4644bb0aba..33c80a3a0ed 100644
--- a/src/third_party/wiredtiger/src/support/scratch.c
+++ b/src/third_party/wiredtiger/src/support/scratch.c
@@ -21,15 +21,23 @@ __wt_buf_grow_worker(WT_SESSION_IMPL *session, WT_ITEM *buf, size_t size)
/*
* Maintain the existing data: there are 3 cases:
- * No existing data: allocate the required memory, and initialize
- * the data to reference it.
- * Existing data local to the buffer: set the data to the same
- * offset in the re-allocated memory.
- * Existing data not-local to the buffer: copy the data into the
- * buffer and set the data to reference it.
+ *
+ * 1. No existing data: allocate the required memory, and initialize the data to reference it.
+ * 2. Existing data local to the buffer: set the data to the same offset in the re-allocated
+ * memory. The offset in this case is likely a read of an overflow item, the data pointer
+ * is offset in the buffer in order to skip over the leading data block page header. For
+ * the same reason, take any offset in the buffer into account when calculating the size
+ * to allocate, it saves complex calculations in our callers to decide if the buffer is large
+ * enough in the case of buffers with offset data pointers.
+ * 3. Existing data not-local to the buffer: copy the data into the buffer and set the data to
+ * reference it.
+ *
+ * Take the offset of the data pointer in the buffer when calculating the size
+ * needed, overflow items use the data pointer to skip the leading data block page header
*/
if (WT_DATA_IN_ITEM(buf)) {
offset = WT_PTRDIFF(buf->data, buf->mem);
+ size += offset;
copy_data = false;
} else {
offset = 0;
@@ -51,8 +59,14 @@ __wt_buf_grow_worker(WT_SESSION_IMPL *session, WT_ITEM *buf, size_t size)
buf->data = buf->mem;
buf->size = 0;
} else {
- if (copy_data)
+ if (copy_data) {
+ /*
+ * It's easy to corrupt memory if you pass in the wrong size for the final buffer size,
+ * which is harder to debug than this assert.
+ */
+ WT_ASSERT(session, buf->size <= buf->memsize);
memcpy(buf->mem, buf->data, buf->size);
+ }
buf->data = (uint8_t *)buf->mem + offset;
}
@@ -67,9 +81,12 @@ int
__wt_buf_fmt(WT_SESSION_IMPL *session, WT_ITEM *buf, const char *fmt, ...)
WT_GCC_FUNC_ATTRIBUTE((format(printf, 3, 4))) WT_GCC_FUNC_ATTRIBUTE((visibility("default")))
{
+ WT_DECL_RET;
+
WT_VA_ARGS_BUF_FORMAT(session, buf, fmt, false);
- return (0);
+err:
+ return (ret);
}
/*
@@ -80,6 +97,8 @@ int
__wt_buf_catfmt(WT_SESSION_IMPL *session, WT_ITEM *buf, const char *fmt, ...)
WT_GCC_FUNC_ATTRIBUTE((format(printf, 3, 4))) WT_GCC_FUNC_ATTRIBUTE((visibility("default")))
{
+ WT_DECL_RET;
+
/*
* If we're appending data to an existing buffer, any data field should point into the allocated
* memory. (It wouldn't be insane to copy any previously existing data at this point, if data
@@ -89,7 +108,8 @@ __wt_buf_catfmt(WT_SESSION_IMPL *session, WT_ITEM *buf, const char *fmt, ...)
WT_VA_ARGS_BUF_FORMAT(session, buf, fmt, true);
- return (0);
+err:
+ return (ret);
}
/*
diff --git a/src/third_party/wiredtiger/src/support/stat.c b/src/third_party/wiredtiger/src/support/stat.c
index d622d44589e..fb9f7870a54 100644
--- a/src/third_party/wiredtiger/src/support/stat.c
+++ b/src/third_party/wiredtiger/src/support/stat.c
@@ -1280,6 +1280,9 @@ static const char *const __stats_connection_desc[] = {
"thread-yield: page delete rollback time sleeping for state change (usecs)",
"thread-yield: page reconciliation yielded due to child modification",
"transaction: Number of prepared updates",
+ "transaction: Number of prepared updates committed",
+ "transaction: Number of prepared updates repeated on the same key",
+ "transaction: Number of prepared updates rolled back",
"transaction: prepared transactions",
"transaction: prepared transactions committed",
"transaction: prepared transactions currently active",
@@ -1798,7 +1801,10 @@ __wt_stat_connection_clear_single(WT_CONNECTION_STATS *stats)
stats->page_sleep = 0;
stats->page_del_rollback_blocked = 0;
stats->child_modify_blocked_page = 0;
- stats->txn_prepared_updates_count = 0;
+ stats->txn_prepared_updates = 0;
+ stats->txn_prepared_updates_committed = 0;
+ stats->txn_prepared_updates_key_repeated = 0;
+ stats->txn_prepared_updates_rolledback = 0;
stats->txn_prepare = 0;
stats->txn_prepare_commit = 0;
stats->txn_prepare_active = 0;
@@ -2309,7 +2315,10 @@ __wt_stat_connection_aggregate(WT_CONNECTION_STATS **from, WT_CONNECTION_STATS *
to->page_sleep += WT_STAT_READ(from, page_sleep);
to->page_del_rollback_blocked += WT_STAT_READ(from, page_del_rollback_blocked);
to->child_modify_blocked_page += WT_STAT_READ(from, child_modify_blocked_page);
- to->txn_prepared_updates_count += WT_STAT_READ(from, txn_prepared_updates_count);
+ to->txn_prepared_updates += WT_STAT_READ(from, txn_prepared_updates);
+ to->txn_prepared_updates_committed += WT_STAT_READ(from, txn_prepared_updates_committed);
+ to->txn_prepared_updates_key_repeated += WT_STAT_READ(from, txn_prepared_updates_key_repeated);
+ to->txn_prepared_updates_rolledback += WT_STAT_READ(from, txn_prepared_updates_rolledback);
to->txn_prepare += WT_STAT_READ(from, txn_prepare);
to->txn_prepare_commit += WT_STAT_READ(from, txn_prepare_commit);
to->txn_prepare_active += WT_STAT_READ(from, txn_prepare_active);
diff --git a/src/third_party/wiredtiger/src/tiered/tiered_config.c b/src/third_party/wiredtiger/src/tiered/tiered_config.c
index 23eb24131cc..6971ec4b7b5 100644
--- a/src/third_party/wiredtiger/src/tiered/tiered_config.c
+++ b/src/third_party/wiredtiger/src/tiered/tiered_config.c
@@ -115,8 +115,8 @@ __wt_tiered_bucket_config(
WT_ERR(__wt_strndup(session, prefix.str, prefix.len, &new->bucket_prefix));
storage = nstorage->storage_source;
- WT_ERR(storage->ss_customize_file_system(storage, &session->iface, new->bucket,
- new->bucket_prefix, new->auth_token, NULL, &new->file_system));
+ WT_ERR(storage->ss_customize_file_system(
+ storage, &session->iface, new->bucket, new->auth_token, NULL, &new->file_system));
new->storage_source = storage;
/* If we're creating a new bucket storage, parse the other settings into it. */
diff --git a/src/third_party/wiredtiger/src/tiered/tiered_cursor.c b/src/third_party/wiredtiger/src/tiered/tiered_cursor.c
index db45db54f9e..c913f9b33ca 100644
--- a/src/third_party/wiredtiger/src/tiered/tiered_cursor.c
+++ b/src/third_party/wiredtiger/src/tiered/tiered_cursor.c
@@ -696,10 +696,9 @@ __curtiered_search_near(WT_CURSOR *cursor, int *exactp)
}
/*
- * Prefer larger cursors. There are two reasons: (1) we expect
- * prefix searches to be a common case (as in our own indices);
- * and (2) we need a way to unambiguously know we have the
- * "closest" result.
+ * Prefer larger cursors. There are two reasons: (1) we expect prefix searches to be a
+ * common case (as in our own indices); and (2) we need a way to unambiguously know we have
+ * the "closest" result.
*/
if (cmp < 0) {
if ((ret = c->next(c)) == WT_NOTFOUND) {
diff --git a/src/third_party/wiredtiger/src/tiered/tiered_handle.c b/src/third_party/wiredtiger/src/tiered/tiered_handle.c
index a1bb6bc37a6..363a9c97140 100644
--- a/src/third_party/wiredtiger/src/tiered/tiered_handle.c
+++ b/src/third_party/wiredtiger/src/tiered/tiered_handle.c
@@ -146,15 +146,14 @@ __tiered_create_object(WT_SESSION_IMPL *session, WT_TIERED *tiered)
WT_ERR(ret);
}
/*
- * Create the name and metadata of the new shared object of the current local object.
- * The data structure keeps this id so that we don't have to parse and manipulate strings.
- * I.e. if we have file:example-000000002.wt we want object:example-000000002.wtobj.
+ * Create the name and metadata of the new shared object of the current local object. The data
+ * structure keeps this id so that we don't have to parse and manipulate strings.
*/
WT_ERR(
__wt_tiered_name(session, &tiered->iface, tiered->current_id, WT_TIERED_NAME_OBJECT, &name));
cfg[0] = WT_CONFIG_BASE(session, object_meta);
cfg[1] = tiered->obj_config;
- cfg[2] = "readonly=true";
+ cfg[2] = "flush=0,readonly=true";
WT_ASSERT(session, tiered->obj_config != NULL);
WT_ERR(__wt_config_merge(session, cfg, NULL, (const char **)&config));
__wt_verbose(
@@ -162,6 +161,13 @@ __tiered_create_object(WT_SESSION_IMPL *session, WT_TIERED *tiered)
/* Create the new shared object. */
WT_ERR(__wt_schema_create(session, name, config));
+#if 0
+ /*
+ * If we get here we have successfully created the object. It is ready to be fully flushed to
+ * the cloud. Push a work element to let the internal thread do that here.
+ */
+#endif
+
err:
__wt_free(session, config);
__wt_free(session, name);
@@ -307,6 +313,10 @@ static int
__tiered_switch(WT_SESSION_IMPL *session, const char *config)
{
WT_DECL_RET;
+#if 0
+ WT_FILE_SYSTEM *fs;
+ WT_STORAGE_SOURCE *storage_source;
+#endif
WT_TIERED *tiered;
bool need_object, need_tree, tracking;
@@ -357,10 +367,31 @@ __tiered_switch(WT_SESSION_IMPL *session, const char *config)
/* We always need to create a local object. */
WT_ERR(__tiered_create_local(session, tiered));
+#if 0
/*
- * Note that removal of overlapping local objects is not in the purview of this function. Some
- * other mechanism will remove outdated tiers. Here's where it could be done though.
+ * We expect this part to be done asynchronously in its own thread. First flush the contents of
+ * the data file to the new cloud object.
*/
+ storage_source = tiered->bstorage->storage_source;
+ fs = tiered->bucket_storage->file_system;
+ WT_ASSERT(session, storage_source != NULL);
+
+ /* This call make take a while, and may fail due to network timeout. */
+ WT_ERR(storage_source->ss_flush(storage_source, &session->iface,
+ fs, old_filename, object_name, NULL));
+
+ /*
+ * The metadata for the old local object will be initialized with "flush=0". When the flush call
+ * completes, it can be marked as "flush=1". When that's done, we can finish the flush. The
+ * flush finish call moves the file from the home directory to the extension's cache. Then the
+ * extension will own it.
+ *
+ * We may need a way to restart flushes for those not completed (after a crash), or failed (due
+ * to previous network outage).
+ */
+ WT_ERR(storage_source->ss_flush_finish(storage_source, &session->iface,
+ fs, old_filename, object_name, NULL));
+#endif
/* Update the tiered: metadata to new object number and tiered array. */
WT_ERR(__tiered_update_metadata(session, tiered, config));
@@ -424,7 +455,7 @@ __wt_tiered_name(
if (LF_ISSET(WT_TIERED_NAME_PREFIX))
WT_ERR(__wt_buf_fmt(session, tmp, "file:%s-", name));
else
- WT_ERR(__wt_buf_fmt(session, tmp, "file:%s-%010" PRIu64 ".wt", name, id));
+ WT_ERR(__wt_buf_fmt(session, tmp, "file:%s-%010" PRIu64 ".wtobj", name, id));
} else if (LF_ISSET(WT_TIERED_NAME_OBJECT)) {
if (LF_ISSET(WT_TIERED_NAME_PREFIX))
WT_ERR(__wt_buf_fmt(session, tmp, "object:%s-", name));
diff --git a/src/third_party/wiredtiger/src/txn/txn.c b/src/third_party/wiredtiger/src/txn/txn.c
index 9aa2f085386..dabce04d12f 100644
--- a/src/third_party/wiredtiger/src/txn/txn.c
+++ b/src/third_party/wiredtiger/src/txn/txn.c
@@ -1054,6 +1054,7 @@ __txn_resolve_prepared_op(WT_SESSION_IMPL *session, WT_TXN_OP *op, bool commit,
#endif
size_t not_used;
uint32_t hs_btree_id;
+ char ts_string[3][WT_TS_INT_STRING_SIZE];
bool upd_appended;
hs_cursor = NULL;
@@ -1063,9 +1064,18 @@ __txn_resolve_prepared_op(WT_SESSION_IMPL *session, WT_TXN_OP *op, bool commit,
WT_RET(__txn_search_prepared_op(session, op, cursorp, &upd));
- __wt_verbose(session, WT_VERB_TRANSACTION,
- "resolving prepared op for txnid: %" PRIu64 " that %s", txn->id,
- commit ? "committed" : "roll backed");
+ if (commit)
+ __wt_verbose(session, WT_VERB_TRANSACTION,
+ "commit resolving prepared transaction with txnid: %" PRIu64
+ "and timestamp: %s to commit and durable timestamps: %s,%s",
+ txn->id, __wt_timestamp_to_string(txn->prepare_timestamp, ts_string[0]),
+ __wt_timestamp_to_string(txn->commit_timestamp, ts_string[1]),
+ __wt_timestamp_to_string(txn->durable_timestamp, ts_string[2]));
+ else
+ __wt_verbose(session, WT_VERB_TRANSACTION,
+ "rollback resolving prepared transaction with txnid: %" PRIu64 "and timestamp:%s",
+ txn->id, __wt_timestamp_to_string(txn->prepare_timestamp, ts_string[0]));
+
/*
* Aborted updates can exist in the update chain of our transaction. Generally this will occur
* due to a reserved update. As such we should skip over these updates.
@@ -1082,7 +1092,7 @@ __txn_resolve_prepared_op(WT_SESSION_IMPL *session, WT_TXN_OP *op, bool commit,
* we rolled back all associated updates in the previous iteration of this function.
*/
if (upd == NULL || upd->prepare_state != WT_PREPARE_INPROGRESS)
- return (0);
+ goto prepare_verify;
WT_ERR(__txn_commit_timestamps_usage_check(session, op, upd));
/*
@@ -1092,9 +1102,14 @@ __txn_resolve_prepared_op(WT_SESSION_IMPL *session, WT_TXN_OP *op, bool commit,
* updates first, the history search logic may race with other sessions modifying the same key
* and checkpoint moving the new updates to the history store.
*
- * For prepared delete, we don't need to fix the history store.
+ * For prepared delete commit, we don't need to fix the history store. Whereas for rollback, if
+ * the update is also from the same prepared transaction, restore the update from history store
+ * or remove the key.
*/
- if (F_ISSET(upd, WT_UPDATE_PREPARE_RESTORED_FROM_DS) && upd->type != WT_UPDATE_TOMBSTONE) {
+ if (F_ISSET(upd, WT_UPDATE_PREPARE_RESTORED_FROM_DS) &&
+ (upd->type != WT_UPDATE_TOMBSTONE ||
+ (!commit && upd->next != NULL && upd->durable_ts == upd->next->durable_ts &&
+ upd->txnid == upd->next->txnid && upd->start_ts == upd->next->start_ts))) {
cbt = (WT_CURSOR_BTREE *)(*cursorp);
hs_btree_id = S2BT(session)->id;
/* Open a history store table cursor. */
@@ -1140,6 +1155,7 @@ __txn_resolve_prepared_op(WT_SESSION_IMPL *session, WT_TXN_OP *op, bool commit,
if (!commit) {
upd->txnid = WT_TXN_ABORTED;
+ WT_STAT_CONN_INCR(session, txn_prepared_updates_rolledback);
continue;
}
@@ -1172,6 +1188,7 @@ __txn_resolve_prepared_op(WT_SESSION_IMPL *session, WT_TXN_OP *op, bool commit,
* Resolve the prepared update to be committed update.
*/
__txn_resolve_prepared_update(session, upd);
+ WT_STAT_CONN_INCR(session, txn_prepared_updates_committed);
}
/*
@@ -1183,14 +1200,16 @@ __txn_resolve_prepared_op(WT_SESSION_IMPL *session, WT_TXN_OP *op, bool commit,
if (fix_upd != NULL)
WT_ERR(__txn_fixup_prepared_update(session, hs_cursor, fix_upd, commit));
+prepare_verify:
#ifdef HAVE_DIAGNOSTIC
for (; head_upd != NULL; head_upd = head_upd->next) {
/*
- * Assert if we still have an update from the current transaction that hasn't been aborted.
- * Only perform this check if aborting the prepared transaction.
+ * Assert if we still have an update from the current transaction that hasn't been resolved
+ * or aborted.
*/
- WT_ASSERT(
- session, commit || head_upd->txnid == WT_TXN_ABORTED || head_upd->txnid != txn->id);
+ WT_ASSERT(session,
+ head_upd->txnid == WT_TXN_ABORTED || head_upd->prepare_state == WT_PREPARE_RESOLVED ||
+ head_upd->txnid != txn->id);
if (head_upd->txnid == WT_TXN_ABORTED)
continue;
@@ -1407,12 +1426,18 @@ __wt_txn_commit(WT_SESSION_IMPL *session, const char *cfg[])
wt_timestamp_t candidate_durable_timestamp, prev_durable_timestamp;
uint32_t fileid;
u_int i;
+#ifdef HAVE_DIAGNOSTIC
+ u_int prepare_count;
+#endif
bool locked, prepare, readonly, update_durable_ts;
txn = session->txn;
conn = S2C(session);
cursor = NULL;
txn_global = &conn->txn_global;
+#ifdef HAVE_DIAGNOSTIC
+ prepare_count = 0;
+#endif
locked = false;
prepare = F_ISSET(txn, WT_TXN_PREPARE);
readonly = txn->mod_count == 0;
@@ -1470,15 +1495,13 @@ __wt_txn_commit(WT_SESSION_IMPL *session, const char *cfg[])
WT_ERR(__wt_config_gets_def(session, cfg, "sync", 0, &cval));
/*
- * If the user chose the default setting, check whether sync is enabled
- * for this transaction (either inherited or via begin_transaction).
- * If sync is disabled, clear the field to avoid the log write being
- * flushed.
+ * If the user chose the default setting, check whether sync is enabled for this transaction
+ * (either inherited or via begin_transaction). If sync is disabled, clear the field to avoid
+ * the log write being flushed.
*
- * Otherwise check for specific settings. We don't need to check for
- * "on" because that is the default inherited from the connection. If
- * the user set anything in begin_transaction, we only override with an
- * explicit setting.
+ * Otherwise check for specific settings. We don't need to check for "on" because that is the
+ * default inherited from the connection. If the user set anything in begin_transaction, we only
+ * override with an explicit setting.
*/
if (cval.len == 0) {
if (!FLD_ISSET(txn->txn_logsync, WT_LOG_SYNC_ENABLED) && !F_ISSET(txn, WT_TXN_SYNC_SET))
@@ -1572,6 +1595,9 @@ __wt_txn_commit(WT_SESSION_IMPL *session, const char *cfg[])
*/
if (!F_ISSET(op, WT_TXN_OP_KEY_REPEATED))
WT_ERR(__txn_resolve_prepared_op(session, op, true, &cursor));
+#ifdef HAVE_DIAGNOSTIC
+ ++prepare_count;
+#endif
}
break;
case WT_TXN_OP_REF_DELETE:
@@ -1589,6 +1615,10 @@ __wt_txn_commit(WT_SESSION_IMPL *session, const char *cfg[])
WT_CLEAR(cursor->key);
}
txn->mod_count = 0;
+#ifdef HAVE_DIAGNOSTIC
+ WT_ASSERT(session, txn->prepare_count == prepare_count);
+ txn->prepare_count = 0;
+#endif
if (cursor != NULL) {
WT_ERR(cursor->close(cursor));
@@ -1680,11 +1710,10 @@ __wt_txn_prepare(WT_SESSION_IMPL *session, const char *cfg[])
WT_TXN *txn;
WT_TXN_OP *op;
WT_UPDATE *upd, *tmp;
- int64_t txn_prepared_updates_count;
- u_int i;
+ u_int i, prepared_updates, prepared_updates_key_repeated;
txn = session->txn;
- txn_prepared_updates_count = 0;
+ prepared_updates = prepared_updates_key_repeated = 0;
WT_ASSERT(session, F_ISSET(txn, WT_TXN_RUNNING));
WT_ASSERT(session, !F_ISSET(txn, WT_TXN_ERROR));
@@ -1749,7 +1778,7 @@ __wt_txn_prepare(WT_SESSION_IMPL *session, const char *cfg[])
break;
}
- ++txn_prepared_updates_count;
+ ++prepared_updates;
/* Set prepare timestamp. */
upd->start_ts = txn->prepare_timestamp;
@@ -1776,6 +1805,7 @@ __wt_txn_prepare(WT_SESSION_IMPL *session, const char *cfg[])
if (tmp->type != WT_UPDATE_RESERVE &&
!F_ISSET(tmp, WT_UPDATE_RESTORED_FAST_TRUNCATE)) {
F_SET(op, WT_TXN_OP_KEY_REPEATED);
+ ++prepared_updates_key_repeated;
break;
}
break;
@@ -1788,7 +1818,11 @@ __wt_txn_prepare(WT_SESSION_IMPL *session, const char *cfg[])
break;
}
}
- WT_STAT_CONN_INCR(session, txn_prepared_updates_count);
+ WT_STAT_CONN_INCRV(session, txn_prepared_updates, prepared_updates);
+ WT_STAT_CONN_INCRV(session, txn_prepared_updates_key_repeated, prepared_updates_key_repeated);
+#ifdef HAVE_DIAGNOSTIC
+ txn->prepare_count = prepared_updates;
+#endif
/* Set transaction state to prepare. */
F_SET(session->txn, WT_TXN_PREPARE);
@@ -1819,10 +1853,16 @@ __wt_txn_rollback(WT_SESSION_IMPL *session, const char *cfg[])
WT_TXN_OP *op;
WT_UPDATE *upd;
u_int i;
+#ifdef HAVE_DIAGNOSTIC
+ u_int prepare_count;
+#endif
bool prepare, readonly;
cursor = NULL;
txn = session->txn;
+#ifdef HAVE_DIAGNOSTIC
+ prepare_count = 0;
+#endif
prepare = F_ISSET(txn, WT_TXN_PREPARE);
readonly = txn->mod_count == 0;
@@ -1874,6 +1914,9 @@ __wt_txn_rollback(WT_SESSION_IMPL *session, const char *cfg[])
*/
if (!F_ISSET(op, WT_TXN_OP_KEY_REPEATED))
WT_TRET(__txn_resolve_prepared_op(session, op, false, &cursor));
+#ifdef HAVE_DIAGNOSTIC
+ ++prepare_count;
+#endif
}
break;
case WT_TXN_OP_REF_DELETE:
@@ -1895,6 +1938,10 @@ __wt_txn_rollback(WT_SESSION_IMPL *session, const char *cfg[])
WT_CLEAR(cursor->key);
}
txn->mod_count = 0;
+#ifdef HAVE_DIAGNOSTIC
+ WT_ASSERT(session, txn->prepare_count == prepare_count);
+ txn->prepare_count = 0;
+#endif
if (cursor != NULL) {
WT_TRET(cursor->close(cursor));
diff --git a/src/third_party/wiredtiger/src/txn/txn_ckpt.c b/src/third_party/wiredtiger/src/txn/txn_ckpt.c
index dba739792a2..61720a8adaa 100644
--- a/src/third_party/wiredtiger/src/txn/txn_ckpt.c
+++ b/src/third_party/wiredtiger/src/txn/txn_ckpt.c
@@ -301,7 +301,6 @@ __wt_checkpoint_get_handles(WT_SESSION_IMPL *session, const char *cfg[])
WT_SAVE_DHANDLE(session, ret = __checkpoint_lock_dirty_tree(session, true, force, true, cfg));
WT_RET(ret);
if (F_ISSET(btree, WT_BTREE_SKIP_CKPT)) {
- WT_ASSERT(session, btree->ckpt == NULL);
__checkpoint_update_generation(session);
return (0);
}
@@ -1367,14 +1366,16 @@ __checkpoint_lock_dirty_tree(
WT_CONFIG_ITEM cval, k, v;
WT_DATA_HANDLE *dhandle;
WT_DECL_RET;
+ size_t ckpt_bytes_allocated;
uint64_t now;
char *name_alloc;
const char *name;
- bool is_drop, is_wt_ckpt, skip_ckpt;
+ bool is_drop, is_wt_ckpt, seen_ckpt_add, skip_ckpt;
btree = S2BT(session);
ckpt = ckptbase = NULL;
dhandle = session->dhandle;
+ ckpt_bytes_allocated = 0;
name_alloc = NULL;
/*
@@ -1439,12 +1440,30 @@ __checkpoint_lock_dirty_tree(
}
}
+ /*
+ * Discard the saved list of checkpoints, and slow path if this is not a WiredTiger checkpoint
+ * or if checkpoint drops are involved. Also, if we do not have checkpoint array size, the
+ * regular checkpoint process did not create the array. It is safer to discard the array in such
+ * a case.
+ */
+ if (!is_wt_ckpt || is_drop || btree->ckpt_bytes_allocated == 0)
+ __wt_meta_saved_ckptlist_free(session);
+
/* If we have to process this btree for any reason, reset the timer and obsolete pages flag. */
WT_BTREE_CLEAN_CKPT(session, btree, 0);
F_CLR(btree, WT_BTREE_OBSOLETE_PAGES);
- /* Get the list of checkpoints for this file. */
- WT_ERR(__wt_meta_ckptlist_get(session, dhandle->name, true, &ckptbase));
+ /*
+ * Get the list of checkpoints for this file: We try to cache the ckptlist between the
+ * checkpoints. But there might not be one, as there are operations that can invalidate a
+ * ckptlist. So, use a cached ckptlist if there is one. Otherwise go through slow path of
+ * re-generating the ckptlist by reading the metadata. Also, we avoid using a cached checkpoint
+ * list for metadata.
+ */
+ if (WT_IS_METADATA(dhandle) ||
+ __wt_meta_saved_ckptlist_get(session, dhandle->name, &ckptbase) != 0)
+ WT_ERR(
+ __wt_meta_ckptlist_get(session, dhandle->name, true, &ckptbase, &ckpt_bytes_allocated));
/* We may be dropping specific checkpoints, check the configuration. */
if (cfg != NULL) {
@@ -1488,19 +1507,36 @@ __checkpoint_lock_dirty_tree(
WT_WITH_HOTBACKUP_READ_LOCK_UNCOND(session,
ret = __checkpoint_lock_dirty_tree_int(session, is_checkpoint, force, btree, ckpt, ckptbase));
WT_ERR(ret);
- if (F_ISSET(btree, WT_BTREE_SKIP_CKPT))
- goto err;
- WT_ASSERT(session, btree->ckpt == NULL && !F_ISSET(btree, WT_BTREE_SKIP_CKPT));
- btree->ckpt = ckptbase;
+ /*
+ * If we decided to skip checkpointing, we need to remove the new checkpoint entry we might have
+ * appended to the list.
+ */
+ seen_ckpt_add = false;
+ if (F_ISSET(btree, WT_BTREE_SKIP_CKPT)) {
+ WT_CKPT_FOREACH_NAME_OR_ORDER (ckptbase, ckpt) {
+ /* Checkpoint(s) to be added are always at the end of the list. */
+ WT_ASSERT(session, !seen_ckpt_add || F_ISSET(ckpt, WT_CKPT_ADD));
+ if (F_ISSET(ckpt, WT_CKPT_ADD)) {
+ seen_ckpt_add = true;
+ __wt_meta_checkpoint_free(session, ckpt);
+ }
+ }
+ }
- if (0) {
+ if (ckptbase->name != NULL) {
+ btree->ckpt = ckptbase;
+ btree->ckpt_bytes_allocated = ckpt_bytes_allocated;
+ } else {
+ /* It is possible that we do not have any checkpoint in the list. */
err:
__wt_meta_ckptlist_free(session, &ckptbase);
+ __wt_meta_saved_ckptlist_free(session);
}
skip:
__wt_free(session, name_alloc);
+ WT_UNUSED(seen_ckpt_add);
return (ret);
}
@@ -1642,6 +1678,64 @@ __wt_checkpoint_tree_reconcile_update(WT_SESSION_IMPL *session, WT_TIME_AGGREGAT
}
/*
+ * __checkpoint_save_ckptlist --
+ * Post processing of the ckptlist to carry forward a cached list for the next checkpoint.
+ */
+static int
+__checkpoint_save_ckptlist(WT_SESSION_IMPL *session, WT_CKPT *ckptbase)
+{
+ WT_CKPT *ckpt, *ckpt_itr;
+ WT_DECL_ITEM(tmp);
+ WT_DECL_RET;
+
+ ckpt_itr = ckptbase;
+ WT_ERR(__wt_scr_alloc(session, 0, &tmp));
+ WT_CKPT_FOREACH (ckptbase, ckpt) {
+ /* Remove any deleted checkpoints, by shifting the array. */
+ if (F_ISSET(ckpt, WT_CKPT_DELETE)) {
+ __wt_meta_checkpoint_free(session, ckpt);
+ continue;
+ }
+
+ /* Clean up block manager information. */
+ __wt_free(session, ckpt->bpriv);
+ ckpt->bpriv = NULL;
+
+ /* Update the internal checkpoints to their full names, with the generation count suffix. */
+ if (strcmp(ckpt->name, WT_CHECKPOINT) == 0) {
+ WT_ERR(__wt_buf_fmt(session, tmp, "%s.%" PRId64, WT_CHECKPOINT, ckpt->order));
+ __wt_free(session, ckpt->name);
+ WT_ERR(__wt_strdup(session, tmp->mem, &ckpt->name));
+ }
+
+ /* Reset the flags, and mark a checkpoint fake if there is no address. */
+ ckpt->flags = 0;
+ if (ckpt->addr.size == 0) {
+ WT_ASSERT(session, ckpt->addr.data == NULL);
+ F_SET(ckpt, WT_CKPT_FAKE);
+ }
+
+ /* Shift the valid checkpoints, if there are deleted checkpoints in the list. */
+ if (ckpt_itr != ckpt) {
+ *ckpt_itr = *ckpt;
+ WT_CLEAR(*ckpt);
+ }
+ ckpt_itr++;
+ }
+
+ /*
+ * Confirm that the last checkpoint has a metadata entry that we can use to base a new
+ * checkpoint on.
+ */
+ ckpt_itr--;
+ WT_ASSERT(session, ckpt_itr->block_metadata != NULL);
+
+err:
+ __wt_scr_free(session, &tmp);
+ return (ret);
+}
+
+/*
* __checkpoint_tree --
* Checkpoint a single tree. Assumes all necessary locks have been acquired by the caller.
*/
@@ -1785,7 +1879,15 @@ err:
conn->modified = true;
}
- __wt_meta_ckptlist_free(session, &btree->ckpt);
+ /* For a successful checkpoint, post process the ckptlist, to keep a cached copy around. */
+ if (ret != 0 || WT_IS_METADATA(session->dhandle) || F_ISSET(conn, WT_CONN_CLOSING))
+ __wt_meta_saved_ckptlist_free(session);
+ else {
+ ret = __checkpoint_save_ckptlist(session, btree->ckpt);
+ /* Discard the saved checkpoint list if processing the list did not work. */
+ if (ret != 0)
+ __wt_meta_saved_ckptlist_free(session);
+ }
return (ret);
}
@@ -1880,13 +1982,20 @@ __wt_checkpoint(WT_SESSION_IMPL *session, const char *cfg[])
!WT_IS_METADATA(session->dhandle) ||
FLD_ISSET(session->lock_flags, WT_SESSION_LOCKED_METADATA));
+ /* Discard the cached checkpoint list when checkpointing a single file by itself. */
+ __wt_meta_saved_ckptlist_free(session);
+
WT_RET(__wt_config_gets_def(session, cfg, "force", 0, &cval));
force = cval.val != 0;
WT_SAVE_DHANDLE(session, ret = __checkpoint_lock_dirty_tree(session, true, force, true, cfg));
- WT_RET(ret);
- if (F_ISSET(S2BT(session), WT_BTREE_SKIP_CKPT))
- return (0);
- return (__checkpoint_tree(session, true, cfg));
+ if (ret != 0 || F_ISSET(S2BT(session), WT_BTREE_SKIP_CKPT))
+ goto done;
+ ret = __checkpoint_tree(session, true, cfg);
+
+done:
+ /* Do not store the cached checkpoint list when checkpointing a single file alone. */
+ __wt_meta_saved_ckptlist_free(session);
+ return (ret);
}
/*
@@ -1948,6 +2057,9 @@ __wt_checkpoint_close(WT_SESSION_IMPL *session, bool final)
(!F_ISSET(S2C(session), WT_CONN_FILE_CLOSE_SYNC) && !metadata)))
return (__wt_set_return(session, EBUSY));
+ /* Discard the cached checkpoint list when checkpointing a single file by itself. */
+ __wt_meta_saved_ckptlist_free(session);
+
/*
* Make sure there isn't a potential race between backup copying the metadata and a checkpoint
* changing the metadata. Backup holds both the checkpoint and schema locks. Checkpoint should
@@ -1975,6 +2087,9 @@ __wt_checkpoint_close(WT_SESSION_IMPL *session, bool final)
if (ret == 0 && !F_ISSET(btree, WT_BTREE_SKIP_CKPT))
ret = __checkpoint_tree(session, false, NULL);
+ /* Do not store the cached checkpoint list when checkpointing a single file alone. */
+ __wt_meta_saved_ckptlist_free(session);
+
if (need_tracking)
WT_TRET(__wt_meta_track_off(session, true, ret != 0));
diff --git a/src/third_party/wiredtiger/src/txn/txn_rollback_to_stable.c b/src/third_party/wiredtiger/src/txn/txn_rollback_to_stable.c
index de2ff910072..5b5482524f9 100644
--- a/src/third_party/wiredtiger/src/txn/txn_rollback_to_stable.c
+++ b/src/third_party/wiredtiger/src/txn/txn_rollback_to_stable.c
@@ -300,7 +300,7 @@ __rollback_ondisk_fixup_key(WT_SESSION_IMPL *session, WT_REF *ref, WT_PAGE *page
/* Get the full update value from the data store. */
unpack = &_unpack;
- __wt_row_leaf_value_cell(session, page, rip, NULL, unpack);
+ __wt_row_leaf_value_cell(session, page, rip, unpack);
} else {
/* Unpack a column cell. */
WT_ERR(__wt_scr_alloc(session, WT_INTPACK64_MAXSIZE, &key));
@@ -588,7 +588,7 @@ __rollback_abort_ondisk_kv(WT_SESSION_IMPL *session, WT_REF *ref, WT_COL *cip, W
WT_ASSERT(session, (rip != NULL && cip == NULL) || (rip == NULL && cip != NULL));
if (rip != NULL)
- __wt_row_leaf_value_cell(session, page, rip, NULL, vpack);
+ __wt_row_leaf_value_cell(session, page, rip, vpack);
else {
kcell = WT_COL_PTR(page, cip);
__wt_cell_unpack_kv(session, page->dsk, kcell, vpack);
diff --git a/src/third_party/wiredtiger/src/utilities/util_load.c b/src/third_party/wiredtiger/src/utilities/util_load.c
index 36eea69648a..ba0b019453a 100644
--- a/src/third_party/wiredtiger/src/utilities/util_load.c
+++ b/src/third_party/wiredtiger/src/utilities/util_load.c
@@ -440,12 +440,12 @@ config_update(WT_SESSION *session, char **list)
return (util_err(session, errno, NULL));
/*
- * For each match, rewrite the dump configuration as described by any
- * command-line configuration arguments.
+ * For each match, rewrite the dump configuration as described by any command-line configuration
+ * arguments.
*
- * New filenames will be chosen as part of the table load, remove all
- * "filename=", "source=" and other configurations that foil loading
- * from the values; we call an unpublished API to do the work.
+ * New filenames will be chosen as part of the table load, remove all "filename=", "source=" and
+ * other configurations that foil loading from the values; we call an unpublished API to do the
+ * work.
*/
for (listp = list; *listp != NULL; listp += 2) {
cnt = 0;
diff --git a/src/third_party/wiredtiger/src/utilities/util_load_json.c b/src/third_party/wiredtiger/src/utilities/util_load_json.c
index 38801769e63..d1a92944a43 100644
--- a/src/third_party/wiredtiger/src/utilities/util_load_json.c
+++ b/src/third_party/wiredtiger/src/utilities/util_load_json.c
@@ -382,11 +382,9 @@ json_top_level(WT_SESSION *session, JSON_INPUT_STATE *ins, uint32_t flags)
}
/*
- * Allow any ordering of 'config', 'colgroups',
- * 'indices' before 'data', which must appear last.
- * The non-'data' items build up a list of entries
- * that created in our session before the data is
- * inserted.
+ * Allow any ordering of 'config', 'colgroups', 'indices' before 'data', which must appear
+ * last. The non-'data' items build up a list of entries that created in our session before
+ * the data is inserted.
*/
for (;;) {
if (json_skip(session, ins, json_markers) != 0)
diff --git a/src/third_party/wiredtiger/test/cppsuite/configs/config_example_test_default.txt b/src/third_party/wiredtiger/test/cppsuite/configs/config_example_test_default.txt
index b46fed225eb..f5d5c916bdc 100644
--- a/src/third_party/wiredtiger/test/cppsuite/configs/config_example_test_default.txt
+++ b/src/third_party/wiredtiger/test/cppsuite/configs/config_example_test_default.txt
@@ -1,48 +1,4 @@
-# Same parameters as config_poc_test_default
-duration_seconds=10,
-cache_size_mb=1000,
-enable_logging=true,
-runtime_monitor=
-(
- op_count=3,
- interval=s,
- stat_cache_size=
- (
- enabled=true,
- limit=100
- )
-),
-timestamp_manager=
-(
- enabled=true,
- oldest_lag=1,
- stable_lag=1
-),
-workload_generator=
-(
- collection_count=2,
- key_count=5,
- key_size=1,
- ops_per_transaction=
- (
- min=5,
- max=50
- ),
- read_threads=1,
- update_threads=1,
- value_size=10,
- update_config=
- (
- op_count=1,
- interval=s
- ),
- insert_config=
- (
- op_count=1,
- interval=s
- )
-),
-workload_tracking=
-(
- enabled=true
-)
+# Example configuration file, as default are added automatically only non default configurations
+# need to be defined.
+duration_seconds=5,
+cache_size_mb=250
diff --git a/src/third_party/wiredtiger/test/cppsuite/configs/config_poc_test_default.txt b/src/third_party/wiredtiger/test/cppsuite/configs/config_poc_test_default.txt
index c677142234d..6caaa4d4456 100644
--- a/src/third_party/wiredtiger/test/cppsuite/configs/config_poc_test_default.txt
+++ b/src/third_party/wiredtiger/test/cppsuite/configs/config_poc_test_default.txt
@@ -3,23 +3,14 @@
# Used as a basic test for the framework.
duration_seconds=10,
cache_size_mb=1000,
-enable_logging=true,
runtime_monitor=
(
- op_count=3,
- interval=s,
stat_cache_size=
(
enabled=true,
limit=100
)
),
-timestamp_manager=
-(
- enabled=true,
- oldest_lag=1,
- stable_lag=1
-),
workload_generator=
(
collection_count=2,
@@ -31,20 +22,5 @@ workload_generator=
max=50
),
read_threads=1,
- update_threads=1,
- update_config=
- (
- op_count=1,
- interval=s
- ),
- insert_config=
- (
- op_count=1,
- interval=s
- ),
- value_size=10
+ update_threads=1
),
-workload_tracking=
-(
- enabled=true
-)
diff --git a/src/third_party/wiredtiger/test/cppsuite/configs/config_poc_test_stress.txt b/src/third_party/wiredtiger/test/cppsuite/configs/config_poc_test_stress.txt
index 6067bea3983..6eeda0ab7c0 100644
--- a/src/third_party/wiredtiger/test/cppsuite/configs/config_poc_test_stress.txt
+++ b/src/third_party/wiredtiger/test/cppsuite/configs/config_poc_test_stress.txt
@@ -6,19 +6,12 @@ cache_size_mb=5000,
enable_logging=true,
runtime_monitor=
(
- rate_per_second=3,
stat_cache_size=
(
enabled=true,
limit=100
)
),
-timestamp_manager=
-(
- enabled=true,
- oldest_lag=1,
- stable_lag=1
-),
workload_generator=
(
collection_count=2,
@@ -32,8 +25,4 @@ workload_generator=
read_threads=1,
update_threads=1,
value_size=2000
-),
-workload_tracking=
-(
- enabled=true
-)
+), \ No newline at end of file
diff --git a/src/third_party/wiredtiger/test/cppsuite/create_test.sh b/src/third_party/wiredtiger/test/cppsuite/create_test.sh
new file mode 100755
index 00000000000..91f506e39e9
--- /dev/null
+++ b/src/third_party/wiredtiger/test/cppsuite/create_test.sh
@@ -0,0 +1,81 @@
+#! /bin/bash
+
+# First argument needs to be the name of the script.
+if [ $# -eq 0 ]
+ then
+ echo "Please give a name to your test i.e ./s_new_test my_test"
+ exit 128
+fi
+
+# Check the test name
+if [[ $1 =~ ^[0-9a-zA-Z_-]+$ ]];then
+ echo "Generating test: $1..."
+else
+ echo "Invalid test name. Only alphanumeric characters are allowed. \"_\" and \"-\" can be used too."
+ exit 128
+fi
+
+# Check if the test already exists.
+FILE=tests/$1.cxx
+if test -f "$FILE"; then
+ echo "$FILE cannot be created as it already exists."
+ exit 1
+fi
+
+# Check if default configuration associated to the test already exists.
+CONFIG=configs/config_$1_default.txt
+if test -f "$CONFIG"; then
+ echo "$CONFIG cannot be created as it already exists."
+ exit 1
+fi
+
+# Copy the default template.
+cp tests/example_test.cxx $FILE
+echo "Created $FILE."
+cp configs/config_example_test_default.txt $CONFIG
+echo "Created $CONFIG."
+
+# Replace example_test with the new test name.
+SEARCH="example_test"
+sed -i "s/$SEARCH/$1/" $FILE
+echo "Updated $FILE."
+
+# Replace the first line of the configuration file.
+REPLACE="# Configuration for $1."
+sed -i "1s/.*/$REPLACE/" $CONFIG
+echo "Updated $CONFIG."
+
+# Include the new test in run.cxx
+FILE=tests/run.cxx
+SEARCH="#include \"example_test.cxx\""
+VALUE="#include \"$1.cxx\""
+sed -i "/$SEARCH/a $VALUE" $FILE
+
+# Add the new test to the run_test() method
+SEARCH="example_test(config, test_name).run()"
+LINE_1="\ else if (test_name == \"$1\")\n"
+LINE_2="\ $1(config, test_name).run();"
+sed -i "/$SEARCH/a $LINE_1$LINE_2" $FILE
+
+# Add the new test to all existing tests.
+SEARCH="all_tests = {\"example_test\""
+REPLACE="$SEARCH, \"$1\""
+sed -i "s/$SEARCH/$REPLACE/" $FILE
+echo "Updated $FILE."
+
+# Add the new test to test_data.py
+FILE=../../dist/test_data.py
+SEARCH="example_test"
+LINE_1="\ '$1' : Method(test_config),"
+sed -i "/$SEARCH/a $LINE_1" $FILE
+echo "Updated $FILE."
+
+# Trigger s_all
+echo "Running s_all.."
+cd ../../dist
+./s_all
+
+# Last changes to be done manually
+echo "Follow the next steps to execute your new test:"
+echo "1. Start editing $1.cxx"
+echo "2. Compile your changes, go to build_posix/test/cppsuite and run your test with ./run -t $1"
diff --git a/src/third_party/wiredtiger/test/cppsuite/test_harness/core/component.h b/src/third_party/wiredtiger/test/cppsuite/test_harness/core/component.h
index 91b165d8f29..341932a0236 100644
--- a/src/third_party/wiredtiger/test/cppsuite/test_harness/core/component.h
+++ b/src/third_party/wiredtiger/test/cppsuite/test_harness/core/component.h
@@ -88,7 +88,7 @@ class component {
}
bool
- is_enabled() const
+ enabled() const
{
return _enabled;
}
diff --git a/src/third_party/wiredtiger/test/cppsuite/test_harness/core/configuration.h b/src/third_party/wiredtiger/test/cppsuite/test_harness/core/configuration.h
index c2b9494487f..7eaa96214cb 100644
--- a/src/third_party/wiredtiger/test/cppsuite/test_harness/core/configuration.h
+++ b/src/third_party/wiredtiger/test/cppsuite/test_harness/core/configuration.h
@@ -29,7 +29,9 @@
#ifndef CONFIGURATION_H
#define CONFIGURATION_H
+#include <algorithm>
#include <string>
+#include <stack>
extern "C" {
#include "test_util.h"
@@ -40,14 +42,22 @@ enum class types { BOOL, INT, STRING, STRUCT };
namespace test_harness {
class configuration {
public:
- configuration(const std::string &test_config_name, const std::string &config) : _config(config)
+ configuration(const std::string &test_config_name, const std::string &config)
{
+ const auto *config_entry = __wt_test_config_match(test_config_name.c_str());
+ if (config_entry == nullptr)
+ testutil_die(EINVAL, "failed to match test config name");
+ std::string default_config = std::string(config_entry->base);
+ /* Merge in the default configuration. */
+ _config = merge_default_config(default_config, config);
+ debug_print("Running with enriched config: " + _config, DEBUG_INFO);
+
int ret = wiredtiger_test_config_validate(
- nullptr, nullptr, test_config_name.c_str(), config.c_str());
+ nullptr, nullptr, test_config_name.c_str(), _config.c_str());
if (ret != 0)
testutil_die(EINVAL, "failed to validate given config, ensure test config exists");
ret =
- wiredtiger_config_parser_open(nullptr, config.c_str(), config.size(), &_config_parser);
+ wiredtiger_config_parser_open(nullptr, _config.c_str(), _config.size(), &_config_parser);
if (ret != 0)
testutil_die(EINVAL, "failed to create configuration parser for provided config");
}
@@ -173,6 +183,104 @@ class configuration {
return func(value);
}
+ /*
+ * Merge together two configuration strings, the user one and the default one.
+ */
+ static std::string
+ merge_default_config(const std::string &default_config, const std::string &user_config)
+ {
+ std::string merged_config;
+ auto split_default_config = split_config(default_config);
+ auto split_user_config = split_config(user_config);
+ auto user_it = split_user_config.begin();
+ for (auto default_it = split_default_config.begin();
+ default_it != split_default_config.end(); ++default_it) {
+ if (user_it->first != default_it->first)
+ /* The default does not exist in the user configuration, add it. */
+ merged_config += default_it->first + "=" + default_it->second;
+ else {
+ /* If we have a sub config merge it in. */
+ if (user_it->second[0] == '(')
+ merged_config += default_it->first + "=(" +
+ merge_default_config(default_it->second, user_it->second) + ')';
+ else
+ /* Add the user configuration as it exists. */
+ merged_config += user_it->first + "=" + user_it->second;
+ ++user_it;
+ }
+ /* Add a comma after every item we add except the last one. */
+ if (split_default_config.end() - default_it != 1)
+ merged_config += ",";
+ }
+ return (merged_config);
+ }
+
+ /*
+ * Split a config string into keys and values, taking care to not split incorrectly when we have
+ * a sub config.
+ */
+ static std::vector<std::pair<std::string, std::string>>
+ split_config(const std::string &config)
+ {
+ std::string cut_config = config;
+ std::vector<std::pair<std::string, std::string>> split_config;
+ std::string key = "", value = "";
+ bool in_subconfig = false;
+ bool expect_value = false;
+ std::stack<char> subconfig_parens;
+
+ /* All configuration strings must be at least 2 characters. */
+ testutil_assert(config.size() > 1);
+
+ /* Remove prefix and trailing "()". */
+ if (config[0] == '(')
+ cut_config = config.substr(1, config.size() - 2);
+
+ size_t start = 0, len = 0;
+ for (size_t i = 0; i < cut_config.size(); ++i) {
+ if (cut_config[i] == '(') {
+ subconfig_parens.push(cut_config[i]);
+ in_subconfig = true;
+ }
+ if (cut_config[i] == ')') {
+ subconfig_parens.pop();
+ in_subconfig = !subconfig_parens.empty();
+ }
+ if (cut_config[i] == '=' && !in_subconfig) {
+ expect_value = true;
+ key = cut_config.substr(start, len);
+ start += len + 1;
+ len = 0;
+ continue;
+ }
+ if (cut_config[i] == ',' && !in_subconfig) {
+ expect_value = false;
+ if (start + len >= cut_config.size())
+ break;
+ value = cut_config.substr(start, len);
+ start += len + 1;
+ len = 0;
+ split_config.push_back(std::make_pair(key, value));
+ continue;
+ }
+ ++len;
+ }
+ if (expect_value) {
+ value = cut_config.substr(start, len);
+ split_config.push_back(std::make_pair(key, value));
+ }
+
+ /* We have to sort the config here otherwise we will match incorrectly while merging. */
+ std::sort(split_config.begin(), split_config.end(), comparator);
+ return (split_config);
+ }
+
+ static bool
+ comparator(std::pair<std::string, std::string> a, std::pair<std::string, std::string> b)
+ {
+ return (a.first < b.first);
+ }
+
std::string _config;
WT_CONFIG_PARSER *_config_parser = nullptr;
};
diff --git a/src/third_party/wiredtiger/test/cppsuite/test_harness/runtime_monitor.h b/src/third_party/wiredtiger/test/cppsuite/test_harness/runtime_monitor.h
index b7897eb39f1..bc559a03104 100644
--- a/src/third_party/wiredtiger/test/cppsuite/test_harness/runtime_monitor.h
+++ b/src/third_party/wiredtiger/test/cppsuite/test_harness/runtime_monitor.h
@@ -64,7 +64,7 @@ class statistic {
virtual ~statistic() {}
bool
- is_enabled() const
+ enabled() const
{
return _enabled;
}
@@ -154,7 +154,7 @@ class runtime_monitor : public component {
do_work()
{
for (const auto &it : _stats) {
- if (it->is_enabled())
+ if (it->enabled())
it->check(_cursor);
}
}
diff --git a/src/third_party/wiredtiger/test/cppsuite/test_harness/test.h b/src/third_party/wiredtiger/test/cppsuite/test_harness/test.h
index a753e131f0f..f5049df074d 100644
--- a/src/third_party/wiredtiger/test/cppsuite/test_harness/test.h
+++ b/src/third_party/wiredtiger/test/cppsuite/test_harness/test.h
@@ -102,7 +102,7 @@ class test : public database_operation {
run()
{
int64_t cache_size_mb, duration_seconds;
- bool enable_logging, is_success = true;
+ bool enable_logging;
/* Build the database creation config string. */
std::string db_create_config = CONNECTION_CREATE;
@@ -124,6 +124,10 @@ class test : public database_operation {
for (const auto &it : _components)
_thread_manager->add_thread(&component::run, it);
+ /* The initial population phase needs to be finished before starting the actual test. */
+ while (_workload_generator->enabled() && !_workload_generator->db_populated())
+ std::this_thread::sleep_for(std::chrono::milliseconds(10));
+
/* The test will run for the duration as defined in the config. */
duration_seconds = _config->get_int(DURATION_SECONDS);
testutil_assert(duration_seconds >= 0);
@@ -135,13 +139,13 @@ class test : public database_operation {
_thread_manager->join();
/* Validation stage. */
- if (_workload_tracking->is_enabled()) {
+ if (_workload_tracking->enabled()) {
workload_validation wv;
- is_success = wv.validate(_workload_tracking->get_operation_table_name(),
+ wv.validate(_workload_tracking->get_operation_table_name(),
_workload_tracking->get_schema_table_name(), _workload_generator->get_database());
}
- debug_print(is_success ? "SUCCESS" : "FAILED", DEBUG_INFO);
+ debug_print("SUCCESS", DEBUG_INFO);
connection_manager::instance().close();
}
diff --git a/src/third_party/wiredtiger/test/cppsuite/test_harness/util/debug_utils.h b/src/third_party/wiredtiger/test/cppsuite/test_harness/util/debug_utils.h
index da09a08c9d8..a2694f6987c 100644
--- a/src/third_party/wiredtiger/test/cppsuite/test_harness/util/debug_utils.h
+++ b/src/third_party/wiredtiger/test/cppsuite/test_harness/util/debug_utils.h
@@ -34,7 +34,6 @@
/* Define helpful functions related to debugging. */
namespace test_harness {
-#define DEBUG_ABORT -1
#define DEBUG_ERROR 0
#define DEBUG_INFO 1
#define DEBUG_TRACE 2
diff --git a/src/third_party/wiredtiger/test/cppsuite/test_harness/workload/database_model.h b/src/third_party/wiredtiger/test/cppsuite/test_harness/workload/database_model.h
index 07e7c007ea7..c2a7ed9f6a6 100644
--- a/src/third_party/wiredtiger/test/cppsuite/test_harness/workload/database_model.h
+++ b/src/third_party/wiredtiger/test/cppsuite/test_harness/workload/database_model.h
@@ -42,9 +42,6 @@ struct key_t {
bool exists;
};
-/* Iterator type used to iterate over keys that are stored in the data model. */
-typedef std::map<test_harness::key_value_t, test_harness::key_t>::const_iterator keys_iterator_t;
-
/* Representation of a value. */
struct value_t {
key_value_t value;
@@ -59,18 +56,6 @@ struct collection_t {
/* Representation of the collections in memory. */
class database {
public:
- const keys_iterator_t
- get_collection_keys_begin(const std::string &collection_name) const
- {
- return (collections.at(collection_name).keys.begin());
- }
-
- const keys_iterator_t
- get_collection_keys_end(const std::string &collection_name) const
- {
- return (collections.at(collection_name).keys.end());
- }
-
const std::vector<std::string>
get_collection_names() const
{
diff --git a/src/third_party/wiredtiger/test/cppsuite/test_harness/workload/database_operation.h b/src/third_party/wiredtiger/test/cppsuite/test_harness/workload/database_operation.h
index 7a88ed9b662..fc97c1e381c 100644
--- a/src/third_party/wiredtiger/test/cppsuite/test_harness/workload/database_operation.h
+++ b/src/third_party/wiredtiger/test/cppsuite/test_harness/workload/database_operation.h
@@ -46,7 +46,7 @@ class database_operation {
* - Open a cursor on each collection.
* - Insert m key/value pairs in each collection. Values are random strings which size is
* defined by the configuration.
- * - Store in memory the created collections and the generated keys that were inserted.
+ * - Store in memory the created collections.
*/
virtual void
populate(database &database, timestamp_manager *timestamp_manager, configuration *config,
@@ -58,7 +58,7 @@ class database_operation {
int64_t collection_count, key_count, key_cpt, key_size, value_size;
std::string collection_name, cfg, home;
key_value_t generated_key, generated_value;
- bool ts_enabled = timestamp_manager->is_enabled();
+ bool ts_enabled = timestamp_manager->enabled();
cursor = nullptr;
collection_count = key_count = key_size = value_size = 0;
@@ -67,13 +67,14 @@ class database_operation {
session = connection_manager::instance().create_session();
/* Create n collections as per the configuration and store each collection name. */
collection_count = config->get_int(COLLECTION_COUNT);
- for (int i = 0; i < collection_count; ++i) {
+ for (size_t i = 0; i < collection_count; ++i) {
collection_name = "table:collection" + std::to_string(i);
database.collections[collection_name] = {};
testutil_check(
session->create(session, collection_name.c_str(), DEFAULT_FRAMEWORK_SCHEMA));
ts = timestamp_manager->get_next_ts();
- testutil_check(tracking->save(tracking_operation::CREATE, collection_name, 0, "", ts));
+ tracking->save_schema_operation(
+ tracking_operation::CREATE_COLLECTION, collection_name, ts);
}
debug_print(std::to_string(collection_count) + " collections created", DEBUG_TRACE);
@@ -89,11 +90,13 @@ class database_operation {
for (const auto &it_collections : database.collections) {
collection_name = it_collections.first;
key_cpt = 0;
- /* WiredTiger lets you open a cursor on a collection using the same pointer. When a
- * session is closed, WiredTiger APIs close the cursors too. */
+ /*
+ * WiredTiger lets you open a cursor on a collection using the same pointer. When a
+ * session is closed, WiredTiger APIs close the cursors too.
+ */
testutil_check(
session->open_cursor(session, collection_name.c_str(), NULL, NULL, &cursor));
- for (size_t j = 0; j < key_count; ++j) {
+ for (size_t i = 0; i < key_count; ++i) {
/* Generation of a unique key. */
generated_key = number_to_string(key_size, key_cpt);
++key_cpt;
@@ -106,16 +109,12 @@ class database_operation {
ts = timestamp_manager->get_next_ts();
if (ts_enabled)
testutil_check(session->begin_transaction(session, ""));
- testutil_check(insert(cursor, tracking, collection_name, generated_key.c_str(),
- generated_value.c_str(), ts));
+ insert(cursor, tracking, collection_name, generated_key.c_str(),
+ generated_value.c_str(), ts);
if (ts_enabled) {
cfg = std::string(COMMIT_TS) + "=" + timestamp_manager->decimal_to_hex(ts);
testutil_check(session->commit_transaction(session, cfg.c_str()));
}
- /* Update the memory representation of the collections. */
- database.collections[collection_name].keys[generated_key].exists = true;
- /* Values are not stored here. */
- database.collections[collection_name].values = nullptr;
}
}
debug_print("Populate stage done", DEBUG_TRACE);
@@ -150,13 +149,15 @@ class database_operation {
virtual void
update_operation(thread_context &context, WT_SESSION *session)
{
+ WT_DECL_RET;
WT_CURSOR *cursor;
wt_timestamp_t ts;
std::vector<WT_CURSOR *> cursors;
- std::string collection_name;
std::vector<std::string> collection_names = context.get_collection_names();
- key_value_t generated_value, key;
- int64_t cpt, value_size = context.get_value_size();
+ key_value_t key, generated_value;
+ const char *key_tmp;
+ int64_t value_size = context.get_value_size();
+ uint64_t i;
testutil_assert(session != nullptr);
/* Get a cursor for each collection in collection_names. */
@@ -165,17 +166,31 @@ class database_operation {
cursors.push_back(cursor);
}
- cpt = 0;
- /* Walk each cursor. */
- for (const auto &it : cursors) {
- collection_name = collection_names[cpt];
- /* Walk each key. */
- for (keys_iterator_t iter_key = context.get_collection_keys_begin(collection_name);
- iter_key != context.get_collection_keys_end(collection_name); ++iter_key) {
- /* Do not process removed keys. */
- if (!iter_key->second.exists)
- continue;
-
+ /*
+ * Update each collection while the test is running.
+ */
+ i = 0;
+ while (context.is_running() && !collection_names.empty()) {
+ if (i >= collection_names.size())
+ i = 0;
+ ret = cursors[i]->next(cursors[i]);
+ /* If we have reached the end of the collection, reset. */
+ if (ret == WT_NOTFOUND) {
+ testutil_check(cursors[i]->reset(cursors[i]));
+ ++i;
+ } else if (ret != 0)
+ /* Stop updating in case of an error. */
+ testutil_die(DEBUG_ERROR, "update_operation: cursor->next() failed: %d", ret);
+ else {
+ testutil_check(cursors[i]->get_key(cursors[i], &key_tmp));
+ /*
+ * The retrieved key needs to be passed inside the update function. However, the
+ * update API doesn't guarantee our buffer will still be valid once it is called, as
+ * such we copy the buffer and then pass it into the API.
+ */
+ key = key_value_t(key_tmp);
+ generated_value =
+ random_generator::random_generator::instance().generate_string(value_size);
ts = context.get_timestamp_manager()->get_next_ts();
/* Start a transaction if possible. */
@@ -183,17 +198,15 @@ class database_operation {
context.begin_transaction(session, "");
context.set_commit_timestamp(session, ts);
}
- generated_value =
- random_generator::random_generator::instance().generate_string(value_size);
- testutil_check(update(context.get_tracking(), it, collection_name,
- iter_key->first.c_str(), generated_value.c_str(), ts));
+
+ update(context.get_tracking(), cursors[i], collection_names[i], key.c_str(),
+ generated_value.c_str(), ts);
/* Commit the current transaction if possible. */
context.increment_operation_count();
if (context.can_commit_transaction())
context.commit_transaction(session, "");
}
- ++cpt;
}
/*
@@ -211,48 +224,34 @@ class database_operation {
private:
/* WiredTiger APIs wrappers for single operations. */
template <typename K, typename V>
- int
+ void
insert(WT_CURSOR *cursor, workload_tracking *tracking, const std::string &collection_name,
const K &key, const V &value, wt_timestamp_t ts)
{
- int error_code;
-
testutil_assert(cursor != nullptr);
+
cursor->set_key(cursor, key);
cursor->set_value(cursor, value);
- error_code = cursor->insert(cursor);
+ testutil_check(cursor->insert(cursor));
+ debug_print("key/value inserted", DEBUG_TRACE);
- if (error_code == 0) {
- debug_print("key/value inserted", DEBUG_TRACE);
- error_code =
- tracking->save(tracking_operation::INSERT, collection_name, key, value, ts);
- } else
- debug_print("key/value insertion failed", DEBUG_ERROR);
-
- return (error_code);
+ tracking->save_operation(tracking_operation::INSERT, collection_name, key, value, ts);
}
template <typename K, typename V>
- static int
+ static void
update(workload_tracking *tracking, WT_CURSOR *cursor, const std::string &collection_name,
K key, V value, wt_timestamp_t ts)
{
- int error_code;
-
testutil_assert(tracking != nullptr);
testutil_assert(cursor != nullptr);
+
cursor->set_key(cursor, key);
cursor->set_value(cursor, value);
- error_code = cursor->update(cursor);
-
- if (error_code == 0) {
- debug_print("key/value update", DEBUG_TRACE);
- error_code =
- tracking->save(tracking_operation::UPDATE, collection_name, key, value, ts);
- } else
- debug_print("key/value update failed", DEBUG_ERROR);
+ testutil_check(cursor->update(cursor));
+ debug_print("key/value updated", DEBUG_TRACE);
- return (error_code);
+ tracking->save_operation(tracking_operation::UPDATE, collection_name, key, value, ts);
}
/*
diff --git a/src/third_party/wiredtiger/test/cppsuite/test_harness/workload/thread_context.h b/src/third_party/wiredtiger/test/cppsuite/test_harness/workload/thread_context.h
index e5275bc7819..2cf20066504 100644
--- a/src/third_party/wiredtiger/test/cppsuite/test_harness/workload/thread_context.h
+++ b/src/third_party/wiredtiger/test/cppsuite/test_harness/workload/thread_context.h
@@ -69,18 +69,6 @@ class thread_context {
return (_database.get_collection_names());
}
- const keys_iterator_t
- get_collection_keys_begin(const std::string &collection_name) const
- {
- return (_database.get_collection_keys_begin(collection_name));
- }
-
- const keys_iterator_t
- get_collection_keys_end(const std::string &collection_name) const
- {
- return (_database.get_collection_keys_end(collection_name));
- }
-
thread_operation
get_thread_operation() const
{
@@ -132,7 +120,7 @@ class thread_context {
void
begin_transaction(WT_SESSION *session, const std::string &config)
{
- if (!_in_txn && _timestamp_manager->is_enabled()) {
+ if (!_in_txn && _timestamp_manager->enabled()) {
testutil_check(
session->begin_transaction(session, config.empty() ? nullptr : config.c_str()));
/* This randomizes the number of operations to be executed in one transaction. */
@@ -154,7 +142,7 @@ class thread_context {
bool
can_commit_transaction() const
{
- return (_timestamp_manager->is_enabled() && _in_txn &&
+ return (_timestamp_manager->enabled() && _in_txn &&
(!_running || (_current_op_count > _max_op_count)));
}
@@ -180,7 +168,7 @@ class thread_context {
void
set_commit_timestamp(WT_SESSION *session, wt_timestamp_t ts)
{
- if (!_timestamp_manager->is_enabled())
+ if (!_timestamp_manager->enabled())
return;
std::string config = std::string(COMMIT_TS) + "=" + _timestamp_manager->decimal_to_hex(ts);
diff --git a/src/third_party/wiredtiger/test/cppsuite/test_harness/workload/workload_tracking.h b/src/third_party/wiredtiger/test/cppsuite/test_harness/workload/workload_tracking.h
index 4d1b2d755a8..41efadb440b 100644
--- a/src/third_party/wiredtiger/test/cppsuite/test_harness/workload/workload_tracking.h
+++ b/src/third_party/wiredtiger/test/cppsuite/test_harness/workload/workload_tracking.h
@@ -49,7 +49,7 @@
namespace test_harness {
/* Tracking operations. */
-enum class tracking_operation { CREATE, DELETE_COLLECTION, DELETE_KEY, INSERT, UPDATE };
+enum class tracking_operation { CREATE_COLLECTION, DELETE_COLLECTION, DELETE_KEY, INSERT, UPDATE };
/* Class used to track operations performed on collections */
class workload_tracking : public component {
@@ -107,41 +107,49 @@ class workload_tracking : public component {
/* Does not do anything. */
}
- template <typename K, typename V>
- int
- save(const tracking_operation &operation, const std::string &collection_name, const K &key,
- const V &value, wt_timestamp_t ts)
+ void
+ save_schema_operation(
+ const tracking_operation &operation, const std::string &collection_name, wt_timestamp_t ts)
{
- WT_CURSOR *cursor;
- int error_code = 0;
+ std::string error_message;
if (!_enabled)
- return (error_code);
-
- /* Select the correct cursor to save in the collection associated to specific operations. */
- switch (operation) {
- case tracking_operation::CREATE:
- case tracking_operation::DELETE_COLLECTION:
- cursor = _cursor_schema;
- cursor->set_key(cursor, collection_name.c_str(), ts);
- cursor->set_value(cursor, static_cast<int>(operation));
- break;
-
- default:
- cursor = _cursor_operations;
- cursor->set_key(cursor, collection_name.c_str(), key, ts);
- cursor->set_value(cursor, static_cast<int>(operation), value);
- break;
+ return;
+
+ if (operation == tracking_operation::CREATE_COLLECTION ||
+ operation == tracking_operation::DELETE_COLLECTION) {
+ _cursor_schema->set_key(_cursor_schema, collection_name.c_str(), ts);
+ _cursor_schema->set_value(_cursor_schema, static_cast<int>(operation));
+ testutil_check(_cursor_schema->insert(_cursor_schema));
+ } else {
+ error_message = "save_schema_operation: invalid operation " +
+ std::to_string(static_cast<int>(operation));
+ testutil_die(EINVAL, error_message.c_str());
}
+ debug_print("save_schema_operation: workload tracking saved operation.", DEBUG_TRACE);
+ }
- error_code = cursor->insert(cursor);
+ template <typename K, typename V>
+ void
+ save_operation(const tracking_operation &operation, const std::string &collection_name,
+ const K &key, const V &value, wt_timestamp_t ts)
+ {
+ std::string error_message;
- if (error_code == 0)
- debug_print("Workload tracking saved operation.", DEBUG_TRACE);
- else
- debug_print("Workload tracking failed to save operation !", DEBUG_ERROR);
+ if (!_enabled)
+ return;
- return error_code;
+ if (operation == tracking_operation::CREATE_COLLECTION ||
+ operation == tracking_operation::DELETE_COLLECTION) {
+ error_message =
+ "save_operation: invalid operation " + std::to_string(static_cast<int>(operation));
+ testutil_die(EINVAL, error_message.c_str());
+ } else {
+ _cursor_operations->set_key(_cursor_operations, collection_name.c_str(), key, ts);
+ _cursor_operations->set_value(_cursor_operations, static_cast<int>(operation), value);
+ testutil_check(_cursor_operations->insert(_cursor_operations));
+ }
+ debug_print("save_operation: workload tracking saved operation.", DEBUG_TRACE);
}
private:
diff --git a/src/third_party/wiredtiger/test/cppsuite/test_harness/workload/workload_validation.h b/src/third_party/wiredtiger/test/cppsuite/test_harness/workload/workload_validation.h
index 5ef7992e773..aaab9ad25a9 100644
--- a/src/third_party/wiredtiger/test/cppsuite/test_harness/workload/workload_validation.h
+++ b/src/third_party/wiredtiger/test/cppsuite/test_harness/workload/workload_validation.h
@@ -45,95 +45,113 @@ namespace test_harness {
class workload_validation {
public:
/*
- * Validate the on disk data against what has been tracked during the test.
- * - The first step is to replay the tracked operations so a representation in memory of the
- * collections is created. This representation is then compared to what is on disk.
- * - The second step is to go through what has been saved on disk and make sure the memory
- * representation has the same data.
- * operation_table_name is the collection that contains all the operations about the key/value
- * pairs in the different collections used during the test. schema_table_name is the collection
- * that contains all the operations about the creation or deletion of collections during the
- * test.
+ * Validate the on disk data against what has been tracked during the test. This is done by
+ * replaying the tracked operations so a representation in memory of the collections is created.
+ * This representation is then compared to what is on disk. operation_table_name: collection
+ * that contains all the operations about the key/value pairs in the different collections used
+ * during the test. schema_table_name: collection that contains all the operations about the
+ * creation or deletion of collections during the test.
*/
- bool
+ void
validate(const std::string &operation_table_name, const std::string &schema_table_name,
database &database)
{
+ WT_DECL_RET;
+ WT_CURSOR *cursor;
WT_SESSION *session;
- std::string collection_name;
- /* Existing collections after the test. */
+ wt_timestamp_t key_timestamp;
std::vector<std::string> created_collections, deleted_collections;
- bool is_valid = true;
+ const char *key, *key_collection_name, *value;
+ int value_operation_type;
+ std::string collection_name;
session = connection_manager::instance().create_session();
/* Retrieve the collections that were created and deleted during the test. */
- collection_name = schema_table_name;
parse_schema_tracking_table(
- session, collection_name, created_collections, deleted_collections);
-
- /* Make sure they exist in memory. */
- for (auto const &it : created_collections) {
- if (database.collections.count(it) == 0) {
- debug_print("Collection missing in memory: " + it, DEBUG_ERROR);
- is_valid = false;
- break;
- }
- }
-
- if (!is_valid)
- return (is_valid);
+ session, schema_table_name, created_collections, deleted_collections);
- /* Make sure they don't exist in memory nor on disk. */
+ /*
+ * Make sure the deleted collections do not exist on disk. The created collections are
+ * checked in check_reference.
+ */
for (auto const &it : deleted_collections) {
- if (database.collections.count(it) > 0) {
- debug_print(
- "Collection present in memory while it has been tracked as deleted: " + it,
- DEBUG_ERROR);
- is_valid = false;
- break;
- }
- if (!verify_collection_state(session, it, false)) {
- debug_print(
- "Collection present on disk while it has been tracked as deleted: " + it,
- DEBUG_ERROR);
- is_valid = false;
- break;
- }
+ if (!verify_collection_state(session, it, false))
+ testutil_die(DEBUG_ERROR,
+ "validate: collection %s present on disk while it has been tracked as deleted.",
+ it.c_str());
}
- for (auto const &collection_name : database.get_collection_names()) {
- if (!is_valid)
- break;
-
- /* Get the values associated to the different keys in the current collection. */
- parse_operation_tracking_table(
- session, operation_table_name, collection_name, database);
- /* Check all tracked operations in memory against the database on disk. */
- if (!check_reference(session, collection_name, database)) {
- debug_print(
- "check_reference failed for collection " + collection_name, DEBUG_ERROR);
- is_valid = false;
- }
- /* Check what has been saved on disk against what has been tracked. */
- else if (!check_disk_state(session, collection_name, database)) {
- debug_print(
- "check_disk_state failed for collection " + collection_name, DEBUG_ERROR);
- is_valid = false;
+ /* Parse the tracking table. */
+ testutil_check(
+ session->open_cursor(session, operation_table_name.c_str(), NULL, NULL, &cursor));
+ while ((ret = cursor->next(cursor)) == 0) {
+ testutil_check(cursor->get_key(cursor, &key_collection_name, &key, &key_timestamp));
+ testutil_check(cursor->get_value(cursor, &value_operation_type, &value));
+
+ debug_print("Collection name is " + std::string(key_collection_name), DEBUG_TRACE);
+ debug_print("Key is " + std::string(key), DEBUG_TRACE);
+ debug_print("Timestamp is " + std::to_string(key_timestamp), DEBUG_TRACE);
+ debug_print("Operation type is " + std::to_string(value_operation_type), DEBUG_TRACE);
+ debug_print("Value is " + std::string(value), DEBUG_TRACE);
+
+ /*
+ * If the cursor points to values from a collection that has been created during the
+ * test, update the data model.
+ */
+ if (std::find(created_collections.begin(), created_collections.end(),
+ key_collection_name) != created_collections.end())
+ update_data_model(static_cast<tracking_operation>(value_operation_type),
+ key_collection_name, key, value, database);
+ /*
+ * The collection should be part of the deleted collections if it has not be found in
+ * the created ones.
+ */
+ else if (std::find(deleted_collections.begin(), deleted_collections.end(),
+ key_collection_name) == deleted_collections.end())
+ testutil_die(DEBUG_ERROR,
+ "validate: The collection %s is not part of the created or deleted collections.",
+ key_collection_name);
+
+ if (collection_name.empty())
+ collection_name = key_collection_name;
+ else if (collection_name != key_collection_name) {
+ /*
+ * The data model is now fully updated for the last read collection. It can be
+ * checked.
+ */
+ check_reference(session, collection_name, database.collections.at(collection_name));
+ /* Clear memory. */
+ delete database.collections[collection_name].values;
+ database.collections[collection_name].values = nullptr;
+
+ collection_name = key_collection_name;
}
+ };
+
+ /* The value of ret should be WT_NOTFOUND once the cursor has read all rows. */
+ if (ret != WT_NOTFOUND)
+ testutil_die(DEBUG_ERROR, "validate: cursor->next() %d.", ret);
+
+ /*
+ * Once the cursor has read the entire table, the last parsed collection has not been
+ * checked yet. We still have to make sure collection_name has been updated. It will remain
+ * empty if there is no collections to check after the end of the test (no collections
+ * created or all deleted).
+ */
+ if (!collection_name.empty()) {
+ check_reference(session, collection_name, database.collections.at(collection_name));
/* Clear memory. */
delete database.collections[collection_name].values;
database.collections[collection_name].values = nullptr;
}
-
- return (is_valid);
}
private:
/*
* Read the tracking table to retrieve the created and deleted collections during the test.
- * collection_name is the collection that contains the operations on the different collections
- * during the test.
+ * collection_name: collection that contains the operations on the different collections during
+ * the test.
*/
void
parse_schema_tracking_table(WT_SESSION *session, const std::string &collection_name,
@@ -155,7 +173,7 @@ class workload_validation {
debug_print("Operation type is " + std::to_string(value_operation_type), DEBUG_TRACE);
if (static_cast<tracking_operation>(value_operation_type) ==
- tracking_operation::CREATE) {
+ tracking_operation::CREATE_COLLECTION) {
deleted_collections.erase(std::remove(deleted_collections.begin(),
deleted_collections.end(), key_collection_name),
deleted_collections.end());
@@ -170,211 +188,95 @@ class workload_validation {
}
}
- /*
- * Parse the tracked operations to build a representation in memory of the collections at the
- * end of the test. tracking_collection_name is the tracking collection used to save the
- * operations performed on the collections during the test. collection_name is the collection
- * that needs to be represented in memory.
- */
+ /* Update the data model. */
void
- parse_operation_tracking_table(WT_SESSION *session, const std::string &tracking_collection_name,
- const std::string &collection_name, database &database)
+ update_data_model(const tracking_operation &operation, const std::string &collection_name,
+ const char *key, const char *value, database &database)
{
- WT_CURSOR *cursor;
- wt_timestamp_t key_timestamp;
- int exact, value_operation_type;
- const char *key, *key_collection_name, *value;
- std::vector<key_value_t> collection_keys;
- std::string key_str;
-
- /* Retrieve all keys from the given collection. */
- for (auto const &it : database.collections.at(collection_name).keys)
- collection_keys.push_back(it.first);
- /* There must be at least a key. */
- testutil_assert(!collection_keys.empty());
- /* Sort keys. */
- std::sort(collection_keys.begin(), collection_keys.end());
- /* Use the first key as a parameter for search_near. */
- key_str = collection_keys[0];
-
- testutil_check(
- session->open_cursor(session, tracking_collection_name.c_str(), NULL, NULL, &cursor));
-
- cursor->set_key(cursor, collection_name.c_str(), key_str.c_str());
- testutil_check(cursor->search_near(cursor, &exact));
- /*
- * Since the timestamp which is part of the key is not provided, exact cannot be 0. If it is
- * -1, we need to go to the next key.
- */
- testutil_assert(exact != 0);
- if (exact < 0)
- testutil_check(cursor->next(cursor));
-
- do {
- testutil_check(cursor->get_key(cursor, &key_collection_name, &key, &key_timestamp));
- testutil_check(cursor->get_value(cursor, &value_operation_type, &value));
-
- debug_print("Collection name is " + std::string(key_collection_name), DEBUG_TRACE);
- debug_print("Key is " + std::string(key), DEBUG_TRACE);
- debug_print("Timestamp is " + std::to_string(key_timestamp), DEBUG_TRACE);
- debug_print("Operation type is " + std::to_string(value_operation_type), DEBUG_TRACE);
- debug_print("Value is " + std::string(value), DEBUG_TRACE);
-
+ switch (operation) {
+ case tracking_operation::DELETE_KEY:
/*
- * If the cursor is reading an operation for a different collection, we know all the
- * operations have been parsed for the collection we were interested in.
+ * Operations are parsed from the oldest to the most recent one. It is safe to assume
+ * the key has been inserted previously in an existing collection and can be safely
+ * deleted.
*/
- if (std::string(key_collection_name) != collection_name)
- break;
-
- /* Replay the current operation. */
- switch (static_cast<tracking_operation>(value_operation_type)) {
- case tracking_operation::DELETE_KEY:
- /*
- * Operations are parsed from the oldest to the most recent one. It is safe to
- * assume the key has been inserted previously in an existing collection and can be
- * safely deleted.
- */
- database.collections.at(key_collection_name).keys.at(std::string(key)).exists =
- false;
- delete database.collections.at(key_collection_name).values;
- database.collections.at(key_collection_name).values = nullptr;
- break;
- case tracking_operation::INSERT: {
- /* Keys are unique, it is safe to assume the key has not been encountered before. */
- database.collections[key_collection_name].keys[std::string(key)].exists = true;
- if (database.collections[key_collection_name].values == nullptr) {
- database.collections[key_collection_name].values =
- new std::map<key_value_t, value_t>();
- }
- value_t v;
- v.value = key_value_t(value);
- std::pair<key_value_t, value_t> pair(key_value_t(key), v);
- database.collections[key_collection_name].values->insert(pair);
- break;
- }
- case tracking_operation::UPDATE:
- database.collections[key_collection_name].values->at(key).value =
- key_value_t(value);
- break;
- default:
- testutil_die(DEBUG_ABORT, "Unexpected operation in the tracking table: %d",
- value_operation_type);
- break;
- }
-
- } while (cursor->next(cursor) == 0);
-
- if (cursor->reset(cursor) != 0)
- debug_print("Cursor could not be reset !", DEBUG_ERROR);
+ database.collections.at(collection_name).keys.at(key).exists = false;
+ delete database.collections.at(collection_name).values;
+ database.collections.at(collection_name).values = nullptr;
+ break;
+ case tracking_operation::INSERT: {
+ /*
+ * Keys are unique, it is safe to assume the key has not been encountered before.
+ */
+ database.collections[collection_name].keys[key].exists = true;
+ if (database.collections[collection_name].values == nullptr)
+ database.collections[collection_name].values = new std::map<key_value_t, value_t>();
+ value_t v;
+ v.value = key_value_t(value);
+ std::pair<key_value_t, value_t> pair(key_value_t(key), v);
+ database.collections[collection_name].values->insert(pair);
+ break;
+ }
+ case tracking_operation::UPDATE:
+ database.collections[collection_name].values->at(key).value = key_value_t(value);
+ break;
+ default:
+ testutil_die(DEBUG_ERROR, "Unexpected operation in the tracking table: %d",
+ static_cast<tracking_operation>(operation));
+ break;
+ }
}
/*
- * Compare the tracked operations against what has been saved on disk. database is the
- * representation in memory of the collections after the test according to the tracking table.
+ * Compare the tracked operations against what has been saved on disk. collection:
+ * representation in memory of the collection values and keys according to the tracking table.
*/
- bool
+ void
check_reference(
- WT_SESSION *session, const std::string &collection_name, const database &database)
+ WT_SESSION *session, const std::string &collection_name, const collection_t &collection)
{
bool is_valid;
- collection_t collection;
key_t key;
key_value_t key_str;
/* Check the collection exists on disk. */
- is_valid = verify_collection_state(session, collection_name, true);
-
- if (is_valid) {
- collection = database.collections.at(collection_name);
- /* Walk through each key/value pair of the current collection. */
- for (const auto &keys : collection.keys) {
- key_str = keys.first;
- key = keys.second;
- /* The key/value pair exists. */
- if (key.exists)
- is_valid = (is_key_present(session, collection_name, key_str.c_str()) == true);
- /* The key has been deleted. */
- else
- is_valid = (is_key_present(session, collection_name, key_str.c_str()) == false);
-
- /* Check the associated value is valid. */
- if (is_valid && key.exists) {
- testutil_assert(collection.values != nullptr);
- is_valid = verify_value(session, collection_name, key_str.c_str(),
- collection.values->at(key_str).value);
- }
-
- if (!is_valid) {
- debug_print("check_reference failed for key " + key_str, DEBUG_ERROR);
- break;
- }
- }
- }
-
- if (!is_valid)
- debug_print("check_reference failed for collection " + collection_name, DEBUG_ERROR);
-
- return (is_valid);
- }
-
- /* Check what is present on disk against what has been tracked. */
- bool
- check_disk_state(
- WT_SESSION *session, const std::string &collection_name, const database &database)
- {
- WT_CURSOR *cursor;
- collection_t collection;
- bool is_valid = true;
- /* Key/value pairs on disk. */
- const char *key_on_disk, *value_on_disk;
- key_value_t key_str, value_str;
-
- testutil_check(session->open_cursor(session, collection_name.c_str(), NULL, NULL, &cursor));
-
- collection = database.collections.at(collection_name);
-
- /* Read the collection on disk. */
- while (is_valid && (cursor->next(cursor) == 0)) {
- testutil_check(cursor->get_key(cursor, &key_on_disk));
- testutil_check(cursor->get_value(cursor, &value_on_disk));
+ if (!verify_collection_state(session, collection_name, true))
+ testutil_die(DEBUG_ERROR,
+ "check_reference: collection %s not present on disk while it has been tracked as "
+ "created.",
+ collection_name.c_str());
+
+ /* Walk through each key/value pair of the current collection. */
+ for (const auto &keys : collection.keys) {
+ key_str = keys.first;
+ key = keys.second;
+ /* The key/value pair exists. */
+ if (key.exists)
+ is_valid = (is_key_present(session, collection_name, key_str.c_str()) == true);
+ /* The key has been deleted. */
+ else
+ is_valid = (is_key_present(session, collection_name, key_str.c_str()) == false);
- key_str = std::string(key_on_disk);
-
- debug_print("Key on disk is " + key_str, DEBUG_TRACE);
- debug_print("Value on disk is " + std::string(value_on_disk), DEBUG_TRACE);
+ if (!is_valid)
+ testutil_die(DEBUG_ERROR, "check_reference: failed for key %s in collection %s.",
+ key_str.c_str(), collection_name.c_str());
- /* Check the key on disk has been saved in memory too. */
- if ((collection.keys.count(key_str) > 0) && collection.keys.at(key_str).exists) {
- /* Memory should be allocated for values. */
+ /* Check the associated value is valid. */
+ if (key.exists) {
testutil_assert(collection.values != nullptr);
- value_str = collection.values->at(key_str).value;
- /*
- * Check the key/value pair on disk matches the one in memory from the tracked
- * operations.
- */
- is_valid = (value_str == key_value_t(value_on_disk));
- if (!is_valid)
- debug_print(" Key/Value pair mismatch.\n Disk key: " + key_str +
- "\n Disk value: " + std ::string(value_on_disk) +
- "\n Tracking table key: " + key_str + "\n Tracking table value exists: " +
- std::to_string(collection.keys.at(key_str).exists) +
- "\n Tracking table value: " + value_str,
- DEBUG_ERROR);
- } else {
- is_valid = false;
- debug_print(
- "The key " + std::string(key_on_disk) + " present on disk has not been tracked",
- DEBUG_ERROR);
+ if (!verify_value(session, collection_name, key_str.c_str(),
+ collection.values->at(key_str).value))
+ testutil_die(DEBUG_ERROR,
+ "check_reference: failed for key %s / value %s in collection %s.",
+ key_str.c_str(), collection.values->at(key_str).value.c_str(),
+ collection_name.c_str());
}
}
-
- return (is_valid);
}
/*
- * Check whether a collection exists on disk. collection_name is the collection to check. exists
- * needs to be set to true if the collection is expected to be existing, false otherwise.
+ * Check whether a collection exists on disk. exists: needs to be set to true if the collection
+ * is expected to be existing, false otherwise.
*/
bool
verify_collection_state(
@@ -385,6 +287,7 @@ class workload_validation {
return (exists ? (ret == 0) : (ret != 0));
}
+ /* Check whether a keys exists in a collection on disk. */
template <typename K>
bool
is_key_present(WT_SESSION *session, const std::string &collection_name, const K &key)
diff --git a/src/third_party/wiredtiger/test/cppsuite/test_harness/workload_generator.h b/src/third_party/wiredtiger/test/cppsuite/test_harness/workload_generator.h
index 9413834ba31..5e084229123 100644
--- a/src/third_party/wiredtiger/test/cppsuite/test_harness/workload_generator.h
+++ b/src/third_party/wiredtiger/test/cppsuite/test_harness/workload_generator.h
@@ -72,6 +72,7 @@ class workload_generator : public component {
/* Populate the database. */
_database_operation->populate(_database, _timestamp_manager, _config, _tracking);
+ _db_populated = true;
/* Retrieve useful parameters from the test configuration. */
transaction_config = _config->get_subconfig(OPS_PER_TRANSACTION);
@@ -87,7 +88,7 @@ class workload_generator : public component {
testutil_assert(value_size >= 0);
/* Generate threads to execute read operations on the collections. */
- for (int i = 0; i < read_threads; ++i) {
+ for (size_t i = 0; i < read_threads && _running; ++i) {
thread_context *tc = new thread_context(_timestamp_manager, _tracking, _database,
thread_operation::READ, max_operation_per_transaction, min_operation_per_transaction,
value_size, throttle());
@@ -96,7 +97,7 @@ class workload_generator : public component {
}
/* Generate threads to execute update operations on the collections. */
- for (int i = 0; i < update_threads; ++i) {
+ for (size_t i = 0; i < update_threads && _running; ++i) {
thread_context *tc = new thread_context(_timestamp_manager, _tracking, _database,
thread_operation::UPDATE, max_operation_per_transaction,
min_operation_per_transaction, value_size, throttle(update_config));
@@ -123,7 +124,13 @@ class workload_generator : public component {
database &
get_database()
{
- return _database;
+ return (_database);
+ }
+
+ bool
+ db_populated() const
+ {
+ return (_db_populated);
}
/* Workload threaded operations. */
@@ -148,7 +155,7 @@ class workload_generator : public component {
db_operation.update_operation(context, session);
break;
default:
- testutil_die(DEBUG_ABORT, "system: thread_operation is unknown : %d",
+ testutil_die(DEBUG_ERROR, "system: thread_operation is unknown : %d",
static_cast<int>(context.get_thread_operation()));
break;
}
@@ -161,6 +168,7 @@ class workload_generator : public component {
timestamp_manager *_timestamp_manager;
workload_tracking *_tracking;
std::vector<thread_context *> _workers;
+ bool _db_populated = false;
};
} // namespace test_harness
diff --git a/src/third_party/wiredtiger/test/cppsuite/tests/run.cxx b/src/third_party/wiredtiger/test/cppsuite/tests/run.cxx
index 5fe6641cc3b..67d77116cf1 100755
--- a/src/third_party/wiredtiger/test/cppsuite/tests/run.cxx
+++ b/src/third_party/wiredtiger/test/cppsuite/tests/run.cxx
@@ -140,7 +140,7 @@ main(int argc, char *argv[])
* -l : Trace level.
* -t : Test to run. All tests are run if not specified.
*/
- for (int i = 1; (i < argc) && (error_code == 0); ++i) {
+ for (size_t i = 1; (i < argc) && (error_code == 0); ++i) {
if (std::string(argv[i]) == "-h") {
print_help();
return 0;
diff --git a/src/third_party/wiredtiger/test/csuite/incr_backup/main.c b/src/third_party/wiredtiger/test/csuite/incr_backup/main.c
index 9b535d7bc54..b09e1b44da4 100644
--- a/src/third_party/wiredtiger/test/csuite/incr_backup/main.c
+++ b/src/third_party/wiredtiger/test/csuite/incr_backup/main.c
@@ -74,8 +74,8 @@ static bool do_rename = true;
} while (0)
/*
- * We keep an array of tables, each one may or may not be in use.
- * "In use" means it has been created, and will be updated from time to time.
+ * We keep an array of tables, each one may or may not be in use. "In use" means it has been
+ * created, and will be updated from time to time.
*/
typedef struct {
char *name; /* non-null entries represent tables in use */
@@ -189,8 +189,7 @@ key_value(uint64_t change_count, char *key, size_t key_size, WT_ITEM *item, OPER
* is inserted, it is all the letter 'a'. When the value is updated, is it mostly 'b', with some
* 'c' mixed in. When the value is to modified, we'll end up with a value with mostly 'b' and
* 'M' mixed in, in different spots. Thus the modify operation will have both additions ('M')
- * and
- * subtractions ('c') from the previous version.
+ * and subtractions ('c') from the previous version.
*/
if (op_type == INSERT)
ch = 'a';
diff --git a/src/third_party/wiredtiger/test/csuite/wt2719_reconfig/main.c b/src/third_party/wiredtiger/test/csuite/wt2719_reconfig/main.c
index 5434aa191ef..27d1a58ccbd 100644
--- a/src/third_party/wiredtiger/test/csuite/wt2719_reconfig/main.c
+++ b/src/third_party/wiredtiger/test/csuite/wt2719_reconfig/main.c
@@ -183,9 +183,8 @@ main(int argc, char *argv[])
/*
* A linear pass through the list, adding random elements.
*
- * WiredTiger configurations are usually "the last one set wins", but
- * "shared_cache" and "cache_set" options aren't allowed in the same
- * configuration string.
+ * WiredTiger configurations are usually "the last one set wins", but "shared_cache" and
+ * "cache_set" options aren't allowed in the same configuration string.
*/
for (i = 0; i < WT_ELEMENTS(list); ++i) {
p = list[i];
diff --git a/src/third_party/wiredtiger/test/evergreen.yml b/src/third_party/wiredtiger/test/evergreen.yml
index b4ab0507399..6e8c3f01d11 100755
--- a/src/third_party/wiredtiger/test/evergreen.yml
+++ b/src/third_party/wiredtiger/test/evergreen.yml
@@ -2061,10 +2061,6 @@ tasks:
- func: "format test"
vars:
extra_args: checkpoints=1 leak_memory=0 mmap=1 file_type=row compression=zlib huffman_value=1
- # FIXME-WT-6668: temporarily disable lower isolation level test
- # - func: "format test"
- # vars:
- # extra_args: checkpoints=1 leak_memory=0 mmap=1 file_type=row isolation=random transaction_timestamps=0
- func: "format test"
vars:
extra_args: checkpoints=1 leak_memory=0 mmap=1 file_type=row data_source=lsm bloom=1
diff --git a/src/third_party/wiredtiger/test/format/bulk.c b/src/third_party/wiredtiger/test/format/bulk.c
index 5af036d1495..c1decf24b6f 100644
--- a/src/third_party/wiredtiger/test/format/bulk.c
+++ b/src/third_party/wiredtiger/test/format/bulk.c
@@ -38,7 +38,8 @@ bulk_begin_transaction(WT_SESSION *session)
uint64_t ts;
char buf[64];
- wiredtiger_begin_transaction(session, "isolation=snapshot");
+ /* Writes require snapshot isolation. */
+ wiredtiger_begin_transaction(session, NULL);
ts = __wt_atomic_addv64(&g.timestamp, 1);
testutil_check(__wt_snprintf(buf, sizeof(buf), "read_timestamp=%" PRIx64, ts));
testutil_check(session->timestamp_transaction(session, buf));
@@ -113,7 +114,6 @@ wts_load(void)
bulk_begin_transaction(session);
for (committed_keyno = keyno = 0; ++keyno <= g.c_rows;) {
- key_gen(&key, keyno);
val_gen(NULL, &value, keyno);
switch (g.type) {
@@ -132,6 +132,7 @@ wts_load(void)
trace_msg("bulk %" PRIu32 " {%.*s}", keyno, (int)value.size, (char *)value.data);
break;
case ROW:
+ key_gen(&key, keyno);
cursor->set_key(cursor, &key);
cursor->set_value(cursor, &value);
if (g.trace_all)
@@ -188,22 +189,22 @@ wts_load(void)
}
}
+ if (g.c_txn_timestamps)
+ bulk_commit_transaction(session);
+
/*
* Ideally, the insert loop runs until the number of rows plus one, in which case row counts are
* correct. If the loop exited early, reset the counters and rewrite the CONFIG file (so reopens
* aren't surprised).
*/
if (keyno != g.c_rows + 1) {
- testutil_assert(committed_keyno > 0);
+ g.c_rows = g.c_txn_timestamps ? committed_keyno : (keyno - 1);
+ testutil_assert(g.c_rows > 0);
+ g.rows = g.c_rows;
- g.rows = committed_keyno;
- g.c_rows = (uint32_t)committed_keyno;
config_print(false);
}
- if (g.c_txn_timestamps)
- bulk_commit_transaction(session);
-
testutil_check(cursor->close(cursor));
trace_msg("%s", "=============== bulk load stop");
diff --git a/src/third_party/wiredtiger/test/format/config.c b/src/third_party/wiredtiger/test/format/config.c
index 20431b3f1ab..f7321f77c99 100644
--- a/src/third_party/wiredtiger/test/format/config.c
+++ b/src/third_party/wiredtiger/test/format/config.c
@@ -51,8 +51,8 @@ static void config_map_checksum(const char *, u_int *);
static void config_map_compression(const char *, u_int *);
static void config_map_encryption(const char *, u_int *);
static void config_map_file_type(const char *, u_int *);
-static void config_map_isolation(const char *, u_int *);
static void config_pct(void);
+static void config_prefix(void);
static void config_reset(void);
static void config_transaction(void);
@@ -202,6 +202,7 @@ config_run(void)
config_compression("btree.compression");
config_compression("logging.compression");
config_encryption();
+ config_prefix();
/* Configuration based on the configuration already chosen. */
config_directio();
@@ -876,23 +877,6 @@ config_pct(void)
}
/*
- * Cursor modify isn't possible for anything besides snapshot isolation transactions. If both
- * forced, it's an error. The run-time operations code converts modify operations into updates
- * if we're in some other transaction type, but if we're never going to be able to do a modify,
- * turn it off in the CONFIG output to avoid misleading debuggers.
- */
- if (g.c_isolation_flag == ISOLATION_READ_COMMITTED ||
- g.c_isolation_flag == ISOLATION_READ_UNCOMMITTED) {
- if (config_is_perm("transaction.isolation") && config_is_perm("ops.pct.modify") &&
- g.c_modify_pct != 0)
- testutil_die(
- EINVAL, "WT_CURSOR.modify only supported with snapshot isolation transactions");
-
- list[CONFIG_MODIFY_ENTRY].order = 0;
- *list[CONFIG_MODIFY_ENTRY].vp = 0;
- }
-
- /*
* Walk the list, allocating random numbers of operations in a random order.
*
* If the "order" field is non-zero, we need to create a value for this operation. Find the
@@ -924,116 +908,70 @@ config_pct(void)
}
/*
+ * config_prefix --
+ * Prefix configuration.
+ */
+static void
+config_prefix(void)
+{
+ /* Add prefix compression if prefixes are configured and no explicit choice was made. */
+ if (g.c_prefix != 0 && g.c_prefix_compression == 0 &&
+ !config_is_perm("btree.prefix_compression"))
+ config_single("btree.prefix_compression=on", false);
+}
+
+/*
* config_transaction --
* Transaction configuration.
*/
static void
config_transaction(void)
{
- /*
- * WiredTiger cannot support relaxed isolation levels. Turn off everything but timestamps with
- * snapshot isolation.
- */
- if ((!g.c_txn_timestamps && config_is_perm("transaction.timestamps")) ||
- (g.c_isolation_flag != ISOLATION_SNAPSHOT && config_is_perm("transaction.isolation")))
- testutil_die(EINVAL, "format limited to timestamp and snapshot-isolation testing");
- if (!g.c_txn_timestamps)
- config_single("transaction.timestamps=on", false);
- if (g.c_isolation_flag != ISOLATION_SNAPSHOT)
- config_single("transaction.isolation=snapshot", false);
-
- /*
- * Check the permanent configuration. We can't prepare a transaction if logging is configured or
- * timestamps aren't configured. For repeatable reads to work in timestamp testing, all updates
- * must be done in a snapshot isolation transaction.
- */
+ /* Transaction prepare requires timestamps and is incompatible with logging. */
if (g.c_prepare && config_is_perm("ops.prepare")) {
if (g.c_logging && config_is_perm("logging"))
testutil_die(EINVAL, "prepare is incompatible with logging");
if (!g.c_txn_timestamps && config_is_perm("transaction.timestamps"))
testutil_die(EINVAL, "prepare requires transaction timestamps");
- if (g.c_isolation_flag != ISOLATION_SNAPSHOT && config_is_perm("transaction.isolation"))
- testutil_die(EINVAL, "prepare requires snapshot isolation");
- if (g.c_txn_freq != 100 && config_is_perm("transaction.frequency"))
- testutil_die(EINVAL, "prepare requires transaction frequency set to 100");
}
+
+ /* Transaction timestamps are incompatible with implicit transactions. */
if (g.c_txn_timestamps && config_is_perm("transaction.timestamps")) {
- if (g.c_isolation_flag != ISOLATION_SNAPSHOT && config_is_perm("transaction.isolation"))
- testutil_die(EINVAL, "timestamps require snapshot isolation");
- if (g.c_txn_freq != 100 && config_is_perm("transaction.frequency"))
- testutil_die(EINVAL, "timestamps require transaction frequency set to 100");
- }
- if (g.c_logging && config_is_perm("logging") && g.c_prepare)
- config_single("ops.prepare=off", false);
+ if (g.c_txn_implicit && config_is_perm("transaction.implicit"))
+ testutil_die(
+ EINVAL, "transaction.timestamps is incompatible with implicit transactions");
- /* FIXME-WT-6431: temporarily disable salvage with timestamps. */
- if (g.c_txn_timestamps && g.c_salvage) {
- if (config_is_perm("ops.salvage"))
- testutil_die(EINVAL, "salvage cannot run with timestamps");
- config_single("ops.salvage=off", false);
+ /* FIXME-WT-6431: temporarily disable salvage with timestamps. */
+ if (g.c_salvage && config_is_perm("ops.salvage"))
+ testutil_die(EINVAL, "transaction.timestamps is incompatible with salvage");
}
- if (g.c_isolation_flag == ISOLATION_SNAPSHOT && config_is_perm("transaction.isolation")) {
- if (!g.c_txn_timestamps && config_is_perm("transaction.timestamps"))
- testutil_die(EINVAL, "snapshot isolation requires timestamps");
- if (g.c_txn_freq != 100 && config_is_perm("transaction.frequency"))
- testutil_die(EINVAL, "snapshot isolation requires transaction frequency set to 100");
- }
- if (g.c_txn_rollback_to_stable && config_is_perm("transaction.rollback_to_stable") &&
- g.c_isolation_flag != ISOLATION_SNAPSHOT && config_is_perm("transaction.isolation"))
- testutil_die(EINVAL, "rollback to stable requires snapshot isolation");
/*
- * The permanent configuration has no incompatible settings, adjust the temporary configuration
- * as necessary. Prepare overrides timestamps, overrides isolation, for no reason other than
- * prepare is the least configured and timestamps are the option we want to test the most.
+ * Incompatible permanent configurations have been checked, now turn off any incompatible flags.
+ * The choices are inclined to prepare (it's only rarely configured), then timestamps. Note any
+ * of the options may still be set as required for the run, so we still have to check if that's
+ * the case until we run out of combinations (for example, prepare turns off logging, so by the
+ * time we check logging, logging must have been required by the run if both logging and prepare
+ * are still set, so we can just turn off prepare in that case).
*/
if (g.c_prepare) {
- if (g.c_logging)
+ if (!config_is_perm("logging"))
config_single("logging=off", false);
- if (!g.c_txn_timestamps)
- config_single("transaction.timestamps=on", false);
- if (g.c_isolation_flag != ISOLATION_SNAPSHOT)
- config_single("transaction.isolation=snapshot", false);
- if (g.c_txn_freq != 100)
- config_single("transaction.frequency=100", false);
- }
- if (g.c_txn_rollback_to_stable) {
- if (!g.c_txn_timestamps)
+ if (!config_is_perm("transaction.timestamps"))
config_single("transaction.timestamps=on", false);
}
if (g.c_txn_timestamps) {
- if (g.c_isolation_flag != ISOLATION_SNAPSHOT)
- config_single("transaction.isolation=snapshot", false);
- if (g.c_txn_freq != 100)
- config_single("transaction.frequency=100", false);
- }
- if (g.c_isolation_flag == ISOLATION_NOT_SET) {
- switch (mmrand(NULL, 1, 20)) {
- case 1: /* 5% */
- config_single("transaction.isolation=random", false);
- break;
- case 2: /* 5% */
- config_single("transaction.isolation=read-uncommitted", false);
- break;
- case 3: /* 5% */
- config_single("transaction.isolation=read-committed", false);
- break;
- default: /* 85% */
- config_single("transaction.isolation=snapshot", false);
- break;
- }
- if (g.c_isolation_flag == ISOLATION_SNAPSHOT) {
- if (!g.c_txn_timestamps)
- config_single("transaction.timestamps=on", false);
- if (g.c_txn_freq != 100)
- config_single("transaction.frequency=100", false);
- } else {
- if (g.c_prepare)
- config_single("ops.prepare=off", false);
- if (g.c_txn_timestamps)
- config_single("transaction.timestamps=off", false);
- }
+ if (!config_is_perm("transaction.implicit"))
+ config_single("transaction.implicit=0", false);
+ if (!config_is_perm("ops.salvage"))
+ config_single("ops.salvage=off", false);
}
+ if (g.c_logging)
+ config_single("ops.prepare=off", false);
+ if (g.c_txn_implicit)
+ config_single("transaction.timestamps=off", false);
+ if (g.c_salvage)
+ config_single("transaction.timestamps=off", false);
}
/*
@@ -1175,9 +1113,6 @@ config_reset(void)
{
CONFIG *cp;
- if (!config_is_perm("transaction.isolation"))
- g.c_isolation_flag = ISOLATION_NOT_SET;
-
/* Clear temporary allocated configuration data. */
for (cp = c; cp->name != NULL; ++cp) {
F_CLR(cp, C_TEMP);
@@ -1289,9 +1224,6 @@ config_single(const char *s, bool perm)
} else if (strncmp(s, "runs.type", strlen("runs.type")) == 0) {
config_map_file_type(equalp, &g.type);
*cp->vstr = dstrdup(config_file_type(g.type));
- } else if (strncmp(s, "transaction.isolation", strlen("transaction.isolation")) == 0) {
- config_map_isolation(equalp, &g.c_isolation_flag);
- *cp->vstr = dstrdup(equalp);
} else if (strncmp(s, "logging.compression", strlen("logging.compression")) == 0) {
config_map_compression(equalp, &g.c_logging_compression_flag);
*cp->vstr = dstrdup(equalp);
@@ -1475,25 +1407,6 @@ config_map_encryption(const char *s, u_int *vp)
}
/*
- * config_map_isolation --
- * Map an isolation configuration to a flag.
- */
-static void
-config_map_isolation(const char *s, u_int *vp)
-{
- if (strcmp(s, "random") == 0)
- *vp = ISOLATION_RANDOM;
- else if (strcmp(s, "read-uncommitted") == 0)
- *vp = ISOLATION_READ_UNCOMMITTED;
- else if (strcmp(s, "read-committed") == 0)
- *vp = ISOLATION_READ_COMMITTED;
- else if (strcmp(s, "snapshot") == 0)
- *vp = ISOLATION_SNAPSHOT;
- else
- testutil_die(EINVAL, "illegal isolation configuration: %s", s);
-}
-
-/*
* config_is_perm
* Return if a specific configuration entry was permanently set.
*/
diff --git a/src/third_party/wiredtiger/test/format/config.h b/src/third_party/wiredtiger/test/format/config.h
index a06509b0dba..0feb22f202c 100644
--- a/src/third_party/wiredtiger/test/format/config.h
+++ b/src/third_party/wiredtiger/test/format/config.h
@@ -60,14 +60,14 @@ typedef struct {
#define COMPRESSION_LIST " (none | lz4 | snappy | zlib | zstd)"
static CONFIG c[] = {
- /* 5% */
- {"assert.commit_timestamp", "assert commit_timestamp", C_BOOL, 5, 0, 0,
- &g.c_assert_commit_timestamp, NULL},
-
- /* 5% */
- {"assert.read_timestamp", "assert read_timestamp", C_BOOL, 5, 0, 0, &g.c_assert_read_timestamp,
+ /* 2% */
+ {"assert.read_timestamp", "assert read_timestamp", C_BOOL, 2, 0, 0, &g.c_assert_read_timestamp,
NULL},
+ /* 2% */
+ {"assert.write_timestamp", "set write_timestamp_usage and assert write_timestamp", C_BOOL, 2, 0,
+ 0, &g.c_assert_write_timestamp, NULL},
+
/* 20% */
{"backup", "configure backups", C_BOOL, 20, 0, 0, &g.c_backups, NULL},
@@ -98,8 +98,6 @@ static CONFIG c[] = {
{"btree.internal_page_max", "btree internal node maximum size", 0x0, 9, 17, 27,
&g.c_intl_page_max, NULL},
- {"btree.key_gap", "btree page instantiated key gap", 0x0, 0, 20, 20, &g.c_key_gap, NULL},
-
{"btree.key_max", "maximum key size", 0x0, 20, 128, MEGABYTE(10), &g.c_key_max, NULL},
/*
@@ -113,6 +111,8 @@ static CONFIG c[] = {
{"btree.memory_page_max", "maximum cache page size", 0x0, 1, 10, 128, &g.c_memory_page_max, NULL},
+ {"btree.prefix", "common key prefix", C_BOOL, 3, 0, 0, &g.c_prefix, NULL},
+
/* 80% */
{"btree.prefix_compression", "configure prefix compressed keys", C_BOOL, 80, 0, 0,
&g.c_prefix_compression, NULL},
@@ -184,8 +184,8 @@ static CONFIG c[] = {
/*
* 0%
- * FIXME-WT-7418 and FIXME-WT-7416: Temporarily disable import until WT_ROLLBACK error and
- * interaction with backup thread is fixed. Should be 20%
+ * FIXME-WT-7418 and FIXME-WT-7510: Temporarily disable import until WT_ROLLBACK error and
+ * wt_copy_and_sync error is fixed. It should be (C_BOOL, 20, 0, 0).
*/
{"import", "import table from newly created database", C_BOOL, 0, 0, 0, &g.c_import, NULL},
@@ -340,19 +340,11 @@ static CONFIG c[] = {
/* 2% */
{"stress.split_8", "stress splits (#8)", C_BOOL, 2, 0, 0, &g.c_timing_stress_split_8, NULL},
- {"transaction.frequency", "operations inside an explicit transaction (percentage)", 0x0, 1, 100,
- 100, &g.c_txn_freq, NULL},
-
- {"transaction.isolation",
- "isolation level (random | read-uncommitted | read-committed | snapshot)", C_IGNORE | C_STRING,
- 0, 0, 0, NULL, &g.c_isolation},
-
- /* 0% - By default, turned off until fallout has been debugged. */
- {"transaction.rollback_to_stable", "configure rollback_to_stable", C_BOOL, 0, 0, 0,
- &g.c_txn_rollback_to_stable, NULL},
+ {"transaction.implicit", "implicit, without timestamps, transactions (percentage)", 0x0, 0, 100,
+ 100, &g.c_txn_implicit, NULL},
/* 70% */
- {"transaction.timestamps", "configure transaction timestamps", C_BOOL, 70, 0, 0,
+ {"transaction.timestamps", "all transactions (or none), have timestamps", C_BOOL, 80, 0, 0,
&g.c_txn_timestamps, NULL},
{"wiredtiger.config", "wiredtiger_open API configuration string", C_IGNORE | C_STRING, 0, 0, 0,
diff --git a/src/third_party/wiredtiger/test/format/config_compat.c b/src/third_party/wiredtiger/test/format/config_compat.c
index 4a5789bf854..2926d54ca4b 100644
--- a/src/third_party/wiredtiger/test/format/config_compat.c
+++ b/src/third_party/wiredtiger/test/format/config_compat.c
@@ -101,10 +101,6 @@ static const char *list[] = {
"btree.internal_key_truncation",
"internal_page_max=",
"btree.internal_page_max",
- "isolation=",
- "transaction.isolation",
- "key_gap=",
- "btree.key_gap",
"key_max=",
"btree.key_max",
"key_min=",
diff --git a/src/third_party/wiredtiger/test/format/config_compat.sed b/src/third_party/wiredtiger/test/format/config_compat.sed
index 0f43b19fc6a..b90b21332e8 100644
--- a/src/third_party/wiredtiger/test/format/config_compat.sed
+++ b/src/third_party/wiredtiger/test/format/config_compat.sed
@@ -8,7 +8,6 @@ s/^btree.dictionary=/dictionary=/
s/^btree.huffman_value=/huffman_value=/
s/^btree.internal_key_truncation=/internal_key_truncation=/
s/^btree.internal_page_max=/internal_page_max=/
-s/^btree.key_gap=/key_gap=/
s/^btree.key_max=/key_max=/
s/^btree.key_min=/key_min=/
s/^btree.leaf_page_max=/leaf_page_max=/
diff --git a/src/third_party/wiredtiger/test/format/format.h b/src/third_party/wiredtiger/test/format/format.h
index 7aefc071396..cd46f43a781 100644
--- a/src/third_party/wiredtiger/test/format/format.h
+++ b/src/third_party/wiredtiger/test/format/format.h
@@ -140,8 +140,8 @@ typedef struct {
uint32_t c_abort; /* Config values */
uint32_t c_alter;
- uint32_t c_assert_commit_timestamp;
uint32_t c_assert_read_timestamp;
+ uint32_t c_assert_write_timestamp;
uint32_t c_auto_throttle;
char *c_backup_incremental;
uint32_t c_backup_incr_granularity;
@@ -178,8 +178,6 @@ typedef struct {
uint32_t c_insert_pct;
uint32_t c_internal_key_truncation;
uint32_t c_intl_page_max;
- char *c_isolation;
- uint32_t c_key_gap;
uint32_t c_key_max;
uint32_t c_key_min;
uint32_t c_leaf_page_max;
@@ -197,6 +195,7 @@ typedef struct {
uint32_t c_mmap_all;
uint32_t c_modify_pct;
uint32_t c_ops;
+ uint32_t c_prefix;
uint32_t c_prefix_compression;
uint32_t c_prefix_compression_min;
uint32_t c_prepare;
@@ -228,8 +227,7 @@ typedef struct {
uint32_t c_timing_stress_split_7;
uint32_t c_timing_stress_split_8;
uint32_t c_truncate;
- uint32_t c_txn_freq;
- uint32_t c_txn_rollback_to_stable;
+ uint32_t c_txn_implicit;
uint32_t c_txn_timestamps;
uint32_t c_value_max;
uint32_t c_value_min;
@@ -270,13 +268,6 @@ typedef struct {
#define ENCRYPT_ROTN_7 2
u_int c_encryption_flag; /* Encryption flag value */
-#define ISOLATION_NOT_SET 0
-#define ISOLATION_RANDOM 1
-#define ISOLATION_READ_UNCOMMITTED 2
-#define ISOLATION_READ_COMMITTED 3
-#define ISOLATION_SNAPSHOT 4
- u_int c_isolation_flag; /* Isolation flag value */
-
/* The page must be a multiple of the allocation size, and 512 always works. */
#define BLOCK_ALLOCATION_SIZE 512
uint32_t intl_page_max; /* Maximum page sizes */
@@ -284,6 +275,7 @@ typedef struct {
uint64_t rows; /* Total rows */
+ uint32_t prefix_len; /* Common key prefix length */
uint32_t key_rand_len[1031]; /* Key lengths */
} GLOBAL;
extern GLOBAL g;
diff --git a/src/third_party/wiredtiger/test/format/format.sh b/src/third_party/wiredtiger/test/format/format.sh
index 9d462aed0df..a2fcc71c93e 100755
--- a/src/third_party/wiredtiger/test/format/format.sh
+++ b/src/third_party/wiredtiger/test/format/format.sh
@@ -256,6 +256,37 @@ skip_known_errors()
return 1
}
+# Categorize the failures
+# $1 Log file
+categorize_failure()
+{
+ log=$1
+
+ # Add any important configs to be picked from the detailed failed configuration.
+ configs=("backup=" "runs.source" "runs.type" "transaction.isolation" "transaction.rollback_to_stable"
+ "ops.prepare" "transaction.timestamps")
+ count=${#configs[@]}
+
+ search_string=""
+
+ # now loop through the config array
+ for ((i=0; i<$count; i++))
+ do
+ if [ $i == $(($count - 1)) ]
+ then
+ search_string+=${configs[i]}
+ else
+ search_string+="${configs[i]}|"
+ fi
+ done
+
+ echo "############################################"
+ echo "test/format run configuration highlights"
+ echo "############################################"
+ grep -E "$search_string" $log
+ echo "############################################"
+}
+
# Report a failure.
# $1 directory name
report_failure()
@@ -288,6 +319,8 @@ report_failure()
echo "$name: $dir/CONFIG:"
sed 's/^/ /' < $dir/CONFIG
+ categorize_failure $log
+
echo "$name: failure status reported" > $dir/$status
}
diff --git a/src/third_party/wiredtiger/test/format/kv.c b/src/third_party/wiredtiger/test/format/kv.c
index 04e9e0fc46c..32788b86ffb 100644
--- a/src/third_party/wiredtiger/test/format/kv.c
+++ b/src/third_party/wiredtiger/test/format/kv.c
@@ -75,6 +75,10 @@ key_init(void)
for (i = 0; i < WT_ELEMENTS(g.key_rand_len); ++i)
fprintf(fp, "%" PRIu32 "\n", g.key_rand_len[i]);
fclose_and_clear(&fp);
+
+ /* Fill in the common key prefix length (which is added to the key min/max). */
+ if (g.c_prefix != 0)
+ g.prefix_len = mmrand(NULL, 15, 80);
}
/*
@@ -87,7 +91,7 @@ key_gen_init(WT_ITEM *key)
size_t i, len;
char *p;
- len = WT_MAX(KILOBYTE(100), g.c_key_max);
+ len = WT_MAX(KILOBYTE(100), g.c_key_max + g.prefix_len);
p = dmalloc(len);
for (i = 0; i < len; ++i)
p[i] = "abcdefghijklmnopqrstuvwxyz"[i % 26];
@@ -111,45 +115,62 @@ key_gen_teardown(WT_ITEM *key)
/*
* key_gen_common --
- * Key generation code shared between normal and insert key generation.
+ * Row-store key generation code shared between normal and insert key generation.
*/
void
key_gen_common(WT_ITEM *key, uint64_t keyno, const char *const suffix)
{
- int len;
+ uint64_t n;
char *p;
+ const char *bucket;
+
+ testutil_assert(g.type == ROW);
p = key->mem;
/*
- * The key always starts with a 10-digit string (the specified row) followed by two digits, a
- * random number between 1 and 15 if it's an insert, otherwise 00.
+ * The workload we're trying to mimic with a prefix is a long common prefix followed by a record
+ * number, the tricks are creating a prefix that won't re-order keys, and to change the prefix
+ * with some regularity to test prefix boundaries. Split the key space into power-of-2 buckets:
+ * that results in tiny runs of prefix strings at the beginning of the tree, and increasingly
+ * large common prefixes as the tree grows (with a testing sweet spot in the middle). After the
+ * bucket value, append a string of common bytes. The standard, zero-padded key itself sorts
+ * lexicographically, meaning the common key prefix will grow and shrink by a few bytes as the
+ * number increments, which is a good thing for testing.
+ */
+ if (g.prefix_len > 0) {
+ bucket = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz";
+ for (n = keyno; n > 0; n >>= 1) {
+ if (*bucket == 'z')
+ break;
+ ++bucket;
+ }
+ p[0] = *bucket;
+ memset(p + 1, 'C', g.prefix_len - 1);
+ p += g.prefix_len;
+ }
+
+ /*
+ * After any common prefix, the key starts with a 10-digit string (the specified row) followed
+ * by two digits (a random number between 1 and 15 if it's an insert, otherwise 00).
*/
- u64_to_string_zf(keyno, key->mem, 11);
+ u64_to_string_zf(keyno, p, 11);
p[10] = '.';
p[11] = suffix[0];
p[12] = suffix[1];
- len = 13;
+ p[13] = '/';
/*
- * In a column-store, the key isn't used, it doesn't need a random length.
+ * Because we're doing table lookup for key sizes, we can't set overflow key sizes in the table,
+ * the table isn't big enough to keep our hash from selecting too many big keys and blowing out
+ * the cache. Handle that here, use a really big key 1 in 2500 times.
*/
- if (g.type == ROW) {
- p[len] = '/';
-
- /*
- * Because we're doing table lookup for key sizes, we weren't able to set really big keys
- * sizes in the table, the table isn't big enough to keep our hash from selecting too many
- * big keys and blowing out the cache. Handle that here, use a really big key 1 in 2500
- * times.
- */
- len = keyno % 2500 == 0 && g.c_key_max < KILOBYTE(80) ?
- KILOBYTE(80) :
- (int)g.key_rand_len[keyno % WT_ELEMENTS(g.key_rand_len)];
- }
-
key->data = key->mem;
- key->size = (size_t)len;
+ key->size = g.prefix_len;
+ key->size += keyno % 2500 == 0 && g.c_key_max < KILOBYTE(80) ?
+ KILOBYTE(80) :
+ g.key_rand_len[keyno % WT_ELEMENTS(g.key_rand_len)];
+ testutil_assert(key->size <= key->memsize);
}
static char *val_base; /* Base/original value */
diff --git a/src/third_party/wiredtiger/test/format/ops.c b/src/third_party/wiredtiger/test/format/ops.c
index 0e5f8a30422..3fd5706efad 100644
--- a/src/third_party/wiredtiger/test/format/ops.c
+++ b/src/third_party/wiredtiger/test/format/ops.c
@@ -189,35 +189,20 @@ tinfo_teardown(void)
}
/*
- * Command used before rollback to stable to save the interesting files so we can replay the command
- * as necessary.
- *
- * Redirect the "cd" command to /dev/null so chatty cd implementations don't add the new working
- * directory to our output.
- */
-#define ROLLBACK_STABLE_COPY_CMD \
- "cd %s > /dev/null && " \
- "rm -rf ROLLBACK.copy && mkdir ROLLBACK.copy && " \
- "cp WiredTiger* wt* ROLLBACK.copy/"
-
-/*
- * tinfo_rollback_to_stable_and_check --
- * Do a rollback to stable, then check that changes are correct from what we know in the worker
- * thread structures.
+ * tinfo_rollback_to_stable --
+ * Do a rollback to stable and verify operations.
*/
static void
-tinfo_rollback_to_stable_and_check(WT_SESSION *session)
+tinfo_rollback_to_stable(WT_SESSION *session)
{
WT_CURSOR *cursor;
- WT_DECL_RET;
- char cmd[512];
- testutil_check(__wt_snprintf(cmd, sizeof(cmd), ROLLBACK_STABLE_COPY_CMD, g.home));
- if ((ret = system(cmd)) != 0)
- testutil_die(ret, "rollback to stable copy (\"%s\") failed", cmd);
- trace_msg("%-10s ts=%" PRIu64, "rts", g.stable_timestamp);
+ /* Rollback-to-stable only makes sense for timestamps and on-disk stores. */
+ if (g.c_txn_timestamps == 0 || g.c_in_memory != 0)
+ return;
- g.wts_conn->rollback_to_stable(g.wts_conn, NULL);
+ trace_msg("%-10s ts=%" PRIu64, "rts", g.stable_timestamp);
+ testutil_check(g.wts_conn->rollback_to_stable(g.wts_conn, NULL));
/* Check the saved snap operations for consistency. */
testutil_check(session->open_cursor(session, g.uri, NULL, NULL, &cursor));
@@ -402,8 +387,13 @@ operations(u_int ops_seconds, bool lastrun)
trace_msg("%s", "=============== thread ops stop");
- if (g.c_txn_rollback_to_stable)
- tinfo_rollback_to_stable_and_check(session);
+ /*
+ * The system should be quiescent at this point, call rollback to stable. Generally, we expect
+ * applications to do rollback-to-stable as part of the database open, but calling it outside of
+ * the open path is expected in the case of applications that are "restarting" but skipping the
+ * close/re-open pair.
+ */
+ tinfo_rollback_to_stable(session);
if (lastrun) {
tinfo_teardown();
@@ -418,20 +408,16 @@ operations(u_int ops_seconds, bool lastrun)
* Begin a timestamped transaction.
*/
static void
-begin_transaction_ts(TINFO *tinfo, u_int *iso_configp)
+begin_transaction_ts(TINFO *tinfo)
{
TINFO **tlp;
WT_DECL_RET;
WT_SESSION *session;
uint64_t ts;
- const char *config;
char buf[64];
session = tinfo->session;
- config = "isolation=snapshot";
- *iso_configp = ISOLATION_SNAPSHOT;
-
/*
* Transaction reads are normally repeatable, but WiredTiger timestamps allow rewriting commits,
* that is, applications can specify at commit time the timestamp at which the commit happens.
@@ -444,7 +430,7 @@ begin_transaction_ts(TINFO *tinfo, u_int *iso_configp)
for (ts = UINT64_MAX, tlp = tinfo_list; *tlp != NULL; ++tlp)
ts = WT_MIN(ts, (*tlp)->commit_ts);
if (ts != 0) {
- wiredtiger_begin_transaction(session, config);
+ wiredtiger_begin_transaction(session, NULL);
/*
* If the timestamp has aged out of the system, we'll get EINVAL when we try and set it.
@@ -463,7 +449,7 @@ begin_transaction_ts(TINFO *tinfo, u_int *iso_configp)
testutil_check(session->rollback_transaction(session, NULL));
}
- wiredtiger_begin_transaction(session, config);
+ wiredtiger_begin_transaction(session, NULL);
/*
* Otherwise, pick a current timestamp.
@@ -487,40 +473,19 @@ begin_transaction_ts(TINFO *tinfo, u_int *iso_configp)
/*
* begin_transaction --
- * Choose an isolation configuration and begin a transaction.
+ * Begin a non-timestamp transaction.
*/
static void
-begin_transaction(TINFO *tinfo, u_int *iso_configp)
+begin_transaction(TINFO *tinfo, const char *iso_config)
{
WT_SESSION *session;
- u_int v;
- const char *config;
session = tinfo->session;
- if ((v = g.c_isolation_flag) == ISOLATION_RANDOM)
- v = mmrand(&tinfo->rnd, 1, 3);
- switch (v) {
- case 1:
- v = ISOLATION_READ_UNCOMMITTED;
- config = "isolation=read-uncommitted";
- break;
- case 2:
- v = ISOLATION_READ_COMMITTED;
- config = "isolation=read-committed";
- break;
- case 3:
- default:
- v = ISOLATION_SNAPSHOT;
- config = "isolation=snapshot";
- break;
- }
- *iso_configp = v;
-
- wiredtiger_begin_transaction(session, config);
+ wiredtiger_begin_transaction(session, iso_config);
snap_op_init(tinfo, WT_TS_NONE, false);
- trace_op(tinfo, "begin %s", config);
+ trace_op(tinfo, "begin %s", iso_config);
}
/*
@@ -641,7 +606,7 @@ prepare_transaction(TINFO *tinfo)
#define OP_FAILED(notfound_ok) \
do { \
positioned = false; \
- if (intxn && (ret == WT_CACHE_FULL || ret == WT_ROLLBACK || ret == WT_CACHE_FULL)) \
+ if (intxn && (ret == WT_CACHE_FULL || ret == WT_ROLLBACK)) \
goto rollback; \
testutil_assert( \
(notfound_ok && ret == WT_NOTFOUND) || ret == WT_CACHE_FULL || ret == WT_ROLLBACK); \
@@ -660,16 +625,6 @@ prepare_transaction(TINFO *tinfo)
} while (0)
/*
- * When in a transaction on the live table with snapshot isolation, track operations for later
- * repetition.
- */
-#define SNAP_TRACK(tinfo, op) \
- do { \
- if (intxn && iso_config == ISOLATION_SNAPSHOT) \
- snap_track(tinfo, op); \
- } while (0)
-
-/*
* ops_open_session --
* Create a new session/cursor pair for the thread.
*/
@@ -702,6 +657,21 @@ ops_open_session(TINFO *tinfo)
tinfo->cursor = cursor;
}
+/* Isolation configuration. */
+typedef enum {
+ ISOLATION_READ_COMMITTED,
+ ISOLATION_READ_UNCOMMITTED,
+ ISOLATION_SNAPSHOT
+} iso_level_t;
+
+/* When in an explicit snapshot isolation transaction, track operations for later
+ * repetition. */
+#define SNAP_TRACK(tinfo, op) \
+ do { \
+ if (intxn && iso_level == ISOLATION_SNAPSHOT) \
+ snap_track(tinfo, op); \
+ } while (0)
+
/*
* ops --
* Per-thread operations.
@@ -713,10 +683,12 @@ ops(void *arg)
WT_CURSOR *cursor;
WT_DECL_RET;
WT_SESSION *session;
+ iso_level_t iso_level;
thread_op op;
uint64_t reset_op, session_op, truncate_op;
uint32_t range, rnd;
- u_int i, j, iso_config;
+ u_int i, j;
+ const char *iso_config;
bool greater_than, intxn, next, positioned, prepared;
tinfo = arg;
@@ -733,7 +705,7 @@ ops(void *arg)
else
__wt_random_init(&tinfo->rnd);
- iso_config = ISOLATION_RANDOM; /* -Wconditional-uninitialized */
+ iso_level = ISOLATION_SNAPSHOT; /* -Wconditional-uninitialized */
/* Set the first operation where we'll create sessions and cursors. */
cursor = NULL;
@@ -769,9 +741,9 @@ ops(void *arg)
}
/*
- * If not in a transaction, reset the session now and then, just to make sure that operation
- * gets tested. The test is not for equality, we have to do the reset outside of a
- * transaction so we aren't likely to get an exact match.
+ * If not in a transaction, reset the session periodically to make sure that operation is
+ * tested. The test is not for equality, resets must be done outside of transactions so we
+ * aren't likely to get an exact match.
*/
if (!intxn && tinfo->ops > reset_op) {
testutil_check(session->reset(session));
@@ -781,42 +753,66 @@ ops(void *arg)
}
/*
- * If not in a transaction, have a live handle and running in a timestamp world,
- * occasionally repeat a timestamped operation.
+ * If not in a transaction and in a timestamp world, occasionally repeat a timestamped
+ * operation.
*/
if (!intxn && g.c_txn_timestamps && mmrand(&tinfo->rnd, 1, 15) == 1) {
++tinfo->search;
snap_repeat_single(cursor, tinfo);
}
+ /* If not in a transaction and in a timestamp world, start a transaction. */
+ if (!intxn && g.c_txn_timestamps) {
+ iso_level = ISOLATION_SNAPSHOT;
+ begin_transaction_ts(tinfo);
+ intxn = true;
+ }
+
/*
- * If not in a transaction and have a live handle, choose an isolation level and start a
- * transaction some percentage of the time.
+ * If not in a transaction and not in a timestamp world, start a transaction some percentage
+ * of the time.
*/
- if (!intxn && (g.c_txn_timestamps || mmrand(&tinfo->rnd, 1, 100) <= g.c_txn_freq)) {
- if (g.c_txn_timestamps)
- begin_transaction_ts(tinfo, &iso_config);
- else
- begin_transaction(tinfo, &iso_config);
+ if (!intxn && mmrand(&tinfo->rnd, 1, 100) < g.c_txn_implicit) {
+ iso_level = ISOLATION_SNAPSHOT;
+ iso_config = "isolation=snapshot";
+
+ /* Occasionally do reads at an isolation level lower than snapshot. */
+ switch (mmrand(NULL, 1, 20)) {
+ case 1:
+ iso_level = ISOLATION_READ_COMMITTED; /* 5% */
+ iso_config = "isolation=read-committed";
+ break;
+ case 2:
+ iso_level = ISOLATION_READ_UNCOMMITTED; /* 5% */
+ iso_config = "isolation=read-uncommitted";
+ break;
+ }
+
+ begin_transaction(tinfo, iso_config);
intxn = true;
}
- /* Select an operation. */
+ /*
+ * Select an operation: all updates must be in snapshot isolation, modify must be in an
+ * explicit transaction.
+ */
op = READ;
- i = mmrand(&tinfo->rnd, 1, 100);
- if (i < g.c_delete_pct && tinfo->ops > truncate_op) {
- op = TRUNCATE;
-
- /* Pick the next truncate operation. */
- truncate_op += mmrand(&tinfo->rnd, 20000, 100000);
- } else if (i < g.c_delete_pct)
- op = REMOVE;
- else if (i < g.c_delete_pct + g.c_insert_pct)
- op = INSERT;
- else if (i < g.c_delete_pct + g.c_insert_pct + g.c_modify_pct)
- op = MODIFY;
- else if (i < g.c_delete_pct + g.c_insert_pct + g.c_modify_pct + g.c_write_pct)
- op = UPDATE;
+ if (iso_level == ISOLATION_SNAPSHOT) {
+ i = mmrand(&tinfo->rnd, 1, 100);
+ if (i < g.c_delete_pct && tinfo->ops > truncate_op) {
+ op = TRUNCATE;
+
+ /* Pick the next truncate operation. */
+ truncate_op += mmrand(&tinfo->rnd, 20000, 100000);
+ } else if (i < g.c_delete_pct)
+ op = REMOVE;
+ else if (i < g.c_delete_pct + g.c_insert_pct)
+ op = INSERT;
+ else if (intxn && i < g.c_delete_pct + g.c_insert_pct + g.c_modify_pct)
+ op = MODIFY;
+ else if (i < g.c_delete_pct + g.c_insert_pct + g.c_modify_pct + g.c_write_pct)
+ op = UPDATE;
+ }
/* Select a row. */
tinfo->keyno = mmrand(&tinfo->rnd, 1, (u_int)g.rows);
@@ -838,10 +834,10 @@ ops(void *arg)
}
/*
- * Optionally reserve a row. Reserving a row before a read isn't all that sensible, but not
- * unexpected, either.
+ * Optionally reserve a row, it's an update so it requires snapshot isolation. Reserving a
+ * row before a read isn't all that sensible, but not unexpected, either.
*/
- if (intxn && mmrand(&tinfo->rnd, 0, 20) == 1) {
+ if (intxn && iso_level == ISOLATION_SNAPSHOT && mmrand(&tinfo->rnd, 0, 20) == 1) {
switch (g.type) {
case ROW:
ret = row_reserve(tinfo, cursor, positioned);
@@ -853,8 +849,7 @@ ops(void *arg)
}
if (ret == 0) {
positioned = true;
-
- __wt_yield(); /* Let other threads proceed. */
+ __wt_yield(); /* Encourage races */
} else
WRITE_OP_FAILED(true);
}
@@ -888,13 +883,6 @@ ops(void *arg)
WRITE_OP_FAILED(false);
break;
case MODIFY:
- /*
- * Change modify into update if not part of a snapshot isolation transaction, modify
- * isn't supported in those cases.
- */
- if (!intxn || iso_config != ISOLATION_SNAPSHOT)
- goto update_instead_of_chosen_op;
-
++tinfo->update;
switch (g.type) {
case ROW:
@@ -1050,17 +1038,17 @@ update_instead_of_chosen_op:
testutil_check(cursor->reset(cursor));
/*
- * Continue if not in a transaction, else add more operations to the transaction half the
- * time.
+ * No post-operation work is needed outside of a transaction. If in a transaction, add more
+ * operations to the transaction half the time.
*/
if (!intxn || (rnd = mmrand(&tinfo->rnd, 1, 10)) > 5)
continue;
/*
- * Ending a transaction. If on a live handle and the transaction was configured for snapshot
- * isolation, repeat the operations and confirm the results are unchanged.
+ * Ending a transaction. If the transaction was configured for snapshot isolation, repeat
+ * the operations and confirm the results are unchanged.
*/
- if (intxn && iso_config == ISOLATION_SNAPSHOT) {
+ if (intxn && iso_level == ISOLATION_SNAPSHOT) {
__wt_yield(); /* Encourage races */
ret = snap_repeat_txn(cursor, tinfo);
@@ -1069,13 +1057,10 @@ update_instead_of_chosen_op:
goto rollback;
}
- /*
- * If prepare configured, prepare the transaction 10% of the time.
- */
+ /* If prepare configured, prepare the transaction 10% of the time. */
prepared = false;
if (g.c_prepare && mmrand(&tinfo->rnd, 1, 10) == 1) {
- ret = prepare_transaction(tinfo);
- if (ret != 0)
+ if ((ret = prepare_transaction(tinfo)) != 0)
WRITE_OP_FAILED(false);
__wt_yield(); /* Encourage races */
@@ -1083,7 +1068,8 @@ update_instead_of_chosen_op:
}
/*
- * If we're in a transaction, commit 40% of the time and rollback 10% of the time.
+ * If we're in a transaction, commit 40% of the time and rollback 10% of the time (we
+ * continued to add operations to the transaction the remaining 50% of the time).
*/
switch (rnd) {
case 1:
@@ -1351,8 +1337,8 @@ order_error_col:
* to the row's key.) Keys are strings with terminating '/' values, so absent key
* corruption, we can simply do the underlying string conversion on the key string.
*/
- keyno_prev = strtoul(tinfo->key->data, NULL, 10);
- keyno = strtoul(key.data, NULL, 10);
+ keyno_prev = strtoul((char *)tinfo->key->data + g.prefix_len, NULL, 10);
+ keyno = strtoul((char *)key.data + g.prefix_len, NULL, 10);
if (incrementing) {
if (keyno_prev != keyno && keyno_prev + 1 != keyno)
goto order_error_row;
diff --git a/src/third_party/wiredtiger/test/format/snap.c b/src/third_party/wiredtiger/test/format/snap.c
index bc2b58b2f3c..1c934a5d187 100644
--- a/src/third_party/wiredtiger/test/format/snap.c
+++ b/src/third_party/wiredtiger/test/format/snap.c
@@ -29,11 +29,6 @@
#include "format.h"
/*
- * Issue a warning when there enough consecutive unsuccessful checks for rollback to stable.
- */
-#define WARN_RTS_NO_CHECK 5
-
-/*
* snap_init --
* Initialize the repeatable operation tracking.
*/
@@ -41,14 +36,16 @@ void
snap_init(TINFO *tinfo)
{
/*
- * We maintain two snap lists. The current one is indicated by tinfo->s, and keeps the most
- * recent operations. The other one is used when we are running with rollback_to_stable. When
- * each thread notices that the stable timestamp has changed, it stashes the current snap list
- * and starts fresh with the other snap list. After we've completed a rollback_to_stable, we can
- * the secondary snap list to see the state of keys/values seen and updated at the time of the
- * rollback.
+ * We maintain two snap lists, where the current one is indicated by tinfo->s, and keeps the
+ * most recent operations.
+ *
+ * The other one is used when we are running timestamp transactions with rollback_to_stable.
+ * When each thread notices that the stable timestamp has changed, it stashes the current snap
+ * list and starts fresh with the other snap list. After we've completed a rollback_to_stable,
+ * we can the secondary snap list to see the state of keys/values seen and updated at the time
+ * of the rollback.
*/
- if (g.c_txn_rollback_to_stable) {
+ if (g.c_txn_timestamps) {
tinfo->s = &tinfo->snap_states[1];
tinfo->snap_list = dcalloc(SNAP_LIST_SIZE, sizeof(SNAP_OPS));
tinfo->snap_end = &tinfo->snap_list[SNAP_LIST_SIZE];
@@ -113,7 +110,7 @@ snap_op_init(TINFO *tinfo, uint64_t read_ts, bool repeatable_reads)
++tinfo->opid;
- if (g.c_txn_rollback_to_stable) {
+ if (g.c_txn_timestamps) {
/*
* If the stable timestamp has changed and we've advanced beyond it, preserve the current
* snapshot history up to this point, we'll use it verify rollback_to_stable. Switch our
@@ -528,40 +525,45 @@ snap_repeat_update(TINFO *tinfo, bool committed)
* Repeat one operation.
*/
static void
-snap_repeat(WT_CURSOR *cursor, TINFO *tinfo, SNAP_OPS *snap, bool rollback_allowed)
+snap_repeat(WT_CURSOR *cursor, TINFO *tinfo, SNAP_OPS *snap)
{
WT_DECL_RET;
WT_SESSION *session;
+#define MAX_RETRY_ON_ROLLBACK 1000
+ u_int max_retry;
char buf[64];
session = cursor->session;
- /*
- * Start a new transaction. Set the read timestamp. Verify the record. Discard the transaction.
- */
- wiredtiger_begin_transaction(session, "isolation=snapshot");
+ trace_op(tinfo, "repeat %" PRIu64 " ts=%" PRIu64 " {%s}", snap->keyno, snap->ts,
+ trace_bytes(tinfo, snap->vdata, snap->vsize));
- /*
- * If the timestamp has aged out of the system, we'll get EINVAL when we try and set it.
- */
+ /* Start a transaction with a read-timestamp and verify the record. */
testutil_check(__wt_snprintf(buf, sizeof(buf), "read_timestamp=%" PRIx64, snap->ts));
- ret = session->timestamp_transaction(session, buf);
- if (ret == 0) {
- trace_op(tinfo, "repeat %" PRIu64 " ts=%" PRIu64 " {%s}", snap->keyno, snap->ts,
- trace_bytes(tinfo, snap->vdata, snap->vsize));
-
- /* The only expected error is rollback. */
- ret = snap_verify(cursor, tinfo, snap);
+ for (max_retry = 0; max_retry < MAX_RETRY_ON_ROLLBACK; ++max_retry, __wt_yield()) {
+ wiredtiger_begin_transaction(session, "isolation=snapshot");
- if (ret != 0 && (!rollback_allowed || (ret != WT_ROLLBACK && ret != WT_CACHE_FULL)))
- testutil_check(ret);
- } else if (ret == EINVAL)
- snap_ts_clear(tinfo, snap->ts);
- else
+ /* EINVAL means the timestamp has aged out of the system. */
+ if ((ret = session->timestamp_transaction(session, buf)) == EINVAL) {
+ snap_ts_clear(tinfo, snap->ts);
+ break;
+ }
testutil_check(ret);
- /* Discard the transaction. */
+ /*
+ * The only expected error is rollback (as a read-only transaction, cache-full shouldn't
+ * matter to us). Persist after rollback, as a repeatable read we should succeed, yield to
+ * let eviction catch up.
+ */
+ if ((ret = snap_verify(cursor, tinfo, snap)) == 0)
+ break;
+ testutil_assert(ret == WT_ROLLBACK);
+
+ testutil_check(session->rollback_transaction(session, NULL));
+ }
+ testutil_assert(max_retry < MAX_RETRY_ON_ROLLBACK);
+
testutil_check(session->rollback_transaction(session, NULL));
}
@@ -593,7 +595,7 @@ snap_repeat_single(WT_CURSOR *cursor, TINFO *tinfo)
if (count == 0)
return;
- snap_repeat(cursor, tinfo, snap, true);
+ snap_repeat(cursor, tinfo, snap);
}
/*
@@ -626,9 +628,8 @@ snap_repeat_rollback(WT_CURSOR *cursor, TINFO **tinfo_array, size_t tinfo_count)
for (statenum = 0; statenum < WT_ELEMENTS(tinfo->snap_states); statenum++) {
state = &tinfo->snap_states[statenum];
for (snap = state->snap_state_list; snap < state->snap_state_end; ++snap) {
- if (snap->repeatable && snap->ts <= g.stable_timestamp &&
- snap->ts >= g.oldest_timestamp) {
- snap_repeat(cursor, tinfo, snap, false);
+ if (snap->repeatable && snap->ts <= g.stable_timestamp) {
+ snap_repeat(cursor, tinfo, snap);
++count;
if (count % 100 == 0) {
testutil_check(__wt_snprintf(
@@ -646,6 +647,7 @@ snap_repeat_rollback(WT_CURSOR *cursor, TINFO **tinfo_array, size_t tinfo_count)
__wt_snprintf(buf, sizeof(buf), "rollback_to_stable: %" PRIu32 " ops repeated", count));
track(buf, 0ULL, NULL);
if (count == 0) {
+#define WARN_RTS_NO_CHECK 5
if (++g.rts_no_check >= WARN_RTS_NO_CHECK)
fprintf(stderr,
"Warning: %" PRIu32 " consecutive runs with no rollback_to_stable checking\n", count);
diff --git a/src/third_party/wiredtiger/test/format/util.c b/src/third_party/wiredtiger/test/format/util.c
index f35b2a8416c..8c5efd007ee 100644
--- a/src/third_party/wiredtiger/test/format/util.c
+++ b/src/third_party/wiredtiger/test/format/util.c
@@ -282,7 +282,7 @@ timestamp_once(bool allow_lag, bool final)
/*
* If a lag is permitted, move the oldest timestamp half the way to the current
- * "all_durable" timestamp. Move the stable timestamp to "all_durable".
+ * "all_durable" timestamp. Move the stable timestamp to "all_durable".
*/
if (allow_lag)
g.oldest_timestamp = (all_durable + g.oldest_timestamp) / 2;
diff --git a/src/third_party/wiredtiger/test/format/wts.c b/src/third_party/wiredtiger/test/format/wts.c
index 3b37b3a43d1..f95d7903c94 100644
--- a/src/third_party/wiredtiger/test/format/wts.c
+++ b/src/third_party/wiredtiger/test/format/wts.c
@@ -319,10 +319,8 @@ create_object(WT_CONNECTION *conn)
CONFIG_APPEND(p, ",value_format=%" PRIu32 "t", g.c_bitcnt);
break;
case ROW:
- if (g.c_prefix_compression)
- CONFIG_APPEND(p, ",prefix_compression_min=%" PRIu32, g.c_prefix_compression_min);
- else
- CONFIG_APPEND(p, ",prefix_compression=false");
+ CONFIG_APPEND(p, ",prefix_compression=%s,prefix_compression_min=%" PRIu32,
+ g.c_prefix_compression == 0 ? "false" : "true", g.c_prefix_compression_min);
if (g.c_reverse)
CONFIG_APPEND(p, ",collator=reverse");
/* FALLTHROUGH */
@@ -351,22 +349,16 @@ create_object(WT_CONNECTION *conn)
if (g.c_compression_flag != COMPRESS_NONE)
CONFIG_APPEND(p, ",block_compressor=\"%s\"", compressor(g.c_compression_flag));
- /* Configure Btree internal key truncation. */
+ /* Configure Btree. */
CONFIG_APPEND(p, ",internal_key_truncate=%s", g.c_internal_key_truncation ? "true" : "false");
-
- /* Configure Btree page key gap. */
- CONFIG_APPEND(p, ",key_gap=%" PRIu32, g.c_key_gap);
-
- /* Configure Btree split page percentage. */
CONFIG_APPEND(p, ",split_pct=%" PRIu32, g.c_split_pct);
- /*
- * Assertions. Assertions slow down the code for additional diagnostic checking.
- */
- if (g.c_txn_timestamps && g.c_assert_commit_timestamp)
- CONFIG_APPEND(p, ",write_timestamp_usage=key_consistent,assert=(write_timestamp=on)");
- if (g.c_txn_timestamps && g.c_assert_read_timestamp)
- CONFIG_APPEND(p, ",assert=(read_timestamp=always)");
+ /* Assertions: assertions slow down the code for additional diagnostic checking. */
+ if (g.c_assert_read_timestamp)
+ CONFIG_APPEND(p, ",assert=(read_timestamp=%s)", g.c_txn_timestamps ? "always" : "never");
+ if (g.c_assert_write_timestamp)
+ CONFIG_APPEND(p, ",assert=(write_timestamp=on),write_timestamp_usage=%s",
+ g.c_txn_timestamps ? "always" : "never");
/* Configure LSM. */
if (DATASOURCE("lsm")) {
diff --git a/src/third_party/wiredtiger/test/suite/test_backup22.py b/src/third_party/wiredtiger/test/suite/test_backup22.py
new file mode 100644
index 00000000000..06d1a81ef7c
--- /dev/null
+++ b/src/third_party/wiredtiger/test/suite/test_backup22.py
@@ -0,0 +1,93 @@
+#!/usr/bin/env python
+#
+# Public Domain 2014-present MongoDB, Inc.
+# Public Domain 2008-2014 WiredTiger, Inc.
+#
+# This is free and unencumbered software released into the public domain.
+#
+# Anyone is free to copy, modify, publish, use, compile, sell, or
+# distribute this software, either in source code form or as a compiled
+# binary, for any purpose, commercial or non-commercial, and by any
+# means.
+#
+# In jurisdictions that recognize copyright laws, the author or authors
+# of this software dedicate any and all copyright interest in the
+# software to the public domain. We make this dedication for the benefit
+# of the public at large and to the detriment of our heirs and
+# successors. We intend this dedication to be an overt act of
+# relinquishment in perpetuity of all present and future rights to this
+# software under copyright law.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+# IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+# OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+# ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+# OTHER DEALINGS IN THE SOFTWARE.
+
+import wiredtiger, os
+from wtscenario import make_scenarios
+from wtbackup import backup_base
+
+# test_backup22.py
+# Test interaction between import and incremental backup.
+# Test the functionality of importing dropped tables in incremental backup.
+#
+class test_backup22(backup_base):
+ create_config = 'allocation_size=512,key_format=i,value_format=i'
+ # Backup directory name
+ dir='backup.dir'
+ incr_dir = 'incr_backup.dir'
+ uri = 'test_backup22'
+ scenarios = make_scenarios([
+ ('import_with_metadata', dict(repair=False,checkpoint=False)),
+ ('import_repair', dict(repair=True,checkpoint=False)),
+ ('import_with_metadata_ckpt', dict(repair=False,checkpoint=True)),
+ ('import_repair_ckpt', dict(repair=True,checkpoint=True)),
+ ])
+
+ def test_import_with_open_backup_cursor(self):
+ os.mkdir(self.dir)
+ os.mkdir(self.incr_dir)
+
+ # Create and populate the table.
+ table_uri = 'table:' + self.uri
+ self.session.create(table_uri, self.create_config)
+ cursor = self.session.open_cursor(table_uri)
+ for i in range(1, 1000):
+ cursor[i] = i
+ cursor.close()
+ self.session.checkpoint()
+
+ # Export the metadata for the file.
+ file_uri = 'file:' + self.uri + '.wt'
+ c = self.session.open_cursor('metadata:', None, None)
+ original_db_table_config = c[table_uri]
+ original_db_file_config = c[file_uri]
+ c.close()
+
+ config = 'incremental=(enabled,granularity=4k,this_id="ID1")'
+ bkup_c = self.session.open_cursor('backup:', None, config)
+ self.take_full_backup(self.dir, bkup_c)
+ bkup_c.close()
+ self.session.drop(table_uri, 'remove_files=false')
+
+ # First construct the config string for the default or repair import scenario,
+ # then call create to import the table.
+ if self.repair:
+ import_config = 'import=(enabled,repair=true)'
+ else:
+ import_config = '{},import=(enabled,repair=false,file_metadata=({}))'.format(
+ original_db_table_config, original_db_file_config)
+ self.session.create(table_uri, import_config)
+
+ if self.checkpoint:
+ self.session.checkpoint()
+ # Perform incremental backup with id 2 on empty directory. We want empty directory
+ # because we expect all files to be copied over in it's entirety.
+ self.take_incr_backup(self.incr_dir, 2)
+ self.compare_backups(self.uri, self.dir, self.incr_dir)
+
+if __name__ == '__main__':
+ wttest.run()
diff --git a/src/third_party/wiredtiger/test/suite/test_prepare15.py b/src/third_party/wiredtiger/test/suite/test_prepare15.py
new file mode 100644
index 00000000000..4c4ba49a182
--- /dev/null
+++ b/src/third_party/wiredtiger/test/suite/test_prepare15.py
@@ -0,0 +1,204 @@
+#!/usr/bin/env python
+#
+# Public Domain 2014-present MongoDB, Inc.
+# Public Domain 2008-2014 WiredTiger, Inc.
+#
+# This is free and unencumbered software released into the public domain.
+#
+# Anyone is free to copy, modify, publish, use, compile, sell, or
+# distribute this software, either in source code form or as a compiled
+# binary, for any purpose, commercial or non-commercial, and by any
+# means.
+#
+# In jurisdictions that recognize copyright laws, the author or authors
+# of this software dedicate any and all copyright interest in the
+# software to the public domain. We make this dedication for the benefit
+# of the public at large and to the detriment of our heirs and
+# successors. We intend this dedication to be an overt act of
+# relinquishment in perpetuity of all present and future rights to this
+# software under copyright law.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+# IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+# OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+# ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+# OTHER DEALINGS IN THE SOFTWARE.
+
+import wttest
+from wiredtiger import WT_NOTFOUND
+from wtscenario import make_scenarios
+
+def timestamp_str(t):
+ return '%x' % t
+
+# test_prepare15.py
+# Test that the prepare transaction rollback removes the on-disk key
+# or replace it with history store and commit retains the changes when
+# both insert and remove operations are from the same transaction.
+class test_prepare15(wttest.WiredTigerTestCase):
+ in_memory_values = [
+ ('no_inmem', dict(in_memory=False)),
+ ('inmem', dict(in_memory=True))
+ ]
+
+ key_format_values = [
+ ('column', dict(key_format='r')),
+ ('integer_row', dict(key_format='i')),
+ ]
+
+ txn_end_values = [
+ ('commit', dict(commit=True)),
+ ('rollback', dict(commit=False)),
+ ]
+
+ scenarios = make_scenarios(in_memory_values, key_format_values, txn_end_values)
+
+ def conn_config(self):
+ config = 'cache_size=50MB'
+ if self.in_memory:
+ config += ',in_memory=true'
+ else:
+ config += ',in_memory=false'
+ return config
+
+ def test_prepare_restore_hs_update(self):
+ # Prepare transactions for column store table is not yet supported.
+ if self.key_format == 'r':
+ self.skipTest('Prepare transactions for column store table is not yet supported')
+
+ # Create a table without logging.
+ uri = "table:prepare15"
+ create_config = 'allocation_size=512,key_format=S,value_format=S'
+ self.session.create(uri, create_config)
+
+ # Pin oldest and stable timestamps to 10.
+ self.conn.set_timestamp('oldest_timestamp=' + timestamp_str(10) +
+ ',stable_timestamp=' + timestamp_str(10))
+
+ valuea = 'a'
+ valueb = 'a'
+
+ # Perform an update and remove.
+ cursor = self.session.open_cursor(uri)
+ self.session.begin_transaction()
+ cursor[str(0)] = valuea
+ self.session.commit_transaction('commit_timestamp=' + timestamp_str(20))
+
+ self.session.begin_transaction()
+ cursor.set_key(str(0))
+ cursor.remove()
+ self.session.commit_transaction('commit_timestamp=' + timestamp_str(30))
+ cursor.close()
+
+ # Perform an update and remove.
+ s = self.conn.open_session()
+ cursor = s.open_cursor(uri)
+ s.begin_transaction()
+ cursor[str(0)] = valueb
+ cursor.set_key(str(0))
+ cursor.remove()
+ cursor.close()
+ s.prepare_transaction('prepare_timestamp=' + timestamp_str(40))
+
+ # Configure debug behavior on a cursor to evict the page positioned on when the reset API is used.
+ evict_cursor = self.session.open_cursor(uri, None, "debug=(release_evict)")
+
+ # Search for the key so we position our cursor on the page that we want to evict.
+ self.session.begin_transaction('ignore_prepare = true')
+ evict_cursor.set_key(str(0))
+ self.assertEquals(evict_cursor.search(), WT_NOTFOUND)
+ evict_cursor.reset()
+ evict_cursor.close()
+ self.session.commit_transaction()
+
+ if self.commit:
+ # Commit the prepared transaction
+ s.timestamp_transaction('commit_timestamp=' + timestamp_str(50))
+ s.timestamp_transaction('durable_timestamp=' + timestamp_str(60))
+ s.commit_transaction()
+ else:
+ # Rollback the prepared transaction
+ s.rollback_transaction()
+
+ # Configure debug behavior on a cursor to evict the page positioned on when the reset API is used.
+ evict_cursor = self.session.open_cursor(uri, None, "debug=(release_evict)")
+
+ # Search for the key so we position our cursor on the page that we want to evict.
+ self.session.begin_transaction()
+ evict_cursor.set_key(str(0))
+ self.assertEquals(evict_cursor.search(), WT_NOTFOUND)
+ evict_cursor.reset()
+ evict_cursor.close()
+ self.session.commit_transaction()
+
+ self.session.begin_transaction('read_timestamp=' + timestamp_str(20))
+ cursor2 = self.session.open_cursor(uri)
+ cursor2.set_key(str(0))
+ self.assertEquals(cursor2.search(), 0)
+ self.assertEqual(cursor2.get_value(), valuea)
+ self.session.commit_transaction()
+
+ def test_prepare_not_found(self):
+ # Prepare transactions for column store table is not yet supported.
+ if self.key_format == 'r':
+ self.skipTest('Prepare transactions for column store table is not yet supported')
+
+ # Create a table without logging.
+ uri = "table:prepare15"
+ create_config = 'allocation_size=512,key_format=S,value_format=S'
+ self.session.create(uri, create_config)
+
+ # Pin oldest and stable timestamps to 10.
+ self.conn.set_timestamp('oldest_timestamp=' + timestamp_str(10) +
+ ',stable_timestamp=' + timestamp_str(10))
+
+ value = 'a'
+
+ # Perform an update and remove.
+ s = self.conn.open_session()
+ cursor = s.open_cursor(uri)
+ s.begin_transaction()
+ cursor[str(0)] = value
+ cursor.set_key(str(0))
+ cursor.remove()
+ cursor.close()
+ s.prepare_transaction('prepare_timestamp=' + timestamp_str(20))
+
+ # Configure debug behavior on a cursor to evict the page positioned on when the reset API is used.
+ evict_cursor = self.session.open_cursor(uri, None, "debug=(release_evict)")
+
+ # Search for the key so we position our cursor on the page that we want to evict.
+ self.session.begin_transaction("ignore_prepare = true")
+ evict_cursor.set_key(str(0))
+ self.assertEquals(evict_cursor.search(), WT_NOTFOUND)
+ evict_cursor.reset()
+ evict_cursor.close()
+ self.session.commit_transaction()
+
+ if self.commit:
+ # Commit the prepared transaction
+ s.timestamp_transaction('commit_timestamp=' + timestamp_str(30))
+ s.timestamp_transaction('durable_timestamp=' + timestamp_str(40))
+ s.commit_transaction()
+ else:
+ # Rollback the prepared transaction
+ s.rollback_transaction()
+
+ # Configure debug behavior on a cursor to evict the page positioned on when the reset API is used.
+ evict_cursor = self.session.open_cursor(uri, None, "debug=(release_evict)")
+
+ # Search for the key so we position our cursor on the page that we want to evict.
+ self.session.begin_transaction()
+ evict_cursor.set_key(str(0))
+ self.assertEquals(evict_cursor.search(), WT_NOTFOUND)
+ evict_cursor.reset()
+ evict_cursor.close()
+ self.session.commit_transaction()
+
+ self.session.begin_transaction()
+ cursor2 = self.session.open_cursor(uri)
+ cursor2.set_key(str(0))
+ self.assertEquals(cursor2.search(), WT_NOTFOUND)
+ self.session.commit_transaction()
diff --git a/src/third_party/wiredtiger/test/suite/test_tiered04.py b/src/third_party/wiredtiger/test/suite/test_tiered04.py
index 0347647031f..75d4fac1b19 100755
--- a/src/third_party/wiredtiger/test/suite/test_tiered04.py
+++ b/src/third_party/wiredtiger/test/suite/test_tiered04.py
@@ -35,7 +35,7 @@ StorageSource = wiredtiger.StorageSource # easy access to constants
class test_tiered04(wttest.WiredTigerTestCase):
# If the 'uri' changes all the other names must change with it.
- fileuri = 'file:test_tiered04-0000000001.wt'
+ fileuri = 'file:test_tiered04-0000000001.wtobj'
objuri = 'object:test_tiered04-0000000001.wtobj'
tiereduri = "tiered:test_tiered04"
uri = "table:test_tiered04"
diff --git a/src/third_party/wiredtiger/test/suite/test_tiered06.py b/src/third_party/wiredtiger/test/suite/test_tiered06.py
index e0614cd8c1b..c797936a82b 100755
--- a/src/third_party/wiredtiger/test/suite/test_tiered06.py
+++ b/src/third_party/wiredtiger/test/suite/test_tiered06.py
@@ -64,7 +64,7 @@ class test_tiered06(wttest.WiredTigerTestCase):
local = self.get_local_storage_source()
os.mkdir("objects")
- fs = local.ss_customize_file_system(session, "./objects", "cluster1-", "Secret", None)
+ fs = local.ss_customize_file_system(session, "./objects", "Secret", None)
# The object doesn't exist yet.
self.assertFalse(fs.fs_exist(session, 'foobar'))
@@ -95,33 +95,29 @@ class test_tiered06(wttest.WiredTigerTestCase):
fh.fh_lock(session, False)
fh.close(session)
- self.assertEquals(fs.fs_directory_list(session, '', ''), ['foobar'])
+ # Nothing is in the directory list until a flush.
+ self.assertEquals(fs.fs_directory_list(session, '', ''), [])
- # Newly created objects are in the list.
fh = fs.fs_open_file(session, 'zzz', FileSystem.open_file_type_data, FileSystem.open_create)
- # TODO: tiered: the newly created file should be visible, but it is not yet.
- # self.assertEquals(sorted(fs.fs_directory_list(session, '', '')), ['foobar', 'zzz' ])
-
# Sync merely syncs to the local disk.
fh.fh_sync(session)
fh.close(session) # zero length
- self.assertEquals(sorted(fs.fs_directory_list(session, '', '')), ['foobar', 'zzz' ])
+ self.assertEquals(sorted(fs.fs_directory_list(session, '', '')), [])
# See that we can rename objects.
fs.fs_rename(session, 'zzz', 'yyy', 0)
- self.assertEquals(sorted(fs.fs_directory_list(session, '', '')), ['foobar', 'yyy' ])
+ self.assertEquals(sorted(fs.fs_directory_list(session, '', '')), [])
# See that we can remove objects.
fs.fs_remove(session, 'yyy', 0)
- self.assertEquals(fs.fs_directory_list(session, '', ''), ['foobar'])
- # TODO: tiered: flush tests disabled, as the interface
- # for flushing will be changed.
- return
+ # Nothing is in the directory list until a flush.
+ self.assertEquals(fs.fs_directory_list(session, '', ''), [])
- # Flushing doesn't do anything that's visible.
- local.ss_flush(session, fs, None, '')
+ # Flushing moves the file.
+ local.ss_flush(session, fs, 'foobar', 'foobar', None)
+ local.ss_flush_finish(session, fs, 'foobar', 'foobar', None)
self.assertEquals(fs.fs_directory_list(session, '', ''), ['foobar'])
# Files that have been flushed cannot be manipulated.
@@ -145,7 +141,7 @@ class test_tiered06(wttest.WiredTigerTestCase):
local = self.get_local_storage_source()
os.mkdir("objects")
- fs = local.ss_customize_file_system(session, "./objects", "cluster1-", "Secret", None)
+ fs = local.ss_customize_file_system(session, "./objects", "Secret", None)
# We call these 4K chunks of data "blocks" for this test, but that doesn't
# necessarily relate to WT block sizing.
@@ -208,11 +204,23 @@ class test_tiered06(wttest.WiredTigerTestCase):
cachedir1 = "./cache1"
cachedir2 = "./cache2"
- def check(self, fs, prefix, expect):
+ # Add a suffix to each in a list
+ def suffix(self, lst, sfx):
+ return [x + '.' + sfx for x in lst]
+
+ def check_dirlist(self, fs, prefix, expect):
# We don't require any sorted output for directory lists,
# so we'll sort before comparing.'
got = sorted(fs.fs_directory_list(self.session, '', prefix))
- expect = sorted(expect)
+ expect = sorted(self.suffix(expect, 'wtobj'))
+ self.assertEquals(got, expect)
+
+ # Check for data files in the WiredTiger home directory.
+ def check_home(self, expect):
+ # Get list of all .wt files in home, prune out the WiredTiger produced ones
+ got = sorted(list(os.listdir(self.home)))
+ got = [x for x in got if not x.startswith('WiredTiger') and x.endswith('.wt')]
+ expect = sorted(self.suffix(expect, 'wt'))
self.assertEquals(got, expect)
# Check that objects are "in the cloud" after a flush.
@@ -220,12 +228,25 @@ class test_tiered06(wttest.WiredTigerTestCase):
# objectdir1 or objectdir2
def check_objects(self, expect1, expect2):
got = sorted(list(os.listdir(self.objectdir1)))
- expect = sorted(expect1)
+ expect = sorted(self.suffix(expect1, 'wtobj'))
self.assertEquals(got, expect)
got = sorted(list(os.listdir(self.objectdir2)))
- expect = sorted(expect2)
+ expect = sorted(self.suffix(expect2, 'wtobj'))
self.assertEquals(got, expect)
+ # Check that objects are in the cache directory after flush_finish.
+ def check_caches(self, expect1, expect2):
+ got = sorted(list(os.listdir(self.cachedir1)))
+ expect = sorted(self.suffix(expect1, 'wtobj'))
+ self.assertEquals(got, expect)
+ got = sorted(list(os.listdir(self.cachedir2)))
+ expect = sorted(self.suffix(expect2, 'wtobj'))
+ self.assertEquals(got, expect)
+
+ def create_wt_file(self, name):
+ with open(name + '.wt', 'w') as f:
+ f.write('hello')
+
def test_local_file_systems(self):
# Test using various buckets, hosts
@@ -244,11 +265,11 @@ class test_tiered06(wttest.WiredTigerTestCase):
errmsg = '/No such file or directory/'
self.assertRaisesWithMessage(wiredtiger.WiredTigerError,
lambda: local.ss_customize_file_system(
- session, "./objects1", "pre1-", "k1", bad_config), errmsg)
+ session, "./objects1", "k1", bad_config), errmsg)
self.assertRaisesWithMessage(wiredtiger.WiredTigerError,
lambda: local.ss_customize_file_system(
- session, "./objects_BAD", "pre1-", "k1", config1), errmsg)
+ session, "./objects_BAD", "k1", config1), errmsg)
# Create an empty file, try to use it as a directory.
with open("some_file", "w"):
@@ -256,143 +277,75 @@ class test_tiered06(wttest.WiredTigerTestCase):
errmsg = '/Invalid argument/'
self.assertRaisesWithMessage(wiredtiger.WiredTigerError,
lambda: local.ss_customize_file_system(
- session, "some_file", "pre1-", "k1", config1), errmsg)
+ session, "some_file", "k1", config1), errmsg)
# Now create some file systems that should succeed.
# Use either different bucket directories or different prefixes,
# so activity that happens in the various file systems should be independent.
- fs1 = local.ss_customize_file_system(session, "./objects1", "pre1-", "k1", config1)
- fs2 = local.ss_customize_file_system(session, "./objects2", "pre1-", "k2", config2)
- fs3 = local.ss_customize_file_system(session, "./objects1", "pre2-", "k3", config1)
- fs4 = local.ss_customize_file_system(session, "./objects2", "pre2-", "k4", config2)
-
- # Create files in the file systems with some name overlap
- self.create_with_fs(fs1, 'alpaca')
- self.create_with_fs(fs2, 'bear')
- self.create_with_fs(fs3, 'crab')
- self.create_with_fs(fs4, 'deer')
+ fs1 = local.ss_customize_file_system(session, "./objects1", "k1", config1)
+ fs2 = local.ss_customize_file_system(session, "./objects2", "k2", config2)
+
+ # Create files in the wt home directory.
for a in ['beagle', 'bird', 'bison', 'bat']:
- self.create_with_fs(fs1, a)
- for a in ['bird', 'bison', 'bat', 'badger']:
- self.create_with_fs(fs2, a)
- for a in ['bison', 'bat', 'badger', 'baboon']:
- self.create_with_fs(fs3, a)
- for a in ['bat', 'badger', 'baboon', 'beagle']:
- self.create_with_fs(fs4, a)
-
- # Make sure we see the expected file names
- self.check(fs1, '', ['alpaca', 'beagle', 'bird', 'bison', 'bat'])
- self.check(fs1, 'a', ['alpaca'])
- self.check(fs1, 'b', ['beagle', 'bird', 'bison', 'bat'])
- self.check(fs1, 'c', [])
- self.check(fs1, 'd', [])
-
- self.check(fs2, '', ['bear', 'bird', 'bison', 'bat', 'badger'])
- self.check(fs2, 'a', [])
- self.check(fs2, 'b', ['bear', 'bird', 'bison', 'bat', 'badger'])
- self.check(fs2, 'c', [])
- self.check(fs2, 'd', [])
-
- self.check(fs3, '', ['crab', 'bison', 'bat', 'badger', 'baboon'])
- self.check(fs3, 'a', [])
- self.check(fs3, 'b', ['bison', 'bat', 'badger', 'baboon'])
- self.check(fs3, 'c', ['crab'])
- self.check(fs3, 'd', [])
-
- self.check(fs4, '', ['deer', 'bat', 'badger', 'baboon', 'beagle'])
- self.check(fs4, 'a', [])
- self.check(fs4, 'b', ['bat', 'badger', 'baboon', 'beagle'])
- self.check(fs4, 'c', [])
- self.check(fs4, 'd', ['deer'])
-
- # Flushing copies files to one of the subdirectories:
- # "./objects1" (for fs1 and fs3)
- # "./objects2" (for fs2 and fs4)
- #
- # After every flush, we'll check that the right objects appear in the right directory.
- # check_objects takes two lists: objects expected to be in ./objects1,
- # and objects expected to be in ./objects2 .
+ self.create_wt_file(a)
+ for a in ['cat', 'cougar', 'coyote', 'cub']:
+ self.create_wt_file(a)
+
+ # Everything is in wt home, nothing in the file system yet.
+ self.check_home(['beagle', 'bird', 'bison', 'bat', 'cat', 'cougar', 'coyote', 'cub'])
+ self.check_dirlist(fs1, '', [])
+ self.check_dirlist(fs2, '', [])
+ self.check_caches([], [])
self.check_objects([], [])
- # TODO: tiered: flush tests disabled, as the interface
- # for flushing will be changed.
- enable_fs_flush_tests = False
- if enable_fs_flush_tests:
- local.ss_flush(session, fs4, None, '')
- self.check_objects([], ['pre2-deer', 'pre2-bat', 'pre2-badger', 'pre2-baboon', 'pre2-beagle'])
-
- local.ss_flush(session, fs3, 'badger', '')
- self.check_objects(['pre2-badger'],
- ['pre2-deer', 'pre2-bat', 'pre2-badger', 'pre2-baboon', 'pre2-beagle'])
-
- #local.ss_flush(session, fs3, 'c', '') # make sure we don't flush prefixes
- self.check_objects(['pre2-badger'],
- ['pre2-deer', 'pre2-bat', 'pre2-badger', 'pre2-baboon', 'pre2-beagle'])
-
- local.ss_flush(session, fs3, 'b', '') # or suffixes
- self.check_objects(['pre2-badger'],
- ['pre2-deer', 'pre2-bat', 'pre2-badger', 'pre2-baboon', 'pre2-beagle'])
-
- local.ss_flush(session, fs3, 'crab', '')
- self.check_objects(['pre2-crab', 'pre2-badger'],
- ['pre2-deer', 'pre2-bat', 'pre2-badger', 'pre2-baboon', 'pre2-beagle'])
-
- local.ss_flush(session, fs3, 'crab', '') # should do nothing
- self.check_objects(['pre2-crab', 'pre2-badger'],
- ['pre2-deer', 'pre2-bat', 'pre2-badger', 'pre2-baboon', 'pre2-beagle'])
-
- local.ss_flush(session, None, None, '') # flush everything else
- self.check_objects(['pre1-alpaca', 'pre1-beagle', 'pre1-bird', 'pre1-bison', 'pre1-bat',
- 'pre2-crab', 'pre2-bison', 'pre2-bat', 'pre2-badger', 'pre2-baboon'],
- ['pre1-bear', 'pre1-bird', 'pre1-bison', 'pre1-bat', 'pre1-badger',
- 'pre2-deer', 'pre2-bat', 'pre2-badger', 'pre2-baboon', 'pre2-beagle'])
-
- local.ss_flush(session, None, None, '') # should do nothing
- self.check_objects(['pre1-alpaca', 'pre1-beagle', 'pre1-bird', 'pre1-bison', 'pre1-bat',
- 'pre2-crab', 'pre2-bison', 'pre2-bat', 'pre2-badger', 'pre2-baboon'],
- ['pre1-bear', 'pre1-bird', 'pre1-bison', 'pre1-bat', 'pre1-badger',
- 'pre2-deer', 'pre2-bat', 'pre2-badger', 'pre2-baboon', 'pre2-beagle'])
-
- self.create_with_fs(fs4, 'zebra') # should do nothing in the objects directories
- self.create_with_fs(fs4, 'yeti') # should do nothing in the objects directories
- self.check_objects(['pre1-alpaca', 'pre1-beagle', 'pre1-bird', 'pre1-bison', 'pre1-bat',
- 'pre2-crab', 'pre2-bison', 'pre2-bat', 'pre2-badger', 'pre2-baboon'],
- ['pre1-bear', 'pre1-bird', 'pre1-bison', 'pre1-bat', 'pre1-badger',
- 'pre2-deer', 'pre2-bat', 'pre2-badger', 'pre2-baboon', 'pre2-beagle'])
-
- # Try remove and rename, should be possible until we flush
- self.check(fs4, '', ['deer', 'bat', 'badger', 'baboon', 'beagle', 'yeti', 'zebra'])
- fs4.fs_remove(session, 'yeti', 0)
- self.check(fs4, '', ['deer', 'bat', 'badger', 'baboon', 'beagle', 'zebra'])
- fs4.fs_rename(session, 'zebra', 'okapi', 0)
- self.check(fs4, '', ['deer', 'bat', 'badger', 'baboon', 'beagle', 'okapi'])
- local.ss_flush(session, None, None, '')
- self.check(fs4, '', ['deer', 'bat', 'badger', 'baboon', 'beagle', 'okapi'])
- self.check_objects(['pre1-alpaca', 'pre1-beagle', 'pre1-bird', 'pre1-bison', 'pre1-bat',
- 'pre2-crab', 'pre2-bison', 'pre2-bat', 'pre2-badger', 'pre2-baboon'],
- ['pre1-bear', 'pre1-bird', 'pre1-bison', 'pre1-bat', 'pre1-badger',
- 'pre2-deer', 'pre2-bat', 'pre2-badger', 'pre2-baboon', 'pre2-beagle',
- 'pre2-okapi'])
-
- errmsg = '/rename of flushed file not allowed/'
- self.assertRaisesWithMessage(wiredtiger.WiredTigerError,
- lambda: fs4.fs_rename(session, 'okapi', 'zebra', 0), errmsg)
-
- # XXX
- # At the moment, removal of flushed files is not allowed - as flushed files are immutable.
- # We may need to explicitly evict flushed files from cache directory via the API, if so,
- # the API to do that might be on the local store object, not the file system.
- errmsg = '/remove of flushed file not allowed/'
- self.assertRaisesWithMessage(wiredtiger.WiredTigerError,
- lambda: fs4.fs_remove(session, 'okapi', 0), errmsg)
-
- # No change since last time.
- self.check(fs4, '', ['deer', 'bat', 'badger', 'baboon', 'beagle', 'okapi'])
- self.check_objects(['pre1-alpaca', 'pre1-beagle', 'pre1-bird', 'pre1-bison', 'pre1-bat',
- 'pre2-crab', 'pre2-bison', 'pre2-bat', 'pre2-badger', 'pre2-baboon'],
- ['pre1-bear', 'pre1-bird', 'pre1-bison', 'pre1-bat', 'pre1-badger',
- 'pre2-deer', 'pre2-bat', 'pre2-badger', 'pre2-baboon', 'pre2-beagle',
- 'pre2-okapi'])
+ # A flush copies to the cloud, nothing is removed.
+ local.ss_flush(session, fs1, 'beagle.wt', 'beagle.wtobj')
+ self.check_home(['beagle', 'bird', 'bison', 'bat', 'cat', 'cougar', 'coyote', 'cub'])
+ self.check_dirlist(fs1, '', [])
+ self.check_dirlist(fs2, '', [])
+ self.check_caches([], [])
+ self.check_objects(['beagle'], [])
+
+ # Bad file to flush
+ errmsg = '/No such file/'
+ self.assertRaisesWithMessage(wiredtiger.WiredTigerError,
+ lambda: local.ss_flush(session, fs1, 'bad.wt', 'bad.wtobj'), errmsg)
+
+ # It's okay to flush again, nothing changes
+ local.ss_flush(session, fs1, 'beagle.wt', 'beagle.wtobj')
+ self.check_home(['beagle', 'bird', 'bison', 'bat', 'cat', 'cougar', 'coyote', 'cub'])
+ self.check_dirlist(fs1, '', [])
+ self.check_dirlist(fs2, '', [])
+ self.check_caches([], [])
+ self.check_objects(['beagle'], [])
+
+ # When we flush_finish, the local file will move to the cache directory
+ local.ss_flush_finish(session, fs1, 'beagle.wt', 'beagle.wtobj')
+ self.check_home(['bird', 'bison', 'bat', 'cat', 'cougar', 'coyote', 'cub'])
+ self.check_dirlist(fs1, '', ['beagle'])
+ self.check_dirlist(fs2, '', [])
+ self.check_caches(['beagle'], [])
+ self.check_objects(['beagle'], [])
+
+ # Do a some more in each file ssytem
+ local.ss_flush(session, fs1, 'bison.wt', 'bison.wtobj')
+ local.ss_flush(session, fs2, 'cat.wt', 'cat.wtobj')
+ local.ss_flush(session, fs1, 'bat.wt', 'bat.wtobj')
+ local.ss_flush_finish(session, fs2, 'cat.wt', 'cat.wtobj')
+ local.ss_flush(session, fs2, 'cub.wt', 'cub.wtobj')
+ local.ss_flush_finish(session, fs1, 'bat.wt', 'bat.wtobj')
+
+ self.check_home(['bird', 'bison', 'cougar', 'coyote', 'cub'])
+ self.check_dirlist(fs1, '', ['beagle', 'bat'])
+ self.check_dirlist(fs2, '', ['cat'])
+ self.check_caches(['beagle', 'bat'], ['cat'])
+ self.check_objects(['beagle', 'bat', 'bison'], ['cat', 'cub'])
+
+ # Test directory listing prefixes
+ self.check_dirlist(fs1, '', ['beagle', 'bat'])
+ self.check_dirlist(fs1, 'ba', ['bat'])
+ self.check_dirlist(fs1, 'be', ['beagle'])
+ self.check_dirlist(fs1, 'x', [])
if __name__ == '__main__':
wttest.run()