diff options
author | Luke Chen <luke.chen@mongodb.com> | 2021-05-19 15:49:07 +1000 |
---|---|---|
committer | Evergreen Agent <no-reply@evergreen.mongodb.com> | 2021-05-19 06:02:30 +0000 |
commit | 1d20af63ae5e95e0ed1219809c5af571de8e3ae3 (patch) | |
tree | 8b046a8403a959b712715fdfd296a38f0c4b1fb4 /src/third_party/wiredtiger | |
parent | cc1640581318df61a5fedc5c7ddd1a91c13e0712 (diff) | |
download | mongo-1d20af63ae5e95e0ed1219809c5af571de8e3ae3.tar.gz |
Import wiredtiger: bae0c1c914bc0fa92f3775c08650b65663094034 from branch mongodb-4.4
ref: aadac22242..bae0c1c914
for: 4.4.7
WT-6403 Restore format non-timestamp transactional testing
WT-6576 Fix the aborted on-disk prepared key
WT-7106 Increase how often delta encoding is used for history store records
WT-7204 Update cursor-backward walks key instantiation support
WT-7234 Prefix-compressed keys and memory amplification
WT-7296 Merge default configuration with supplied test configuration in test framework
WT-7325 Created a script to generate a new test in the WT test framework
WT-7381 Cache btree's ckptlist between checkpoints
WT-7382 Refactor of database validation in the test framework
WT-7407 test/format failure classifier
WT-7411 Stats and counter to track prepared updates
WT-7416 Imported table requires full copy between incremental backups
WT-7446 Fix incorrect duration_seconds value in test framework
WT-7486 Coverity explcit NULL dereferenced
WT-7487 Coverity explcit NULL dereferenced
WT-7497 Add flush component to object metadata
WT-7499 Change WT_STORAGE_SOURCE.flush API and add flush_finish
WT-7503 Change default compressor for WT HS to Zstandard
WT-7506 Allow single and double quotes inside auto-formatted comments
WT-7511 Add assert to ensure the history store page is pinned before search
WT-7519 Fix flags field overflow in WT_DATA_HANDLE
WT-7525 Add key order check right after history store insert
WT-7537 Change local tier object suffix to .wtobj
WT-7546 Coverity: Minor issues in CppSuite test harness
Diffstat (limited to 'src/third_party/wiredtiger')
116 files changed, 3316 insertions, 2549 deletions
diff --git a/src/third_party/wiredtiger/.clang-format b/src/third_party/wiredtiger/.clang-format index db3bac132a4..7f291933076 100644 --- a/src/third_party/wiredtiger/.clang-format +++ b/src/third_party/wiredtiger/.clang-format @@ -71,6 +71,7 @@ ForEachMacros: - WT_CELL_FOREACH_KV - WT_CELL_FOREACH_VRFY - WT_CKPT_FOREACH + - WT_CKPT_FOREACH_NAME_OR_ORDER - WT_COL_FOREACH - WT_EXT_FOREACH - WT_EXT_FOREACH_OFF diff --git a/src/third_party/wiredtiger/bench/wtperf/config.c b/src/third_party/wiredtiger/bench/wtperf/config.c index a6c625e5ef0..70e01507719 100644 --- a/src/third_party/wiredtiger/bench/wtperf/config.c +++ b/src/third_party/wiredtiger/bench/wtperf/config.c @@ -462,8 +462,7 @@ config_opt(WTPERF *wtperf, WT_CONFIG_ITEM *k, WT_CONFIG_ITEM *v) break; case STRING_TYPE: /* - * Thread configuration is the one case where the type isn't a - * "string", it's a "struct". + * Thread configuration is the one case where the type isn't a "string", it's a "struct". */ if (v->type == WT_CONFIG_ITEM_STRUCT && STRING_MATCH("threads", k->str, k->len)) return (config_threads(wtperf, v->str, v->len)); @@ -821,12 +820,10 @@ config_consolidate(CONFIG_OPTS *opts) char *string_key; /* - * This loop iterates over the config queue and for each entry checks if - * a later queue entry has the same key. If there's a match, and key is - * "conn_config" or "table_config", the later queue entry is replaced - * with a concatenated entry of the two queue entries, the current queue - * entry is removed. For any other key, if there is a match, the current - * queue entry is removed. + * This loop iterates over the config queue and for each entry checks if a later queue entry has + * the same key. If there's a match, and key is "conn_config" or "table_config", the later queue + * entry is replaced with a concatenated entry of the two queue entries, the current queue entry + * is removed. For any other key, if there is a match, the current queue entry is removed. */ conf_line = TAILQ_FIRST(&opts->config_head); while (conf_line != NULL) { diff --git a/src/third_party/wiredtiger/dist/api_data.py b/src/third_party/wiredtiger/dist/api_data.py index 953474d404f..6d65d9d217a 100644 --- a/src/third_party/wiredtiger/dist/api_data.py +++ b/src/third_party/wiredtiger/dist/api_data.py @@ -49,12 +49,10 @@ common_runtime_config = [ enable enhanced checking. ''', type='category', subconfig= [ Config('commit_timestamp', 'none', r''' - This option is no longer supported. Retained for backward - compatibility. Use \c write_timestamp option instead.''', + This option is no longer supported, retained for backward compatibility.''', choices=['always', 'key_consistent', 'never', 'none']), Config('durable_timestamp', 'none', r''' - This option is no longer supported. Retained for backward - compatibility. Use \c write_timestamp option instead.''', + This option is no longer supported, retained for backward compatibility.''', choices=['always', 'key_consistent', 'never', 'none']), Config('write_timestamp', 'off', r''' verify that commit timestamps are used per the configured @@ -330,8 +328,7 @@ file_config = format_meta + file_runtime_config + tiered_config + [ the file format''', choices=['btree']), Config('huffman_key', 'none', r''' - This option is no longer supported. Retained for backward - compatibility. See @ref huffman for more information'''), + This option is no longer supported, retained for backward compatibility.'''), Config('huffman_value', 'none', r''' configure Huffman encoding for values. Permitted values are \c "none", \c "english", \c "utf8<file>" or \c "utf16<file>". @@ -355,8 +352,8 @@ file_config = format_meta + file_runtime_config + tiered_config + [ block compression is done''', min='512B', max='512MB'), Config('internal_item_max', '0', r''' - historic term for internal_key_max''', - min=0, undoc=True), + This option is no longer supported, retained for backward compatibility.''', + min=0), Config('internal_key_max', '0', r''' the largest key stored in an internal node, in bytes. If set, keys larger than the specified size are stored as overflow items (which @@ -365,10 +362,8 @@ file_config = format_meta + file_runtime_config + tiered_config + [ page''', min='0'), Config('key_gap', '10', r''' - the maximum gap between instantiated keys in a Btree leaf page, - constraining the number of keys processed to instantiate a - random Btree leaf page key''', - min='0', undoc=True), + This option is no longer supported, retained for backward compatibility.''', + min='0'), Config('leaf_key_max', '0', r''' the largest key stored in a leaf node, in bytes. If set, keys larger than the specified size are stored as overflow items (which @@ -392,8 +387,8 @@ file_config = format_meta + file_runtime_config + tiered_config + [ a newly split leaf page''', min='0'), Config('leaf_item_max', '0', r''' - historic term for leaf_key_max and leaf_value_max''', - min=0, undoc=True), + This option is no longer supported, retained for backward compatibility.''', + min=0), Config('memory_page_image_max', '0', r''' the maximum in-memory page image represented by a single storage block. Depending on compression efficiency, compression can create storage @@ -467,7 +462,10 @@ tiered_meta = common_meta + tiered_config + [ tier_meta = file_meta + tiered_tree_config # Objects need to have the readonly setting set and bucket_prefix. # The file_meta already contains those pieces. -object_meta = file_meta +object_meta = file_meta + [ + Config('flush', '0', r''' + indicates the time this object was flushed to shared storage or 0 if unflushed'''), +] table_only_config = [ Config('colgroups', '', r''' diff --git a/src/third_party/wiredtiger/dist/s_comment.py b/src/third_party/wiredtiger/dist/s_comment.py index 482fded4fff..92e4ef348da 100644 --- a/src/third_party/wiredtiger/dist/s_comment.py +++ b/src/third_party/wiredtiger/dist/s_comment.py @@ -104,11 +104,11 @@ for line in sys.stdin: (sline[2].islower() or sline[2] == '_') and sline.endswith('--')): function_desc = True # We're only reformatting block comments where each line begins with a space and an - # alphabetic character after the asterisk, or a parenthetical. The only exceptions + # normal comment character after the asterisk, or a parenthetical. The only exceptions # are function descriptions. block = block and \ len(sline) >= 3 and sline.startswith('*') and sline[1] == ' ' and \ - (sline[2].isalpha() or (len(sline) >= 5 and \ + (sline[2].isalpha() or sline[2] == '"' or sline[2] == "'" or (len(sline) >= 5 and \ (sline[2] == '(' and sline[3].isalpha() and sline[4] != ')'))) or function_desc # Trim asterisks at the beginning of each line in a multiline comment. if sline.startswith('*'): diff --git a/src/third_party/wiredtiger/dist/s_define.list b/src/third_party/wiredtiger/dist/s_define.list index 3e0f6af6581..34e0a9a8aa2 100644 --- a/src/third_party/wiredtiger/dist/s_define.list +++ b/src/third_party/wiredtiger/dist/s_define.list @@ -32,6 +32,8 @@ WT_CLOCKDIFF_NS WT_CONN_CHECK_PANIC WT_DEADLOCK WT_DEBUG_BYTE +WT_DHANDLE_MAX_FLAG +WT_DHANDLE_ZZZ_ENDFLAG WT_ERR_ASSERT WT_ERR_ERROR_OK WT_EXT_FOREACH_OFF diff --git a/src/third_party/wiredtiger/dist/s_string.ok b/src/third_party/wiredtiger/dist/s_string.ok index 9c6976641b0..db198204b10 100644 --- a/src/third_party/wiredtiger/dist/s_string.ok +++ b/src/third_party/wiredtiger/dist/s_string.ok @@ -1035,6 +1035,7 @@ mbss mem memalign membar +memcmp memcpy memdup memget @@ -1493,6 +1494,7 @@ wtstats xF xdeadbeef xff +xfff xxxx xxxxx xxxxxx diff --git a/src/third_party/wiredtiger/dist/s_void b/src/third_party/wiredtiger/dist/s_void index 95cf8ec5215..ab46f05c593 100755 --- a/src/third_party/wiredtiger/dist/s_void +++ b/src/third_party/wiredtiger/dist/s_void @@ -194,6 +194,8 @@ for f in `find bench ext src test -name '*.c' -o -name '*_inline.h'`; do -e '/WT_ERR/d' \ -e '/WT_SYSCALL.*ret/d' \ -e '/WT_TRET/d' \ + -e '/__wt_buf_catfmt/d' \ + -e '/__wt_buf_fmt/d' \ -e 's/^\([^(]*\).*/\1/' \ -e 's/^ *//' > $t test -s $t && { diff --git a/src/third_party/wiredtiger/dist/stat_data.py b/src/third_party/wiredtiger/dist/stat_data.py index 34e5b020a4a..89a5578b362 100644 --- a/src/third_party/wiredtiger/dist/stat_data.py +++ b/src/third_party/wiredtiger/dist/stat_data.py @@ -551,7 +551,10 @@ connection_stats = [ TxnStat('txn_prepare_active', 'prepared transactions currently active'), TxnStat('txn_prepare_commit', 'prepared transactions committed'), TxnStat('txn_prepare_rollback', 'prepared transactions rolled back'), - TxnStat('txn_prepared_updates_count', 'Number of prepared updates'), + TxnStat('txn_prepared_updates_committed', 'Number of prepared updates committed'), + TxnStat('txn_prepared_updates', 'Number of prepared updates'), + TxnStat('txn_prepared_updates_key_repeated', 'Number of prepared updates repeated on the same key'), + TxnStat('txn_prepared_updates_rolledback', 'Number of prepared updates rolled back'), TxnStat('txn_query_ts', 'query timestamp calls'), TxnStat('txn_rollback', 'transactions rolled back'), TxnStat('txn_rollback_to_stable_running', 'transaction rollback to stable currently running', 'no_clear,no_scale'), diff --git a/src/third_party/wiredtiger/dist/test_data.py b/src/third_party/wiredtiger/dist/test_data.py index d0c8d36e955..23667a35751 100644 --- a/src/third_party/wiredtiger/dist/test_data.py +++ b/src/third_party/wiredtiger/dist/test_data.py @@ -59,9 +59,9 @@ throttle_config = [ # Record config specifies the format of the keys and values used in the database # record_config = throttle_config + [ - Config('key_size', 0, r''' + Config('key_size', 5, r''' The size of the keys created''', min=0, max=10000), - Config('value_size', 0, r''' + Config('value_size', 5, r''' The size of the values created''', min=0, max=1000000000), ] @@ -79,27 +79,33 @@ populate_config = [ # A generic configuration used by various other configurations to define whether that component or # similar is enabled or not. # -enable_config = [ +enabled_config_true = [ + Config('enabled', 'true', r''' + Whether or not this is relevant to the workload''', + type='boolean'), +] + +enabled_config_false = [ Config('enabled', 'false', r''' Whether or not this is relevant to the workload''', type='boolean'), ] -stat_config = enable_config +stat_config = enabled_config_false limit_stat = stat_config + [ Config('limit', 0, r''' - The limit value a statistic is allowed to reach''') + The limit value a statistic is allowed to reach''', min=0) ] range_config = [ Config('min', 0, r''' - The minimum a value can be in a range'''), + The minimum a value can be in a range''', min=0), Config('max', 1, r''' The maximum a value can be in a range''') ] -component_config = enable_config + throttle_config +component_config = enabled_config_true + throttle_config transaction_config = [ Config('ops_per_transaction', '', r''' @@ -122,16 +128,16 @@ runtime_monitor = component_config + [ # Configuration that applies to the timestamp_manager component. # timestamp_manager = component_config + [ - Config('oldest_lag', 0, r''' + Config('oldest_lag', 1, r''' The duration between the stable and oldest timestamps''', min=0, max=1000000), - Config('stable_lag', 0, r''' + Config('stable_lag', 1, r''' The duration between the latest and stable timestamps''', min=0, max=1000000), ] # # Configuration that applies to the workload tracking component. # -workload_tracking = enable_config +workload_tracking = component_config # # Configuration that applies to the workload_generator component. @@ -173,7 +179,7 @@ test_config = [ The cache size that wiredtiger will be configured to run with''', min=0, max=100000000000), Config('duration_seconds', 0, r''' The duration that the test run will last''', min=0, max=1000000), - Config('enable_logging', 'true', r''' + Config('enable_logging', 'false', r''' Enables write ahead logs''', type='boolean'), ] diff --git a/src/third_party/wiredtiger/examples/c/ex_all.c b/src/third_party/wiredtiger/examples/c/ex_all.c index fd4b9ca43b9..36de04005c0 100644 --- a/src/third_party/wiredtiger/examples/c/ex_all.c +++ b/src/third_party/wiredtiger/examples/c/ex_all.c @@ -1012,9 +1012,9 @@ connection_ops(WT_CONNECTION *conn) * * Functions are specified by name (for example, "wiredtiger_open"). * - * Methods are specified using a concatenation of the handle name, a - * period and the method name (for example, session create would be - * "WT_SESSION.create" and cursor close would be WT_CURSOR.close"). + * Methods are specified using a concatenation of the handle name, a period and the method name + * (for example, session create would be "WT_SESSION.create" and cursor close would be + * "WT_CURSOR.close"). */ error_check( wiredtiger_config_validate(NULL, NULL, "WT_SESSION.create", "allocation_size=32KB")); diff --git a/src/third_party/wiredtiger/examples/c/ex_backup.c b/src/third_party/wiredtiger/examples/c/ex_backup.c index 9267bd1ccf6..23636f3494e 100644 --- a/src/third_party/wiredtiger/examples/c/ex_backup.c +++ b/src/third_party/wiredtiger/examples/c/ex_backup.c @@ -50,13 +50,12 @@ compare_backups(int i) char buf[1024], msg[32]; /* - * We run 'wt dump' on both the full backup directory and the - * incremental backup directory for this iteration. Since running - * 'wt' runs recovery and makes both directories "live", we need - * a new directory for each iteration. + * We run 'wt dump' on both the full backup directory and the incremental backup directory for + * this iteration. Since running 'wt' runs recovery and makes both directories "live", we need a + * new directory for each iteration. * - * If i == 0, we're comparing against the main, original directory - * with the final incremental directory. + * If i == 0, we're comparing against the main, original directory with the final incremental + * directory. */ if (i == 0) (void)snprintf( diff --git a/src/third_party/wiredtiger/examples/c/ex_backup_block.c b/src/third_party/wiredtiger/examples/c/ex_backup_block.c index c935baf9c75..1cbbe1fbf09 100644 --- a/src/third_party/wiredtiger/examples/c/ex_backup_block.c +++ b/src/third_party/wiredtiger/examples/c/ex_backup_block.c @@ -65,13 +65,12 @@ compare_backups(int i) char buf[1024], msg[32]; /* - * We run 'wt dump' on both the full backup directory and the - * incremental backup directory for this iteration. Since running - * 'wt' runs recovery and makes both directories "live", we need - * a new directory for each iteration. + * We run 'wt dump' on both the full backup directory and the incremental backup directory for + * this iteration. Since running 'wt' runs recovery and makes both directories "live", we need a + * new directory for each iteration. * - * If i == 0, we're comparing against the main, original directory - * with the final incremental directory. + * If i == 0, we're comparing against the main, original directory with the final incremental + * directory. */ if (i == 0) (void)snprintf(buf, sizeof(buf), "../../wt -R -h %s dump main > %s.%d", home, full_out, i); diff --git a/src/third_party/wiredtiger/examples/c/ex_data_source.c b/src/third_party/wiredtiger/examples/c/ex_data_source.c index ae16b837d29..9ee94e6cfae 100644 --- a/src/third_party/wiredtiger/examples/c/ex_data_source.c +++ b/src/third_party/wiredtiger/examples/c/ex_data_source.c @@ -351,8 +351,7 @@ my_open_cursor(WT_DATA_SOURCE *dsrc, WT_SESSION *session, const char *uri, WT_CO int my_data_source_overwrite; /* - * Retrieve the value of the boolean type configuration string - * "overwrite". + * Retrieve the value of the boolean type configuration string "overwrite". */ error_check(wt_api->config_get(wt_api, session, config, "overwrite", &v)); my_data_source_overwrite = v.val != 0; @@ -367,8 +366,7 @@ my_open_cursor(WT_DATA_SOURCE *dsrc, WT_SESSION *session, const char *uri, WT_CO int64_t my_data_source_page_size; /* - * Retrieve the value of the integer type configuration string - * "page_size". + * Retrieve the value of the integer type configuration string "page_size". */ error_check(wt_api->config_get(wt_api, session, config, "page_size", &v)); my_data_source_page_size = v.val; @@ -383,8 +381,7 @@ my_open_cursor(WT_DATA_SOURCE *dsrc, WT_SESSION *session, const char *uri, WT_CO const char *my_data_source_key; /* - * Retrieve the value of the string type configuration string - * "key_format". + * Retrieve the value of the string type configuration string "key_format". */ error_check(wt_api->config_get(wt_api, session, config, "key_format", &v)); diff --git a/src/third_party/wiredtiger/ext/storage_sources/local_store/local_store.c b/src/third_party/wiredtiger/ext/storage_sources/local_store/local_store.c index 00f65988843..0f6a7cfe473 100644 --- a/src/third_party/wiredtiger/ext/storage_sources/local_store/local_store.c +++ b/src/third_party/wiredtiger/ext/storage_sources/local_store/local_store.c @@ -66,7 +66,6 @@ typedef struct { * Locks are used to protect the file handle queue and flush queue. */ pthread_rwlock_t file_handle_lock; - pthread_rwlock_t flush_lock; /* * Configuration values are set at startup. @@ -87,7 +86,6 @@ typedef struct { /* Queue of file handles */ TAILQ_HEAD(local_file_handle_qh, local_file_handle) fileq; - TAILQ_HEAD(local_flush_qh, local_flush_item) flushq; } LOCAL_STORAGE; @@ -102,34 +100,13 @@ typedef struct { char *auth_token; /* Identifier for key management system */ char *bucket_dir; /* Directory that stands in for cloud storage bucket */ char *cache_dir; /* Directory for pre-flushed objects and cached objects */ - char *fs_prefix; /* File system prefix, allowing for a "directory" within a bucket */ } LOCAL_FILE_SYSTEM; -/* - * Indicates a object that has not yet been flushed. - */ -typedef struct local_flush_item { - char *src_path; /* File name to copy from, object name and cache name derived from this */ - - /* - * These fields would be used in performing a flush. - */ - char *auth_token; /* Identifier for key management system */ - char *bucket; /* Bucket name */ - char *cache_dir; /* Cache directory */ - char *fs_prefix; /* Prefix for file system */ - WT_FS_OPEN_FILE_TYPE file_type; /* File type */ - - TAILQ_ENTRY(local_flush_item) q; /* Queue of items */ -} LOCAL_FLUSH_ITEM; - typedef struct local_file_handle { WT_FILE_HANDLE iface; /* Must come first */ - LOCAL_STORAGE *local; /* Enclosing storage source */ - WT_FILE_HANDLE *fh; /* File handle */ - char *path; /* Path name of file */ - LOCAL_FLUSH_ITEM *flush; /* Flush information, set if newly created */ + LOCAL_STORAGE *local; /* Enclosing storage source */ + WT_FILE_HANDLE *fh; /* File handle */ TAILQ_ENTRY(local_file_handle) q; /* Queue of handles */ } LOCAL_FILE_HANDLE; @@ -137,24 +114,28 @@ typedef struct local_file_handle { /* * Forward function declarations for internal functions */ +static int local_bucket_path(WT_FILE_SYSTEM *, const char *, char **); +static int local_cache_path(WT_FILE_SYSTEM *, const char *, char **); static int local_configure(LOCAL_STORAGE *, WT_CONFIG_ARG *); static int local_configure_int(LOCAL_STORAGE *, WT_CONFIG_ARG *, const char *, uint32_t *); static int local_delay(LOCAL_STORAGE *); static int local_err(LOCAL_STORAGE *, WT_SESSION *, int, const char *, ...); -static void local_flush_free(LOCAL_FLUSH_ITEM *); +static int local_file_copy( + LOCAL_STORAGE *, WT_SESSION *, const char *, const char *, WT_FS_OPEN_FILE_TYPE); static int local_get_directory(const char *, ssize_t len, char **); -static int local_location_path(WT_FILE_SYSTEM *, const char *, char **); +static int local_path(WT_FILE_SYSTEM *, const char *, const char *, char **); static int local_writeable(LOCAL_STORAGE *, const char *name, bool *writeable); /* * Forward function declarations for storage source API implementation */ static int local_exist(WT_FILE_SYSTEM *, WT_SESSION *, const char *, bool *); -static int local_customize_file_system(WT_STORAGE_SOURCE *, WT_SESSION *, const char *, - const char *, const char *, const char *, WT_FILE_SYSTEM **); +static int local_customize_file_system( + WT_STORAGE_SOURCE *, WT_SESSION *, const char *, const char *, const char *, WT_FILE_SYSTEM **); static int local_flush( - WT_STORAGE_SOURCE *, WT_SESSION *, WT_FILE_SYSTEM *, const char *, const char *); -static int local_flush_one(LOCAL_STORAGE *, WT_SESSION *, LOCAL_FLUSH_ITEM *); + WT_STORAGE_SOURCE *, WT_SESSION *, WT_FILE_SYSTEM *, const char *, const char *, const char *); +static int local_flush_finish( + WT_STORAGE_SOURCE *, WT_SESSION *, WT_FILE_SYSTEM *, const char *, const char *, const char *); static int local_terminate(WT_STORAGE_SOURCE *, WT_SESSION *); /* @@ -296,23 +277,6 @@ local_err(LOCAL_STORAGE *local, WT_SESSION *session, int ret, const char *format } /* - * local_flush_free -- - * Free storage for a flush item. - */ -static void -local_flush_free(LOCAL_FLUSH_ITEM *flush) -{ - if (flush != NULL) { - free(flush->auth_token); - free(flush->bucket); - free(flush->cache_dir); - free(flush->fs_prefix); - free(flush->src_path); - free(flush); - } -} - -/* * local_get_directory -- * Return a copy of a directory name after verifying that it is a directory. */ @@ -363,19 +327,37 @@ local_writeable(LOCAL_STORAGE *local, const char *name, bool *writeablep) } /* - * local_location_path -- + * local_bucket_path -- + * Construct the bucket pathname from the file system and local name. + */ +static int +local_bucket_path(WT_FILE_SYSTEM *file_system, const char *name, char **pathp) +{ + return (local_path(file_system, ((LOCAL_FILE_SYSTEM *)file_system)->bucket_dir, name, pathp)); +} + +/* + * local_cache_path -- + * Construct the cache pathname from the file system and local name. + */ +static int +local_cache_path(WT_FILE_SYSTEM *file_system, const char *name, char **pathp) +{ + return (local_path(file_system, ((LOCAL_FILE_SYSTEM *)file_system)->cache_dir, name, pathp)); +} + +/* + * local_path -- * Construct a pathname from the file system and local name. */ -int -local_location_path(WT_FILE_SYSTEM *file_system, const char *name, char **pathp) +static int +local_path(WT_FILE_SYSTEM *file_system, const char *dir, const char *name, char **pathp) { - LOCAL_FILE_SYSTEM *local_fs; size_t len; int ret; char *p; ret = 0; - local_fs = (LOCAL_FILE_SYSTEM *)file_system; /* Skip over "./" and variations (".//", ".///./././//") at the beginning of the name. */ while (*name == '.') { @@ -385,10 +367,10 @@ local_location_path(WT_FILE_SYSTEM *file_system, const char *name, char **pathp) while (*name == '/') name++; } - len = strlen(local_fs->cache_dir) + strlen(local_fs->fs_prefix) + strlen(name) + 2; + len = strlen(dir) + strlen(name) + 2; if ((p = malloc(len)) == NULL) - return (local_err(FS2LOCAL(file_system), NULL, ENOMEM, "local_location_path")); - snprintf(p, len, "%s/%s%s", local_fs->cache_dir, local_fs->fs_prefix, name); + return (local_err(FS2LOCAL(file_system), NULL, ENOMEM, "local_path")); + snprintf(p, len, "%s/%s", dir, name); *pathp = p; return (ret); } @@ -399,7 +381,7 @@ local_location_path(WT_FILE_SYSTEM *file_system, const char *name, char **pathp) */ static int local_customize_file_system(WT_STORAGE_SOURCE *storage_source, WT_SESSION *session, - const char *bucket_name, const char *prefix, const char *auth_token, const char *config, + const char *bucket_name, const char *auth_token, const char *config, WT_FILE_SYSTEM **file_systemp) { LOCAL_STORAGE *local; @@ -470,10 +452,6 @@ local_customize_file_system(WT_STORAGE_SOURCE *storage_source, WT_SESSION *sessi local_err(local, session, ret, "%*s: cache directory", (int)cachedir.len, cachedir.str); goto err; } - if ((fs->fs_prefix = strdup(prefix)) == NULL) { - ret = local_err(local, session, ENOMEM, "local_file_system.prefix"); - goto err; - } fs->file_system.fs_directory_list = local_directory_list; fs->file_system.fs_directory_list_single = local_directory_list_single; fs->file_system.fs_directory_list_free = local_directory_list_free; @@ -491,7 +469,6 @@ err: free(fs->auth_token); free(fs->bucket_dir); free(fs->cache_dir); - free(fs->fs_prefix); free(fs); } return (ret); @@ -521,7 +498,7 @@ local_exist(WT_FILE_SYSTEM *file_system, WT_SESSION *session, const char *name, ret = local_err(local, session, errno, "%s: ss_exist stat", path); local->op_count++; - if ((ret = local_location_path(file_system, name, &path)) != 0) + if ((ret = local_cache_path(file_system, name, &path)) != 0) goto err; ret = stat(path, &sb); @@ -539,149 +516,46 @@ err: } /* - * local_flush -- - * Return when the files have been flushed. + * local_file_copy -- + * Copy a file. */ static int -local_flush(WT_STORAGE_SOURCE *storage_source, WT_SESSION *session, WT_FILE_SYSTEM *file_system, - const char *name, const char *config) -{ - LOCAL_STORAGE *local; - LOCAL_FLUSH_ITEM *flush, *safe_flush; - int ret, t_ret; - char *match; - - (void)config; /* Unused */ - - /* - * This implementation does not do anything meaningful on flush. However, we do track which - * objects have not yet been flushed and note which ones need to be flushed now. - */ - ret = 0; - local = (LOCAL_STORAGE *)storage_source; - match = NULL; - - if (file_system == NULL && name != NULL) - return local_err(local, session, EINVAL, "flush: cannot specify name without file system"); - - local->op_count++; - if (file_system != NULL) { - if ((ret = local_location_path(file_system, name == NULL ? "" : name, &match)) != 0) - goto err; - } - VERBOSE(local, "Flush: match=%s\n", SHOW_STRING(match)); - - /* - * Note: we retain the lock on the data structure while flushing all entries. This is fine for - * our local file implementation, when we don't have to do anything to flush, but for a cloud - * implementation, we'll want some way to not hold the lock while transferring data. - */ - if ((ret = pthread_rwlock_wrlock(&local->flush_lock)) != 0) { - (void)local_err(local, session, ret, "flush: pthread_rwlock_wrlock"); - goto err; - } - - TAILQ_FOREACH_SAFE(flush, &local->flushq, q, safe_flush) - { - if (match != NULL) { - /* - * We must match against the bucket and the name if given. - * Our match string is of the form: - * <bucket_name>/<fs_prefix><name> - * - * If name is given, we must match the entire path. - * If name is not given, we must match up to the beginning - * of the name. - */ - if (name != NULL) { - /* Exact name match required. */ - if (strcmp(flush->src_path, match) != 0) - continue; - } - /* No name specified, everything up to the name must match. */ - else if (strncmp(flush->src_path, match, strlen(match)) != 0) - continue; - } - if ((t_ret = local_flush_one(local, session, flush)) != 0 && ret == 0) - ret = t_ret; - TAILQ_REMOVE(&local->flushq, flush, q); - local_flush_free(flush); - } - - if ((t_ret = pthread_rwlock_unlock(&local->flush_lock)) != 0) { - (void)local_err(local, session, t_ret, "flush: pthread_rwlock_unlock"); - if (ret == 0) - ret = t_ret; - } - -err: - free(match); - - return (ret); -} - -/* - * local_flush_one -- - * Flush one item on the flush queue. - */ -static int -local_flush_one(LOCAL_STORAGE *local, WT_SESSION *session, LOCAL_FLUSH_ITEM *flush) +local_file_copy(LOCAL_STORAGE *local, WT_SESSION *session, const char *src_path, + const char *dest_path, WT_FS_OPEN_FILE_TYPE type) { WT_FILE_HANDLE *dest, *src; WT_FILE_SYSTEM *wt_fs; wt_off_t copy_size, file_size, left; + ssize_t pos; int ret, t_ret; - char *object_name; char buffer[1024 * 64]; - char dest_path[1024]; - ssize_t pos; - ret = 0; - src = dest = NULL; - - object_name = strrchr(flush->src_path, '/'); - if (object_name == NULL) { - ret = local_err(local, session, errno, "%s: unexpected src path", flush->src_path); - goto err; - } - object_name++; - - /* - * Here's where we flush the file to the cloud. This "local" implementation copies the file to - * the bucket directory. - */ - VERBOSE(local, "Flush object: from=%s, bucket=%s, object=%s, auth_token=%s, \n", - flush->src_path, flush->bucket, object_name, flush->auth_token); - - if ((ret = local_delay(local)) != 0) - goto err; + dest = src = NULL; if ((ret = local->wt_api->file_system_get(local->wt_api, session, &wt_fs)) != 0) { ret = local_err(local, session, ret, "local_file_system: cannot get WiredTiger file system"); goto err; } - snprintf(dest_path, sizeof(dest_path), "%s/%s", flush->bucket, object_name); - - if ((ret = wt_fs->fs_open_file( - wt_fs, session, flush->src_path, flush->file_type, WT_FS_OPEN_READONLY, &src)) != 0) { - ret = local_err(local, session, ret, "%s: cannot open for read", flush->src_path); + if ((ret = wt_fs->fs_open_file(wt_fs, session, src_path, type, WT_FS_OPEN_READONLY, &src)) != + 0) { + ret = local_err(local, session, ret, "%s: cannot open for read", src_path); goto err; } - if ((ret = wt_fs->fs_open_file( - wt_fs, session, dest_path, flush->file_type, WT_FS_OPEN_CREATE, &dest)) != 0) { + if ((ret = wt_fs->fs_open_file(wt_fs, session, dest_path, type, WT_FS_OPEN_CREATE, &dest)) != + 0) { ret = local_err(local, session, ret, "%s: cannot create", dest_path); goto err; } - if ((ret = wt_fs->fs_size(wt_fs, session, flush->src_path, &file_size)) != 0) { - ret = local_err(local, session, ret, "%s: cannot get size", flush->src_path); + if ((ret = wt_fs->fs_size(wt_fs, session, src_path, &file_size)) != 0) { + ret = local_err(local, session, ret, "%s: cannot get size", src_path); goto err; } for (pos = 0, left = file_size; left > 0; pos += copy_size, left -= copy_size) { copy_size = left < (wt_off_t)sizeof(buffer) ? left : (wt_off_t)sizeof(buffer); if ((ret = src->fh_read(src, session, pos, (size_t)copy_size, buffer)) != 0) { - ret = local_err(local, session, ret, "%s: cannot read", flush->src_path); + ret = local_err(local, session, ret, "%s: cannot read", src_path); goto err; } if ((ret = dest->fh_write(dest, session, pos, (size_t)copy_size, buffer)) != 0) { @@ -689,16 +563,7 @@ local_flush_one(LOCAL_STORAGE *local, WT_SESSION *session, LOCAL_FLUSH_ITEM *flu goto err; } } - if ((ret = dest->fh_sync(dest, session)) != 0) { - ret = local_err(local, session, ret, "%s: cannot sync", dest_path); - goto err; - } - local->object_flushes++; - err: - /* When we're done with flushing this file, set the file to readonly. */ - if (ret == 0 && (ret = chmod(flush->src_path, 0444)) < 0) - ret = local_err(local, session, errno, "%s: chmod flushed file failed", flush->src_path); if (src != NULL && (t_ret = src->close(src, session)) != 0) if (ret == 0) ret = t_ret; @@ -710,6 +575,79 @@ err: } /* + * local_flush -- + * Return when the file has been flushed. + */ +static int +local_flush(WT_STORAGE_SOURCE *storage_source, WT_SESSION *session, WT_FILE_SYSTEM *file_system, + const char *source, const char *object, const char *config) +{ + LOCAL_STORAGE *local; + int ret; + char *dest_path; + + (void)config; /* unused */ + dest_path = NULL; + local = (LOCAL_STORAGE *)storage_source; + ret = 0; + + if (file_system == NULL || source == NULL || object == NULL) + return local_err(local, session, EINVAL, "ss_flush_finish: required arguments missing"); + + if ((ret = local_bucket_path(file_system, object, &dest_path)) != 0) + goto err; + + if ((ret = local_delay(local)) != 0) + goto err; + + if ((ret = local_file_copy(local, session, source, dest_path, WT_FS_OPEN_FILE_TYPE_DATA)) != 0) + goto err; + + local->object_flushes++; + +err: + free(dest_path); + return (ret); +} + +/* + * local_flush_finish -- + * Move a file from the default file system to the cache in the new file system. + */ +static int +local_flush_finish(WT_STORAGE_SOURCE *storage_source, WT_SESSION *session, + WT_FILE_SYSTEM *file_system, const char *source, const char *object, const char *config) +{ + LOCAL_STORAGE *local; + int ret; + char *dest_path; + + (void)config; /* unused */ + dest_path = NULL; + local = (LOCAL_STORAGE *)storage_source; + ret = 0; + + if (file_system == NULL || source == NULL || object == NULL) + return local_err(local, session, EINVAL, "ss_flush_finish: required arguments missing"); + + if ((ret = local_cache_path(file_system, object, &dest_path)) != 0) + goto err; + + local->op_count++; + if ((ret = rename(source, dest_path)) != 0) { + ret = local_err( + local, session, errno, "ss_flush_finish rename %s to %s failed", source, dest_path); + goto err; + } + /* Set the file to readonly in the cache. */ + if (ret == 0 && (ret = chmod(dest_path, 0444)) < 0) + ret = local_err(local, session, errno, "%s: ss_flush_finish chmod failed", dest_path); +err: + free(dest_path); + return (ret); +} + +/* * local_directory_list -- * Return a list of object names for the given location. */ @@ -791,9 +729,8 @@ local_directory_list_internal(WT_FILE_SYSTEM *file_system, WT_SESSION *session, struct dirent *dp; DIR *dirp; LOCAL_FILE_SYSTEM *local_fs; - LOCAL_FLUSH_ITEM *flush; LOCAL_STORAGE *local; - size_t dir_len, fs_prefix_len, prefix_len; + size_t dir_len, prefix_len; uint32_t allocated, count; int ret, t_ret; char **entries; @@ -803,7 +740,6 @@ local_directory_list_internal(WT_FILE_SYSTEM *file_system, WT_SESSION *session, local = local_fs->local_storage; entries = NULL; allocated = count = 0; - fs_prefix_len = strlen(local_fs->fs_prefix); dir_len = (directory == NULL ? 0 : strlen(directory)); prefix_len = (prefix == NULL ? 0 : strlen(prefix)); ret = 0; @@ -811,6 +747,9 @@ local_directory_list_internal(WT_FILE_SYSTEM *file_system, WT_SESSION *session, *dirlistp = NULL; *countp = 0; + /* + * We list items in the cache directory (these have 'finished' flushing). + */ if ((dirp = opendir(local_fs->cache_dir)) == NULL) { ret = errno; if (ret == 0) @@ -819,9 +758,6 @@ local_directory_list_internal(WT_FILE_SYSTEM *file_system, WT_SESSION *session, local_err(local, session, ret, "%s: ss_directory_list: opendir", local_fs->cache_dir)); } - /* - * We list items in the cache directory as well as items in the "to be flushed" list. - */ for (count = 0; (dp = readdir(dirp)) != NULL && (limit == 0 || count < limit);) { /* Skip . and .. */ basename = dp->d_name; @@ -833,36 +769,6 @@ local_directory_list_internal(WT_FILE_SYSTEM *file_system, WT_SESSION *session, continue; basename += dir_len; - /* Skip files not associated with our file system prefix. */ - if (strncmp(basename, local_fs->fs_prefix, fs_prefix_len) != 0) - continue; - - basename += fs_prefix_len; - /* The list of files is optionally filtered by a prefix. */ - if (prefix != NULL && strncmp(basename, prefix, prefix_len) != 0) - continue; - - if ((ret = local_directory_list_add(local, &entries, basename, count, &allocated)) != 0) - goto err; - count++; - } - - TAILQ_FOREACH (flush, &local->flushq, q) { - if (limit != 0 && count >= limit) - break; - - /* Skip files not associated with this file system. */ - if (strcmp(local_fs->bucket_dir, flush->bucket) != 0 || - strcmp(local_fs->cache_dir, flush->cache_dir) != 0 || - strcmp(local_fs->fs_prefix, flush->fs_prefix) != 0) - continue; - - basename = strrchr(flush->src_path, '/'); - if (basename == NULL) - basename = flush->src_path; - else - basename++; - /* The list of files is optionally filtered by a prefix. */ if (prefix != NULL && strncmp(basename, prefix, prefix_len) != 0) continue; @@ -909,7 +815,6 @@ local_fs_terminate(WT_FILE_SYSTEM *file_system, WT_SESSION *session) free(local_fs->auth_token); free(local_fs->bucket_dir); free(local_fs->cache_dir); - free(local_fs->fs_prefix); free(file_system); return (0); @@ -925,12 +830,13 @@ local_open(WT_FILE_SYSTEM *file_system, WT_SESSION *session, const char *name, { LOCAL_FILE_HANDLE *local_fh; LOCAL_FILE_SYSTEM *local_fs; - LOCAL_FLUSH_ITEM *flush; LOCAL_STORAGE *local; WT_FILE_HANDLE *file_handle, *wt_fh; WT_FILE_SYSTEM *wt_fs; struct stat sb; int ret; + char *alloced_path; + const char *path; bool create, exists; (void)flags; /* Unused */ @@ -941,6 +847,7 @@ local_open(WT_FILE_SYSTEM *file_system, WT_SESSION *session, const char *name, local_fs = (LOCAL_FILE_SYSTEM *)file_system; local = local_fs->local_storage; wt_fs = local_fs->wt_fs; + alloced_path = NULL; /* * We expect that the local file system will be used narrowly, like when creating or opening a @@ -972,18 +879,16 @@ local_open(WT_FILE_SYSTEM *file_system, WT_SESSION *session, const char *name, exists = (ret == 0); } else exists = false; - if (create || exists) { + if (create || exists) /* The file has not been flushed, use the file directly in the file system. */ - if ((local_fh->path = strdup(name)) == NULL) { - ret = local_err(local, session, ENOMEM, "local_open"); - goto err; - } - } else { - if ((ret = local_location_path(file_system, name, &local_fh->path)) != 0) + path = name; + else { + if ((ret = local_cache_path(file_system, name, &alloced_path)) != 0) goto err; - ret = stat(local_fh->path, &sb); + path = alloced_path; + ret = stat(path, &sb); if (ret != 0 && errno != ENOENT) { - ret = local_err(local, session, errno, "%s: local_open stat", local_fh->path); + ret = local_err(local, session, errno, "%s: local_open stat", path); goto err; } exists = (ret == 0); @@ -997,35 +902,8 @@ local_open(WT_FILE_SYSTEM *file_system, WT_SESSION *session, const char *name, } #endif - if (create && !exists) { - if ((flush = calloc(1, sizeof(LOCAL_FLUSH_ITEM))) == NULL) { - ret = ENOMEM; - goto err; - } - local_fh->flush = flush; - - if ((flush->auth_token = strdup(local_fs->auth_token)) == NULL) { - ret = local_err(local, session, ENOMEM, "open.auth_token"); - goto err; - } - if ((flush->bucket = strdup(local_fs->bucket_dir)) == NULL) { - ret = local_err(local, session, ENOMEM, "open.bucket"); - goto err; - } - if ((flush->cache_dir = strdup(local_fs->cache_dir)) == NULL) { - ret = local_err(local, session, ENOMEM, "open.cache_dir"); - goto err; - } - if ((flush->fs_prefix = strdup(local_fs->fs_prefix)) == NULL) { - ret = local_err(local, session, ENOMEM, "open.fs_prefix"); - goto err; - } - flush->file_type = file_type; - } - - if ((ret = wt_fs->fs_open_file(wt_fs, session, local_fh->path, file_type, flags, &wt_fh)) != - 0) { - ret = local_err(local, session, ret, "ss_open_object: open: %s", local_fh->path); + if ((ret = wt_fs->fs_open_file(wt_fs, session, path, file_type, flags, &wt_fh)) != 0) { + ret = local_err(local, session, ret, "ss_open_object: open: %s", path); goto err; } local_fh->fh = wt_fh; @@ -1071,9 +949,10 @@ local_open(WT_FILE_SYSTEM *file_system, WT_SESSION *session, const char *name, *file_handlep = file_handle; VERBOSE( - local, "File opened: %s final path=%s\n", SHOW_STRING(name), SHOW_STRING(local_fh->path)); + local, "File opened: %s final path=%s\n", SHOW_STRING(name), SHOW_STRING(local_fh->fh->name)); err: + free(alloced_path); if (ret != 0) { if (local_fh != NULL) local_file_close_internal(local, session, local_fh); @@ -1093,11 +972,9 @@ local_rename(WT_FILE_SYSTEM *file_system, WT_SESSION *session, const char *from, uint32_t flags) { LOCAL_FILE_SYSTEM *local_fs; - LOCAL_FLUSH_ITEM *flush; LOCAL_STORAGE *local; WT_FILE_SYSTEM *wt_fs; - int ret, t_ret; - char *copy; + int ret; bool writeable; local = FS2LOCAL(file_system); @@ -1117,33 +994,6 @@ local_rename(WT_FILE_SYSTEM *file_system, WT_SESSION *session, const char *from, goto err; } - /* - * Find any flush entry that matches, and rename that too. - */ - if ((ret = pthread_rwlock_wrlock(&local->flush_lock)) != 0) { - ret = local_err(local, session, ret, "ss_remove: pthread_rwlock_wrlock"); - goto err; - } - - TAILQ_FOREACH (flush, &local->flushq, q) { - if (strcmp(flush->src_path, from) == 0) { - if ((copy = strdup(to)) == NULL) - ret = ENOMEM; - else { - free(flush->src_path); - flush->src_path = copy; - } - break; - } - } - - if ((t_ret = pthread_rwlock_unlock(&local->flush_lock)) != 0) { - (void)local_err(local, session, t_ret, "ss_remove: pthread_rwlock_unlock"); - if (ret == 0) - ret = t_ret; - goto err; - } - err: return (ret); } @@ -1157,7 +1007,6 @@ err: static int local_remove(WT_FILE_SYSTEM *file_system, WT_SESSION *session, const char *name, uint32_t flags) { - LOCAL_FLUSH_ITEM *flush; LOCAL_STORAGE *local; int ret; bool writeable; @@ -1180,27 +1029,6 @@ local_remove(WT_FILE_SYSTEM *file_system, WT_SESSION *session, const char *name, goto err; } - /* - * Find any flush entry that matches, and remove that too. - */ - if ((ret = pthread_rwlock_wrlock(&local->flush_lock)) != 0) { - ret = local_err(local, session, ret, "ss_remove: pthread_rwlock_wrlock"); - goto err; - } - - TAILQ_FOREACH (flush, &local->flushq, q) { - if (strcmp(flush->src_path, name) == 0) { - TAILQ_REMOVE(&local->flushq, flush, q); - local_flush_free(flush); - break; - } - } - - if ((ret = pthread_rwlock_unlock(&local->flush_lock)) != 0) { - ret = local_err(local, session, ret, "ss_remove: pthread_rwlock_unlock"); - goto err; - } - err: return (ret); } @@ -1226,7 +1054,7 @@ local_size(WT_FILE_SYSTEM *file_system, WT_SESSION *session, const char *name, w ret = stat(name, &sb); if (ret == ENOENT) { /* Otherwise, we'll see if it's in the cache directory. */ - if ((ret = local_location_path(file_system, name, &path)) != 0) + if ((ret = local_cache_path(file_system, name, &path)) != 0) goto err; ret = stat(path, &sb); @@ -1282,7 +1110,6 @@ local_file_close(WT_FILE_HANDLE *file_handle, WT_SESSION *session) { LOCAL_STORAGE *local; LOCAL_FILE_HANDLE *local_fh; - LOCAL_FLUSH_ITEM *flush; int ret, t_ret; ret = 0; @@ -1299,28 +1126,6 @@ local_file_close(WT_FILE_HANDLE *file_handle, WT_SESSION *session) if ((ret = pthread_rwlock_unlock(&local->file_handle_lock)) != 0) (void)local_err(local, session, ret, "file handle close: pthread_rwlock_unlock"); - /* - * If we need to track flushes for this file, save the flush item on our queue. - */ - if (ret == 0 && ((flush = local_fh->flush)) != NULL) { - if ((ret = pthread_rwlock_wrlock(&local->flush_lock)) != 0) - (void)local_err(local, session, ret, "file handle close: pthread_rwlock_wrlock2"); - - if (ret == 0) { - /* - * Move the flush object from the file handle and to the flush queue. It is now owned by - * the flush queue and will be freed when that item is flushed. - */ - TAILQ_INSERT_HEAD(&local->flushq, flush, q); - local_fh->flush = NULL; - - if ((ret = pthread_rwlock_unlock(&local->flush_lock)) != 0) - (void)local_err(local, session, ret, "file handle close: pthread_rwlock_unlock2"); - if (ret == 0 && ((flush->src_path = strdup(local_fh->path)) == NULL)) - ret = ENOMEM; - } - } - if ((t_ret = local_file_close_internal(local, session, local_fh)) != 0) { if (ret == 0) ret = t_ret; @@ -1344,8 +1149,6 @@ local_file_close_internal(LOCAL_STORAGE *local, WT_SESSION *session, LOCAL_FILE_ if (wt_fh != NULL && (ret = wt_fh->close(wt_fh, session)) != 0) ret = local_err(local, session, ret, "WT_FILE_HANDLE->close: close"); - local_flush_free(local_fh->flush); - free(local_fh->path); free(local_fh->iface.name); free(local_fh); @@ -1451,8 +1254,7 @@ wiredtiger_extension_init(WT_CONNECTION *connection, WT_CONFIG_ARG *config) if ((local = calloc(1, sizeof(LOCAL_STORAGE))) == NULL) return (errno); local->wt_api = connection->get_extension_api(connection); - if ((ret = pthread_rwlock_init(&local->file_handle_lock, NULL)) != 0 || - (ret = pthread_rwlock_init(&local->flush_lock, NULL)) != 0) { + if ((ret = pthread_rwlock_init(&local->file_handle_lock, NULL)) != 0) { (void)local_err(local, NULL, ret, "pthread_rwlock_init"); free(local); return (ret); @@ -1464,6 +1266,7 @@ wiredtiger_extension_init(WT_CONNECTION *connection, WT_CONFIG_ARG *config) */ local->storage_source.ss_customize_file_system = local_customize_file_system; local->storage_source.ss_flush = local_flush; + local->storage_source.ss_flush_finish = local_flush_finish; local->storage_source.terminate = local_terminate; if ((ret = local_configure(local, config)) != 0) { diff --git a/src/third_party/wiredtiger/import.data b/src/third_party/wiredtiger/import.data index 392d1bb1861..2c1e2219e30 100644 --- a/src/third_party/wiredtiger/import.data +++ b/src/third_party/wiredtiger/import.data @@ -2,5 +2,5 @@ "vendor": "wiredtiger", "github": "wiredtiger/wiredtiger.git", "branch": "mongodb-4.4", - "commit": "aadac222429faa9b20d9344e3648a19be97811b9" + "commit": "bae0c1c914bc0fa92f3775c08650b65663094034" } diff --git a/src/third_party/wiredtiger/lang/python/wiredtiger.i b/src/third_party/wiredtiger/lang/python/wiredtiger.i index 3dbf69d35c2..0105a5a70d6 100644 --- a/src/third_party/wiredtiger/lang/python/wiredtiger.i +++ b/src/third_party/wiredtiger/lang/python/wiredtiger.i @@ -986,14 +986,19 @@ typedef int int_void; %enddef SIDESTEP_METHOD(__wt_storage_source, ss_customize_file_system, - (WT_SESSION *session, const char *bucket_name, const char *prefix, + (WT_SESSION *session, const char *bucket_name, const char *auth_token, const char *config, WT_FILE_SYSTEM **file_systemp), - (self, session, bucket_name, prefix, auth_token, config, file_systemp)) + (self, session, bucket_name, auth_token, config, file_systemp)) SIDESTEP_METHOD(__wt_storage_source, ss_flush, (WT_SESSION *session, WT_FILE_SYSTEM *file_system, - const char *name, const char *config), - (self, session, file_system, name, config)) + const char *source, const char *object, const char *config), + (self, session, file_system, source, object, config)) + +SIDESTEP_METHOD(__wt_storage_source, ss_flush_finish, + (WT_SESSION *session, WT_FILE_SYSTEM *file_system, + const char *source, const char *object, const char *config), + (self, session, file_system, source, object, config)) SIDESTEP_METHOD(__wt_storage_source, terminate, (WT_SESSION *session), diff --git a/src/third_party/wiredtiger/src/block/block_addr.c b/src/third_party/wiredtiger/src/block/block_addr.c index cc8a46a196a..15295bc02d3 100644 --- a/src/third_party/wiredtiger/src/block/block_addr.c +++ b/src/third_party/wiredtiger/src/block/block_addr.c @@ -28,16 +28,13 @@ __block_buffer_to_addr(WT_BLOCK *block, const uint8_t **pp, uint32_t *logidp, wt WT_RET(__wt_vunpack_uint(pp, 0, &c)); /* - * To avoid storing large offsets, we minimize the value by subtracting - * a block for description information, then storing a count of block - * allocation units. That implies there is no such thing as an - * "invalid" offset though, they could all be valid (other than very - * large numbers), which is what we didn't want to store in the first - * place. Use the size: writing a block of size 0 makes no sense, so - * that's the out-of-band value. Once we're out of this function and - * are working with a real file offset, size and checksum triplet, there - * can be invalid offsets, that's simpler than testing sizes of 0 all - * over the place. + * To avoid storing large offsets, we minimize the value by subtracting a block for description + * information, then storing a count of block allocation units. That implies there is no such + * thing as an "invalid" offset though, they could all be valid (other than very large numbers), + * which is what we didn't want to store in the first place. Use the size: writing a block of + * size 0 makes no sense, so that's the out-of-band value. Once we're out of this function and + * are working with a real file offset, size and checksum triplet, there can be invalid offsets, + * that's simpler than testing sizes of 0 all over the place. */ if (s == 0) { *offsetp = 0; diff --git a/src/third_party/wiredtiger/src/block/block_ext.c b/src/third_party/wiredtiger/src/block/block_ext.c index 0deaef04654..310330da831 100644 --- a/src/third_party/wiredtiger/src/block/block_ext.c +++ b/src/third_party/wiredtiger/src/block/block_ext.c @@ -280,18 +280,16 @@ __wt_block_misplaced(WT_SESSION_IMPL *session, WT_BLOCK *block, const char *list return (0); /* - * Verify a block the btree engine thinks it "owns" doesn't appear on - * the available or discard lists (it might reasonably be on the alloc - * list, if it was allocated since the last checkpoint). The engine - * "owns" a block if it's trying to read or free the block, and those + * Verify a block the btree engine thinks it "owns" doesn't appear on the available or discard + * lists (it might reasonably be on the alloc list, if it was allocated since the last + * checkpoint). The engine "owns" a block if it's trying to read or free the block, and those * functions make this check. * * Any block being read or freed should not be "available". * - * Any block being read or freed in the live system should not be on the - * discard list. (A checkpoint handle might be reading a block which is - * on the live system's discard list; any attempt to free a block from a - * checkpoint handle has already failed.) + * Any block being read or freed in the live system should not be on the discard list. (A + * checkpoint handle might be reading a block which is on the live system's discard list; any + * attempt to free a block from a checkpoint handle has already failed.) */ __wt_spin_lock(session, &block->live_lock); if (__block_off_match(&block->live.avail, offset, size)) diff --git a/src/third_party/wiredtiger/src/block/block_tiered.c b/src/third_party/wiredtiger/src/block/block_tiered.c index 776b2a127ad..d922a663e03 100644 --- a/src/third_party/wiredtiger/src/block/block_tiered.c +++ b/src/third_party/wiredtiger/src/block/block_tiered.c @@ -52,7 +52,6 @@ __wt_block_tiered_newfile(WT_SESSION_IMPL *session, WT_BLOCK *block) { WT_DECL_ITEM(tmp); WT_DECL_RET; - WT_STORAGE_SOURCE *storage_source; const char *filename; /* Get the old file name again. */ @@ -62,21 +61,8 @@ __wt_block_tiered_newfile(WT_SESSION_IMPL *session, WT_BLOCK *block) * TODO: tiered: We will get rid of the log id, and this name generation will be replaced by the * name generated by __tiered_switch. */ - WT_ERR(__wt_buf_fmt(session, tmp, "%s.%08" PRIu32, block->name, block->logid)); - filename = tmp->data; WT_ERR(__wt_close(session, &block->fh)); - /* - * TODO: tiered: Assert that session->bucket_storage is not NULL. We can't do that while we have - * tests that use block_allocation=log without setting up bucket storage. This whole function is - * going to look very different when flush_tier is fully integrated. - */ - if (session->bucket_storage != NULL && block->logid != 0) { - storage_source = session->bucket_storage->storage_source; - WT_ASSERT(session, storage_source != NULL); - WT_ERR(storage_source->ss_flush( - storage_source, &session->iface, session->bucket_storage->file_system, filename, NULL)); - } /* Bump to a new file ID. */ ++block->logid; WT_ERR(__wt_buf_fmt(session, tmp, "%s.%08" PRIu32, block->name, block->logid)); diff --git a/src/third_party/wiredtiger/src/btree/bt_curnext.c b/src/third_party/wiredtiger/src/btree/bt_curnext.c index a2b7f161d3e..5f3e03bde3e 100644 --- a/src/third_party/wiredtiger/src/btree/bt_curnext.c +++ b/src/third_party/wiredtiger/src/btree/bt_curnext.c @@ -314,7 +314,6 @@ __cursor_row_next( WT_PAGE *page; WT_ROW *rip; WT_SESSION_IMPL *session; - bool kpack_used; session = CUR2S(cbt); page = cbt->ref->page; @@ -402,7 +401,7 @@ restart_read_insert: cbt->slot = cbt->row_iteration_slot / 2 - 1; restart_read_page: rip = &page->pg_row[cbt->slot]; - WT_RET(__cursor_row_slot_key_return(cbt, rip, &kpack, &kpack_used)); + WT_RET(__cursor_row_slot_key_return(cbt, rip, &kpack)); /* * If the cursor has prefix search configured we can early exit here if the key that we are * visiting is after our prefix. @@ -679,6 +678,8 @@ __wt_btcur_next_prefix(WT_CURSOR_BTREE *cbt, WT_ITEM *prefix, bool truncating) page = cbt->ref == NULL ? NULL : cbt->ref->page; if (F_ISSET(cbt, WT_CBT_ITERATE_APPEND)) { + /* The page cannot be NULL if the above flag is set. */ + WT_ASSERT(session, page != NULL); switch (page->type) { case WT_PAGE_COL_FIX: ret = __cursor_fix_append_next(cbt, newpage, restart); diff --git a/src/third_party/wiredtiger/src/btree/bt_curprev.c b/src/third_party/wiredtiger/src/btree/bt_curprev.c index 867a46201a4..abf31424525 100644 --- a/src/third_party/wiredtiger/src/btree/bt_curprev.c +++ b/src/third_party/wiredtiger/src/btree/bt_curprev.c @@ -454,7 +454,6 @@ __cursor_row_prev( WT_PAGE *page; WT_ROW *rip; WT_SESSION_IMPL *session; - bool kpack_used; session = CUR2S(cbt); page = cbt->ref->page; @@ -480,12 +479,8 @@ __cursor_row_prev( * Initialize for each new page. */ if (newpage) { - /* - * If we haven't instantiated keys on this page, do so, else it is a very, very slow - * traversal. - */ - if (!F_ISSET_ATOMIC(page, WT_PAGE_BUILD_KEYS)) - WT_RET(__wt_row_leaf_keys(session, page)); + /* Check if keys need to be instantiated before we walk the page. */ + WT_RET(__wt_row_leaf_key_instantiate(session, page)); /* * Be paranoid and set the slot out of bounds when moving to a new page. @@ -554,7 +549,7 @@ restart_read_insert: cbt->slot = cbt->row_iteration_slot / 2 - 1; restart_read_page: rip = &page->pg_row[cbt->slot]; - WT_RET(__cursor_row_slot_key_return(cbt, rip, &kpack, &kpack_used)); + WT_RET(__cursor_row_slot_key_return(cbt, rip, &kpack)); /* * If the cursor has prefix search configured we can early exit here if the key we are * visiting is before our prefix. @@ -638,6 +633,8 @@ __wt_btcur_prev_prefix(WT_CURSOR_BTREE *cbt, WT_ITEM *prefix, bool truncating) F_SET(cbt, WT_CBT_ITERATE_APPEND); if (F_ISSET(cbt, WT_CBT_ITERATE_APPEND)) { + /* The page cannot be NULL if the above flag is set. */ + WT_ASSERT(session, page != NULL); switch (page->type) { case WT_PAGE_COL_FIX: ret = __cursor_fix_append_prev(cbt, newpage, restart); diff --git a/src/third_party/wiredtiger/src/btree/bt_debug.c b/src/third_party/wiredtiger/src/btree/bt_debug.c index c9acfff3628..c68855988a2 100644 --- a/src/third_party/wiredtiger/src/btree/bt_debug.c +++ b/src/third_party/wiredtiger/src/btree/bt_debug.c @@ -1345,7 +1345,7 @@ __debug_page_row_leaf(WT_DBG *ds, WT_PAGE *page, WT_CURSOR *hs_cursor) WT_RET(__wt_row_leaf_key(session, page, rip, ds->key, false)); WT_RET(__debug_item_key(ds, "K", ds->key->data, ds->key->size)); - __wt_row_leaf_value_cell(session, page, rip, NULL, unpack); + __wt_row_leaf_value_cell(session, page, rip, unpack); WT_RET(__debug_cell_kv(ds, page, WT_PAGE_ROW_LEAF, "V", unpack)); if ((upd = WT_ROW_UPDATE(page, rip)) != NULL) diff --git a/src/third_party/wiredtiger/src/btree/bt_discard.c b/src/third_party/wiredtiger/src/btree/bt_discard.c index 03f789868db..02bd970e0c6 100644 --- a/src/third_party/wiredtiger/src/btree/bt_discard.c +++ b/src/third_party/wiredtiger/src/btree/bt_discard.c @@ -360,22 +360,12 @@ __free_page_col_var(WT_SESSION_IMPL *session, WT_PAGE *page) static void __free_page_row_leaf(WT_SESSION_IMPL *session, WT_PAGE *page) { - WT_IKEY *ikey; WT_ROW *rip; uint32_t i; - void *copy; - /* - * Free the in-memory index array. - * - * For each entry, see if the key was an allocation (that is, if it points somewhere other than - * the original page), and if so, free the memory. - */ - WT_ROW_FOREACH (page, rip, i) { - copy = WT_ROW_KEY_COPY(rip); - WT_IGNORE_RET_BOOL(__wt_row_leaf_key_info(page, copy, &ikey, NULL, NULL, NULL)); - __wt_free(session, ikey); - } + /* Free any allocated memory used by instantiated keys. */ + WT_ROW_FOREACH (page, rip, i) + __wt_row_leaf_key_free(session, page, rip); } /* diff --git a/src/third_party/wiredtiger/src/btree/bt_handle.c b/src/third_party/wiredtiger/src/btree/bt_handle.c index 0d36f155f7a..5e6444dc202 100644 --- a/src/third_party/wiredtiger/src/btree/bt_handle.c +++ b/src/third_party/wiredtiger/src/btree/bt_handle.c @@ -222,6 +222,9 @@ __wt_btree_close(WT_SESSION_IMPL *session) !F_ISSET(S2C(session), WT_CONN_HS_OPEN) || !btree->hs_entries || (!WT_IS_METADATA(btree->dhandle) && !WT_IS_HS(btree->dhandle))); + /* Clear the saved checkpoint information. */ + __wt_meta_saved_ckptlist_free(session); + /* * If we turned eviction off and never turned it back on, do that now, otherwise the counter * will be off. @@ -344,7 +347,7 @@ __btree_conf(WT_SESSION_IMPL *session, WT_CKPT *ckpt) WT_RET(__wt_struct_confchk(session, &cval)); WT_RET(__wt_strndup(session, cval.str, cval.len, &btree->value_format)); - /* Row-store key comparison and key gap for prefix compression. */ + /* Row-store key comparison. */ if (btree->type == BTREE_ROW) { WT_RET(__wt_config_gets_none(session, cfg, "collator", &cval)); if (cval.len != 0) { @@ -352,9 +355,6 @@ __btree_conf(WT_SESSION_IMPL *session, WT_CKPT *ckpt) WT_RET(__wt_collator_config(session, btree->dhandle->name, &cval, &metadata, &btree->collator, &btree->collator_owned)); } - - WT_RET(__wt_config_gets(session, cfg, "key_gap", &cval)); - btree->key_gap = (uint32_t)cval.val; } /* Column-store: check for fixed-size data. */ @@ -389,9 +389,8 @@ __btree_conf(WT_SESSION_IMPL *session, WT_CKPT *ckpt) F_CLR(btree, WT_BTREE_IGNORE_CACHE); /* - * The metadata isn't blocked by in-memory cache limits because metadata - * "unroll" is performed by updates that are potentially blocked by the - * cache-full checks. + * The metadata isn't blocked by in-memory cache limits because metadata "unroll" is performed + * by updates that are potentially blocked by the cache-full checks. */ if (WT_IS_METADATA(btree->dhandle)) F_SET(btree, WT_BTREE_IGNORE_CACHE); diff --git a/src/third_party/wiredtiger/src/btree/bt_import.c b/src/third_party/wiredtiger/src/btree/bt_import.c index 4e207d7aa8f..6a650cf0647 100644 --- a/src/third_party/wiredtiger/src/btree/bt_import.c +++ b/src/third_party/wiredtiger/src/btree/bt_import.c @@ -20,6 +20,7 @@ __wt_import_repair(WT_SESSION_IMPL *session, const char *uri, char **configp) WT_CONFIG_ITEM v; WT_DECL_ITEM(a); WT_DECL_ITEM(b); + WT_DECL_ITEM(buf); WT_DECL_ITEM(checkpoint); WT_DECL_RET; WT_KEYED_ENCRYPTOR *kencryptor; @@ -33,6 +34,7 @@ __wt_import_repair(WT_SESSION_IMPL *session, const char *uri, char **configp) WT_ERR(__wt_scr_alloc(session, 0, &a)); WT_ERR(__wt_scr_alloc(session, 0, &b)); + WT_ERR(__wt_scr_alloc(session, 1024, &buf)); WT_ERR(__wt_scr_alloc(session, 0, &checkpoint)); WT_ASSERT(session, WT_PREFIX_MATCH(uri, "file:")); @@ -92,13 +94,14 @@ __wt_import_repair(WT_SESSION_IMPL *session, const char *uri, char **configp) * Build and flatten the metadata and the checkpoint list, then insert it into the metadata for * this file. * - * Strip out any incremental backup information, an imported file has not been part of a backup. - * Strip out the checkpoint LSN, an imported file isn't associated with any log files. Assign a - * unique file ID. + * Reconstruct the incremental backup information, to indicate copying the whole file as an + * imported file has not been part of backup. Strip out the checkpoint LSN, an imported file + * isn't associated with any log files. Assign a unique file ID. */ cfg[1] = a->data; cfg[2] = checkpoint_list; - cfg[3] = "checkpoint_backup_info="; + WT_ERR(__wt_reset_blkmod(session, a->data, buf)); + cfg[3] = buf->mem; cfg[4] = "checkpoint_lsn="; WT_WITH_SCHEMA_LOCK(session, ret = __wt_snprintf(fileid, sizeof(fileid), "id=%" PRIu32, ++S2C(session)->next_file_id)); @@ -129,7 +132,7 @@ __wt_import_repair(WT_SESSION_IMPL *session, const char *uri, char **configp) * Update the last checkpoint with the corrected information. Update the file's metadata with * the new checkpoint information. */ - WT_ERR(__wt_meta_ckptlist_get_from_config(session, false, &ckptbase, config_tmp)); + WT_ERR(__wt_meta_ckptlist_get_from_config(session, false, &ckptbase, NULL, config_tmp)); WT_CKPT_FOREACH (ckptbase, ckpt) if (ckpt->name == NULL || (ckpt + 1)->name == NULL) break; @@ -154,6 +157,7 @@ err: __wt_scr_free(session, &a); __wt_scr_free(session, &b); + __wt_scr_free(session, &buf); __wt_scr_free(session, &checkpoint); return (ret); diff --git a/src/third_party/wiredtiger/src/btree/bt_page.c b/src/third_party/wiredtiger/src/btree/bt_page.c index 5a6f36b160c..22766b682bf 100644 --- a/src/third_party/wiredtiger/src/btree/bt_page.c +++ b/src/third_party/wiredtiger/src/btree/bt_page.c @@ -539,11 +539,21 @@ __inmem_row_leaf(WT_SESSION_IMPL *session, WT_PAGE *page) WT_ROW *rip; WT_UPDATE *tombstone, *upd; size_t size, total_size; + uint32_t best_prefix_count, best_prefix_start, best_prefix_stop; + uint32_t last_slot, prefix_count, prefix_start, prefix_stop, slot; + uint8_t smallest_prefix; btree = S2BT(session); tombstone = upd = NULL; + last_slot = 0; size = total_size = 0; + /* The code depends on the prefix count variables, other initialization shouldn't matter. */ + best_prefix_count = prefix_count = 0; + smallest_prefix = 0; /* [-Wconditional-uninitialized] */ + prefix_start = prefix_stop = 0; /* [-Wconditional-uninitialized] */ + best_prefix_start = best_prefix_stop = 0; /* [-Wconditional-uninitialized] */ + /* * Optionally instantiate prepared updates. In-memory databases restore non-obsolete updates on * the page as part of the __split_multi_inmem function. @@ -557,19 +567,74 @@ __inmem_row_leaf(WT_SESSION_IMPL *session, WT_PAGE *page) rip = page->pg_row; WT_CELL_FOREACH_KV (session, page->dsk, unpack) { switch (unpack.type) { - case WT_CELL_KEY_OVFL: - __wt_row_leaf_key_set_cell(page, rip, unpack.cell); - ++rip; - continue; case WT_CELL_KEY: /* - * Simple keys without prefix compression can be directly referenced on the page to + * Simple keys and prefix-compressed keys can be directly referenced on the page to * avoid repeatedly unpacking their cells. + * + * Review groups of prefix-compressed keys, and track the biggest group as the page's + * prefix. What we're finding is the biggest group of prefix-compressed keys we can + * immediately build using a previous key plus their suffix bytes, without rolling + * forward through intermediate keys. We save that information on the page and then + * never physically instantiate those keys, avoiding memory amplification for pages with + * a page-wide prefix. On the first of a group of prefix-compressed keys, track the slot + * of the fully-instantiated key from which it's derived and the current key's prefix + * length. On subsequent keys, if the key can be built from the original key plus the + * current key's suffix bytes, update the maximum slot to which the prefix applies and + * the smallest prefix length. + * + * Groups of prefix-compressed keys end when a key is not prefix-compressed (ignoring + * overflow keys), or the key's prefix length increases. A prefix length decreasing is + * OK, it only means fewer bytes taken from the original key. A prefix length increasing + * doesn't necessarily end a group of prefix-compressed keys as we might be able to + * build a subsequent key using the original key and the key's suffix bytes, that is the + * prefix length could increase and then decrease to the same prefix length as before + * and those latter keys could be built without rolling forward through intermediate + * keys. + * + * However, that gets tricky: once a key prefix grows, we can never include a prefix + * smaller than the smallest prefix found so far, in the group, as a subsequent key + * prefix larger than the smallest prefix found so far might include bytes not present + * in the original instantiated key. Growing and shrinking is complicated to track, so + * rather than code up that complexity, we close out a group whenever the prefix grows. + * Plus, growing has additional issues. Any key with a larger prefix cannot be + * instantiated without rolling forward through intermediate keys, and so while such a + * key isn't required to close out the prefix group in all cases, it's not a useful + * entry for finding the best group of prefix-compressed keys, either, it's only + * possible keys after the prefix shrinks again that are potentially worth including in + * a group. + */ + slot = WT_ROW_SLOT(page, rip); + if (unpack.prefix == 0) { + /* If the last prefix group was the best, track it. */ + if (prefix_count > best_prefix_count) { + best_prefix_start = prefix_start; + best_prefix_stop = prefix_stop; + best_prefix_count = prefix_count; + } + prefix_count = 0; + prefix_start = slot; + } else { + /* Check for starting or continuing a prefix group. */ + if (prefix_count == 0 || + (last_slot == slot - 1 && unpack.prefix <= smallest_prefix)) { + smallest_prefix = unpack.prefix; + last_slot = prefix_stop = slot; + ++prefix_count; + } + } + __wt_row_leaf_key_set(page, rip, &unpack); + ++rip; + continue; + case WT_CELL_KEY_OVFL: + /* + * Prefix compression skips overflow items, ignore this slot. The last slot value is + * only used inside a group of prefix-compressed keys, so blindly increment it, it's not + * used unless the count of prefix-compressed keys is non-zero. */ - if (unpack.prefix == 0) - __wt_row_leaf_key_set(page, rip, &unpack); - else - __wt_row_leaf_key_set_cell(page, rip, unpack.cell); + ++last_slot; + + __wt_row_leaf_key_set(page, rip, &unpack); ++rip; continue; case WT_CELL_VALUE: @@ -584,7 +649,7 @@ __inmem_row_leaf(WT_SESSION_IMPL *session, WT_PAGE *page) (WT_TIME_WINDOW_IS_EMPTY(&unpack.tw) || (!WT_TIME_WINDOW_HAS_STOP(&unpack.tw) && __wt_txn_tw_start_visible_all(session, &unpack.tw)))) - __wt_row_leaf_value_set(page, rip - 1, &unpack); + __wt_row_leaf_value_set(rip - 1, &unpack); break; case WT_CELL_VALUE_OVFL: break; @@ -610,6 +675,9 @@ __inmem_row_leaf(WT_SESSION_IMPL *session, WT_PAGE *page) prepare = PREPARE_INITIALIZED; } + /* Make sure that there is no in-memory update for this key. */ + WT_ASSERT(session, page->modify->mod_row_update[WT_ROW_SLOT(page, rip - 1)] == NULL); + /* Take the value from the page cell. */ WT_ERR(__wt_page_cell_data_ref(session, page, &unpack, value)); @@ -618,6 +686,7 @@ __inmem_row_leaf(WT_SESSION_IMPL *session, WT_PAGE *page) upd->durable_ts = unpack.tw.durable_start_ts; upd->start_ts = unpack.tw.start_ts; upd->txnid = unpack.tw.start_txn; + F_SET(upd, WT_UPDATE_PREPARE_RESTORED_FROM_DS); /* * Instantiate both update and tombstone if the prepared update is a tombstone. This is @@ -632,7 +701,6 @@ __inmem_row_leaf(WT_SESSION_IMPL *session, WT_PAGE *page) tombstone->txnid = unpack.tw.stop_txn; tombstone->prepare_state = WT_PREPARE_INPROGRESS; F_SET(tombstone, WT_UPDATE_PREPARE_RESTORED_FROM_DS); - F_SET(upd, WT_UPDATE_RESTORED_FROM_DS); /* * Mark the update also as in-progress if the update and tombstone are from same @@ -644,14 +712,12 @@ __inmem_row_leaf(WT_SESSION_IMPL *session, WT_PAGE *page) unpack.tw.start_txn == unpack.tw.stop_txn) { upd->durable_ts = WT_TS_NONE; upd->prepare_state = WT_PREPARE_INPROGRESS; - F_SET(upd, WT_UPDATE_PREPARE_RESTORED_FROM_DS); } tombstone->next = upd; } else { upd->durable_ts = WT_TS_NONE; upd->prepare_state = WT_PREPARE_INPROGRESS; - F_SET(upd, WT_UPDATE_PREPARE_RESTORED_FROM_DS); tombstone = upd; } @@ -660,6 +726,23 @@ __inmem_row_leaf(WT_SESSION_IMPL *session, WT_PAGE *page) } WT_CELL_FOREACH_END; + /* If the last prefix group was the best, track it. Save the best prefix group for the page. */ + if (prefix_count > best_prefix_count) { + best_prefix_start = prefix_start; + best_prefix_stop = prefix_stop; + } + page->prefix_start = best_prefix_start; + page->prefix_stop = best_prefix_stop; + + /* + * Backward cursor traversal can be too slow if we're forced to process long stretches of + * prefix-compressed keys to create every key as we walk backwards through the page, and we + * handle that by instantiating periodic keys when backward cursor traversal enters a new page. + * Mark the page as not needing that work if there aren't stretches of prefix-compressed keys. + */ + if (best_prefix_count <= 10) + F_SET_ATOMIC(page, WT_PAGE_BUILD_KEYS); + __wt_cache_page_inmem_incr(session, page, total_size); err: diff --git a/src/third_party/wiredtiger/src/btree/bt_ret.c b/src/third_party/wiredtiger/src/btree/bt_ret.c index 4ea7884d4b7..b9d14f0a889 100644 --- a/src/third_party/wiredtiger/src/btree/bt_ret.c +++ b/src/third_party/wiredtiger/src/btree/bt_ret.c @@ -91,15 +91,16 @@ __wt_read_row_time_window(WT_SESSION_IMPL *session, WT_PAGE *page, WT_ROW *rip, { WT_CELL_UNPACK_KV unpack; - WT_TIME_WINDOW_INIT(tw); /* - * If a value is simple and is globally visible at the time of reading a page into cache, we set - * the start time point as globally visible. + * Simple values are encoded at the time of reading a page into cache, in which case we set the + * start time point as globally visible. */ - if (__wt_row_leaf_value_exists(rip)) + if (__wt_row_leaf_value_is_encoded(rip)) { + WT_TIME_WINDOW_INIT(tw); return; + } - __wt_row_leaf_value_cell(session, page, rip, NULL, &unpack); + __wt_row_leaf_value_cell(session, page, rip, &unpack); WT_TIME_WINDOW_COPY(tw, &unpack.tw); } @@ -165,7 +166,7 @@ __wt_value_return_buf(WT_CURSOR_BTREE *cbt, WT_REF *ref, WT_ITEM *buf, WT_TIME_W } /* Take the value from the original page cell. */ - __wt_row_leaf_value_cell(session, page, rip, NULL, &unpack); + __wt_row_leaf_value_cell(session, page, rip, &unpack); if (tw != NULL) WT_TIME_WINDOW_COPY(tw, &unpack.tw); return (__wt_page_cell_data_ref(session, page, &unpack, buf)); diff --git a/src/third_party/wiredtiger/src/btree/bt_slvg.c b/src/third_party/wiredtiger/src/btree/bt_slvg.c index 0fccaa8c801..5608242a5dd 100644 --- a/src/third_party/wiredtiger/src/btree/bt_slvg.c +++ b/src/third_party/wiredtiger/src/btree/bt_slvg.c @@ -194,6 +194,9 @@ __slvg_checkpoint(WT_SESSION_IMPL *session, WT_REF *root) * We may not have found any pages during salvage and there's no tree to flush. */ if (root->page != NULL) { + /* Make sure that the saved checkpoint information has been cleared. */ + WT_ASSERT(session, btree->ckpt == NULL); + btree->ckpt = ckptbase; ret = __wt_evict(session, root, WT_REF_MEM, WT_EVICT_CALL_CLOSING); root->page = NULL; @@ -1993,12 +1996,10 @@ __slvg_row_ovfl( */ for (rip = page->pg_row + start; start < stop; ++start, ++rip) { copy = WT_ROW_KEY_COPY(rip); - WT_IGNORE_RET_BOOL(__wt_row_leaf_key_info(page, copy, NULL, &cell, NULL, NULL)); - if (cell != NULL) { - __wt_cell_unpack_kv(session, page->dsk, cell, &unpack); - WT_RET(__slvg_row_ovfl_single(session, trk, &unpack)); - } - __wt_row_leaf_value_cell(session, page, rip, NULL, &unpack); + __wt_row_leaf_key_info(page, copy, NULL, &cell, NULL, NULL, NULL); + __wt_cell_unpack_kv(session, page->dsk, cell, &unpack); + WT_RET(__slvg_row_ovfl_single(session, trk, &unpack)); + __wt_row_leaf_value_cell(session, page, rip, &unpack); WT_RET(__slvg_row_ovfl_single(session, trk, &unpack)); } return (0); diff --git a/src/third_party/wiredtiger/src/btree/bt_split.c b/src/third_party/wiredtiger/src/btree/bt_split.c index 765d9240657..6fc62f0a52b 100644 --- a/src/third_party/wiredtiger/src/btree/bt_split.c +++ b/src/third_party/wiredtiger/src/btree/bt_split.c @@ -1782,14 +1782,12 @@ __split_insert(WT_SESSION_IMPL *session, WT_REF *ref) if (type == WT_PAGE_ROW_LEAF) { /* - * Copy the first key from the original page into first ref in - * the new parent. Pages created in memory always have a - * "smallest" insert list, so look there first. If we don't - * find one, get the first key from the disk image. + * Copy the first key from the original page into first ref in the new parent. Pages created + * in memory always have a "smallest" insert list, so look there first. If we don't find + * one, get the first key from the disk image. * - * We can't just use the key from the original ref: it may have - * been suffix-compressed, and after the split the truncated key - * may not be valid. + * We can't just use the key from the original ref: it may have been suffix-compressed, and + * after the split the truncated key may not be valid. */ WT_ERR(__wt_scr_alloc(session, 0, &key)); if ((ins = WT_SKIP_FIRST(WT_ROW_INSERT_SMALLEST(page))) != NULL) { diff --git a/src/third_party/wiredtiger/src/btree/bt_stat.c b/src/third_party/wiredtiger/src/btree/bt_stat.c index cc1d83e3335..9e452f39d2d 100644 --- a/src/third_party/wiredtiger/src/btree/bt_stat.c +++ b/src/third_party/wiredtiger/src/btree/bt_stat.c @@ -274,7 +274,7 @@ __stat_page_row_leaf(WT_SESSION_IMPL *session, WT_PAGE *page, WT_DSRC_STATS **st if (upd == NULL || (upd->type != WT_UPDATE_RESERVE && upd->type != WT_UPDATE_TOMBSTONE)) ++entry_cnt; if (upd == NULL) { - __wt_row_leaf_value_cell(session, page, rip, NULL, &unpack); + __wt_row_leaf_value_cell(session, page, rip, &unpack); if (unpack.type == WT_CELL_VALUE_OVFL) ++ovfl_cnt; } diff --git a/src/third_party/wiredtiger/src/btree/bt_vrfy.c b/src/third_party/wiredtiger/src/btree/bt_vrfy.c index cf2aca0fc87..039a9e7e823 100644 --- a/src/third_party/wiredtiger/src/btree/bt_vrfy.c +++ b/src/third_party/wiredtiger/src/btree/bt_vrfy.c @@ -213,7 +213,7 @@ __wt_verify(WT_SESSION_IMPL *session, const char *cfg[]) * Get a list of the checkpoints for this file. Empty objects have no checkpoints, in which case * there's no work to do. */ - WT_ERR_NOTFOUND_OK(__wt_meta_ckptlist_get(session, name, false, &ckptbase), true); + WT_ERR_NOTFOUND_OK(__wt_meta_ckptlist_get(session, name, false, &ckptbase, NULL), true); if (ret == WT_NOTFOUND) { ret = 0; goto done; diff --git a/src/third_party/wiredtiger/src/btree/row_key.c b/src/third_party/wiredtiger/src/btree/row_key.c index 12558339f97..1049b0d2186 100644 --- a/src/third_party/wiredtiger/src/btree/row_key.c +++ b/src/third_party/wiredtiger/src/btree/row_key.c @@ -8,100 +8,6 @@ #include "wt_internal.h" -static void __inmem_row_leaf_slots(uint8_t *, uint32_t, uint32_t, uint32_t); - -/* - * __wt_row_leaf_keys -- - * Instantiate the interesting keys for random search of a page. - */ -int -__wt_row_leaf_keys(WT_SESSION_IMPL *session, WT_PAGE *page) -{ - WT_BTREE *btree; - WT_DECL_ITEM(key); - WT_DECL_ITEM(tmp); - WT_DECL_RET; - WT_ROW *rip; - uint32_t gap, i; - - btree = S2BT(session); - - if (page->entries == 0) { /* Just checking... */ - F_SET_ATOMIC(page, WT_PAGE_BUILD_KEYS); - return (0); - } - - /* - * Row-store leaf pages are written as one big prefix-compressed chunk, - * that is, only the first key on the page is not prefix-compressed, and - * to instantiate the last key on the page, you have to take the first - * key on the page and roll it forward to the end of the page. We don't - * want to do that on every page access, of course, so we instantiate a - * set of keys, essentially creating prefix chunks on the page, where we - * can roll forward from the closest, previous, instantiated key. The - * complication is that not all keys on a page are equal: we're doing a - * binary search on the page, which means there are keys we look at a - * lot (every time we search the page), and keys we never look at unless - * they are actually being searched for. This function figures out the - * "interesting" keys on a page, and then we sequentially walk that list - * instantiating those keys. - * - * Allocate a bit array and figure out the set of "interesting" keys, - * marking up the array. - */ - WT_RET(__wt_scr_alloc(session, 0, &key)); - WT_RET(__wt_scr_alloc(session, (uint32_t)__bitstr_size(page->entries), &tmp)); - memset(tmp->mem, 0, tmp->memsize); - - if ((gap = btree->key_gap) == 0) - gap = 1; - __inmem_row_leaf_slots(tmp->mem, 0, page->entries, gap); - - /* Instantiate the keys. */ - for (rip = page->pg_row, i = 0; i < page->entries; ++rip, ++i) - if (__bit_test(tmp->mem, i)) - WT_ERR(__wt_row_leaf_key_work(session, page, rip, key, true)); - - F_SET_ATOMIC(page, WT_PAGE_BUILD_KEYS); - -err: - __wt_scr_free(session, &key); - __wt_scr_free(session, &tmp); - return (ret); -} - -/* - * __inmem_row_leaf_slots -- - * Figure out the interesting slots of a page for random search, up to the specified depth. - */ -static void -__inmem_row_leaf_slots(uint8_t *list, uint32_t base, uint32_t entries, uint32_t gap) -{ - uint32_t indx, limit; - - if (entries < gap) - return; - - /* - * !!! - * Don't clean this code up -- it deliberately looks like the binary - * search code. - * - * !!! - * There's got to be a function that would give me this information, but - * I don't see any performance reason we can't just do this recursively. - */ - limit = entries; - indx = base + (limit >> 1); - __bit_set(list, indx); - - __inmem_row_leaf_slots(list, base, limit >> 1, gap); - - base = indx + 1; - --limit; - __inmem_row_leaf_slots(list, base, limit >> 1, gap); -} - /* * __wt_row_leaf_key_copy -- * Get a copy of a row-store leaf-page key. @@ -131,21 +37,19 @@ __wt_row_leaf_key_work( WT_BTREE *btree; WT_CELL *cell; WT_CELL_UNPACK_KV *unpack, _unpack; - WT_DECL_ITEM(tmp); WT_DECL_RET; WT_IKEY *ikey; WT_ROW *rip, *jump_rip; - size_t size; - u_int last_prefix; - int jump_slot_offset, slot_offset; + size_t group_size, key_size; + uint32_t slot; + u_int jump_slot_offset, slot_offset; + uint8_t group_prefix, key_prefix, last_prefix; void *copy; - const void *p; + const void *group_key, *key_data; /* - * !!! - * It is unusual to call this function: most code should be calling the - * front-end, __wt_row_leaf_key, be careful if you're calling this code - * directly. + * It is unusual to call this function: most code should be calling the front-end, + * __wt_row_leaf_key, be careful if you're calling this code directly. */ btree = S2BT(session); @@ -154,10 +58,10 @@ __wt_row_leaf_key_work( jump_rip = NULL; jump_slot_offset = 0; - last_prefix = 0; + last_prefix = key_prefix = 0; - p = NULL; /* -Werror=maybe-uninitialized */ - size = 0; /* -Werror=maybe-uninitialized */ + key_data = NULL; /* -Werror=maybe-uninitialized */ + key_size = 0; /* -Werror=maybe-uninitialized */ direction = BACKWARD; for (slot_offset = 0;;) { @@ -171,17 +75,26 @@ switch_and_jump: rip = jump_rip; slot_offset = jump_slot_offset; } - copy = WT_ROW_KEY_COPY(rip); +overflow_retry: /* - * Figure out what the key looks like. + * Figure out what the key looks like. The row-store key can change underfoot; explicitly + * take a copy. */ - WT_IGNORE_RET_BOOL(__wt_row_leaf_key_info(page, copy, &ikey, &cell, &p, &size)); + copy = WT_ROW_KEY_COPY(rip); + __wt_row_leaf_key_info(page, copy, &ikey, &cell, &key_data, &key_size, &key_prefix); /* 1: the test for a directly referenced on-page key. */ - if (cell == NULL) { - keyb->data = p; - keyb->size = size; + if (ikey == NULL && key_data != NULL) { + /* + * If there's a key without prefix compression, we're good to go, otherwise we have to + * deal with the prefix. + */ + if (key_prefix == 0) { + keyb->data = key_data; + keyb->size = key_size; + } else + goto prefix_continue; /* * If this is the key we originally wanted, we don't care if we're rolling forward or @@ -189,18 +102,19 @@ switch_and_jump: * normally happen, the fast-path code that front-ends this function will have figured * it out before we were called. * - * The key doesn't need to be instantiated, skip past that test. + * The key doesn't need to be instantiated, just return. */ if (slot_offset == 0) - goto done; + return (0); /* - * This key is not an overflow key by definition and - * isn't compressed in any way, we can use it to roll - * forward. - * If rolling backward, switch directions. - * If rolling forward: there's a bug somewhere, - * we should have hit this key when rolling backward. + * This key is not an overflow key by definition and isn't compressed in any way, we can + * use it to roll forward. + * + * If rolling backward, switch directions. + * + * If rolling forward: there's a bug somewhere, we should have hit this key when rolling + * backward. */ goto switch_and_jump; } @@ -212,119 +126,154 @@ switch_and_jump: * backward, or if it's an overflow key or not, it's what we wanted. Take a copy and * wrap up. * - * The key doesn't need to be instantiated, skip past that test. + * The key doesn't need to be instantiated, just return. */ if (slot_offset == 0) { - keyb->data = p; - keyb->size = size; - goto done; + keyb->data = key_data; + keyb->size = key_size; + return (0); } /* - * If we wanted a different key and this key is an - * overflow key: - * If we're rolling backward, this key is useless - * to us because it doesn't have a valid prefix: keep - * rolling backward. - * If we're rolling forward, there's no work to be - * done because prefixes skip overflow keys: keep - * rolling forward. + * If we wanted a different key and this key is an overflow key: + * + * If we're rolling backward, this key is useless to us because it doesn't have a valid + * prefix: keep rolling backward. + * + * If we're rolling forward, there's no work to be done because prefixes skip overflow + * keys: keep rolling forward. */ if (__wt_cell_type(cell) == WT_CELL_KEY_OVFL) goto next; /* - * If we wanted a different key and this key is not an - * overflow key, it has a valid prefix, we can use it. - * If rolling backward, take a copy of the key and - * switch directions, we can roll forward from this key. - * If rolling forward, replace the key we've been - * building with this key, it's what we would have built - * anyway. - * In short: if it's not an overflow key, take a copy - * and roll forward. + * If we wanted a different key and this key is not an overflow key, it has a valid + * prefix, we can use it. + * + * If rolling backward, take a copy of the key and switch directions, we can roll + * forward from this key. + * + * If rolling forward, replace the key we've been building with this key, it's what we + * would have built anyway. + * + * In short: if it's not an overflow key, take a copy and roll forward. */ - keyb->data = p; - keyb->size = size; + keyb->data = key_data; + keyb->size = key_size; direction = FORWARD; goto next; } - /* - * It must be an on-page cell, unpack it. - */ + /* Unpack the on-page cell. */ __wt_cell_unpack_kv(session, page->dsk, cell, unpack); /* 3: the test for an on-page reference to an overflow key. */ - if (unpack->type == WT_CELL_KEY_OVFL) { + if (unpack->type == WT_CELL_KEY_OVFL || unpack->type == WT_CELL_KEY_OVFL_RM) { /* * If this is the key we wanted from the start, we don't care if it's an overflow key, * get a copy and wrap up. * - * Avoid racing with reconciliation deleting overflow keys. Deleted overflow keys must - * be instantiated first, acquire the overflow lock and check. Read the key if we still - * need to do so, but holding the overflow lock. Note we are not using the version of - * the cell-data-ref calls that acquire the overflow lock and do a look-aside into the - * tracking cache: this is an overflow key, not a value, meaning it's instantiated - * before being deleted, not copied into the tracking cache. + * We can race with reconciliation deleting overflow keys. Deleted overflow keys must be + * instantiated before deletion, acquire the overflow lock and check. If the key has + * been deleted, restart the slot and get the instantiated key, else read the key before + * releasing the lock. */ if (slot_offset == 0) { __wt_readlock(session, &btree->ovfl_lock); - copy = WT_ROW_KEY_COPY(rip); - if (!__wt_row_leaf_key_info(page, copy, NULL, &cell, &keyb->data, &keyb->size)) { - __wt_cell_unpack_kv(session, page->dsk, cell, unpack); - ret = __wt_dsk_cell_data_ref(session, WT_PAGE_ROW_LEAF, unpack, keyb); + if (__wt_cell_type_raw(unpack->cell) == WT_CELL_KEY_OVFL_RM) { + __wt_readunlock(session, &btree->ovfl_lock); + goto overflow_retry; } + ret = __wt_dsk_cell_data_ref(session, WT_PAGE_ROW_LEAF, unpack, keyb); __wt_readunlock(session, &btree->ovfl_lock); - WT_ERR(ret); + WT_RET(ret); break; } /* * If we wanted a different key: - * If we're rolling backward, this key is useless - * to us because it doesn't have a valid prefix: keep - * rolling backward. - * If we're rolling forward, there's no work to be - * done because prefixes skip overflow keys: keep - * rolling forward. + * + * If we're rolling backward, this key is useless to us because it doesn't have a valid + * prefix: keep rolling backward. + * + * If we're rolling forward, there's no work to be done because prefixes skip overflow + * keys: keep rolling forward. */ goto next; } /* - * 4: the test for an on-page reference to a key that isn't - * prefix compressed. + * 4: the test for an on-page reference to a key that isn't prefix compressed. */ if (unpack->prefix == 0) { /* - * If this is the key we originally wanted, we don't - * care if we're rolling forward or backward, it's - * what we want. Take a copy and wrap up. + * If this is the key we originally wanted, we don't care if we're rolling forward or + * backward, it's what we want. Take a copy and wrap up. * - * If we wanted a different key, this key has a valid - * prefix, we can use it. - * If rolling backward, take a copy of the key and - * switch directions, we can roll forward from this key. - * If rolling forward there's a bug, we should have - * found this key while rolling backwards and switched - * directions then. + * If we wanted a different key, this key has a valid prefix, we can use it. + * + * If rolling backward, take a copy of the key and switch directions, we can roll + * forward from this key. + * + * If rolling forward there's a bug, we should have found this key while rolling + * backwards and switched directions then. * - * The key doesn't need to be instantiated, skip past - * that test. + * The key doesn't need to be instantiated, just return. */ - WT_ERR(__wt_dsk_cell_data_ref(session, WT_PAGE_ROW_LEAF, unpack, keyb)); + WT_RET(__wt_dsk_cell_data_ref(session, WT_PAGE_ROW_LEAF, unpack, keyb)); if (slot_offset == 0) - goto done; + return (0); goto switch_and_jump; } + key_data = unpack->data; + key_size = unpack->size; + key_prefix = unpack->prefix; + +prefix_continue: + /* + * Proceed with a prefix-compressed key. + * + * Prefix compression means we don't yet have a key, but there's a special case: if the key + * is part of the group of compressed key prefixes we saved when reading the page into + * memory, we can build a key for this slot. Otherwise we have to keep rolling forward or + * backward. + */ + slot = WT_ROW_SLOT(page, rip); + if (slot > page->prefix_start && slot <= page->prefix_stop) { + /* + * Get the root key's information (the row-store key can change underfoot; explicitly + * take a copy). Ignore the root key's size and prefix information because it must be + * large enough (else the current key couldn't have been prefix-compressed based on its + * value), and it can't have a prefix-compression value, it's a root key which is never + * prefix-compressed. + */ + copy = WT_ROW_KEY_COPY(&page->pg_row[page->prefix_start]); + + __wt_row_leaf_key_info(page, copy, NULL, NULL, &group_key, &group_size, &group_prefix); + if (group_key != NULL) { + WT_RET(__wt_buf_init(session, keyb, key_prefix + key_size)); + memcpy(keyb->mem, group_key, key_prefix); + memcpy((uint8_t *)keyb->mem + key_prefix, key_data, key_size); + keyb->size = key_prefix + key_size; + /* + * If this is the key we originally wanted, we don't care if we're rolling forward + * or backward, it's what we want. + * + * The key doesn't need to be instantiated, just return. + */ + if (slot_offset == 0) + return (0); + goto switch_and_jump; + } + } + /* * 5: an on-page reference to a key that's prefix compressed. - * If rolling backward, keep looking for something we can - * use. - * If rolling forward, build the full key and keep rolling - * forward. + * + * If rolling backward, keep looking for something we can use. + * + * If rolling forward, build the full key and keep rolling forward. */ if (direction == BACKWARD) { /* @@ -337,28 +286,26 @@ switch_and_jump: * find a key without a prefix. */ if (slot_offset == 0) - last_prefix = unpack->prefix; - if (slot_offset == 0 || last_prefix > unpack->prefix) { + last_prefix = key_prefix; + if (slot_offset == 0 || last_prefix > key_prefix) { jump_rip = rip; jump_slot_offset = slot_offset; - last_prefix = unpack->prefix; + last_prefix = key_prefix; } } if (direction == FORWARD) { - p = unpack->data; - size = unpack->size; - /* * Grow the buffer as necessary as well as ensure data has been copied into local buffer * space, then append the suffix to the prefix already in the buffer. * * Don't grow the buffer unnecessarily or copy data we don't need, truncate the item's - * data length to the prefix bytes. + * CURRENT data length to the prefix bytes before growing the buffer. */ - keyb->size = unpack->prefix; - WT_ERR(__wt_buf_grow(session, keyb, keyb->size + size)); - memcpy((uint8_t *)keyb->data + keyb->size, p, size); - keyb->size += size; + WT_ASSERT(session, keyb->size >= key_prefix); + keyb->size = key_prefix; + WT_RET(__wt_buf_grow(session, keyb, key_prefix + key_size)); + memcpy((uint8_t *)keyb->data + key_prefix, key_data, key_size); + keyb->size = key_prefix + key_size; if (slot_offset == 0) break; @@ -379,37 +326,35 @@ next: /* * Optionally instantiate the key: there's a cost to figuring out a key value in a leaf page - * with prefix-compressed or Huffman encoded keys, amortize the cost by instantiating a copy of - * the calculated key in allocated memory. We don't instantiate keys when pages are first - * brought into memory because it's wasted effort if the page is only read by a cursor in sorted - * order. If, instead, the page is read by a cursor in reverse order, we immediately instantiate - * periodic keys for the page (otherwise the reverse walk would be insanely slow). If, instead, - * the page is randomly searched, we instantiate keys as they are accessed (meaning, for - * example, as long as the binary search only touches one-half of the page, the only keys we - * instantiate will be in that half of the page). + * with prefix-compressed keys, amortize the cost by instantiating a copy of the calculated key + * in allocated memory. We don't instantiate keys when pages are first brought into memory + * because it's wasted effort if the page is only read by a cursor in sorted order. If, instead, + * the page is read by a cursor in reverse order, we immediately instantiate periodic keys for + * the page (otherwise the reverse walk would be insanely slow). If, instead, the page is + * randomly searched, we instantiate keys as they are accessed (meaning, for example, as long as + * the binary search only touches one-half of the page, the only keys we instantiate will be in + * that half of the page). */ if (instantiate) { copy = WT_ROW_KEY_COPY(rip_arg); - WT_IGNORE_RET_BOOL(__wt_row_leaf_key_info(page, copy, &ikey, &cell, NULL, NULL)); - if (ikey == NULL) { - WT_ERR(__wt_row_ikey_alloc( - session, WT_PAGE_DISK_OFFSET(page, cell), keyb->data, keyb->size, &ikey)); + __wt_row_leaf_key_info(page, copy, &ikey, &cell, NULL, NULL, NULL); - /* - * Serialize the swap of the key into place: on success, update the page's memory - * footprint, on failure, free the allocated memory. - */ - if (__wt_atomic_cas_ptr((void *)&WT_ROW_KEY_COPY(rip), copy, ikey)) - __wt_cache_page_inmem_incr(session, page, sizeof(WT_IKEY) + ikey->size); - else - __wt_free(session, ikey); - } - } + /* Check if we raced with another thread instantiating the key before doing real work. */ + if (ikey != NULL) + return (0); + WT_RET(__wt_row_ikey_alloc( + session, WT_PAGE_DISK_OFFSET(page, cell), keyb->data, keyb->size, &ikey)); -done: -err: - __wt_scr_free(session, &tmp); - return (ret); + /* + * Serialize the swap of the key into place: on success, update the page's memory footprint, + * on failure, free the allocated memory. + */ + if (__wt_atomic_cas_ptr((void *)&WT_ROW_KEY_COPY(rip), copy, ikey)) + __wt_cache_page_inmem_incr(session, page, sizeof(WT_IKEY) + ikey->size); + else + __wt_free(session, ikey); + } + return (0); } /* diff --git a/src/third_party/wiredtiger/src/config/config_collapse.c b/src/third_party/wiredtiger/src/config/config_collapse.c index bd1d172cbaa..527d839f2ec 100644 --- a/src/third_party/wiredtiger/src/config/config_collapse.c +++ b/src/third_party/wiredtiger/src/config/config_collapse.c @@ -222,9 +222,8 @@ __config_merge_format_next(WT_SESSION_IMPL *session, const char *prefix, size_t continue; /* - * The test is complicated by matching empty entries - * "foo=" against nested structures "foo,bar=", where - * the latter is a replacement for the former. + * The test is complicated by matching empty entries "foo=" against nested structures + * "foo,bar=", where the latter is a replacement for the former. */ if (len2 > len1 && (ep + 1)->k[len1] == SEPC && memcmp(ep->k, (ep + 1)->k, len1) == 0) continue; diff --git a/src/third_party/wiredtiger/src/config/config_def.c b/src/third_party/wiredtiger/src/config/config_def.c index 25cbb0e8b33..654abaf40d5 100644 --- a/src/third_party/wiredtiger/src/config/config_def.c +++ b/src/third_party/wiredtiger/src/config/config_def.c @@ -590,6 +590,7 @@ static const WT_CONFIG_CHECK confchk_object_meta[] = { {"collator", "string", NULL, NULL, NULL, 0}, {"columns", "list", NULL, NULL, NULL, 0}, {"dictionary", "int", NULL, "min=0", NULL, 0}, {"encryption", "category", NULL, NULL, confchk_WT_SESSION_create_encryption_subconfigs, 2}, + {"flush", "string", NULL, NULL, NULL, 0}, {"format", "string", NULL, "choices=[\"btree\"]", NULL, 0}, {"huffman_key", "string", NULL, NULL, NULL, 0}, {"huffman_value", "string", NULL, NULL, NULL, 0}, {"id", "string", NULL, NULL, NULL, 0}, @@ -1264,7 +1265,7 @@ static const WT_CONFIG_ENTRY config_entries[] = {{"WT_CONNECTION.add_collator", "block_compressor=,cache_resident=false,checkpoint=," "checkpoint_backup_info=,checkpoint_lsn=,checksum=uncompressed," "collator=,columns=,dictionary=0,encryption=(keyid=,name=)," - "format=btree,huffman_key=,huffman_value=,id=," + "flush=0,format=btree,huffman_key=,huffman_value=,id=," "ignore_in_memory_cache_size=false,internal_item_max=0," "internal_key_max=0,internal_key_truncate=true," "internal_page_max=4KB,key_format=u,key_gap=10,leaf_item_max=0," @@ -1276,7 +1277,7 @@ static const WT_CONFIG_ENTRY config_entries[] = {{"WT_CONNECTION.add_collator", "tiered_storage=(auth_token=,bucket=,bucket_prefix=," "local_retention=300,name=,object_target_size=10M),value_format=u" ",verbose=[],version=(major=0,minor=0),write_timestamp_usage=none", - confchk_object_meta, 46}, + confchk_object_meta, 47}, {"table.meta", "app_metadata=,assert=(commit_timestamp=none," "durable_timestamp=none,read_timestamp=none,write_timestamp=off)," diff --git a/src/third_party/wiredtiger/src/config/test_config.c b/src/third_party/wiredtiger/src/config/test_config.c index bb46c2a1f24..c517ba96f5a 100644 --- a/src/third_party/wiredtiger/src/config/test_config.c +++ b/src/third_party/wiredtiger/src/config/test_config.c @@ -3,7 +3,7 @@ #include "wt_internal.h" static const WT_CONFIG_CHECK confchk_stat_cache_size_subconfigs[] = { - {"enabled", "boolean", NULL, NULL, NULL, 0}, {"limit", "string", NULL, NULL, NULL, 0}, + {"enabled", "boolean", NULL, NULL, NULL, 0}, {"limit", "int", NULL, "min=0", NULL, 0}, {NULL, NULL, NULL, NULL, NULL, 0}}; static const WT_CONFIG_CHECK confchk_runtime_monitor_subconfigs[] = { @@ -27,7 +27,7 @@ static const WT_CONFIG_CHECK confchk_insert_config_subconfigs[] = { {"value_size", "int", NULL, "min=0,max=1000000000", NULL, 0}, {NULL, NULL, NULL, NULL, NULL, 0}}; static const WT_CONFIG_CHECK confchk_ops_per_transaction_subconfigs[] = { - {"max", "string", NULL, NULL, NULL, 0}, {"min", "string", NULL, NULL, NULL, 0}, + {"max", "string", NULL, NULL, NULL, 0}, {"min", "int", NULL, "min=0", NULL, 0}, {NULL, NULL, NULL, NULL, NULL, 0}}; static const WT_CONFIG_CHECK confchk_update_config_subconfigs[] = { @@ -54,7 +54,9 @@ static const WT_CONFIG_CHECK confchk_workload_generator_subconfigs[] = { {"value_size", "int", NULL, "min=0,max=1000000000", NULL, 0}, {NULL, NULL, NULL, NULL, NULL, 0}}; static const WT_CONFIG_CHECK confchk_workload_tracking_subconfigs[] = { - {"enabled", "boolean", NULL, NULL, NULL, 0}, {NULL, NULL, NULL, NULL, NULL, 0}}; + {"enabled", "boolean", NULL, NULL, NULL, 0}, + {"interval", "string", NULL, "choices=[\"s\",\"m\",\"h\"]", NULL, 0}, + {"op_count", "int", NULL, "min=1,max=10000", NULL, 0}, {NULL, NULL, NULL, NULL, NULL, 0}}; static const WT_CONFIG_CHECK confchk_example_test[] = { {"cache_size_mb", "int", NULL, "min=0,max=100000000000", NULL, 0}, @@ -63,7 +65,7 @@ static const WT_CONFIG_CHECK confchk_example_test[] = { {"runtime_monitor", "category", NULL, NULL, confchk_runtime_monitor_subconfigs, 4}, {"timestamp_manager", "category", NULL, NULL, confchk_timestamp_manager_subconfigs, 5}, {"workload_generator", "category", NULL, NULL, confchk_workload_generator_subconfigs, 15}, - {"workload_tracking", "category", NULL, NULL, confchk_workload_tracking_subconfigs, 1}, + {"workload_tracking", "category", NULL, NULL, confchk_workload_tracking_subconfigs, 3}, {NULL, NULL, NULL, NULL, NULL, 0}}; static const WT_CONFIG_CHECK confchk_poc_test[] = { @@ -73,35 +75,35 @@ static const WT_CONFIG_CHECK confchk_poc_test[] = { {"runtime_monitor", "category", NULL, NULL, confchk_runtime_monitor_subconfigs, 4}, {"timestamp_manager", "category", NULL, NULL, confchk_timestamp_manager_subconfigs, 5}, {"workload_generator", "category", NULL, NULL, confchk_workload_generator_subconfigs, 15}, - {"workload_tracking", "category", NULL, NULL, confchk_workload_tracking_subconfigs, 1}, + {"workload_tracking", "category", NULL, NULL, confchk_workload_tracking_subconfigs, 3}, {NULL, NULL, NULL, NULL, NULL, 0}}; static const WT_CONFIG_ENTRY config_entries[] = { {"example_test", - "cache_size_mb=0,duration_seconds=0,enable_logging=true," - "runtime_monitor=(enabled=false,interval=s,op_count=1," - "stat_cache_size=(enabled=false,limit=))," - "timestamp_manager=(enabled=false,interval=s,oldest_lag=0," - "op_count=1,stable_lag=0),workload_generator=(collection_count=1," - "enabled=false,insert_config=(interval=s,key_size=0,op_count=1," - "value_size=0),insert_threads=0,interval=s,interval=s,key_count=0" - ",key_size=0,op_count=1,op_count=1,ops_per_transaction=(max=1," - "min=),read_threads=0,update_config=(interval=s,key_size=0," - "op_count=1,value_size=0),update_threads=0,value_size=0)," - "workload_tracking=(enabled=false)", + "cache_size_mb=0,duration_seconds=0,enable_logging=false," + "runtime_monitor=(enabled=true,interval=s,op_count=1," + "stat_cache_size=(enabled=false,limit=0))," + "timestamp_manager=(enabled=true,interval=s,oldest_lag=1," + "op_count=1,stable_lag=1),workload_generator=(collection_count=1," + "enabled=true,insert_config=(interval=s,key_size=5,op_count=1," + "value_size=5),insert_threads=0,interval=s,interval=s,key_count=0" + ",key_size=5,op_count=1,op_count=1,ops_per_transaction=(max=1," + "min=0),read_threads=0,update_config=(interval=s,key_size=5," + "op_count=1,value_size=5),update_threads=0,value_size=5)," + "workload_tracking=(enabled=true,interval=s,op_count=1)", confchk_example_test, 7}, {"poc_test", - "cache_size_mb=0,duration_seconds=0,enable_logging=true," - "runtime_monitor=(enabled=false,interval=s,op_count=1," - "stat_cache_size=(enabled=false,limit=))," - "timestamp_manager=(enabled=false,interval=s,oldest_lag=0," - "op_count=1,stable_lag=0),workload_generator=(collection_count=1," - "enabled=false,insert_config=(interval=s,key_size=0,op_count=1," - "value_size=0),insert_threads=0,interval=s,interval=s,key_count=0" - ",key_size=0,op_count=1,op_count=1,ops_per_transaction=(max=1," - "min=),read_threads=0,update_config=(interval=s,key_size=0," - "op_count=1,value_size=0),update_threads=0,value_size=0)," - "workload_tracking=(enabled=false)", + "cache_size_mb=0,duration_seconds=0,enable_logging=false," + "runtime_monitor=(enabled=true,interval=s,op_count=1," + "stat_cache_size=(enabled=false,limit=0))," + "timestamp_manager=(enabled=true,interval=s,oldest_lag=1," + "op_count=1,stable_lag=1),workload_generator=(collection_count=1," + "enabled=true,insert_config=(interval=s,key_size=5,op_count=1," + "value_size=5),insert_threads=0,interval=s,interval=s,key_count=0" + ",key_size=5,op_count=1,op_count=1,ops_per_transaction=(max=1," + "min=0),read_threads=0,update_config=(interval=s,key_size=5," + "op_count=1,value_size=5),update_threads=0,value_size=5)," + "workload_tracking=(enabled=true,interval=s,op_count=1)", confchk_poc_test, 7}, {NULL, NULL, NULL, 0}}; diff --git a/src/third_party/wiredtiger/src/conn/conn_api.c b/src/third_party/wiredtiger/src/conn/conn_api.c index 47a28e016f2..3d7b5fed416 100644 --- a/src/third_party/wiredtiger/src/conn/conn_api.c +++ b/src/third_party/wiredtiger/src/conn/conn_api.c @@ -1287,7 +1287,7 @@ __conn_query_timestamp(WT_CONNECTION *wt_conn, char *hex_timestamp, const char * conn = (WT_CONNECTION_IMPL *)wt_conn; CONNECTION_API_CALL(conn, session, query_timestamp, config, cfg); - WT_TRET(__wt_txn_query_timestamp(session, hex_timestamp, cfg, true)); + ret = __wt_txn_query_timestamp(session, hex_timestamp, cfg, true); err: API_END_RET(session, ret); } @@ -1306,7 +1306,7 @@ __conn_set_timestamp(WT_CONNECTION *wt_conn, const char *config) conn = (WT_CONNECTION_IMPL *)wt_conn; CONNECTION_API_CALL(conn, session, set_timestamp, config, cfg); - WT_TRET(__wt_txn_global_set_timestamp(session, cfg)); + ret = __wt_txn_global_set_timestamp(session, cfg); err: API_END_RET(session, ret); } @@ -1326,7 +1326,7 @@ __conn_rollback_to_stable(WT_CONNECTION *wt_conn, const char *config) CONNECTION_API_CALL(conn, session, rollback_to_stable, config, cfg); WT_STAT_CONN_INCR(session, txn_rts); - WT_TRET(__wt_rollback_to_stable(session, cfg, false)); + ret = __wt_rollback_to_stable(session, cfg, false); err: API_END_RET(session, ret); } @@ -2810,16 +2810,13 @@ wiredtiger_open(const char *home, WT_EVENT_HANDLER *event_handler, const char *c WT_ERR(__wt_tiered_conn_config(session, cfg, false)); /* - * The metadata/log encryptor is configured after extensions, since - * extensions may load encryptors. We have to do this before creating - * the metadata file. + * The metadata/log encryptor is configured after extensions, since extensions may load + * encryptors. We have to do this before creating the metadata file. * - * The encryption customize callback needs the fully realized set of - * encryption args, as simply grabbing "encryption" doesn't work. - * As an example, configuration for the current call may just be - * "encryption=(secretkey=xxx)", with encryption.name, - * encryption.keyid being 'inherited' from the stored base - * configuration. + * The encryption customize callback needs the fully realized set of encryption args, as simply + * grabbing "encryption" doesn't work. As an example, configuration for the current call may + * just be "encryption=(secretkey=xxx)", with encryption.name, encryption.keyid being + * 'inherited' from the stored base configuration. */ WT_ERR(__wt_config_gets_none(session, cfg, "encryption.name", &cval)); WT_ERR(__wt_config_gets_none(session, cfg, "encryption.keyid", &keyid)); diff --git a/src/third_party/wiredtiger/src/cursor/cur_backup_incr.c b/src/third_party/wiredtiger/src/cursor/cur_backup_incr.c index 9b31214a0ee..0f1cd879d72 100644 --- a/src/third_party/wiredtiger/src/cursor/cur_backup_incr.c +++ b/src/third_party/wiredtiger/src/cursor/cur_backup_incr.c @@ -78,23 +78,28 @@ __curbackup_incr_blkmod(WT_SESSION_IMPL *session, WT_BTREE *btree, WT_CURSOR_BAC /* * The rename configuration string component was added later. So don't error if we don't - * find it in the string. If we don't have it, we're not doing a rename. + * find it in the string. If we don't have it, we're not doing a rename. Otherwise rename + * forces full copies, there is no need to traverse the blocks information. */ WT_ERR_NOTFOUND_OK(__wt_config_subgets(session, &v, "rename", &b), true); - if (ret == 0 && b.val) + if (ret == 0 && b.val) { + cb->nbits = 0; + cb->offset = 0; + cb->bit_offset = 0; F_SET(cb, WT_CURBACKUP_RENAME); - else + } else { F_CLR(cb, WT_CURBACKUP_RENAME); - /* - * We found a match. Load the block information into the cursor. - */ - if ((ret = __wt_config_subgets(session, &v, "blocks", &b)) == 0) { - WT_ERR(__wt_backup_load_incr(session, &b, &cb->bitstring, cb->nbits)); - cb->bit_offset = 0; - F_SET(cb, WT_CURBACKUP_INCR_INIT); + /* + * We found a match. Load the block information into the cursor. + */ + if ((ret = __wt_config_subgets(session, &v, "blocks", &b)) == 0) { + WT_ERR(__wt_backup_load_incr(session, &b, &cb->bitstring, cb->nbits)); + cb->bit_offset = 0; + F_SET(cb, WT_CURBACKUP_INCR_INIT); + } + WT_ERR_NOTFOUND_OK(ret, false); } - WT_ERR_NOTFOUND_OK(ret, false); break; } WT_ERR_NOTFOUND_OK(ret, false); diff --git a/src/third_party/wiredtiger/src/cursor/cur_file.c b/src/third_party/wiredtiger/src/cursor/cur_file.c index bc9057a47d5..046a7ac52db 100644 --- a/src/third_party/wiredtiger/src/cursor/cur_file.c +++ b/src/third_party/wiredtiger/src/cursor/cur_file.c @@ -423,7 +423,8 @@ __curfile_remove(WT_CURSOR *cursor) WT_ASSERT(session, F_MASK(cursor, WT_CURSTD_VALUE_SET) == 0); err: - CURSOR_UPDATE_API_END(session, ret); + /* If we've lost an initial position, we must fail. */ + CURSOR_UPDATE_API_END_RETRY(session, ret, !positioned || F_ISSET(cursor, WT_CURSTD_KEY_INT)); return (ret); } diff --git a/src/third_party/wiredtiger/src/cursor/cur_hs.c b/src/third_party/wiredtiger/src/cursor/cur_hs.c index a090d0fe0e1..74dd2899fe0 100644 --- a/src/third_party/wiredtiger/src/cursor/cur_hs.c +++ b/src/third_party/wiredtiger/src/cursor/cur_hs.c @@ -867,6 +867,13 @@ retry: goto retry; WT_ERR(ret); +#ifdef HAVE_DIAGNOSTIC + /* Do a search again and call next to check the key order. */ + WT_WITH_PAGE_INDEX(session, ret = __wt_hs_row_search(cbt, &file_cursor->key, true)); + WT_ASSERT(session, ret == 0); + WT_ERR_NOTFOUND_OK(__curhs_file_cursor_next(session, file_cursor), false); +#endif + /* Insert doesn't maintain a position across calls, clear resources. */ if (0) { err: diff --git a/src/third_party/wiredtiger/src/cursor/cur_join.c b/src/third_party/wiredtiger/src/cursor/cur_join.c index 29398aedb2b..7f00ea9bc3d 100644 --- a/src/third_party/wiredtiger/src/cursor/cur_join.c +++ b/src/third_party/wiredtiger/src/cursor/cur_join.c @@ -1422,9 +1422,8 @@ __wt_curjoin_join(WT_SESSION_IMPL *session, WT_CURSOR_JOIN *cjoin, WT_INDEX *idx WT_RET(__curjoin_open_main(session, cjoin, entry)); /* - * When we are repacking index keys to remove the - * primary key, we never want to transform trailing - * 'u'. Use no-op padding to force this. + * When we are repacking index keys to remove the primary key, we never want to + * transform trailing 'u'. Use no-op padding to force this. */ cindex = (WT_CURSOR_INDEX *)ref_cursor; len = strlen(cindex->iface.key_format) + 3; diff --git a/src/third_party/wiredtiger/src/docs/transactions.dox b/src/third_party/wiredtiger/src/docs/transactions.dox index 3bfae988747..c594b6b15c9 100644 --- a/src/third_party/wiredtiger/src/docs/transactions.dox +++ b/src/third_party/wiredtiger/src/docs/transactions.dox @@ -16,14 +16,11 @@ operate on data concurrently because they have the following properties: WiredTiger supports transactions with the following caveats to the ACID properties: -- the maximum level of isolation supported is snapshot isolation. - See @ref transaction_isolation for more details. +- the maximum level of isolation supported is snapshot isolation and all updates must be done at + snapshot isolation. See @ref transaction_isolation for more details. - transactional updates are made durable by a combination of checkpoints and logging. See @ref checkpoint for information on checkpoint durability and @ref durability for information on commit-level durability. -- each transaction's uncommitted changes must fit in memory: for - efficiency, WiredTiger does not write to the log until a transaction - commits. @section transactions_api Transactional API @@ -101,8 +98,8 @@ transactional readers, an operation may fail and return ::WT_ROLLBACK. @section transaction_isolation Isolation levels WiredTiger supports <code>read-uncommitted</code>, -<code>read-committed</code> and <code>snapshot</code> isolation levels; -the default isolation level is <code>snapshot</code>. +<code>read-committed</code> and <code>snapshot</code> isolation levels; the default isolation +level is <code>snapshot</code>, and all updates must be done at snapshot isolation. - <code>read-uncommitted</code>: Transactions can see changes made by other transactions before those diff --git a/src/third_party/wiredtiger/src/history/hs_cursor.c b/src/third_party/wiredtiger/src/history/hs_cursor.c index 1799b068e7e..8dfad4cd983 100644 --- a/src/third_party/wiredtiger/src/history/hs_cursor.c +++ b/src/third_party/wiredtiger/src/history/hs_cursor.c @@ -15,10 +15,17 @@ int __wt_hs_row_search(WT_CURSOR_BTREE *hs_cbt, WT_ITEM *srch_key, bool insert) { + WT_BTREE *hs_btree; WT_CURSOR *hs_cursor; WT_DECL_RET; + WT_SESSION_IMPL *session; bool leaf_found; +#ifdef HAVE_DIAGNOSTIC + WT_PAGE *page; +#endif + hs_btree = CUR2BT(hs_cbt); + session = CUR2S(hs_cbt); hs_cursor = &hs_cbt->iface; leaf_found = false; @@ -27,7 +34,15 @@ __wt_hs_row_search(WT_CURSOR_BTREE *hs_cbt, WT_ITEM *srch_key, bool insert) * perform a full search. */ if (hs_cbt->ref != NULL) { - WT_WITH_BTREE(CUR2S(hs_cbt), CUR2BT(hs_cbt), +#ifdef HAVE_DIAGNOSTIC + WT_ORDERED_READ(page, hs_cbt->ref->page); +#endif + /* + * The page must be pinned and we should have a hazard pointer on that. Ensure the page is + * not evictable. + */ + WT_ASSERT(session, __wt_hazard_check(session, hs_cbt->ref, NULL) != NULL); + WT_WITH_BTREE(session, hs_btree, ret = __wt_row_search(hs_cbt, srch_key, insert, hs_cbt->ref, false, &leaf_found)); WT_RET(ret); @@ -40,13 +55,16 @@ __wt_hs_row_search(WT_CURSOR_BTREE *hs_cbt, WT_ITEM *srch_key, bool insert) (hs_cbt->compare != 0 && (hs_cbt->slot == 0 || hs_cbt->slot == hs_cbt->ref->page->entries - 1))) leaf_found = false; + + /* Ensure there is no eviction happened on this page. */ + WT_ASSERT(session, page == hs_cbt->ref->page); if (!leaf_found) hs_cursor->reset(hs_cursor); } if (!leaf_found) - WT_WITH_BTREE(CUR2S(hs_cbt), CUR2BT(hs_cbt), - ret = __wt_row_search(hs_cbt, srch_key, insert, NULL, false, NULL)); + WT_WITH_BTREE( + session, hs_btree, ret = __wt_row_search(hs_cbt, srch_key, insert, NULL, false, NULL)); if (ret == 0 && !insert) { WT_ERR(__wt_key_return(hs_cbt)); diff --git a/src/third_party/wiredtiger/src/history/hs_rec.c b/src/third_party/wiredtiger/src/history/hs_rec.c index 0e7e2424c57..6e30d425aec 100644 --- a/src/third_party/wiredtiger/src/history/hs_rec.c +++ b/src/third_party/wiredtiger/src/history/hs_rec.c @@ -274,6 +274,8 @@ __wt_hs_insert_updates( WT_DECL_ITEM(prev_full_value); WT_DECL_ITEM(tmp); WT_DECL_RET; +/* Limit the number of consecutive reverse modifies. */ +#define WT_MAX_CONSECUTIVE_REVERSE_MODIFY 10 /* If the limit is exceeded, we will insert a full update to the history store */ #define MAX_REVERSE_MODIFY_NUM 16 WT_MODIFY entries[MAX_REVERSE_MODIFY_NUM]; @@ -284,7 +286,7 @@ __wt_hs_insert_updates( WT_UPDATE *non_aborted_upd, *oldest_upd, *prev_upd, *tombstone, *upd; WT_TIME_WINDOW tw; wt_off_t hs_size; - uint64_t insert_cnt, max_hs_size; + uint64_t insert_cnt, max_hs_size, modify_cnt; uint32_t i; uint8_t *p; int nentries; @@ -363,7 +365,12 @@ __wt_hs_insert_updates( } first_globally_visible_upd = min_ts_upd = out_of_order_ts_upd = NULL; - enable_reverse_modify = true; + + /* + * Reverse deltas are only supported on 'S' and 'u' value formats. + */ + enable_reverse_modify = + (WT_STREQ(btree->value_format, "S") || WT_STREQ(btree->value_format, "u")); /* * The algorithm assumes the oldest update on the update chain in memory is either a full @@ -374,10 +381,11 @@ __wt_hs_insert_updates( * newer than a TOMBSTONE must be a full update. * * The algorithm walks from the oldest update, or the most recently inserted into history - * store update, to the newest update and build full updates along the way. It sets the stop - * time point of the update to the start time point of the next update, squashes the updates - * that are from the same transaction and of the same start timestamp, calculates reverse - * modification if prev_upd is a MODIFY, and inserts the update to the history store. + * store update, to the newest update and builds full updates along the way. It sets the + * stop time point of the update to the start time point of the next update, squashes the + * updates that are from the same transaction and of the same start timestamp, checks if the + * update can be written as reverse modification, and inserts the update to the history + * store either as a full update or a reverse modification. * * It deals with the following scenarios: * 1) We only have full updates on the chain and we only insert full updates to @@ -486,6 +494,7 @@ __wt_hs_insert_updates( * time point, we can squash updates with the same start time point as the onpage update * away. */ + modify_cnt = 0; for (; updates.size > 0 && !(upd->txnid == list->onpage_upd->txnid && upd->start_ts == list->onpage_upd->start_ts); @@ -605,7 +614,9 @@ __wt_hs_insert_updates( * Calculate reverse modify and clear the history store records with timestamps when * inserting the first update. Always write on-disk data store updates to the history * store as a full update because the on-disk update will be the base update for all the - * updates that are older than the on-disk update. + * updates that are older than the on-disk update. Limit the number of consecutive + * reverse modifies for standard updates. We want to ensure we do not store a large + * chain of reverse modifies as to impact read performance. * * Due to concurrent operation of checkpoint and eviction, it is possible that history * store may have more recent versions of a key than the on-disk version. Without a @@ -613,17 +624,20 @@ __wt_hs_insert_updates( * the RTS. */ nentries = MAX_REVERSE_MODIFY_NUM; - if (!F_ISSET(upd, WT_UPDATE_DS) && upd->type == WT_UPDATE_MODIFY && - enable_reverse_modify && + if (!F_ISSET(upd, WT_UPDATE_DS) && !F_ISSET(prev_upd, WT_UPDATE_DS) && + enable_reverse_modify && modify_cnt < WT_MAX_CONSECUTIVE_REVERSE_MODIFY && __wt_calc_modify(session, prev_full_value, full_value, prev_full_value->size / 10, entries, &nentries) == 0) { WT_ERR(__wt_modify_pack(hs_cursor, entries, nentries, &modify_value)); WT_ERR(__hs_insert_record( session, hs_cursor, btree, key, WT_UPDATE_MODIFY, modify_value, &tw)); __wt_scr_free(session, &modify_value); - } else + ++modify_cnt; + } else { + modify_cnt = 0; WT_ERR(__hs_insert_record( session, hs_cursor, btree, key, WT_UPDATE_STANDARD, full_value, &tw)); + } /* Flag the update as now in the history store. */ F_SET(upd, WT_UPDATE_HS); diff --git a/src/third_party/wiredtiger/src/include/api.h b/src/third_party/wiredtiger/src/include/api.h index 2a2bd5aca2f..20a86779a91 100644 --- a/src/third_party/wiredtiger/src/include/api.h +++ b/src/third_party/wiredtiger/src/include/api.h @@ -123,34 +123,31 @@ if (__update) \ F_SET((s)->txn, WT_TXN_UPDATE); -/* End a transactional API call, optional retry on deadlock. */ -#define TXN_API_END_RETRY(s, ret, retry) \ - API_END(s, ret); \ - if (__update) \ - F_CLR((s)->txn, WT_TXN_UPDATE); \ - if (__autotxn) { \ - if (F_ISSET((s)->txn, WT_TXN_AUTOCOMMIT)) \ - F_CLR((s)->txn, WT_TXN_AUTOCOMMIT); \ - else if ((ret) == 0) \ - (ret) = __wt_txn_commit((s), NULL); \ - else { \ - if (retry) \ - WT_TRET(__wt_session_copy_values(s)); \ - WT_TRET(__wt_txn_rollback((s), NULL)); \ - if (((ret) == 0 || (ret) == WT_ROLLBACK) && (retry)) { \ - (ret) = 0; \ - continue; \ - } \ - WT_TRET(__wt_session_reset_cursors(s, false)); \ - } \ - } \ - break; \ - } \ +/* End a transactional API call, optional retry on rollback. */ +#define TXN_API_END(s, ret, retry) \ + API_END(s, ret); \ + if (__update) \ + F_CLR((s)->txn, WT_TXN_UPDATE); \ + if (__autotxn) { \ + if (F_ISSET((s)->txn, WT_TXN_AUTOCOMMIT)) \ + F_CLR((s)->txn, WT_TXN_AUTOCOMMIT); \ + else if ((ret) == 0) \ + (ret) = __wt_txn_commit((s), NULL); \ + else { \ + if (retry) \ + WT_TRET(__wt_session_copy_values(s)); \ + WT_TRET(__wt_txn_rollback((s), NULL)); \ + if ((retry) && (ret) == WT_ROLLBACK) { \ + (ret) = 0; \ + continue; \ + } \ + WT_TRET(__wt_session_reset_cursors(s, false)); \ + } \ + } \ + break; \ + } \ while (1) -/* End a transactional API call, retry on deadlock. */ -#define TXN_API_END(s, ret) TXN_API_END_RETRY(s, ret, 1) - /* * In almost all cases, API_END is returning immediately, make it simple. If a session or connection * method is about to return WT_NOTFOUND (some underlying object was not found), map it to ENOENT, @@ -265,7 +262,9 @@ CURSOR_UPDATE_API_CALL(cur, s, n); \ JOINABLE_CURSOR_CALL_CHECK(cur) -#define CURSOR_UPDATE_API_END(s, ret) \ - if ((ret) == WT_PREPARE_CONFLICT) \ - (ret) = WT_ROLLBACK; \ - TXN_API_END(s, ret) +#define CURSOR_UPDATE_API_END_RETRY(s, ret, retry) \ + if ((ret) == WT_PREPARE_CONFLICT) \ + (ret) = WT_ROLLBACK; \ + TXN_API_END(s, ret, retry) + +#define CURSOR_UPDATE_API_END(s, ret) CURSOR_UPDATE_API_END_RETRY(s, ret, true) diff --git a/src/third_party/wiredtiger/src/include/btmem.h b/src/third_party/wiredtiger/src/include/btmem.h index 24562280ac1..5283c46df55 100644 --- a/src/third_party/wiredtiger/src/include/btmem.h +++ b/src/third_party/wiredtiger/src/include/btmem.h @@ -232,8 +232,8 @@ struct __wt_ovfl_reuse { * We also configure a larger than default internal page size to accommodate for larger history * store keys. We do that to reduce the chances of having to create overflow keys on the page. */ -#ifdef HAVE_BUILTIN_EXTENSION_SNAPPY -#define WT_HS_COMPRESSOR "snappy" +#ifdef HAVE_BUILTIN_EXTENSION_ZSTD +#define WT_HS_COMPRESSOR "zstd" #else #define WT_HS_COMPRESSOR "none" #endif @@ -635,14 +635,17 @@ struct __wt_page { } u; /* - * Page entries, type and flags are positioned at the end of the WT_PAGE union to reduce cache - * misses in the row-store search function. + * Page entry count, page-wide prefix information, type and flags are positioned at the end of + * the WT_PAGE union to reduce cache misses when searching row-store pages. * * The entries field only applies to leaf pages, internal pages use the page-index entries * instead. */ uint32_t entries; /* Leaf page entries */ + uint32_t prefix_start; /* Best page prefix starting slot */ + uint32_t prefix_stop; /* Maximum slot to which the best page prefix applies */ + #define WT_PAGE_IS_INTERNAL(page) \ ((page)->type == WT_PAGE_COL_INT || (page)->type == WT_PAGE_ROW_INT) #define WT_PAGE_INVALID 0 /* Invalid page */ @@ -669,6 +672,19 @@ struct __wt_page { uint8_t unused[2]; /* Unused padding */ + size_t memory_footprint; /* Memory attached to the page */ + + /* Page's on-disk representation: NULL for pages created in memory. */ + const WT_PAGE_HEADER *dsk; + + /* If/when the page is modified, we need lots more information. */ + WT_PAGE_MODIFY *modify; + + /* + * !!! + * This is the 64 byte boundary, try to keep hot fields above here. + */ + /* * The page's read generation acts as an LRU value for each page in the * tree; it is used by the eviction server thread to select pages to be @@ -698,16 +714,6 @@ struct __wt_page { #define WT_READGEN_STEP 100 uint64_t read_gen; - size_t memory_footprint; /* Memory attached to the page */ - - /* Page's on-disk representation: NULL for pages created in memory. */ - const WT_PAGE_HEADER *dsk; - - /* If/when the page is modified, we need lots more information. */ - WT_PAGE_MODIFY *modify; - - /* This is the 64 byte boundary, try to keep hot fields above here. */ - uint64_t cache_create_gen; /* Page create timestamp */ uint64_t evict_pass_gen; /* Eviction pass generation */ }; @@ -1301,10 +1307,9 @@ struct __wt_insert_head { NULL : \ (page)->modify->mod_row_update[WT_ROW_SLOT(page, ip)]) /* - * WT_ROW_INSERT_SMALLEST references an additional slot past the end of the - * "one per WT_ROW slot" insert array. That's because the insert array requires - * an extra slot to hold keys that sort before any key found on the original - * page. + * WT_ROW_INSERT_SMALLEST references an additional slot past the end of the "one per WT_ROW slot" + * insert array. That's because the insert array requires an extra slot to hold keys that sort + * before any key found on the original page. */ #define WT_ROW_INSERT_SMALLEST(page) \ ((page)->modify == NULL || (page)->modify->mod_row_insert == NULL ? \ diff --git a/src/third_party/wiredtiger/src/include/btree.h b/src/third_party/wiredtiger/src/include/btree.h index a253887faf6..6f3a6e086b3 100644 --- a/src/third_party/wiredtiger/src/include/btree.h +++ b/src/third_party/wiredtiger/src/include/btree.h @@ -102,7 +102,8 @@ typedef enum { /* Start position for eviction walk */ struct __wt_btree { WT_DATA_HANDLE *dhandle; - WT_CKPT *ckpt; /* Checkpoint information */ + WT_CKPT *ckpt; /* Checkpoint information */ + size_t ckpt_bytes_allocated; /* Checkpoint information array allocation size */ WT_BTREE_TYPE type; /* Type */ @@ -115,8 +116,6 @@ struct __wt_btree { uint32_t id; /* File ID, for logging */ - uint32_t key_gap; /* Row-store prefix key gap */ - uint32_t allocsize; /* Allocation size */ uint32_t maxintlpage; /* Internal page max size */ uint32_t maxintlkey; /* Internal page max key size */ @@ -256,22 +255,25 @@ struct __wt_btree { WT_EVICT_WALK_TYPE evict_start_type; /* - * Flag values up to 0xff are reserved for WT_DHANDLE_XXX. We don't automatically generate these - * flag values for that reason, there's no way to start at an offset. + * Flag values up to 0xfff are reserved for WT_DHANDLE_XXX. See comment with dhandle flags for an + * explanation. + * + * We don't automatically generate these flag values for this reason; there's no way to start at an + * offset. */ -#define WT_BTREE_ALTER 0x000100u /* Handle is for alter */ -#define WT_BTREE_BULK 0x000200u /* Bulk-load handle */ -#define WT_BTREE_CLOSED 0x000400u /* Handle closed */ -#define WT_BTREE_IGNORE_CACHE 0x000800u /* Cache-resident object */ -#define WT_BTREE_IN_MEMORY 0x001000u /* Cache-resident object */ -#define WT_BTREE_NO_CHECKPOINT 0x002000u /* Disable checkpoints */ -#define WT_BTREE_NO_LOGGING 0x004000u /* Disable logging */ -#define WT_BTREE_OBSOLETE_PAGES 0x008000u /* Handle has obsolete pages */ -#define WT_BTREE_READONLY 0x010000u /* Handle is readonly */ -#define WT_BTREE_SALVAGE 0x020000u /* Handle is for salvage */ -#define WT_BTREE_SKIP_CKPT 0x040000u /* Handle skipped checkpoint */ -#define WT_BTREE_UPGRADE 0x080000u /* Handle is for upgrade */ -#define WT_BTREE_VERIFY 0x100000u /* Handle is for verify */ +#define WT_BTREE_ALTER 0x0001000u /* Handle is for alter */ +#define WT_BTREE_BULK 0x0002000u /* Bulk-load handle */ +#define WT_BTREE_CLOSED 0x0004000u /* Handle closed */ +#define WT_BTREE_IGNORE_CACHE 0x0008000u /* Cache-resident object */ +#define WT_BTREE_IN_MEMORY 0x0010000u /* Cache-resident object */ +#define WT_BTREE_NO_CHECKPOINT 0x0020000u /* Disable checkpoints */ +#define WT_BTREE_NO_LOGGING 0x0040000u /* Disable logging */ +#define WT_BTREE_OBSOLETE_PAGES 0x0080000u /* Handle has obsolete pages */ +#define WT_BTREE_READONLY 0x0100000u /* Handle is readonly */ +#define WT_BTREE_SALVAGE 0x0200000u /* Handle is for salvage */ +#define WT_BTREE_SKIP_CKPT 0x0400000u /* Handle skipped checkpoint */ +#define WT_BTREE_UPGRADE 0x0800000u /* Handle is for upgrade */ +#define WT_BTREE_VERIFY 0x1000000u /* Handle is for verify */ uint32_t flags; }; diff --git a/src/third_party/wiredtiger/src/include/btree_inline.h b/src/third_party/wiredtiger/src/include/btree_inline.h index 41510592410..9c0e0ce784e 100644 --- a/src/third_party/wiredtiger/src/include/btree_inline.h +++ b/src/third_party/wiredtiger/src/include/btree_inline.h @@ -874,151 +874,177 @@ __wt_ref_key_clear(WT_REF *ref) * Return a row-store leaf page key referenced by a WT_ROW if it can be had without unpacking a * cell, and information about the cell, if the key isn't cheaply available. */ -static inline bool -__wt_row_leaf_key_info( - WT_PAGE *page, void *copy, WT_IKEY **ikeyp, WT_CELL **cellp, void *datap, size_t *sizep) +static inline void +__wt_row_leaf_key_info(WT_PAGE *page, void *copy, WT_IKEY **ikeyp, WT_CELL **cellp, void *datap, + size_t *sizep, uint8_t *prefixp) { WT_IKEY *ikey; uintptr_t v; v = (uintptr_t)copy; -/* - * A row-store leaf page key is in one of two places: if instantiated, - * the WT_ROW pointer references a WT_IKEY structure, otherwise, it - * references an on-page offset. Further, on-page keys are in one of - * two states: if the key is a simple key (not an overflow key or prefix - * compressed, all of which are likely), the key's offset/size is encoded - * in the pointer. Otherwise, the offset is to the key's on-page cell. - * - * Now the magic: allocated memory must be aligned to store any standard - * type, and we expect some standard type to require at least quad-byte - * alignment, so allocated memory should have some clear low-order bits. - * On-page objects consist of an offset/length pair: the maximum page - * size currently fits into 29 bits, so we use the low-order bits of the - * pointer to mark the other bits of the pointer as encoding the key's - * location and length. This breaks if allocated memory isn't aligned, - * of course. - * - * In this specific case, we use bit 0x01 to mark an on-page cell, bit - * 0x02 to mark an on-page key, 0x03 to mark an on-page key/value pair, - * otherwise it's a WT_IKEY reference. The bit pattern for on-page cells - * is: - * 29 bits page offset of the key's cell, - * 2 bits flags - * - * The bit pattern for on-page keys is: - * 32 bits key length, - * 29 bits page offset of the key's bytes, - * 2 bits flags - * - * But, while that allows us to skip decoding simple key cells, we also - * want to skip decoding the value cell in the case where the value cell - * is also simple/short. We use bit 0x03 to mark an encoded on-page key - * and value pair. The bit pattern for on-page key/value pairs is: - * 9 bits key length, - * 13 bits value length, - * 20 bits page offset of the key's bytes, - * 20 bits page offset of the value's bytes, - * 2 bits flags - * - * These bit patterns are in-memory only, of course, so can be modified - * (we could even tune for specific workloads). Generally, the fields - * are larger than the anticipated values being stored (512B keys, 8KB - * values, 1MB pages), hopefully that won't be necessary. - * - * This function returns a list of things about the key (instantiation - * reference, cell reference and key/length pair). Our callers know - * the order in which we look things up and the information returned; - * for example, the cell will never be returned if we are working with - * an on-page key. - */ + /* + * A row-store leaf page key is in one of two places: if instantiated, the WT_ROW pointer + * references a WT_IKEY structure, otherwise, it references an on-page item. Further, on-page + * items are in one of two states: if the key is a simple key (not an overflow key, which is + * likely), the key's offset, size and prefix is encoded in the 8B of pointer. Otherwise, the + * offset is to the key's on-page cell. + * + * This function returns information from a set of things about the key (WT_IKEY reference, cell + * reference and/or key/length/prefix triplet). Our callers know the order we resolve items and + * what information will be returned. Specifically, the caller gets a key (in the form of a + * pointer to the bytes, a length and a prefix length in all cases where we can get it without + * unpacking a cell), plus an optional WT_IKEY reference, and in all cases, a pointer to the + * on-page cell. Our caller's test is generally if there is a returned key or not, falling back + * to the returned cell. + * + * Now the magic: allocated memory must be aligned to store any standard type and we expect some + * standard type to require at least quad-byte alignment, so allocated memory should have two + * clear low-order bits. On-page objects consist of an offset/length pair and a prefix in the + * case of a key: the maximum page size is 29 bits (512MB), the remaining bits hold the key or + * value location and bytes. This breaks if allocated memory isn't aligned, of course. + * + * In this specific case, we use bit 0x01 to mark an on-page cell, bit 0x02 to mark an on-page + * key, 0x03 to mark an on-page key/value pair, otherwise it's a WT_IKEY reference. The bit + * pattern for on-page cells is: + * + * 29 bits offset of the key's cell (512MB) + * 2 bits 0x01 flag + * + * The on-page cell is our fallback: if a key or value won't fit into our encoding (unlikely, + * but possible), we fall back to using a cell reference, which obviously has enough room for + * all possible values. + * + * The next encoding is for on-page keys: + * + * 19 bits key's length (512KB) + * 6 bits offset of the key's bytes from the key's cell (32B) + * 8 bits key's prefix length (256B, the maximum possible value) + * 29 bits offset of the key's cell (512MB) + * 2 bits 0x02 flag + * + * But, while that allows us to skip decoding simple key cells, we also want to skip decoding + * value cells in the case where the value cell is also simple/short. We use bit 0x03 to mark + * an encoded on-page key and value pair. The encoding for on-page key/value pairs is: + * + * 13 bits value's length (8KB) + * 6 bits offset of the value's bytes from the end of the key's cell (32B) + * 12 bits key's length (4KB) + * 6 bits offset of the key's bytes from the key's cell (32B) + * 8 bits key's prefix length (256B, the maximum possible value) + * 17 bits offset of the key's cell (128KB) + * 2 bits 0x03 flag + * + * A reason for the complexity here is we need to be able to find the key and value cells from + * the encoded form: for that reason we store an offset to the key cell plus a second offset to + * the start of the key's bytes. Finding the value cell is reasonably straight-forward, we use + * the location of the key to find the cell immediately following the key. + * + * A simple extension of this encoding would be to encode zero-length values similarly to how we + * encode short values. However, zero-length values are noted by adjacent key cells on the page, + * and we detect that without decoding the second cell by checking the cell's type byte. Tests + * indicate it's slightly slower to encode missing value cells than to check the cell type, so + * we don't bother with the encoding. + * + * Generally, the bitfields are expected to be larger than the stored items (4/8KB keys/values, + * 128KB pages), but the underlying limits are larger and we can see items we cannot encode in + * this way. For example, if an application creates pages larger than 128KB, encoded key/value + * offsets after the maximum offset (the offsets of cells at the end of the page), couldn't be + * encoded. If that's not working, these bit patterns can be changed as they are in-memory only + * (we could even tune for specific workloads in specific trees). + */ +#define WT_KEY_FLAG_BITS 0x03 + #define WT_CELL_FLAG 0x01 +/* key cell offset field size can hold maximum value, WT_CELL_MAX_KEY_CELL_OFFSET not needed. */ #define WT_CELL_ENCODE_OFFSET(v) ((uintptr_t)(v) << 2) -#define WT_CELL_DECODE_OFFSET(v) (((v)&0xFFFFFFFF) >> 2) +#define WT_CELL_DECODE_OFFSET(v) ((v) >> 2) #define WT_K_FLAG 0x02 -#define WT_K_ENCODE_KEY_LEN(v) ((uintptr_t)(v) << 32) -#define WT_K_DECODE_KEY_LEN(v) ((v) >> 32) -#define WT_K_ENCODE_KEY_OFFSET(v) ((uintptr_t)(v) << 2) -#define WT_K_DECODE_KEY_OFFSET(v) (((v)&0xFFFFFFFF) >> 2) +#define WT_K_MAX_KEY_LEN (0x80000 - 1) +#define WT_K_DECODE_KEY_LEN(v) (((v)&0xffffe00000000000) >> 45) +#define WT_K_ENCODE_KEY_LEN(v) ((uintptr_t)(v) << 45) +#define WT_K_MAX_KEY_OFFSET (0x40 - 1) +#define WT_K_DECODE_KEY_OFFSET(v) (((v)&0x001f8000000000) >> 39) +#define WT_K_ENCODE_KEY_OFFSET(v) ((uintptr_t)(v) << 39) +/* Key prefix field size can hold maximum value, WT_K_MAX_KEY_PREFIX not needed. */ +#define WT_K_DECODE_KEY_PREFIX(v) (((v)&0x00007f80000000) >> 31) +#define WT_K_ENCODE_KEY_PREFIX(v) ((uintptr_t)(v) << 31) +/* Key cell offset field size can hold maximum value, WT_K_MAX_KEY_CELL_OFFSET not needed. */ +#define WT_K_DECODE_KEY_CELL_OFFSET(v) (((v)&0x0000007ffffffc) >> 2) +#define WT_K_ENCODE_KEY_CELL_OFFSET(v) ((uintptr_t)(v) << 2) #define WT_KV_FLAG 0x03 -#define WT_KV_ENCODE_KEY_LEN(v) ((uintptr_t)(v) << 55) -#define WT_KV_DECODE_KEY_LEN(v) ((v) >> 55) -#define WT_KV_MAX_KEY_LEN (0x200 - 1) -#define WT_KV_ENCODE_VALUE_LEN(v) ((uintptr_t)(v) << 42) -#define WT_KV_DECODE_VALUE_LEN(v) (((v)&0x007FFC0000000000) >> 42) #define WT_KV_MAX_VALUE_LEN (0x2000 - 1) -#define WT_KV_ENCODE_KEY_OFFSET(v) ((uintptr_t)(v) << 22) -#define WT_KV_DECODE_KEY_OFFSET(v) (((v)&0x000003FFFFC00000) >> 22) -#define WT_KV_MAX_KEY_OFFSET (0x100000 - 1) -#define WT_KV_ENCODE_VALUE_OFFSET(v) ((uintptr_t)(v) << 2) -#define WT_KV_DECODE_VALUE_OFFSET(v) (((v)&0x00000000003FFFFC) >> 2) -#define WT_KV_MAX_VALUE_OFFSET (0x100000 - 1) - switch (v & 0x03) { - case WT_CELL_FLAG: - /* On-page cell: no instantiated key. */ +#define WT_KV_DECODE_VALUE_LEN(v) (((v)&0xfff8000000000000) >> 51) +#define WT_KV_ENCODE_VALUE_LEN(v) ((uintptr_t)(v) << 51) +#define WT_KV_MAX_VALUE_OFFSET (0x40 - 1) +#define WT_KV_DECODE_VALUE_OFFSET(v) (((v)&0x07e00000000000) >> 45) +#define WT_KV_ENCODE_VALUE_OFFSET(v) ((uintptr_t)(v) << 45) +#define WT_KV_MAX_KEY_LEN (0x1000 - 1) +#define WT_KV_DECODE_KEY_LEN(v) (((v)&0x001ffe00000000) >> 33) +#define WT_KV_ENCODE_KEY_LEN(v) ((uintptr_t)(v) << 33) +/* Key offset encoding is the same for key and key/value forms, WT_KV_MAX_KEY_OFFSET not needed. */ +#define WT_KV_DECODE_KEY_OFFSET(v) (((v)&0x000001f8000000) >> 27) +#define WT_KV_ENCODE_KEY_OFFSET(v) ((uintptr_t)(v) << 27) +/* Key prefix encoding is the same for key and key/value forms, WT_KV_MAX_KEY_PREFIX not needed. */ +#define WT_KV_DECODE_KEY_PREFIX(v) (((v)&0x00000007f80000) >> 19) +#define WT_KV_ENCODE_KEY_PREFIX(v) ((uintptr_t)(v) << 19) +#define WT_KV_MAX_KEY_CELL_OFFSET (0x20000 - 1) +#define WT_KV_DECODE_KEY_CELL_OFFSET(v) (((v)&0x0000000007fffc) >> 2) +#define WT_KV_ENCODE_KEY_CELL_OFFSET(v) ((uintptr_t)(v) << 2) + + switch (v & WT_KEY_FLAG_BITS) { + case WT_CELL_FLAG: /* On-page cell. */ if (ikeyp != NULL) *ikeyp = NULL; if (cellp != NULL) *cellp = (WT_CELL *)WT_PAGE_REF_OFFSET(page, WT_CELL_DECODE_OFFSET(v)); - return (false); - case WT_K_FLAG: - /* Encoded key: no instantiated key, no cell. */ - if (cellp != NULL) - *cellp = NULL; + if (datap != NULL) { + *(void **)datap = NULL; + *sizep = 0; + *prefixp = 0; + } + break; + case WT_K_FLAG: /* Encoded key. */ if (ikeyp != NULL) *ikeyp = NULL; + if (cellp != NULL) + *cellp = (WT_CELL *)WT_PAGE_REF_OFFSET(page, WT_K_DECODE_KEY_CELL_OFFSET(v)); if (datap != NULL) { - *(void **)datap = WT_PAGE_REF_OFFSET(page, WT_K_DECODE_KEY_OFFSET(v)); + *(void **)datap = + WT_PAGE_REF_OFFSET(page, WT_K_DECODE_KEY_CELL_OFFSET(v) + WT_K_DECODE_KEY_OFFSET(v)); *sizep = WT_K_DECODE_KEY_LEN(v); - return (true); + *prefixp = (uint8_t)WT_K_DECODE_KEY_PREFIX(v); } - return (false); - case WT_KV_FLAG: - /* Encoded key/value pair: no instantiated key, no cell. */ - if (cellp != NULL) - *cellp = NULL; + break; + case WT_KV_FLAG: /* Encoded key/value pair. */ if (ikeyp != NULL) *ikeyp = NULL; + if (cellp != NULL) + *cellp = (WT_CELL *)WT_PAGE_REF_OFFSET(page, WT_KV_DECODE_KEY_CELL_OFFSET(v)); if (datap != NULL) { - *(void **)datap = WT_PAGE_REF_OFFSET(page, WT_KV_DECODE_KEY_OFFSET(v)); + *(void **)datap = WT_PAGE_REF_OFFSET( + page, WT_KV_DECODE_KEY_CELL_OFFSET(v) + WT_KV_DECODE_KEY_OFFSET(v)); *sizep = WT_KV_DECODE_KEY_LEN(v); - return (true); + *prefixp = (uint8_t)WT_KV_DECODE_KEY_PREFIX(v); } - return (false); - } - - /* Instantiated key. */ - ikey = (WT_IKEY *)copy; - if (ikeyp != NULL) - *ikeyp = (WT_IKEY *)copy; - if (cellp != NULL) - *cellp = (WT_CELL *)WT_PAGE_REF_OFFSET(page, ikey->cell_offset); - if (datap != NULL) { - *(void **)datap = WT_IKEY_DATA(ikey); - *sizep = ikey->size; - return (true); + break; + default: /* Instantiated key. */ + ikey = (WT_IKEY *)copy; + if (ikeyp != NULL) + *ikeyp = ikey; + if (cellp != NULL) + *cellp = ikey->cell_offset == 0 ? + NULL : + (WT_CELL *)WT_PAGE_REF_OFFSET(page, ikey->cell_offset); + if (datap != NULL) { + *(void **)datap = WT_IKEY_DATA(ikey); + *sizep = ikey->size; + *prefixp = 0; + } + break; } - return (false); -} - -/* - * __wt_row_leaf_key_set_cell -- - * Set a WT_ROW to reference an on-page row-store leaf cell. - */ -static inline void -__wt_row_leaf_key_set_cell(WT_PAGE *page, WT_ROW *rip, WT_CELL *cell) -{ - uintptr_t v; - - /* - * See the comment in __wt_row_leaf_key_info for an explanation of the magic. - */ - v = WT_CELL_ENCODE_OFFSET(WT_PAGE_DISK_OFFSET(page, cell)) | WT_CELL_FLAG; - WT_ASSERT(NULL, WT_ROW_SLOT(page, rip) < page->entries); - WT_ROW_KEY_SET(rip, v); } /* @@ -1028,54 +1054,92 @@ __wt_row_leaf_key_set_cell(WT_PAGE *page, WT_ROW *rip, WT_CELL *cell) static inline void __wt_row_leaf_key_set(WT_PAGE *page, WT_ROW *rip, WT_CELL_UNPACK_KV *unpack) { - uintptr_t v; + uintptr_t key_offset, v; /* * See the comment in __wt_row_leaf_key_info for an explanation of the magic. + * + * Not checking the prefix and cell offset sizes, the fields hold any legitimate value. */ - v = WT_K_ENCODE_KEY_LEN(unpack->size) | - WT_K_ENCODE_KEY_OFFSET(WT_PAGE_DISK_OFFSET(page, unpack->data)) | WT_K_FLAG; - WT_ASSERT(NULL, WT_ROW_SLOT(page, rip) < page->entries); + key_offset = (uintptr_t)WT_PTRDIFF(unpack->data, unpack->cell); + if (unpack->type != WT_CELL_KEY || key_offset > WT_K_MAX_KEY_OFFSET || + unpack->size > WT_K_MAX_KEY_LEN) + v = WT_CELL_ENCODE_OFFSET(WT_PAGE_DISK_OFFSET(page, unpack->cell)) | WT_CELL_FLAG; + else + v = WT_K_ENCODE_KEY_CELL_OFFSET(WT_PAGE_DISK_OFFSET(page, unpack->cell)) | + WT_K_ENCODE_KEY_PREFIX(unpack->prefix) | WT_K_ENCODE_KEY_OFFSET(key_offset) | + WT_K_ENCODE_KEY_LEN(unpack->size) | WT_K_FLAG; + WT_ROW_KEY_SET(rip, v); } /* * __wt_row_leaf_value_set -- - * Set a WT_ROW to reference an on-page row-store leaf value. + * Set a WT_ROW to reference an on-page row-store leaf key and value pair, if possible. */ static inline void -__wt_row_leaf_value_set(WT_PAGE *page, WT_ROW *rip, WT_CELL_UNPACK_KV *unpack) +__wt_row_leaf_value_set(WT_ROW *rip, WT_CELL_UNPACK_KV *unpack) { - uintptr_t key_len, key_offset, value_offset, v; + uintptr_t value_offset, value_size, v; + /* The row-store key can change underfoot; explicitly take a copy. */ v = (uintptr_t)WT_ROW_KEY_COPY(rip); /* * See the comment in __wt_row_leaf_key_info for an explanation of the magic. + * + * Only encoded keys can be upgraded to encoded key/value pairs. */ - if (!(v & WT_K_FLAG)) /* Already an encoded key */ + if ((v & WT_KEY_FLAG_BITS) != WT_K_FLAG) return; - key_len = WT_K_DECODE_KEY_LEN(v); /* Key length */ - if (key_len > WT_KV_MAX_KEY_LEN) + if (WT_K_DECODE_KEY_CELL_OFFSET(v) > WT_KV_MAX_KEY_CELL_OFFSET) /* Key cell offset */ return; - if (unpack->size > WT_KV_MAX_VALUE_LEN) /* Value length */ + /* + * Not checking the prefix size, the field sizes are the same in both encodings. + * + * Not checking the key offset, the field sizes are the same in both encodings. + */ + if (WT_K_DECODE_KEY_LEN(v) > WT_KV_MAX_KEY_LEN) /* Key len */ return; - key_offset = WT_K_DECODE_KEY_OFFSET(v); /* Page offsets */ - if (key_offset > WT_KV_MAX_KEY_OFFSET) + value_offset = (uintptr_t)WT_PTRDIFF(unpack->data, unpack->cell); + if (value_offset > WT_KV_MAX_VALUE_OFFSET) /* Value offset */ return; - value_offset = WT_PAGE_DISK_OFFSET(page, unpack->data); - if (value_offset > WT_KV_MAX_VALUE_OFFSET) + value_size = unpack->size; + if (value_size > WT_KV_MAX_VALUE_LEN) /* Value length */ return; - v = WT_KV_ENCODE_KEY_LEN(key_len) | WT_KV_ENCODE_VALUE_LEN(unpack->size) | - WT_KV_ENCODE_KEY_OFFSET(key_offset) | WT_KV_ENCODE_VALUE_OFFSET(value_offset) | WT_KV_FLAG; - WT_ASSERT(NULL, WT_ROW_SLOT(page, rip) < page->entries); + v = WT_KV_ENCODE_KEY_CELL_OFFSET(WT_K_DECODE_KEY_CELL_OFFSET(v)) | + WT_KV_ENCODE_KEY_PREFIX(WT_K_DECODE_KEY_PREFIX(v)) | + WT_KV_ENCODE_KEY_OFFSET(WT_K_DECODE_KEY_OFFSET(v)) | + WT_KV_ENCODE_KEY_LEN(WT_K_DECODE_KEY_LEN(v)) | WT_KV_ENCODE_VALUE_OFFSET(value_offset) | + WT_KV_ENCODE_VALUE_LEN(value_size) | WT_KV_FLAG; WT_ROW_KEY_SET(rip, v); } /* + * __wt_row_leaf_key_free -- + * Discard any memory allocated for an instantiated key. + */ +static inline void +__wt_row_leaf_key_free(WT_SESSION_IMPL *session, WT_PAGE *page, WT_ROW *rip) +{ + WT_IKEY *ikey; + void *copy; + + /* The row-store key can change underfoot; explicitly take a copy. */ + copy = WT_ROW_KEY_COPY(rip); + + /* + * If the key was a WT_IKEY allocation (that is, if it points somewhere other than the original + * page), free the memory. + */ + __wt_row_leaf_key_info(page, copy, &ikey, NULL, NULL, NULL, NULL); + __wt_free(session, ikey); +} + +/* * __wt_row_leaf_key -- * Set a buffer to reference a row-store leaf page key as cheaply as possible. */ @@ -1083,7 +1147,12 @@ static inline int __wt_row_leaf_key( WT_SESSION_IMPL *session, WT_PAGE *page, WT_ROW *rip, WT_ITEM *key, bool instantiate) { + WT_CELL *cell; + size_t group_size, key_size; + uint32_t slot; + uint8_t group_prefix, key_prefix; void *copy; + const void *group_key, *key_data; /* * A front-end for __wt_row_leaf_key_work, here to inline fast paths. @@ -1093,12 +1162,34 @@ __wt_row_leaf_key( copy = WT_ROW_KEY_COPY(rip); /* - * All we handle here are on-page keys (which should be a common case), and instantiated keys - * (which start out rare, but become more common as a leaf page is searched, instantiating - * prefix-compressed keys). + * Handle keys taken directly from the disk image (which should be a common case), instantiated + * keys (rare initially, but possibly more common as leaf page search instantiates keys), and + * keys built using the most-used page key prefix. + * + * The most-used page key prefix: the longest group of compressed key prefixes on the page that + * can be built from a single, fully instantiated key on the page, was tracked when the page was + * read. Build keys in that group by appending the key's bytes to the root key from which it was + * compressed. */ - if (__wt_row_leaf_key_info(page, copy, NULL, NULL, &key->data, &key->size)) + __wt_row_leaf_key_info(page, copy, NULL, &cell, &key_data, &key_size, &key_prefix); + if (key_data != NULL && key_prefix == 0) { + key->data = key_data; + key->size = key_size; return (0); + } + slot = WT_ROW_SLOT(page, rip); + if (key_data != NULL && slot > page->prefix_start && slot <= page->prefix_stop) { + /* The row-store key can change underfoot; explicitly take a copy. */ + copy = WT_ROW_KEY_COPY(&page->pg_row[page->prefix_start]); + __wt_row_leaf_key_info(page, copy, NULL, NULL, &group_key, &group_size, &group_prefix); + if (group_key != NULL) { + WT_RET(__wt_buf_init(session, key, key_prefix + key_size)); + memcpy(key->mem, group_key, key_prefix); + memcpy((uint8_t *)key->mem + key_prefix, key_data, key_size); + key->size = key_prefix + key_size; + return (0); + } + } /* * The alternative is an on-page cell with some kind of compressed or overflow key that's never @@ -1108,55 +1199,103 @@ __wt_row_leaf_key( } /* - * __wt_row_leaf_value_cell -- - * Return the unpacked value for a row-store leaf page key. + * __wt_row_leaf_key_instantiate -- + * Instantiate the keys on a leaf page as needed. */ -static inline void -__wt_row_leaf_value_cell(WT_SESSION_IMPL *session, WT_PAGE *page, WT_ROW *rip, - WT_CELL_UNPACK_KV *kpack, WT_CELL_UNPACK_KV *vpack) +static inline int +__wt_row_leaf_key_instantiate(WT_SESSION_IMPL *session, WT_PAGE *page) { - WT_CELL *kcell, *vcell; - WT_CELL_UNPACK_KV unpack; - size_t size; - void *copy, *key; + WT_CELL *cell; + WT_DECL_ITEM(key); + WT_DECL_RET; + WT_ROW *rip; + size_t key_size; + uint32_t i, slot; + uint8_t key_prefix; + u_int skip; + void *copy; + const void *key_data; - size = 0; /* -Werror=maybe-uninitialized */ - key = NULL; /* -Werror=maybe-uninitialized */ + /* + * Cursor previous traversals will be too slow in the case of a set of prefix-compressed keys + * requiring long roll-forward processing. In the worst case, each key would require processing + * every key appearing before it on the page as we walk backwards through the page. If we're + * doing a cursor previous call, and this page has never been checked for excessively long + * stretches of prefix-compressed keys, do it now. + */ + if (F_ISSET_ATOMIC(page, WT_PAGE_BUILD_KEYS)) + return (0); + F_SET_ATOMIC(page, WT_PAGE_BUILD_KEYS); - /* If we already have an unpacked key cell, use it. */ - if (kpack != NULL) - vcell = (WT_CELL *)((uint8_t *)kpack->cell + __wt_cell_total_len(kpack)); - else { + /* Walk the keys, making sure there's something easy to work with periodically. */ + skip = 0; + WT_ROW_FOREACH (page, rip, i) { /* - * The row-store key can change underfoot; explicitly take a copy. + * Get the key's information. The row-store key can change underfoot; explicitly take a + * copy. */ copy = WT_ROW_KEY_COPY(rip); + __wt_row_leaf_key_info(page, copy, NULL, &cell, &key_data, &key_size, &key_prefix); /* - * Figure out where the key is, step past it to the value cell. The test for a cell not - * being set tells us that we have an on-page key, otherwise we're looking at an - * instantiated key or on-page cell, both of which require an unpack of the key's cell to - * find the value cell that follows. + * If the key isn't prefix compressed, or is a prefix-compressed key we can derive from the + * group record, we're done. */ - if (__wt_row_leaf_key_info(page, copy, NULL, &kcell, &key, &size) && kcell == NULL) - vcell = (WT_CELL *)((uint8_t *)key + size); - else { - __wt_cell_unpack_kv(session, page->dsk, kcell, &unpack); - vcell = (WT_CELL *)((uint8_t *)unpack.cell + __wt_cell_total_len(&unpack)); + slot = WT_ROW_SLOT(page, rip); + if (key_data != NULL && + (key_prefix == 0 || (slot > page->prefix_start && slot <= page->prefix_stop))) { + skip = 0; + continue; + } + + /* + * Skip overflow keys: we'll instantiate them on demand and they don't require any special + * processing (but they don't help with long strings of prefix compressed keys, either, so + * we'll likely want to instantiate the first key we find after a long stretch of overflow + * keys). More importantly, we don't want to instantiate them for a cursor traversal, we + * only want to instantiate them for a tree search, as that's likely to happen repeatedly. + */ + if (__wt_cell_type(cell) == WT_CELL_KEY_OVFL) { + ++skip; + continue; + } + + /* + * If we skip 10 keys, instantiate one, limiting how far we're forced to roll backward. (The + * value 10 was chosen for no particular reason.) There are still cases where we might not + * need to instantiate this key (for example, a key too large to be encoded, but still + * on-page and not prefix-compressed). Let the underlying worker function figure that out, + * we should have found the vast majority of cases by now. + */ + if (++skip >= 10) { + if (key == NULL) + WT_ERR(__wt_scr_alloc(session, 0, &key)); + WT_ERR(__wt_row_leaf_key(session, page, rip, key, true)); + skip = 0; } } - __wt_cell_unpack_kv(session, page->dsk, __wt_cell_leaf_value_parse(page, vcell), vpack); +err: + __wt_scr_free(session, &key); + return (ret); } /* - * __wt_row_leaf_value_exists -- - * Check if the value for a row-store leaf page encoded key/value pair exists. + * __wt_row_leaf_value_is_encoded -- + * Return if the value for a row-store leaf page is an encoded key/value pair. */ static inline bool -__wt_row_leaf_value_exists(WT_ROW *rip) +__wt_row_leaf_value_is_encoded(WT_ROW *rip) { - return (((uintptr_t)WT_ROW_KEY_COPY(rip) & 0x03) == WT_KV_FLAG); + uintptr_t v; + + /* The row-store key can change underfoot; explicitly take a copy. */ + v = (uintptr_t)WT_ROW_KEY_COPY(rip); + + /* + * See the comment in __wt_row_leaf_key_info for an explanation of the magic. + */ + return ((v & WT_KEY_FLAG_BITS) == WT_KV_FLAG); } /* @@ -1171,11 +1310,22 @@ __wt_row_leaf_value(WT_PAGE *page, WT_ROW *rip, WT_ITEM *value) /* The row-store key can change underfoot; explicitly take a copy. */ v = (uintptr_t)WT_ROW_KEY_COPY(rip); - /* - * See the comment in __wt_row_leaf_key_info for an explanation of the magic. - */ - if ((v & 0x03) == WT_KV_FLAG) { - value->data = WT_PAGE_REF_OFFSET(page, WT_KV_DECODE_VALUE_OFFSET(v)); + if ((v & WT_KEY_FLAG_BITS) == WT_KV_FLAG) { + /* + * See the comment in __wt_row_leaf_key_info for an explanation of the magic. + * + * Normally a value is represented by the value's cell in the disk image (or an update), but + * there is a fast path for returning a simple value, where it's worth the additional effort + * of encoding the value in the per-row reference and retrieving it. This function does that + * work, while most value retrieval goes through the "return the unpacked cell" version. + * + * The value's data is the page offset of the key's cell, plus the key's offset, plus the + * key's size, plus the value's offset: in other words, we know where the key's cell starts, + * the key's data ends the key's cell, and the value cell immediately follows, Skip past the + * key cell to the value cell, then skip to the start of the value's data. + */ + value->data = (uint8_t *)WT_PAGE_REF_OFFSET(page, WT_KV_DECODE_KEY_CELL_OFFSET(v)) + + WT_KV_DECODE_KEY_OFFSET(v) + WT_KV_DECODE_KEY_LEN(v) + WT_KV_DECODE_VALUE_OFFSET(v); value->size = WT_KV_DECODE_VALUE_LEN(v); return (true); } @@ -1183,6 +1333,55 @@ __wt_row_leaf_value(WT_PAGE *page, WT_ROW *rip, WT_ITEM *value) } /* + * __wt_row_leaf_value_cell -- + * Return the unpacked value for a row-store leaf page key. + */ +static inline void +__wt_row_leaf_value_cell( + WT_SESSION_IMPL *session, WT_PAGE *page, WT_ROW *rip, WT_CELL_UNPACK_KV *vpack) +{ + WT_CELL *kcell, *vcell; + WT_CELL_UNPACK_KV unpack; + WT_IKEY *ikey; + uintptr_t v; + + /* The row-store key can change underfoot; explicitly take a copy. */ + v = (uintptr_t)WT_ROW_KEY_COPY(rip); + + kcell = vcell = NULL; + switch (v & WT_KEY_FLAG_BITS) { + case WT_CELL_FLAG: + /* We have a direct reference the key's cell, step past it to the value's cell. */ + kcell = (WT_CELL *)WT_PAGE_REF_OFFSET(page, WT_CELL_DECODE_OFFSET(v)); + break; + case WT_K_FLAG: + /* We have an encoded on-page key, the value's cell follows the key's data. */ + vcell = (WT_CELL *)((uint8_t *)WT_PAGE_REF_OFFSET(page, WT_K_DECODE_KEY_CELL_OFFSET(v)) + + WT_K_DECODE_KEY_OFFSET(v) + WT_K_DECODE_KEY_LEN(v)); + break; + case WT_KV_FLAG: + /* We have an encoded on-page key/value pair, the value's cell follows the key's data. */ + vcell = (WT_CELL *)((uint8_t *)WT_PAGE_REF_OFFSET(page, WT_KV_DECODE_KEY_CELL_OFFSET(v)) + + WT_KV_DECODE_KEY_OFFSET(v) + WT_KV_DECODE_KEY_LEN(v)); + break; + default: + /* We have an instantiated key, the key cell's offset is included in the structure. */ + ikey = (WT_IKEY *)v; + kcell = + ikey->cell_offset == 0 ? NULL : (WT_CELL *)WT_PAGE_REF_OFFSET(page, ikey->cell_offset); + break; + } + + /* If we only have the key cell, unpack it and skip past it to the value cell. */ + if (vcell == NULL) { + __wt_cell_unpack_kv(session, page->dsk, kcell, &unpack); + vcell = (WT_CELL *)((uint8_t *)unpack.cell + __wt_cell_total_len(&unpack)); + } + + __wt_cell_unpack_kv(session, page->dsk, __wt_cell_leaf_value_parse(page, vcell), vpack); +} + +/* * __wt_ref_addr_copy -- * Return a copy of the WT_REF address information. */ @@ -1726,14 +1925,13 @@ __wt_page_swap_func(WT_SESSION_IMPL *session, WT_REF *held, WT_REF *want, uint32 bool acquired; /* - * This function is here to simplify the error handling during hazard - * pointer coupling so we never leave a hazard pointer dangling. The - * assumption is we're holding a hazard pointer on "held", and want to - * acquire a hazard pointer on "want", releasing the hazard pointer on + * This function is here to simplify the error handling during hazard pointer coupling so we + * never leave a hazard pointer dangling. The assumption is we're holding a hazard pointer on + * "held", and want to acquire a hazard pointer on "want", releasing the hazard pointer on * "held" when we're done. * - * When walking the tree, we sometimes swap to the same page. Fast-path - * that to avoid thinking about error handling. + * When walking the tree, we sometimes swap to the same page. Fast-path that to avoid thinking + * about error handling. */ if (held == want) return (0); diff --git a/src/third_party/wiredtiger/src/include/buf_inline.h b/src/third_party/wiredtiger/src/include/buf_inline.h index f38a632b4e4..610ccf8d698 100644 --- a/src/third_party/wiredtiger/src/include/buf_inline.h +++ b/src/third_party/wiredtiger/src/include/buf_inline.h @@ -13,8 +13,14 @@ static inline int __wt_buf_grow(WT_SESSION_IMPL *session, WT_ITEM *buf, size_t size) { - return ( - size > buf->memsize || !WT_DATA_IN_ITEM(buf) ? __wt_buf_grow_worker(session, buf, size) : 0); + /* + * Take any offset in the buffer into account when calculating the size to allocate, it saves + * complex calculations in our callers to decide if the buffer is large enough in the case of + * buffers with offset data pointers. + */ + return (!WT_DATA_IN_ITEM(buf) || size + WT_PTRDIFF(buf->data, buf->mem) > buf->memsize ? + __wt_buf_grow_worker(session, buf, size) : + 0); } /* diff --git a/src/third_party/wiredtiger/src/include/cursor_inline.h b/src/third_party/wiredtiger/src/include/cursor_inline.h index 4c5889b6b9e..8325e0ad8e6 100644 --- a/src/third_party/wiredtiger/src/include/cursor_inline.h +++ b/src/third_party/wiredtiger/src/include/cursor_inline.h @@ -446,16 +446,16 @@ __cursor_func_init(WT_CURSOR_BTREE *cbt, bool reenter) * Return a row-store leaf page slot's key. */ static inline int -__cursor_row_slot_key_return( - WT_CURSOR_BTREE *cbt, WT_ROW *rip, WT_CELL_UNPACK_KV *kpack, bool *kpack_used) +__cursor_row_slot_key_return(WT_CURSOR_BTREE *cbt, WT_ROW *rip, WT_CELL_UNPACK_KV *kpack) { WT_CELL *cell; WT_ITEM *kb; WT_PAGE *page; WT_SESSION_IMPL *session; + size_t key_size; + uint8_t key_prefix; void *copy; - - *kpack_used = false; + const void *key_data; session = CUR2S(cbt); page = cbt->ref->page; @@ -468,47 +468,53 @@ __cursor_row_slot_key_return( copy = WT_ROW_KEY_COPY(rip); /* - * Get a key: we could just call __wt_row_leaf_key, but as a cursor is running through the tree, - * we may have additional information here (we may have the fully-built key that's immediately - * before the prefix-compressed key we want, so it's a faster construction). - * - * First, check for an immediately available key. + * Check for an immediately available key from an encoded or instantiated key, and if that's not + * available, from the unpacked cell. */ - if (__wt_row_leaf_key_info(page, copy, NULL, &cell, &kb->data, &kb->size)) + __wt_row_leaf_key_info(page, copy, NULL, &cell, &key_data, &key_size, &key_prefix); + if (key_data == NULL) { + if (__wt_cell_type(cell) != WT_CELL_KEY) + goto slow; + __wt_cell_unpack_kv(session, page->dsk, cell, kpack); + key_data = kpack->data; + key_size = kpack->size; + key_prefix = kpack->prefix; + } + if (key_prefix == 0) { + kb->data = key_data; + kb->size = key_size; return (0); + } /* - * Unpack the cell and deal with overflow and prefix-compressed keys. Inline building simple - * prefix-compressed keys from a previous key, otherwise build from scratch. + * A prefix compressed key. As a cursor is running through the tree, we may have the fully-built + * key immediately before the prefix-compressed key we want, so it's faster to build here. + */ + if (cbt->rip_saved == NULL || cbt->rip_saved != rip - 1) + goto slow; + + /* + * Inline building simple prefix-compressed keys from a previous key. * - * Clear the key cell structure. It shouldn't be necessary (as far as I can tell, and we don't - * do it in lots of other places), but disabling shared builds (--disable-shared) results in the - * compiler complaining about uninitialized field use. + * Grow the buffer as necessary as well as ensure data has been copied into local buffer space, + * then append the suffix to the prefix already in the buffer. Don't grow the buffer + * unnecessarily or copy data we don't need, truncate the item's CURRENT data length to the + * prefix bytes before growing the buffer. */ - memset(kpack, 0, sizeof(*kpack)); - __wt_cell_unpack_kv(session, page->dsk, cell, kpack); - *kpack_used = true; - if (kpack->type == WT_CELL_KEY && cbt->rip_saved != NULL && cbt->rip_saved == rip - 1) { - WT_ASSERT(session, cbt->row_key->size >= kpack->prefix); - - /* - * Grow the buffer as necessary as well as ensure data has been copied into local buffer - * space, then append the suffix to the prefix already in the buffer. - * - * Don't grow the buffer unnecessarily or copy data we don't need, truncate the item's data - * length to the prefix bytes. - */ - cbt->row_key->size = kpack->prefix; - WT_RET(__wt_buf_grow(session, cbt->row_key, cbt->row_key->size + kpack->size)); - memcpy((uint8_t *)cbt->row_key->data + cbt->row_key->size, kpack->data, kpack->size); - cbt->row_key->size += kpack->size; - } else { - /* - * Call __wt_row_leaf_key_work instead of __wt_row_leaf_key: we already did - * __wt_row_leaf_key's fast-path checks inline. - */ + WT_ASSERT(session, cbt->row_key->size >= key_prefix); + cbt->row_key->size = key_prefix; + WT_RET(__wt_buf_grow(session, cbt->row_key, key_prefix + key_size)); + memcpy((uint8_t *)cbt->row_key->data + key_prefix, key_data, key_size); + cbt->row_key->size = key_prefix + key_size; + + if (0) { +slow: /* + * Call __wt_row_leaf_key_work() instead of __wt_row_leaf_key(): we already did the + * __wt_row_leaf_key() fast-path checks inline. + */ WT_RET(__wt_row_leaf_key_work(session, page, rip, cbt->row_key, false)); } + kb->data = cbt->row_key->data; kb->size = cbt->row_key->size; cbt->rip_saved = rip; diff --git a/src/third_party/wiredtiger/src/include/dhandle.h b/src/third_party/wiredtiger/src/include/dhandle.h index 967d0b08be4..1ca46550587 100644 --- a/src/third_party/wiredtiger/src/include/dhandle.h +++ b/src/third_party/wiredtiger/src/include/dhandle.h @@ -104,7 +104,14 @@ struct __wt_data_handle { WT_DSRC_STATS *stats[WT_COUNTER_SLOTS]; WT_DSRC_STATS *stat_array; -/* Flags values over 0xff are reserved for WT_BTREE_* */ +/* + * Flags values over 0xfff are reserved for WT_BTREE_*. This lets us combine the dhandle and btree + * flags when we need, for example, to pass both sets in a function call. + * + * To help avoid accidental overrun of the flag values, we add a special flag value that should + * always be the last and highest. We use this value to assert that the dhandle flags haven't run + * into the space reserved for btree flags. + */ /* AUTOMATIC FLAG VALUE GENERATION START */ #define WT_DHANDLE_DEAD 0x001u /* Dead, awaiting discard */ #define WT_DHANDLE_DISCARD 0x002u /* Close on release */ @@ -115,8 +122,13 @@ struct __wt_data_handle { #define WT_DHANDLE_IS_METADATA 0x040u /* Metadata handle */ #define WT_DHANDLE_LOCK_ONLY 0x080u /* Handle only used as a lock */ #define WT_DHANDLE_OPEN 0x100u /* Handle is open */ +#define WT_DHANDLE_ZZZ_ENDFLAG 0x200u /* One past highest flag value */ /* AUTOMATIC FLAG VALUE GENERATION STOP */ uint32_t flags; +#define WT_DHANDLE_MAX_FLAG 0x1000u /* Used to ensure we don't overflow legal flag values */ +#if WT_DHANDLE_ZZZ_ENDFLAG > WT_DHANDLE_MAX_FLAG +#error "Too many dhandle flags" +#endif /* AUTOMATIC FLAG VALUE GENERATION START */ #define WT_DHANDLE_ASSERT_TS_READ_ALWAYS 0x001u /* Assert read always checking. */ diff --git a/src/third_party/wiredtiger/src/include/extern.h b/src/third_party/wiredtiger/src/include/extern.h index fb5c8e361ba..15fbcdb4a74 100644 --- a/src/third_party/wiredtiger/src/include/extern.h +++ b/src/third_party/wiredtiger/src/include/extern.h @@ -1041,8 +1041,6 @@ extern int __wt_meta_apply_all(WT_SESSION_IMPL *session, int (*file_func)(WT_SESSION_IMPL *, const char *[]), int (*name_func)(WT_SESSION_IMPL *, const char *, bool *), const char *cfg[]) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); -extern int __wt_meta_blk_mods_load(WT_SESSION_IMPL *session, const char *config, WT_CKPT *ckpt, - bool rename) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_meta_block_metadata(WT_SESSION_IMPL *session, const char *config, WT_CKPT *ckpt) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_meta_checkpoint(WT_SESSION_IMPL *session, const char *fname, const char *checkpoint, @@ -1052,15 +1050,18 @@ extern int __wt_meta_checkpoint_clear(WT_SESSION_IMPL *session, const char *fnam extern int __wt_meta_checkpoint_last_name(WT_SESSION_IMPL *session, const char *fname, const char **namep) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_meta_ckptlist_get(WT_SESSION_IMPL *session, const char *fname, bool update, - WT_CKPT **ckptbasep) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); + WT_CKPT **ckptbasep, size_t *allocated) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_meta_ckptlist_get_from_config(WT_SESSION_IMPL *session, bool update, - WT_CKPT **ckptbasep, const char *config) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); + WT_CKPT **ckptbasep, size_t *allocatedp, const char *config) + WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_meta_ckptlist_set(WT_SESSION_IMPL *session, const char *fname, WT_CKPT *ckptbase, WT_LSN *ckptlsn) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_meta_ckptlist_to_meta(WT_SESSION_IMPL *session, WT_CKPT *ckptbase, WT_ITEM *buf) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_meta_ckptlist_update_config(WT_SESSION_IMPL *session, WT_CKPT *ckptbase, const char *oldcfg, char **newcfgp) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); +extern int __wt_meta_saved_ckptlist_get(WT_SESSION_IMPL *session, const char *fname, + WT_CKPT **ckptbasep) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_meta_sysinfo_set(WT_SESSION_IMPL *session) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_meta_track_checkpoint(WT_SESSION_IMPL *session) @@ -1236,6 +1237,8 @@ extern int __wt_reconcile(WT_SESSION_IMPL *session, WT_REF *ref, WT_SALVAGE_COOK uint32_t flags) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_remove_if_exists(WT_SESSION_IMPL *session, const char *name, bool durable) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); +extern int __wt_reset_blkmod(WT_SESSION_IMPL *session, const char *orig_config, WT_ITEM *buf) + WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_rollback_to_stable(WT_SESSION_IMPL *session, const char *cfg[], bool no_ckpt) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_row_ikey(WT_SESSION_IMPL *session, uint32_t cell_offset, const void *key, @@ -1250,8 +1253,6 @@ extern int __wt_row_leaf_key_copy(WT_SESSION_IMPL *session, WT_PAGE *page, WT_RO WT_ITEM *key) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_row_leaf_key_work(WT_SESSION_IMPL *session, WT_PAGE *page, WT_ROW *rip_arg, WT_ITEM *keyb, bool instantiate) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); -extern int __wt_row_leaf_keys(WT_SESSION_IMPL *session, WT_PAGE *page) - WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_row_modify(WT_CURSOR_BTREE *cbt, const WT_ITEM *key, const WT_ITEM *value, WT_UPDATE *upd_arg, u_int modify_type, bool exclusive) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); @@ -1752,6 +1753,7 @@ extern void __wt_lsm_tree_writelock(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_t extern void __wt_lsm_tree_writeunlock(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree); extern void __wt_meta_checkpoint_free(WT_SESSION_IMPL *session, WT_CKPT *ckpt); extern void __wt_meta_ckptlist_free(WT_SESSION_IMPL *session, WT_CKPT **ckptbasep); +extern void __wt_meta_saved_ckptlist_free(WT_SESSION_IMPL *session); extern void __wt_meta_track_discard(WT_SESSION_IMPL *session); extern void __wt_meta_track_sub_on(WT_SESSION_IMPL *session); extern void __wt_metadata_free_ckptlist(WT_SESSION *session, WT_CKPT *ckptbase) @@ -1898,11 +1900,9 @@ static inline bool __wt_ref_addr_copy(WT_SESSION_IMPL *session, WT_REF *ref, WT_ static inline bool __wt_ref_cas_state_int(WT_SESSION_IMPL *session, WT_REF *ref, uint8_t old_state, uint8_t new_state, const char *func, int line) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); static inline bool __wt_ref_is_root(WT_REF *ref) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); -static inline bool __wt_row_leaf_key_info(WT_PAGE *page, void *copy, WT_IKEY **ikeyp, - WT_CELL **cellp, void *datap, size_t *sizep) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); static inline bool __wt_row_leaf_value(WT_PAGE *page, WT_ROW *rip, WT_ITEM *value) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); -static inline bool __wt_row_leaf_value_exists(WT_ROW *rip) +static inline bool __wt_row_leaf_value_is_encoded(WT_ROW *rip) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); static inline bool __wt_session_can_wait(WT_SESSION_IMPL *session) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); @@ -2051,6 +2051,8 @@ static inline int __wt_ref_block_free(WT_SESSION_IMPL *session, WT_REF *ref) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); static inline int __wt_row_leaf_key(WT_SESSION_IMPL *session, WT_PAGE *page, WT_ROW *rip, WT_ITEM *key, bool instantiate) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); +static inline int __wt_row_leaf_key_instantiate(WT_SESSION_IMPL *session, WT_PAGE *page) + WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); static inline int __wt_snprintf(char *buf, size_t size, const char *fmt, ...) WT_GCC_FUNC_DECL_ATTRIBUTE((format(printf, 3, 4))) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); @@ -2262,11 +2264,13 @@ static inline void __wt_rec_incr( static inline void __wt_ref_key(WT_PAGE *page, WT_REF *ref, void *keyp, size_t *sizep); static inline void __wt_ref_key_clear(WT_REF *ref); static inline void __wt_ref_key_onpage_set(WT_PAGE *page, WT_REF *ref, WT_CELL_UNPACK_ADDR *unpack); +static inline void __wt_row_leaf_key_free(WT_SESSION_IMPL *session, WT_PAGE *page, WT_ROW *rip); +static inline void __wt_row_leaf_key_info(WT_PAGE *page, void *copy, WT_IKEY **ikeyp, + WT_CELL **cellp, void *datap, size_t *sizep, uint8_t *prefixp); static inline void __wt_row_leaf_key_set(WT_PAGE *page, WT_ROW *rip, WT_CELL_UNPACK_KV *unpack); -static inline void __wt_row_leaf_key_set_cell(WT_PAGE *page, WT_ROW *rip, WT_CELL *cell); -static inline void __wt_row_leaf_value_cell(WT_SESSION_IMPL *session, WT_PAGE *page, WT_ROW *rip, - WT_CELL_UNPACK_KV *kpack, WT_CELL_UNPACK_KV *vpack); -static inline void __wt_row_leaf_value_set(WT_PAGE *page, WT_ROW *rip, WT_CELL_UNPACK_KV *unpack); +static inline void __wt_row_leaf_value_cell( + WT_SESSION_IMPL *session, WT_PAGE *page, WT_ROW *rip, WT_CELL_UNPACK_KV *vpack); +static inline void __wt_row_leaf_value_set(WT_ROW *rip, WT_CELL_UNPACK_KV *unpack); static inline void __wt_scr_free(WT_SESSION_IMPL *session, WT_ITEM **bufp); static inline void __wt_seconds(WT_SESSION_IMPL *session, uint64_t *secondsp); static inline void __wt_seconds32(WT_SESSION_IMPL *session, uint32_t *secondsp); diff --git a/src/third_party/wiredtiger/src/include/meta.h b/src/third_party/wiredtiger/src/include/meta.h index 22b0de65308..924aacc54db 100644 --- a/src/third_party/wiredtiger/src/include/meta.h +++ b/src/third_party/wiredtiger/src/include/meta.h @@ -118,6 +118,8 @@ struct __wt_block_mods { */ #define WT_CHECKPOINT "WiredTigerCheckpoint" #define WT_CKPT_FOREACH(ckptbase, ckpt) for ((ckpt) = (ckptbase); (ckpt)->name != NULL; ++(ckpt)) +#define WT_CKPT_FOREACH_NAME_OR_ORDER(ckptbase, ckpt) \ + for ((ckpt) = (ckptbase); (ckpt)->name != NULL || (ckpt)->order != 0; ++(ckpt)) struct __wt_ckpt { char *name; /* Name or NULL */ diff --git a/src/third_party/wiredtiger/src/include/misc.h b/src/third_party/wiredtiger/src/include/misc.h index 14cf9133502..4451d2aa638 100644 --- a/src/third_party/wiredtiger/src/include/misc.h +++ b/src/third_party/wiredtiger/src/include/misc.h @@ -336,7 +336,7 @@ union __wt_rand_state { do { \ size_t __len, __space; \ va_list __ap; \ - int __ret_xx; /* __ret already used by WT_RET */ \ + int __ret_xx; /* __ret already used by WT_ERR */ \ char *__p; \ \ /* \ @@ -355,7 +355,7 @@ union __wt_rand_state { va_start(__ap, fmt); \ __ret_xx = __wt_vsnprintf_len_set(__p, __space, &__len, fmt, __ap); \ va_end(__ap); \ - WT_RET(__ret_xx); \ + WT_ERR(__ret_xx); \ \ /* Check if there was enough space. */ \ if (__len < __space) { \ @@ -368,6 +368,6 @@ union __wt_rand_state { * If not, double the size of the buffer: we're dealing \ * with strings, we don't expect the size to get huge. \ */ \ - WT_RET(__wt_buf_extend(session, buf, (buf)->size + __len + 1)); \ + WT_ERR(__wt_buf_extend(session, buf, (buf)->size + __len + 1)); \ } \ } while (0) diff --git a/src/third_party/wiredtiger/src/include/reconcile.h b/src/third_party/wiredtiger/src/include/reconcile.h index f4bda5f5434..00057870a0c 100644 --- a/src/third_party/wiredtiger/src/include/reconcile.h +++ b/src/third_party/wiredtiger/src/include/reconcile.h @@ -252,6 +252,10 @@ struct __wt_reconcile { WT_ITEM *cur, _cur; /* Key/Value being built */ WT_ITEM *last, _last; /* Last key/value built */ +/* Don't increase key prefix-compression unless there's a significant gain. */ +#define WT_KEY_PREFIX_PREVIOUS_MINIMUM 10 + uint8_t key_pfx_last; /* Last prefix compression */ + bool key_pfx_compress; /* If can prefix-compress next key */ bool key_pfx_compress_conf; /* If prefix compression configured */ bool key_sfx_compress; /* If can suffix-compress next key */ diff --git a/src/third_party/wiredtiger/src/include/stat.h b/src/third_party/wiredtiger/src/include/stat.h index 534d4a1cf40..540c79187b4 100644 --- a/src/third_party/wiredtiger/src/include/stat.h +++ b/src/third_party/wiredtiger/src/include/stat.h @@ -624,7 +624,10 @@ struct __wt_connection_stats { int64_t page_sleep; int64_t page_del_rollback_blocked; int64_t child_modify_blocked_page; - int64_t txn_prepared_updates_count; + int64_t txn_prepared_updates; + int64_t txn_prepared_updates_committed; + int64_t txn_prepared_updates_key_repeated; + int64_t txn_prepared_updates_rolledback; int64_t txn_prepare; int64_t txn_prepare_commit; int64_t txn_prepare_active; diff --git a/src/third_party/wiredtiger/src/include/txn.h b/src/third_party/wiredtiger/src/include/txn.h index 7dbc17b9063..07f1599aca8 100644 --- a/src/third_party/wiredtiger/src/include/txn.h +++ b/src/third_party/wiredtiger/src/include/txn.h @@ -281,6 +281,9 @@ struct __wt_txn { WT_TXN_OP *mod; size_t mod_alloc; u_int mod_count; +#ifdef HAVE_DIAGNOSTIC + u_int prepare_count; +#endif /* Scratch buffer for in-memory log records. */ WT_ITEM *logrec; diff --git a/src/third_party/wiredtiger/src/include/wiredtiger.in b/src/third_party/wiredtiger/src/include/wiredtiger.in index a4a1b584b35..c21c3dac748 100644 --- a/src/third_party/wiredtiger/src/include/wiredtiger.in +++ b/src/third_party/wiredtiger/src/include/wiredtiger.in @@ -480,6 +480,8 @@ struct __wt_cursor { * (as it partially depends on the underlying file configuration), but * is always a small number of bytes less than 4GB. * + * The WT_CURSOR::insert method can only be used at snapshot isolation. + * * @param cursor the cursor handle * @errors * In particular, if \c overwrite=false is configured and a record with @@ -491,10 +493,8 @@ struct __wt_cursor { int __F(insert)(WT_CURSOR *cursor); /*! - * Modify an existing record. - * - * Both the key and value must be set and the record must already exist; - * the record will be updated. + * Modify an existing record. Both the key and value must be set and the record must + * already exist. * * Modifications are specified in WT_MODIFY structures. Modifications * are applied in order and later modifications can update earlier ones. @@ -503,9 +503,6 @@ struct __wt_cursor { * \c S), or raw byte arrays accessed using a WT_ITEM structure (value * format type \c u). * - * The WT_CURSOR::modify method can only be called from within an - * explicit transaction configured at the snapshot isolation level. - * * The WT_CURSOR::modify method stores a change record in cache and * writes a change record to the log instead of the usual complete * values. Note that WT_CURSOR::modify is generally slower than the @@ -526,6 +523,8 @@ struct __wt_cursor { * (as it partially depends on the underlying file configuration), but * is always a small number of bytes less than 4GB. * + * The WT_CURSOR::modify method can only be used at snapshot isolation. + * * @param cursor the cursor handle * @param entries an array of modification data structures * @param nentries the number of modification data structures @@ -561,6 +560,8 @@ struct __wt_cursor { * (as it partially depends on the underlying file configuration), but * is always a small number of bytes less than 4GB. * + * The WT_CURSOR::update method can only be used at snapshot isolation. + * * @param cursor the cursor handle * @errors * In particular, if \c overwrite=false is configured and no record with @@ -594,6 +595,8 @@ struct __wt_cursor { * (that is, a store with an 'r' type key and 't' type value) is * identical to setting the record's value to 0. * + * The WT_CURSOR::remove method can only be used at snapshot isolation. + * * @param cursor the cursor handle * @errors */ @@ -1119,8 +1122,8 @@ struct __wt_session { * \c none.} * @config{format, the file format., a string\, chosen from the following options: \c * "btree"; default \c btree.} - * @config{huffman_key, This option is no longer supported. Retained for backward - * compatibility. See @ref huffman for more information., a string; default \c none.} + * @config{huffman_key, This option is no longer supported\, retained for backward + * compatibility., a string; default \c none.} * @config{huffman_value, configure Huffman encoding for values. Permitted values are \c * "none"\, \c "english"\, \c "utf8<file>" or \c "utf16<file>". See @ref huffman for more * information., a string; default \c none.} @@ -1140,6 +1143,8 @@ struct __wt_session { * @config{ repair, whether to reconstruct the metadata from * the raw file content., a boolean flag; default \c false.} * @config{ ),,} + * @config{internal_item_max, This option is no longer supported\, retained for backward + * compatibility., an integer greater than or equal to 0; default \c 0.} * @config{internal_key_max, the largest key stored in an internal node\, in bytes. If * set\, keys larger than the specified size are stored as overflow items (which may require * additional I/O to access). The default and the maximum allowed value are both one-tenth @@ -1158,6 +1163,10 @@ struct __wt_session { * use WT_ITEM structures to manipulate raw byte arrays. By default\, records are stored in * row-store files: keys of type \c 'r' are record numbers and records referenced by record * number are stored in column-store files., a format string; default \c u.} + * @config{key_gap, This option is no longer supported\, retained for backward + * compatibility., an integer greater than or equal to 0; default \c 10.} + * @config{leaf_item_max, This option is no longer supported\, retained for backward + * compatibility., an integer greater than or equal to 0; default \c 0.} * @config{leaf_key_max, the largest key stored in a leaf node\, in bytes. If set\, keys * larger than the specified size are stored as overflow items (which may require additional * I/O to access). The default value is one-tenth the size of a newly split leaf page., an @@ -1511,16 +1520,18 @@ struct __wt_session { * contains. * @snippet ex_all.c Truncate a range * - * Any specified cursors end with no position, and subsequent calls to - * the WT_CURSOR::next (WT_CURSOR::prev) method will iterate from the - * beginning (end) of the table. - * * When a range truncate is in progress, and another transaction inserts * a key into that range, the behavior is not well defined - a conflict * may be detected or both transactions may be permitted to commit. If * they do commit, and if there is a crash and recovery runs, the result * may be different than what was in cache before the crash. * + * The WT_CURSOR::truncate range truncate operation can only be used at snapshot isolation. + * + * Any specified cursors end with no position, and subsequent calls to + * the WT_CURSOR::next (WT_CURSOR::prev) method will iterate from the + * beginning (end) of the table. + * * Truncate a backup cursor. This operation removes all log files that * have been returned by the backup cursor. It can be used to remove log * files after copying them during @ref backup_incremental. @@ -1783,6 +1794,8 @@ struct __wt_session { /*! * Set a timestamp on a transaction. * + * The WT_SESSION.timestamp_transaction method can only be used at snapshot isolation. + * * @snippet ex_all.c transaction timestamp * * @requires_transaction @@ -1811,6 +1824,8 @@ struct __wt_session { /*! * Query the session's transaction timestamp state. * + * The WT_SESSION.query_timestamp method can only be used at snapshot isolation. + * * @param session the session handle * @param[out] hex_timestamp a buffer that will be set to the * hexadecimal encoding of the timestamp being queried. Must be large @@ -4778,35 +4793,49 @@ struct __wt_storage_source { * @param storage_source the WT_STORAGE_SOURCE * @param session the current WiredTiger session * @param bucket_name the name of the bucket. Use of '/' is implementation dependent. - * @param prefix a prefix for each file. If used, the prefix will be added to the - * name of each object created or otherwise accessed in the bucket. Also, only - * objects with this prefix will be visible, and the prefix will be removed when - * listed. Prefixes may contain '/' as a separator. * @param auth_token the authorization identifier. - * @param config additional configuration, currently must be NULL. + * @param config additional configuration. The only allowable value is \c cache_directory, + * the name of a directory holding cached objects. Its default is + * \c "<home>/cache-<bucket>" with \c <home> replaced by the @ref home, and + * \c <bucket> replaced by the bucket_name. * @param[out] file_system the customized file system returned */ int (*ss_customize_file_system)(WT_STORAGE_SOURCE *storage_source, WT_SESSION *session, - const char *bucket_name, const char *prefix, const char *auth_token, const char *config, + const char *bucket_name, const char *auth_token, const char *config, WT_FILE_SYSTEM **file_system); /*! - * Flush any existing objects that match the location and name from - * local storage to shared object storage. The implementation guarantees - * that all objects that are in a created state (see WT_STORAGE_SOURCE::ss_open_object) - * at the beginning of this call have been transferred when this call returns. + * Copy a file from the default file system to an object name in shared object storage. * * @errors * * @param storage_source the WT_STORAGE_SOURCE * @param session the current WiredTiger session - * @param file_system if NULL, all objects are considered, otherwise only objects - * managed by the given file system. - * @param name the name of the object to flush (or NULL for all) + * @param file_system the destination bucket and credentials + * @param source the name of the source input file + * @param object the name of the destination object * @param config additional configuration, currently must be NULL */ int (*ss_flush)(WT_STORAGE_SOURCE *storage_source, WT_SESSION *session, - WT_FILE_SYSTEM *file_system, const char *name, const char *config); + WT_FILE_SYSTEM *file_system, const char *source, const char *object, + const char *config); + + /*! + * After a flush, rename the source file from the default file system to be cached in + * the shared object storage. + * + * @errors + * + * @param storage_source the WT_STORAGE_SOURCE + * @param session the current WiredTiger session + * @param file_system the destination bucket and credentials + * @param source the name of the source input file + * @param object the name of the destination object + * @param config additional configuration, currently must be NULL + */ + int (*ss_flush_finish)(WT_STORAGE_SOURCE *storage_source, WT_SESSION *session, + WT_FILE_SYSTEM *file_system, const char *source, const char *object, + const char *config); /*! * A callback performed when the storage source is closed and will no @@ -5619,445 +5648,451 @@ extern int wiredtiger_extension_terminate(WT_CONNECTION *connection); /*! thread-yield: page reconciliation yielded due to child modification */ #define WT_STAT_CONN_CHILD_MODIFY_BLOCKED_PAGE 1310 /*! transaction: Number of prepared updates */ -#define WT_STAT_CONN_TXN_PREPARED_UPDATES_COUNT 1311 +#define WT_STAT_CONN_TXN_PREPARED_UPDATES 1311 +/*! transaction: Number of prepared updates committed */ +#define WT_STAT_CONN_TXN_PREPARED_UPDATES_COMMITTED 1312 +/*! transaction: Number of prepared updates repeated on the same key */ +#define WT_STAT_CONN_TXN_PREPARED_UPDATES_KEY_REPEATED 1313 +/*! transaction: Number of prepared updates rolled back */ +#define WT_STAT_CONN_TXN_PREPARED_UPDATES_ROLLEDBACK 1314 /*! transaction: prepared transactions */ -#define WT_STAT_CONN_TXN_PREPARE 1312 +#define WT_STAT_CONN_TXN_PREPARE 1315 /*! transaction: prepared transactions committed */ -#define WT_STAT_CONN_TXN_PREPARE_COMMIT 1313 +#define WT_STAT_CONN_TXN_PREPARE_COMMIT 1316 /*! transaction: prepared transactions currently active */ -#define WT_STAT_CONN_TXN_PREPARE_ACTIVE 1314 +#define WT_STAT_CONN_TXN_PREPARE_ACTIVE 1317 /*! transaction: prepared transactions rolled back */ -#define WT_STAT_CONN_TXN_PREPARE_ROLLBACK 1315 +#define WT_STAT_CONN_TXN_PREPARE_ROLLBACK 1318 /*! transaction: query timestamp calls */ -#define WT_STAT_CONN_TXN_QUERY_TS 1316 +#define WT_STAT_CONN_TXN_QUERY_TS 1319 /*! transaction: rollback to stable calls */ -#define WT_STAT_CONN_TXN_RTS 1317 +#define WT_STAT_CONN_TXN_RTS 1320 /*! transaction: rollback to stable pages visited */ -#define WT_STAT_CONN_TXN_RTS_PAGES_VISITED 1318 +#define WT_STAT_CONN_TXN_RTS_PAGES_VISITED 1321 /*! transaction: rollback to stable tree walk skipping pages */ -#define WT_STAT_CONN_TXN_RTS_TREE_WALK_SKIP_PAGES 1319 +#define WT_STAT_CONN_TXN_RTS_TREE_WALK_SKIP_PAGES 1322 /*! transaction: rollback to stable updates aborted */ -#define WT_STAT_CONN_TXN_RTS_UPD_ABORTED 1320 +#define WT_STAT_CONN_TXN_RTS_UPD_ABORTED 1323 /*! transaction: sessions scanned in each walk of concurrent sessions */ -#define WT_STAT_CONN_TXN_SESSIONS_WALKED 1321 +#define WT_STAT_CONN_TXN_SESSIONS_WALKED 1324 /*! transaction: set timestamp calls */ -#define WT_STAT_CONN_TXN_SET_TS 1322 +#define WT_STAT_CONN_TXN_SET_TS 1325 /*! transaction: set timestamp durable calls */ -#define WT_STAT_CONN_TXN_SET_TS_DURABLE 1323 +#define WT_STAT_CONN_TXN_SET_TS_DURABLE 1326 /*! transaction: set timestamp durable updates */ -#define WT_STAT_CONN_TXN_SET_TS_DURABLE_UPD 1324 +#define WT_STAT_CONN_TXN_SET_TS_DURABLE_UPD 1327 /*! transaction: set timestamp oldest calls */ -#define WT_STAT_CONN_TXN_SET_TS_OLDEST 1325 +#define WT_STAT_CONN_TXN_SET_TS_OLDEST 1328 /*! transaction: set timestamp oldest updates */ -#define WT_STAT_CONN_TXN_SET_TS_OLDEST_UPD 1326 +#define WT_STAT_CONN_TXN_SET_TS_OLDEST_UPD 1329 /*! transaction: set timestamp stable calls */ -#define WT_STAT_CONN_TXN_SET_TS_STABLE 1327 +#define WT_STAT_CONN_TXN_SET_TS_STABLE 1330 /*! transaction: set timestamp stable updates */ -#define WT_STAT_CONN_TXN_SET_TS_STABLE_UPD 1328 +#define WT_STAT_CONN_TXN_SET_TS_STABLE_UPD 1331 /*! transaction: transaction begins */ -#define WT_STAT_CONN_TXN_BEGIN 1329 +#define WT_STAT_CONN_TXN_BEGIN 1332 /*! transaction: transaction checkpoint currently running */ -#define WT_STAT_CONN_TXN_CHECKPOINT_RUNNING 1330 +#define WT_STAT_CONN_TXN_CHECKPOINT_RUNNING 1333 /*! * transaction: transaction checkpoint currently running for history * store file */ -#define WT_STAT_CONN_TXN_CHECKPOINT_RUNNING_HS 1331 +#define WT_STAT_CONN_TXN_CHECKPOINT_RUNNING_HS 1334 /*! transaction: transaction checkpoint generation */ -#define WT_STAT_CONN_TXN_CHECKPOINT_GENERATION 1332 +#define WT_STAT_CONN_TXN_CHECKPOINT_GENERATION 1335 /*! * transaction: transaction checkpoint history store file duration * (usecs) */ -#define WT_STAT_CONN_TXN_HS_CKPT_DURATION 1333 +#define WT_STAT_CONN_TXN_HS_CKPT_DURATION 1336 /*! transaction: transaction checkpoint max time (msecs) */ -#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_MAX 1334 +#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_MAX 1337 /*! transaction: transaction checkpoint min time (msecs) */ -#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_MIN 1335 +#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_MIN 1338 /*! * transaction: transaction checkpoint most recent duration for gathering * all handles (usecs) */ -#define WT_STAT_CONN_TXN_CHECKPOINT_HANDLE_DURATION 1336 +#define WT_STAT_CONN_TXN_CHECKPOINT_HANDLE_DURATION 1339 /*! * transaction: transaction checkpoint most recent duration for gathering * applied handles (usecs) */ -#define WT_STAT_CONN_TXN_CHECKPOINT_HANDLE_DURATION_APPLY 1337 +#define WT_STAT_CONN_TXN_CHECKPOINT_HANDLE_DURATION_APPLY 1340 /*! * transaction: transaction checkpoint most recent duration for gathering * skipped handles (usecs) */ -#define WT_STAT_CONN_TXN_CHECKPOINT_HANDLE_DURATION_SKIP 1338 +#define WT_STAT_CONN_TXN_CHECKPOINT_HANDLE_DURATION_SKIP 1341 /*! transaction: transaction checkpoint most recent handles applied */ -#define WT_STAT_CONN_TXN_CHECKPOINT_HANDLE_APPLIED 1339 +#define WT_STAT_CONN_TXN_CHECKPOINT_HANDLE_APPLIED 1342 /*! transaction: transaction checkpoint most recent handles skipped */ -#define WT_STAT_CONN_TXN_CHECKPOINT_HANDLE_SKIPPED 1340 +#define WT_STAT_CONN_TXN_CHECKPOINT_HANDLE_SKIPPED 1343 /*! transaction: transaction checkpoint most recent handles walked */ -#define WT_STAT_CONN_TXN_CHECKPOINT_HANDLE_WALKED 1341 +#define WT_STAT_CONN_TXN_CHECKPOINT_HANDLE_WALKED 1344 /*! transaction: transaction checkpoint most recent time (msecs) */ -#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_RECENT 1342 +#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_RECENT 1345 /*! transaction: transaction checkpoint prepare currently running */ -#define WT_STAT_CONN_TXN_CHECKPOINT_PREP_RUNNING 1343 +#define WT_STAT_CONN_TXN_CHECKPOINT_PREP_RUNNING 1346 /*! transaction: transaction checkpoint prepare max time (msecs) */ -#define WT_STAT_CONN_TXN_CHECKPOINT_PREP_MAX 1344 +#define WT_STAT_CONN_TXN_CHECKPOINT_PREP_MAX 1347 /*! transaction: transaction checkpoint prepare min time (msecs) */ -#define WT_STAT_CONN_TXN_CHECKPOINT_PREP_MIN 1345 +#define WT_STAT_CONN_TXN_CHECKPOINT_PREP_MIN 1348 /*! transaction: transaction checkpoint prepare most recent time (msecs) */ -#define WT_STAT_CONN_TXN_CHECKPOINT_PREP_RECENT 1346 +#define WT_STAT_CONN_TXN_CHECKPOINT_PREP_RECENT 1349 /*! transaction: transaction checkpoint prepare total time (msecs) */ -#define WT_STAT_CONN_TXN_CHECKPOINT_PREP_TOTAL 1347 +#define WT_STAT_CONN_TXN_CHECKPOINT_PREP_TOTAL 1350 /*! transaction: transaction checkpoint scrub dirty target */ -#define WT_STAT_CONN_TXN_CHECKPOINT_SCRUB_TARGET 1348 +#define WT_STAT_CONN_TXN_CHECKPOINT_SCRUB_TARGET 1351 /*! transaction: transaction checkpoint scrub time (msecs) */ -#define WT_STAT_CONN_TXN_CHECKPOINT_SCRUB_TIME 1349 +#define WT_STAT_CONN_TXN_CHECKPOINT_SCRUB_TIME 1352 /*! transaction: transaction checkpoint total time (msecs) */ -#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_TOTAL 1350 +#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_TOTAL 1353 /*! transaction: transaction checkpoints */ -#define WT_STAT_CONN_TXN_CHECKPOINT 1351 +#define WT_STAT_CONN_TXN_CHECKPOINT 1354 /*! * transaction: transaction checkpoints skipped because database was * clean */ -#define WT_STAT_CONN_TXN_CHECKPOINT_SKIPPED 1352 +#define WT_STAT_CONN_TXN_CHECKPOINT_SKIPPED 1355 /*! transaction: transaction failures due to history store */ -#define WT_STAT_CONN_TXN_FAIL_CACHE 1353 +#define WT_STAT_CONN_TXN_FAIL_CACHE 1356 /*! * transaction: transaction fsync calls for checkpoint after allocating * the transaction ID */ -#define WT_STAT_CONN_TXN_CHECKPOINT_FSYNC_POST 1354 +#define WT_STAT_CONN_TXN_CHECKPOINT_FSYNC_POST 1357 /*! * transaction: transaction fsync duration for checkpoint after * allocating the transaction ID (usecs) */ -#define WT_STAT_CONN_TXN_CHECKPOINT_FSYNC_POST_DURATION 1355 +#define WT_STAT_CONN_TXN_CHECKPOINT_FSYNC_POST_DURATION 1358 /*! transaction: transaction range of IDs currently pinned */ -#define WT_STAT_CONN_TXN_PINNED_RANGE 1356 +#define WT_STAT_CONN_TXN_PINNED_RANGE 1359 /*! transaction: transaction range of IDs currently pinned by a checkpoint */ -#define WT_STAT_CONN_TXN_PINNED_CHECKPOINT_RANGE 1357 +#define WT_STAT_CONN_TXN_PINNED_CHECKPOINT_RANGE 1360 /*! transaction: transaction range of timestamps currently pinned */ -#define WT_STAT_CONN_TXN_PINNED_TIMESTAMP 1358 +#define WT_STAT_CONN_TXN_PINNED_TIMESTAMP 1361 /*! transaction: transaction range of timestamps pinned by a checkpoint */ -#define WT_STAT_CONN_TXN_PINNED_TIMESTAMP_CHECKPOINT 1359 +#define WT_STAT_CONN_TXN_PINNED_TIMESTAMP_CHECKPOINT 1362 /*! * transaction: transaction range of timestamps pinned by the oldest * active read timestamp */ -#define WT_STAT_CONN_TXN_PINNED_TIMESTAMP_READER 1360 +#define WT_STAT_CONN_TXN_PINNED_TIMESTAMP_READER 1363 /*! * transaction: transaction range of timestamps pinned by the oldest * timestamp */ -#define WT_STAT_CONN_TXN_PINNED_TIMESTAMP_OLDEST 1361 +#define WT_STAT_CONN_TXN_PINNED_TIMESTAMP_OLDEST 1364 /*! transaction: transaction read timestamp of the oldest active reader */ -#define WT_STAT_CONN_TXN_TIMESTAMP_OLDEST_ACTIVE_READ 1362 +#define WT_STAT_CONN_TXN_TIMESTAMP_OLDEST_ACTIVE_READ 1365 /*! transaction: transaction rollback to stable currently running */ -#define WT_STAT_CONN_TXN_ROLLBACK_TO_STABLE_RUNNING 1363 +#define WT_STAT_CONN_TXN_ROLLBACK_TO_STABLE_RUNNING 1366 /*! transaction: transaction sync calls */ -#define WT_STAT_CONN_TXN_SYNC 1364 +#define WT_STAT_CONN_TXN_SYNC 1367 /*! transaction: transaction walk of concurrent sessions */ -#define WT_STAT_CONN_TXN_WALK_SESSIONS 1365 +#define WT_STAT_CONN_TXN_WALK_SESSIONS 1368 /*! transaction: transactions committed */ -#define WT_STAT_CONN_TXN_COMMIT 1366 +#define WT_STAT_CONN_TXN_COMMIT 1369 /*! transaction: transactions rolled back */ -#define WT_STAT_CONN_TXN_ROLLBACK 1367 +#define WT_STAT_CONN_TXN_ROLLBACK 1370 /*! LSM: sleep for LSM checkpoint throttle */ -#define WT_STAT_CONN_LSM_CHECKPOINT_THROTTLE 1368 +#define WT_STAT_CONN_LSM_CHECKPOINT_THROTTLE 1371 /*! LSM: sleep for LSM merge throttle */ -#define WT_STAT_CONN_LSM_MERGE_THROTTLE 1369 +#define WT_STAT_CONN_LSM_MERGE_THROTTLE 1372 /*! cache: bytes currently in the cache */ -#define WT_STAT_CONN_CACHE_BYTES_INUSE 1370 +#define WT_STAT_CONN_CACHE_BYTES_INUSE 1373 /*! cache: bytes dirty in the cache cumulative */ -#define WT_STAT_CONN_CACHE_BYTES_DIRTY_TOTAL 1371 +#define WT_STAT_CONN_CACHE_BYTES_DIRTY_TOTAL 1374 /*! cache: bytes read into cache */ -#define WT_STAT_CONN_CACHE_BYTES_READ 1372 +#define WT_STAT_CONN_CACHE_BYTES_READ 1375 /*! cache: bytes written from cache */ -#define WT_STAT_CONN_CACHE_BYTES_WRITE 1373 +#define WT_STAT_CONN_CACHE_BYTES_WRITE 1376 /*! cache: checkpoint blocked page eviction */ -#define WT_STAT_CONN_CACHE_EVICTION_CHECKPOINT 1374 +#define WT_STAT_CONN_CACHE_EVICTION_CHECKPOINT 1377 /*! * cache: checkpoint of history store file blocked non-history store page * eviction */ -#define WT_STAT_CONN_CACHE_EVICTION_BLOCKED_CHECKPOINT_HS 1375 +#define WT_STAT_CONN_CACHE_EVICTION_BLOCKED_CHECKPOINT_HS 1378 /*! cache: eviction walk target pages histogram - 0-9 */ -#define WT_STAT_CONN_CACHE_EVICTION_TARGET_PAGE_LT10 1376 +#define WT_STAT_CONN_CACHE_EVICTION_TARGET_PAGE_LT10 1379 /*! cache: eviction walk target pages histogram - 10-31 */ -#define WT_STAT_CONN_CACHE_EVICTION_TARGET_PAGE_LT32 1377 +#define WT_STAT_CONN_CACHE_EVICTION_TARGET_PAGE_LT32 1380 /*! cache: eviction walk target pages histogram - 128 and higher */ -#define WT_STAT_CONN_CACHE_EVICTION_TARGET_PAGE_GE128 1378 +#define WT_STAT_CONN_CACHE_EVICTION_TARGET_PAGE_GE128 1381 /*! cache: eviction walk target pages histogram - 32-63 */ -#define WT_STAT_CONN_CACHE_EVICTION_TARGET_PAGE_LT64 1379 +#define WT_STAT_CONN_CACHE_EVICTION_TARGET_PAGE_LT64 1382 /*! cache: eviction walk target pages histogram - 64-128 */ -#define WT_STAT_CONN_CACHE_EVICTION_TARGET_PAGE_LT128 1380 +#define WT_STAT_CONN_CACHE_EVICTION_TARGET_PAGE_LT128 1383 /*! * cache: eviction walk target pages reduced due to history store cache * pressure */ -#define WT_STAT_CONN_CACHE_EVICTION_TARGET_PAGE_REDUCED 1381 +#define WT_STAT_CONN_CACHE_EVICTION_TARGET_PAGE_REDUCED 1384 /*! cache: eviction walks abandoned */ -#define WT_STAT_CONN_CACHE_EVICTION_WALKS_ABANDONED 1382 +#define WT_STAT_CONN_CACHE_EVICTION_WALKS_ABANDONED 1385 /*! cache: eviction walks gave up because they restarted their walk twice */ -#define WT_STAT_CONN_CACHE_EVICTION_WALKS_STOPPED 1383 +#define WT_STAT_CONN_CACHE_EVICTION_WALKS_STOPPED 1386 /*! * cache: eviction walks gave up because they saw too many pages and * found no candidates */ -#define WT_STAT_CONN_CACHE_EVICTION_WALKS_GAVE_UP_NO_TARGETS 1384 +#define WT_STAT_CONN_CACHE_EVICTION_WALKS_GAVE_UP_NO_TARGETS 1387 /*! * cache: eviction walks gave up because they saw too many pages and * found too few candidates */ -#define WT_STAT_CONN_CACHE_EVICTION_WALKS_GAVE_UP_RATIO 1385 +#define WT_STAT_CONN_CACHE_EVICTION_WALKS_GAVE_UP_RATIO 1388 /*! cache: eviction walks reached end of tree */ -#define WT_STAT_CONN_CACHE_EVICTION_WALKS_ENDED 1386 +#define WT_STAT_CONN_CACHE_EVICTION_WALKS_ENDED 1389 /*! cache: eviction walks restarted */ -#define WT_STAT_CONN_CACHE_EVICTION_WALK_RESTART 1387 +#define WT_STAT_CONN_CACHE_EVICTION_WALK_RESTART 1390 /*! cache: eviction walks started from root of tree */ -#define WT_STAT_CONN_CACHE_EVICTION_WALK_FROM_ROOT 1388 +#define WT_STAT_CONN_CACHE_EVICTION_WALK_FROM_ROOT 1391 /*! cache: eviction walks started from saved location in tree */ -#define WT_STAT_CONN_CACHE_EVICTION_WALK_SAVED_POS 1389 +#define WT_STAT_CONN_CACHE_EVICTION_WALK_SAVED_POS 1392 /*! cache: hazard pointer blocked page eviction */ -#define WT_STAT_CONN_CACHE_EVICTION_HAZARD 1390 +#define WT_STAT_CONN_CACHE_EVICTION_HAZARD 1393 /*! cache: history store table insert calls */ -#define WT_STAT_CONN_CACHE_HS_INSERT 1391 +#define WT_STAT_CONN_CACHE_HS_INSERT 1394 /*! cache: history store table insert calls that returned restart */ -#define WT_STAT_CONN_CACHE_HS_INSERT_RESTART 1392 +#define WT_STAT_CONN_CACHE_HS_INSERT_RESTART 1395 /*! * cache: history store table out-of-order resolved updates that lose * their durable timestamp */ -#define WT_STAT_CONN_CACHE_HS_ORDER_LOSE_DURABLE_TIMESTAMP 1393 +#define WT_STAT_CONN_CACHE_HS_ORDER_LOSE_DURABLE_TIMESTAMP 1396 /*! * cache: history store table out-of-order updates that were fixed up by * reinserting with the fixed timestamp */ -#define WT_STAT_CONN_CACHE_HS_ORDER_REINSERT 1394 +#define WT_STAT_CONN_CACHE_HS_ORDER_REINSERT 1397 /*! cache: history store table reads */ -#define WT_STAT_CONN_CACHE_HS_READ 1395 +#define WT_STAT_CONN_CACHE_HS_READ 1398 /*! cache: history store table reads missed */ -#define WT_STAT_CONN_CACHE_HS_READ_MISS 1396 +#define WT_STAT_CONN_CACHE_HS_READ_MISS 1399 /*! cache: history store table reads requiring squashed modifies */ -#define WT_STAT_CONN_CACHE_HS_READ_SQUASH 1397 +#define WT_STAT_CONN_CACHE_HS_READ_SQUASH 1400 /*! * cache: history store table truncation by rollback to stable to remove * an unstable update */ -#define WT_STAT_CONN_CACHE_HS_KEY_TRUNCATE_RTS_UNSTABLE 1398 +#define WT_STAT_CONN_CACHE_HS_KEY_TRUNCATE_RTS_UNSTABLE 1401 /*! * cache: history store table truncation by rollback to stable to remove * an update */ -#define WT_STAT_CONN_CACHE_HS_KEY_TRUNCATE_RTS 1399 +#define WT_STAT_CONN_CACHE_HS_KEY_TRUNCATE_RTS 1402 /*! cache: history store table truncation to remove an update */ -#define WT_STAT_CONN_CACHE_HS_KEY_TRUNCATE 1400 +#define WT_STAT_CONN_CACHE_HS_KEY_TRUNCATE 1403 /*! * cache: history store table truncation to remove range of updates due * to key being removed from the data page during reconciliation */ -#define WT_STAT_CONN_CACHE_HS_KEY_TRUNCATE_ONPAGE_REMOVAL 1401 +#define WT_STAT_CONN_CACHE_HS_KEY_TRUNCATE_ONPAGE_REMOVAL 1404 /*! * cache: history store table truncation to remove range of updates due * to out-of-order timestamp update on data page */ -#define WT_STAT_CONN_CACHE_HS_ORDER_REMOVE 1402 +#define WT_STAT_CONN_CACHE_HS_ORDER_REMOVE 1405 /*! cache: history store table writes requiring squashed modifies */ -#define WT_STAT_CONN_CACHE_HS_WRITE_SQUASH 1403 +#define WT_STAT_CONN_CACHE_HS_WRITE_SQUASH 1406 /*! cache: in-memory page passed criteria to be split */ -#define WT_STAT_CONN_CACHE_INMEM_SPLITTABLE 1404 +#define WT_STAT_CONN_CACHE_INMEM_SPLITTABLE 1407 /*! cache: in-memory page splits */ -#define WT_STAT_CONN_CACHE_INMEM_SPLIT 1405 +#define WT_STAT_CONN_CACHE_INMEM_SPLIT 1408 /*! cache: internal pages evicted */ -#define WT_STAT_CONN_CACHE_EVICTION_INTERNAL 1406 +#define WT_STAT_CONN_CACHE_EVICTION_INTERNAL 1409 /*! cache: internal pages split during eviction */ -#define WT_STAT_CONN_CACHE_EVICTION_SPLIT_INTERNAL 1407 +#define WT_STAT_CONN_CACHE_EVICTION_SPLIT_INTERNAL 1410 /*! cache: leaf pages split during eviction */ -#define WT_STAT_CONN_CACHE_EVICTION_SPLIT_LEAF 1408 +#define WT_STAT_CONN_CACHE_EVICTION_SPLIT_LEAF 1411 /*! cache: modified pages evicted */ -#define WT_STAT_CONN_CACHE_EVICTION_DIRTY 1409 +#define WT_STAT_CONN_CACHE_EVICTION_DIRTY 1412 /*! cache: overflow pages read into cache */ -#define WT_STAT_CONN_CACHE_READ_OVERFLOW 1410 +#define WT_STAT_CONN_CACHE_READ_OVERFLOW 1413 /*! cache: page split during eviction deepened the tree */ -#define WT_STAT_CONN_CACHE_EVICTION_DEEPEN 1411 +#define WT_STAT_CONN_CACHE_EVICTION_DEEPEN 1414 /*! cache: page written requiring history store records */ -#define WT_STAT_CONN_CACHE_WRITE_HS 1412 +#define WT_STAT_CONN_CACHE_WRITE_HS 1415 /*! cache: pages read into cache */ -#define WT_STAT_CONN_CACHE_READ 1413 +#define WT_STAT_CONN_CACHE_READ 1416 /*! cache: pages read into cache after truncate */ -#define WT_STAT_CONN_CACHE_READ_DELETED 1414 +#define WT_STAT_CONN_CACHE_READ_DELETED 1417 /*! cache: pages read into cache after truncate in prepare state */ -#define WT_STAT_CONN_CACHE_READ_DELETED_PREPARED 1415 +#define WT_STAT_CONN_CACHE_READ_DELETED_PREPARED 1418 /*! cache: pages requested from the cache */ -#define WT_STAT_CONN_CACHE_PAGES_REQUESTED 1416 +#define WT_STAT_CONN_CACHE_PAGES_REQUESTED 1419 /*! cache: pages seen by eviction walk */ -#define WT_STAT_CONN_CACHE_EVICTION_PAGES_SEEN 1417 +#define WT_STAT_CONN_CACHE_EVICTION_PAGES_SEEN 1420 /*! cache: pages written from cache */ -#define WT_STAT_CONN_CACHE_WRITE 1418 +#define WT_STAT_CONN_CACHE_WRITE 1421 /*! cache: pages written requiring in-memory restoration */ -#define WT_STAT_CONN_CACHE_WRITE_RESTORE 1419 +#define WT_STAT_CONN_CACHE_WRITE_RESTORE 1422 /*! cache: tracked dirty bytes in the cache */ -#define WT_STAT_CONN_CACHE_BYTES_DIRTY 1420 +#define WT_STAT_CONN_CACHE_BYTES_DIRTY 1423 /*! cache: unmodified pages evicted */ -#define WT_STAT_CONN_CACHE_EVICTION_CLEAN 1421 +#define WT_STAT_CONN_CACHE_EVICTION_CLEAN 1424 /*! checkpoint-cleanup: pages added for eviction */ -#define WT_STAT_CONN_CC_PAGES_EVICT 1422 +#define WT_STAT_CONN_CC_PAGES_EVICT 1425 /*! checkpoint-cleanup: pages removed */ -#define WT_STAT_CONN_CC_PAGES_REMOVED 1423 +#define WT_STAT_CONN_CC_PAGES_REMOVED 1426 /*! checkpoint-cleanup: pages skipped during tree walk */ -#define WT_STAT_CONN_CC_PAGES_WALK_SKIPPED 1424 +#define WT_STAT_CONN_CC_PAGES_WALK_SKIPPED 1427 /*! checkpoint-cleanup: pages visited */ -#define WT_STAT_CONN_CC_PAGES_VISITED 1425 +#define WT_STAT_CONN_CC_PAGES_VISITED 1428 /*! cursor: Total number of entries skipped by cursor next calls */ -#define WT_STAT_CONN_CURSOR_NEXT_SKIP_TOTAL 1426 +#define WT_STAT_CONN_CURSOR_NEXT_SKIP_TOTAL 1429 /*! cursor: Total number of entries skipped by cursor prev calls */ -#define WT_STAT_CONN_CURSOR_PREV_SKIP_TOTAL 1427 +#define WT_STAT_CONN_CURSOR_PREV_SKIP_TOTAL 1430 /*! * cursor: Total number of entries skipped to position the history store * cursor */ -#define WT_STAT_CONN_CURSOR_SKIP_HS_CUR_POSITION 1428 +#define WT_STAT_CONN_CURSOR_SKIP_HS_CUR_POSITION 1431 /*! * cursor: Total number of times a search near has exited due to prefix * config */ -#define WT_STAT_CONN_CURSOR_SEARCH_NEAR_PREFIX_FAST_PATHS 1429 +#define WT_STAT_CONN_CURSOR_SEARCH_NEAR_PREFIX_FAST_PATHS 1432 /*! * cursor: cursor next calls that skip due to a globally visible history * store tombstone */ -#define WT_STAT_CONN_CURSOR_NEXT_HS_TOMBSTONE 1430 +#define WT_STAT_CONN_CURSOR_NEXT_HS_TOMBSTONE 1433 /*! * cursor: cursor next calls that skip greater than or equal to 100 * entries */ -#define WT_STAT_CONN_CURSOR_NEXT_SKIP_GE_100 1431 +#define WT_STAT_CONN_CURSOR_NEXT_SKIP_GE_100 1434 /*! cursor: cursor next calls that skip less than 100 entries */ -#define WT_STAT_CONN_CURSOR_NEXT_SKIP_LT_100 1432 +#define WT_STAT_CONN_CURSOR_NEXT_SKIP_LT_100 1435 /*! * cursor: cursor prev calls that skip due to a globally visible history * store tombstone */ -#define WT_STAT_CONN_CURSOR_PREV_HS_TOMBSTONE 1433 +#define WT_STAT_CONN_CURSOR_PREV_HS_TOMBSTONE 1436 /*! * cursor: cursor prev calls that skip greater than or equal to 100 * entries */ -#define WT_STAT_CONN_CURSOR_PREV_SKIP_GE_100 1434 +#define WT_STAT_CONN_CURSOR_PREV_SKIP_GE_100 1437 /*! cursor: cursor prev calls that skip less than 100 entries */ -#define WT_STAT_CONN_CURSOR_PREV_SKIP_LT_100 1435 +#define WT_STAT_CONN_CURSOR_PREV_SKIP_LT_100 1438 /*! cursor: open cursor count */ -#define WT_STAT_CONN_CURSOR_OPEN_COUNT 1436 +#define WT_STAT_CONN_CURSOR_OPEN_COUNT 1439 /*! reconciliation: approximate byte size of timestamps in pages written */ -#define WT_STAT_CONN_REC_TIME_WINDOW_BYTES_TS 1437 +#define WT_STAT_CONN_REC_TIME_WINDOW_BYTES_TS 1440 /*! * reconciliation: approximate byte size of transaction IDs in pages * written */ -#define WT_STAT_CONN_REC_TIME_WINDOW_BYTES_TXN 1438 +#define WT_STAT_CONN_REC_TIME_WINDOW_BYTES_TXN 1441 /*! reconciliation: fast-path pages deleted */ -#define WT_STAT_CONN_REC_PAGE_DELETE_FAST 1439 +#define WT_STAT_CONN_REC_PAGE_DELETE_FAST 1442 /*! reconciliation: page reconciliation calls */ -#define WT_STAT_CONN_REC_PAGES 1440 +#define WT_STAT_CONN_REC_PAGES 1443 /*! reconciliation: page reconciliation calls for eviction */ -#define WT_STAT_CONN_REC_PAGES_EVICTION 1441 +#define WT_STAT_CONN_REC_PAGES_EVICTION 1444 /*! reconciliation: pages deleted */ -#define WT_STAT_CONN_REC_PAGE_DELETE 1442 +#define WT_STAT_CONN_REC_PAGE_DELETE 1445 /*! * reconciliation: pages written including an aggregated newest start * durable timestamp */ -#define WT_STAT_CONN_REC_TIME_AGGR_NEWEST_START_DURABLE_TS 1443 +#define WT_STAT_CONN_REC_TIME_AGGR_NEWEST_START_DURABLE_TS 1446 /*! * reconciliation: pages written including an aggregated newest stop * durable timestamp */ -#define WT_STAT_CONN_REC_TIME_AGGR_NEWEST_STOP_DURABLE_TS 1444 +#define WT_STAT_CONN_REC_TIME_AGGR_NEWEST_STOP_DURABLE_TS 1447 /*! * reconciliation: pages written including an aggregated newest stop * timestamp */ -#define WT_STAT_CONN_REC_TIME_AGGR_NEWEST_STOP_TS 1445 +#define WT_STAT_CONN_REC_TIME_AGGR_NEWEST_STOP_TS 1448 /*! * reconciliation: pages written including an aggregated newest stop * transaction ID */ -#define WT_STAT_CONN_REC_TIME_AGGR_NEWEST_STOP_TXN 1446 +#define WT_STAT_CONN_REC_TIME_AGGR_NEWEST_STOP_TXN 1449 /*! * reconciliation: pages written including an aggregated newest * transaction ID */ -#define WT_STAT_CONN_REC_TIME_AGGR_NEWEST_TXN 1447 +#define WT_STAT_CONN_REC_TIME_AGGR_NEWEST_TXN 1450 /*! * reconciliation: pages written including an aggregated oldest start * timestamp */ -#define WT_STAT_CONN_REC_TIME_AGGR_OLDEST_START_TS 1448 +#define WT_STAT_CONN_REC_TIME_AGGR_OLDEST_START_TS 1451 /*! reconciliation: pages written including an aggregated prepare */ -#define WT_STAT_CONN_REC_TIME_AGGR_PREPARED 1449 +#define WT_STAT_CONN_REC_TIME_AGGR_PREPARED 1452 /*! * reconciliation: pages written including at least one start durable * timestamp */ -#define WT_STAT_CONN_REC_TIME_WINDOW_PAGES_DURABLE_START_TS 1450 +#define WT_STAT_CONN_REC_TIME_WINDOW_PAGES_DURABLE_START_TS 1453 /*! * reconciliation: pages written including at least one start transaction * ID */ -#define WT_STAT_CONN_REC_TIME_WINDOW_PAGES_START_TXN 1451 +#define WT_STAT_CONN_REC_TIME_WINDOW_PAGES_START_TXN 1454 /*! * reconciliation: pages written including at least one stop durable * timestamp */ -#define WT_STAT_CONN_REC_TIME_WINDOW_PAGES_DURABLE_STOP_TS 1452 +#define WT_STAT_CONN_REC_TIME_WINDOW_PAGES_DURABLE_STOP_TS 1455 /*! reconciliation: pages written including at least one stop timestamp */ -#define WT_STAT_CONN_REC_TIME_WINDOW_PAGES_STOP_TS 1453 +#define WT_STAT_CONN_REC_TIME_WINDOW_PAGES_STOP_TS 1456 /*! * reconciliation: pages written including at least one stop transaction * ID */ -#define WT_STAT_CONN_REC_TIME_WINDOW_PAGES_STOP_TXN 1454 +#define WT_STAT_CONN_REC_TIME_WINDOW_PAGES_STOP_TXN 1457 /*! reconciliation: records written including a start durable timestamp */ -#define WT_STAT_CONN_REC_TIME_WINDOW_DURABLE_START_TS 1455 +#define WT_STAT_CONN_REC_TIME_WINDOW_DURABLE_START_TS 1458 /*! reconciliation: records written including a start timestamp */ -#define WT_STAT_CONN_REC_TIME_WINDOW_START_TS 1456 +#define WT_STAT_CONN_REC_TIME_WINDOW_START_TS 1459 /*! reconciliation: records written including a start transaction ID */ -#define WT_STAT_CONN_REC_TIME_WINDOW_START_TXN 1457 +#define WT_STAT_CONN_REC_TIME_WINDOW_START_TXN 1460 /*! reconciliation: records written including a stop durable timestamp */ -#define WT_STAT_CONN_REC_TIME_WINDOW_DURABLE_STOP_TS 1458 +#define WT_STAT_CONN_REC_TIME_WINDOW_DURABLE_STOP_TS 1461 /*! reconciliation: records written including a stop timestamp */ -#define WT_STAT_CONN_REC_TIME_WINDOW_STOP_TS 1459 +#define WT_STAT_CONN_REC_TIME_WINDOW_STOP_TS 1462 /*! reconciliation: records written including a stop transaction ID */ -#define WT_STAT_CONN_REC_TIME_WINDOW_STOP_TXN 1460 +#define WT_STAT_CONN_REC_TIME_WINDOW_STOP_TXN 1463 /*! session: tiered storage local retention time (secs) */ -#define WT_STAT_CONN_TIERED_RETENTION 1461 +#define WT_STAT_CONN_TIERED_RETENTION 1464 /*! session: tiered storage object size */ -#define WT_STAT_CONN_TIERED_OBJECT_SIZE 1462 +#define WT_STAT_CONN_TIERED_OBJECT_SIZE 1465 /*! transaction: race to read prepared update retry */ -#define WT_STAT_CONN_TXN_READ_RACE_PREPARE_UPDATE 1463 +#define WT_STAT_CONN_TXN_READ_RACE_PREPARE_UPDATE 1466 /*! * transaction: rollback to stable history store records with stop * timestamps older than newer records */ -#define WT_STAT_CONN_TXN_RTS_HS_STOP_OLDER_THAN_NEWER_START 1464 +#define WT_STAT_CONN_TXN_RTS_HS_STOP_OLDER_THAN_NEWER_START 1467 /*! transaction: rollback to stable inconsistent checkpoint */ -#define WT_STAT_CONN_TXN_RTS_INCONSISTENT_CKPT 1465 +#define WT_STAT_CONN_TXN_RTS_INCONSISTENT_CKPT 1468 /*! transaction: rollback to stable keys removed */ -#define WT_STAT_CONN_TXN_RTS_KEYS_REMOVED 1466 +#define WT_STAT_CONN_TXN_RTS_KEYS_REMOVED 1469 /*! transaction: rollback to stable keys restored */ -#define WT_STAT_CONN_TXN_RTS_KEYS_RESTORED 1467 +#define WT_STAT_CONN_TXN_RTS_KEYS_RESTORED 1470 /*! transaction: rollback to stable restored tombstones from history store */ -#define WT_STAT_CONN_TXN_RTS_HS_RESTORE_TOMBSTONES 1468 +#define WT_STAT_CONN_TXN_RTS_HS_RESTORE_TOMBSTONES 1471 /*! transaction: rollback to stable restored updates from history store */ -#define WT_STAT_CONN_TXN_RTS_HS_RESTORE_UPDATES 1469 +#define WT_STAT_CONN_TXN_RTS_HS_RESTORE_UPDATES 1472 /*! transaction: rollback to stable sweeping history store keys */ -#define WT_STAT_CONN_TXN_RTS_SWEEP_HS_KEYS 1470 +#define WT_STAT_CONN_TXN_RTS_SWEEP_HS_KEYS 1473 /*! transaction: rollback to stable updates removed from history store */ -#define WT_STAT_CONN_TXN_RTS_HS_REMOVED 1471 +#define WT_STAT_CONN_TXN_RTS_HS_REMOVED 1474 /*! transaction: transaction checkpoints due to obsolete pages */ -#define WT_STAT_CONN_TXN_CHECKPOINT_OBSOLETE_APPLIED 1472 +#define WT_STAT_CONN_TXN_CHECKPOINT_OBSOLETE_APPLIED 1475 /*! transaction: update conflicts */ -#define WT_STAT_CONN_TXN_UPDATE_CONFLICT 1473 +#define WT_STAT_CONN_TXN_UPDATE_CONFLICT 1476 /*! * @} diff --git a/src/third_party/wiredtiger/src/lsm/lsm_cursor.c b/src/third_party/wiredtiger/src/lsm/lsm_cursor.c index d2093a26ecb..4c3aa2a3204 100644 --- a/src/third_party/wiredtiger/src/lsm/lsm_cursor.c +++ b/src/third_party/wiredtiger/src/lsm/lsm_cursor.c @@ -1281,10 +1281,9 @@ __clsm_search_near(WT_CURSOR *cursor, int *exactp) } /* - * Prefer larger cursors. There are two reasons: (1) we expect - * prefix searches to be a common case (as in our own indices); - * and (2) we need a way to unambiguously know we have the - * "closest" result. + * Prefer larger cursors. There are two reasons: (1) we expect prefix searches to be a + * common case (as in our own indices); and (2) we need a way to unambiguously know we have + * the "closest" result. */ if (cmp < 0) { if ((ret = c->next(c)) == WT_NOTFOUND) { diff --git a/src/third_party/wiredtiger/src/meta/meta_ckpt.c b/src/third_party/wiredtiger/src/meta/meta_ckpt.c index 96ed8cb72a4..b5d0ae0b7a1 100644 --- a/src/third_party/wiredtiger/src/meta/meta_ckpt.c +++ b/src/third_party/wiredtiger/src/meta/meta_ckpt.c @@ -14,7 +14,7 @@ static int __ckpt_load(WT_SESSION_IMPL *, WT_CONFIG_ITEM *, WT_CONFIG_ITEM *, WT static int __ckpt_named(WT_SESSION_IMPL *, const char *, const char *, WT_CKPT *); static int __ckpt_set(WT_SESSION_IMPL *, const char *, const char *, bool); static int __ckpt_version_chk(WT_SESSION_IMPL *, const char *, const char *); - +static int __meta_blk_mods_load(WT_SESSION_IMPL *, const char *, WT_CKPT *, WT_CKPT *, bool); /* * __ckpt_load_blk_mods -- * Load the block information from the config string. @@ -472,18 +472,52 @@ __ckpt_valid_blk_mods(WT_SESSION_IMPL *session, WT_CKPT *ckpt, bool rename) } /* - * __wt_meta_blk_mods_load -- - * Load the block mods for a given checkpoint and set up all the information to store. + * __ckpt_copy_blk_mods -- + * Copy the block mods from a source checkpoint to the destination checkpoint. */ -int -__wt_meta_blk_mods_load(WT_SESSION_IMPL *session, const char *config, WT_CKPT *ckpt, bool rename) +static int +__ckpt_copy_blk_mods(WT_SESSION_IMPL *session, WT_CKPT *src_ckpt, WT_CKPT *dst_ckpt) +{ + uint64_t i; + + for (i = 0; i < WT_BLKINCR_MAX; ++i) { + WT_RET(__wt_strdup( + session, src_ckpt->backup_blocks[i].id_str, &dst_ckpt->backup_blocks[i].id_str)); + WT_RET(__wt_buf_set(session, &dst_ckpt->backup_blocks[i].bitstring, + src_ckpt->backup_blocks[i].bitstring.data, src_ckpt->backup_blocks[i].bitstring.size)); + dst_ckpt->backup_blocks[i].nbits = src_ckpt->backup_blocks[i].nbits; + dst_ckpt->backup_blocks[i].offset = src_ckpt->backup_blocks[i].offset; + dst_ckpt->backup_blocks[i].granularity = src_ckpt->backup_blocks[i].granularity; + dst_ckpt->backup_blocks[i].flags = src_ckpt->backup_blocks[i].flags; + } + + return (0); +} + +/* + * __meta_blk_mods_load -- + * Load the block mods for a given checkpoint and set up all the information to store. Load from + * either the metadata or from a base checkpoint. + */ +static int +__meta_blk_mods_load( + WT_SESSION_IMPL *session, const char *config, WT_CKPT *base_ckpt, WT_CKPT *ckpt, bool rename) { /* - * Load most recent checkpoint backup blocks to this checkpoint. + * Load most recent checkpoint backup blocks to this checkpoint, either from metadata or from a + * previous checkpoint. */ - WT_RET(__ckpt_load_blk_mods(session, config, ckpt)); - - WT_RET(__wt_meta_block_metadata(session, config, ckpt)); + if (config != NULL) { + /* Load from metadata. */ + WT_RET(__ckpt_load_blk_mods(session, config, ckpt)); + WT_RET(__wt_meta_block_metadata(session, config, ckpt)); + } else { + /* Load from an existing base checkpoint. */ + WT_ASSERT(session, base_ckpt != NULL); + WT_RET(__ckpt_copy_blk_mods(session, base_ckpt, ckpt)); + WT_RET(__wt_strndup(session, base_ckpt->block_metadata, strlen(base_ckpt->block_metadata), + &ckpt->block_metadata)); + } /* * Set the add-a-checkpoint flag, and if we're doing incremental backups, request a list of the @@ -503,7 +537,7 @@ __wt_meta_blk_mods_load(WT_SESSION_IMPL *session, const char *config, WT_CKPT *c */ int __wt_meta_ckptlist_get( - WT_SESSION_IMPL *session, const char *fname, bool update, WT_CKPT **ckptbasep) + WT_SESSION_IMPL *session, const char *fname, bool update, WT_CKPT **ckptbasep, size_t *allocated) { WT_DECL_RET; char *config; @@ -511,34 +545,216 @@ __wt_meta_ckptlist_get( config = NULL; WT_ERR(__wt_metadata_search(session, fname, &config)); - WT_ERR(__wt_meta_ckptlist_get_from_config(session, update, ckptbasep, config)); + WT_ERR(__wt_meta_ckptlist_get_from_config(session, update, ckptbasep, allocated, config)); err: __wt_free(session, config); return (ret); } +#ifdef HAVE_DIAGNOSTIC +/* + * __assert_ckpt_matches -- + * Assert that given two checkpoints match. + */ +static void +__assert_ckpt_matches(WT_SESSION_IMPL *session, WT_CKPT *ckpt_a, WT_CKPT *ckpt_b) +{ + /* + * We are not checking checkpoint time, because there could be a minute difference depending + * upon when the checkpoint information was generated. This is acceptable. + */ + WT_ASSERT(session, + (ckpt_a->name == NULL && ckpt_b->name == NULL) || + (ckpt_a->name != NULL && ckpt_b->name != NULL && strcmp(ckpt_a->name, ckpt_b->name) == 0)); + WT_ASSERT(session, ckpt_a->order == ckpt_b->order); + WT_ASSERT(session, ckpt_a->size == ckpt_b->size); + WT_ASSERT(session, ckpt_a->write_gen == ckpt_b->write_gen); + WT_ASSERT(session, ckpt_a->run_write_gen == ckpt_b->run_write_gen); + WT_ASSERT(session, + ckpt_a->ta.newest_start_durable_ts == ckpt_b->ta.newest_start_durable_ts && + ckpt_a->ta.newest_stop_durable_ts == ckpt_b->ta.newest_stop_durable_ts && + ckpt_a->ta.oldest_start_ts == ckpt_b->ta.oldest_start_ts && + ckpt_a->ta.newest_txn == ckpt_b->ta.newest_txn && + ckpt_a->ta.newest_stop_ts == ckpt_b->ta.newest_stop_ts && + ckpt_a->ta.newest_stop_txn == ckpt_b->ta.newest_stop_txn && + ckpt_a->ta.prepare == ckpt_b->ta.prepare); + /* + * The two WT_CKPT structures are created through different paths, specifically in one path the + * WT_CKPT.addr and WT_CKPT.raw fields are taken from a configuration file as strings including + * a training nul byte. Use the minimum size of the data to ignore that nul byte. Passing nul + * pointers to memcmp is undefined, so handle that separately. + */ + WT_ASSERT(session, + (ckpt_a->addr.data == NULL && ckpt_b->addr.data == NULL) || + (ckpt_a->addr.data != NULL && ckpt_b->addr.data != NULL && + memcmp(ckpt_a->addr.data, ckpt_b->addr.data, + WT_MIN(ckpt_a->addr.size, ckpt_b->addr.size)) == 0)); + WT_ASSERT(session, + (ckpt_a->raw.data == NULL && ckpt_b->raw.data == NULL) || + (ckpt_a->raw.data != NULL && ckpt_b->raw.data != NULL && + memcmp(ckpt_a->raw.data, ckpt_b->raw.data, WT_MIN(ckpt_a->raw.size, ckpt_b->raw.size)) == + 0)); + WT_ASSERT(session, ckpt_a->bpriv == NULL && ckpt_b->bpriv == NULL); + WT_ASSERT(session, ckpt_a->flags == ckpt_b->flags); +} + +/* + * __assert_checkpoint_list_matches -- + * Assert that two given checkpoint lists match. + */ +static void +__assert_checkpoint_list_matches(WT_SESSION_IMPL *session, WT_CKPT *saved_list, WT_CKPT *new_list) +{ + WT_CKPT *ckpt_saved, *ckpt_new; + + for (ckpt_saved = saved_list, ckpt_new = new_list; + ckpt_saved != NULL && ckpt_saved->order != 0 && ckpt_new != NULL && ckpt_new->order != 0; + ckpt_saved++, ckpt_new++) + __assert_ckpt_matches(session, ckpt_saved, ckpt_new); + + WT_ASSERT(session, + (ckpt_saved == NULL && ckpt_new == NULL) || + ((ckpt_saved != NULL && ckpt_saved->order == 0) && + (ckpt_new != NULL && ckpt_new->order == 0))); +} +#endif + +/* + * __meta_ckptlist_allocate_new_ckpt -- + * Provided a checkpoint list, allocate a new checkpoint. Either use the last checkpoint in the + * list or the file metadata to initialize this new checkpoint. + */ +static int +__meta_ckptlist_allocate_new_ckpt( + WT_SESSION_IMPL *session, WT_CKPT **ckptbasep, size_t *allocated, const char *config) +{ + WT_CKPT *ckptbase, *ckpt; + WT_CONNECTION_IMPL *conn; + size_t slot; + uint64_t most_recent; + + ckptbase = *ckptbasep; + conn = S2C(session); + slot = 0; + + if (ckptbase != NULL) + WT_CKPT_FOREACH (ckptbase, ckpt) + slot++; + + /* Either we have a configuration or an existing checkpoint to initialize with. */ + WT_ASSERT(session, config != NULL || slot != 0); + + /* + * If we are using an existing checkpoint, we must have the associated metadata. Otherwise we + * will have to go slow path and read the metadata. + */ + if (config == NULL && ckptbase[slot - 1].block_metadata == NULL) + return (WT_NOTFOUND); + + /* + * This isn't clean, but there's necessary cooperation between the schema layer (that maintains + * the list of checkpoints), the btree layer (that knows when the root page is written, creating + * a new checkpoint), and the block manager (which actually creates the checkpoint). All of that + * cooperation is handled in the array of checkpoint structures referenced from the WT_BTREE + * structure. + * + * Allocate a slot for a new value, plus a slot to mark the end. + */ + WT_RET(__wt_realloc_def(session, allocated, slot + 2, &ckptbase)); + *ckptbasep = ckptbase; + + ckpt = &ckptbase[slot]; + ckpt->order = (slot == 0) ? 1 : ckptbase[slot - 1].order + 1; + __wt_seconds(session, &ckpt->sec); + /* + * Update time value for most recent checkpoint, not letting it move backwards. It is possible + * to race here, so use atomic CAS. This code relies on the fact that anyone we race with will + * only increase (never decrease) the most recent checkpoint time value. + */ + for (;;) { + WT_ORDERED_READ(most_recent, conn->ckpt_most_recent); + if (ckpt->sec <= most_recent || + __wt_atomic_cas64(&conn->ckpt_most_recent, most_recent, ckpt->sec)) + break; + } + + /* Either load block mods from the config, or from the previous checkpoint. */ + WT_RET( + __meta_blk_mods_load(session, config, (slot == 0 ? NULL : &ckptbase[slot - 1]), ckpt, false)); + WT_ASSERT(session, ckpt->block_metadata != NULL); + + return (0); +} + +/* + * __wt_meta_saved_ckptlist_get -- + * Append the ckptlist with a new checkpoint to be added. + */ +int +__wt_meta_saved_ckptlist_get(WT_SESSION_IMPL *session, const char *fname, WT_CKPT **ckptbasep) +{ + WT_BTREE *btree; +#ifdef HAVE_DIAGNOSTIC + WT_CKPT *ckptbase_comp; +#endif + WT_DECL_RET; + + *ckptbasep = NULL; + + btree = S2BT(session); + + /* If we do not have a saved ckptlist, return not found. */ + if (btree->ckpt == NULL) + return (WT_NOTFOUND); + + WT_ERR( + __meta_ckptlist_allocate_new_ckpt(session, &btree->ckpt, &btree->ckpt_bytes_allocated, NULL)); + +#ifdef HAVE_DIAGNOSTIC + /* + * Sanity check: Let's compare to a list generated from metadata. There should be no + * differences. + */ + if ((ret = __wt_meta_ckptlist_get(session, fname, true, &ckptbase_comp, NULL)) == 0) + __assert_checkpoint_list_matches(session, btree->ckpt, ckptbase_comp); + __wt_meta_ckptlist_free(session, &ckptbase_comp); + WT_ERR(ret); +#else + WT_UNUSED(fname); +#endif + + /* Return the array to our caller. */ + *ckptbasep = btree->ckpt; + + if (0) { +err: + __wt_meta_saved_ckptlist_free(session); + } + + return (ret); +} + /* * __wt_meta_ckptlist_get_from_config -- * Provided a metadata config, load all available checkpoint information for a file. */ int -__wt_meta_ckptlist_get_from_config( - WT_SESSION_IMPL *session, bool update, WT_CKPT **ckptbasep, const char *config) +__wt_meta_ckptlist_get_from_config(WT_SESSION_IMPL *session, bool update, WT_CKPT **ckptbasep, + size_t *allocatedp, const char *config) { WT_CKPT *ckpt, *ckptbase; WT_CONFIG ckptconf; WT_CONFIG_ITEM k, v; - WT_CONNECTION_IMPL *conn; WT_DECL_RET; size_t allocated, slot; - uint64_t most_recent; *ckptbasep = NULL; + if (allocatedp != NULL) + *allocatedp = 0; ckptbase = NULL; allocated = slot = 0; - conn = S2C(session); /* Load any existing checkpoints into the array. */ if ((ret = __wt_config_getones(session, config, "checkpoint", &v)) == 0) { @@ -560,38 +776,14 @@ __wt_meta_ckptlist_get_from_config( /* Sort in creation-order. */ __wt_qsort(ckptbase, slot, sizeof(WT_CKPT), __ckpt_compare_order); - if (update) { - /* - * This isn't clean, but there's necessary cooperation between the schema layer (that - * maintains the list of checkpoints), the btree layer (that knows when the root page is - * written, creating a new checkpoint), and the block manager (which actually creates the - * checkpoint). All of that cooperation is handled in the array of checkpoint structures - * referenced from the WT_BTREE structure. - * - * Allocate a slot for a new value, plus a slot to mark the end. - */ - WT_ERR(__wt_realloc_def(session, &allocated, slot + 2, &ckptbase)); - - /* The caller may be adding a value, initialize it. */ - ckpt = &ckptbase[slot]; - ckpt->order = (slot == 0) ? 1 : ckptbase[slot - 1].order + 1; - __wt_seconds(session, &ckpt->sec); - /* - * Update time value for most recent checkpoint, not letting it move backwards. It is - * possible to race here, so use atomic CAS. This code relies on the fact that anyone we - * race with will only increase (never decrease) the most recent checkpoint time value. - */ - for (;;) { - WT_ORDERED_READ(most_recent, conn->ckpt_most_recent); - if (ckpt->sec <= most_recent || - __wt_atomic_cas64(&conn->ckpt_most_recent, most_recent, ckpt->sec)) - break; - } - WT_ERR(__wt_meta_blk_mods_load(session, config, ckpt, false)); - } + /* The caller might be asking for a new checkpoint to be allocated. */ + if (update) + WT_ERR(__meta_ckptlist_allocate_new_ckpt(session, &ckptbase, &allocated, config)); /* Return the array to our caller. */ *ckptbasep = ckptbase; + if (allocatedp != NULL) + *allocatedp = allocated; if (0) { err: @@ -932,7 +1124,6 @@ __wt_meta_ckptlist_set( bool has_lsn; WT_RET(__wt_scr_alloc(session, 1024, &buf)); - WT_ERR(__wt_meta_ckptlist_to_meta(session, ckptbase, buf)); /* Add backup block modifications for any added checkpoint. */ WT_CKPT_FOREACH (ckptbase, ckpt) @@ -963,12 +1154,31 @@ __wt_meta_ckptlist_free(WT_SESSION_IMPL *session, WT_CKPT **ckptbasep) if ((ckptbase = *ckptbasep) == NULL) return; - WT_CKPT_FOREACH (ckptbase, ckpt) + /* + * Sometimes the checkpoint list has a checkpoint which has not been named yet, but carries an + * order number. + */ + WT_CKPT_FOREACH_NAME_OR_ORDER (ckptbase, ckpt) __wt_meta_checkpoint_free(session, ckpt); __wt_free(session, *ckptbasep); } /* + * __wt_meta_saved_ckptlist_free -- + * Discard the saved checkpoint list. + */ +void +__wt_meta_saved_ckptlist_free(WT_SESSION_IMPL *session) +{ + WT_BTREE *btree; + + btree = S2BT(session); + + __wt_meta_ckptlist_free(session, &btree->ckpt); + btree->ckpt_bytes_allocated = 0; +} + +/* * __wt_meta_checkpoint_free -- * Clean up a single checkpoint structure. */ @@ -1112,3 +1322,28 @@ __ckpt_version_chk(WT_SESSION_IMPL *session, const char *fname, const char *conf WT_BTREE_MAJOR_VERSION_MAX, WT_BTREE_MINOR_VERSION_MAX); return (0); } + +/* + * __wt_reset_blkmod -- + * Reset the incremental backup information, and recreate incremental backup information to + * indicate copying the entire file. + */ +int +__wt_reset_blkmod(WT_SESSION_IMPL *session, const char *orig_config, WT_ITEM *buf) +{ + WT_CKPT ckpt; + WT_DECL_RET; + + WT_CLEAR(ckpt); + /* + * Replace the old file entries with new file entries. We need to recreate the incremental + * backup information to indicate copying the entire file in its bitmap. + */ + /* First load any existing backup information into a temp checkpoint structure. */ + WT_RET(__meta_blk_mods_load(session, orig_config, NULL, &ckpt, true)); + + /* Take the checkpoint structure and generate the metadata string. */ + ret = __wt_ckpt_blkmod_to_meta(session, buf, &ckpt); + __wt_meta_checkpoint_free(session, &ckpt); + return (ret); +} diff --git a/src/third_party/wiredtiger/src/meta/meta_ext.c b/src/third_party/wiredtiger/src/meta/meta_ext.c index 11e97ad9c45..aa9b6954683 100644 --- a/src/third_party/wiredtiger/src/meta/meta_ext.c +++ b/src/third_party/wiredtiger/src/meta/meta_ext.c @@ -88,7 +88,7 @@ int __wt_metadata_get_ckptlist(WT_SESSION *session, const char *name, WT_CKPT **ckptbasep) WT_GCC_FUNC_ATTRIBUTE((visibility("default"))) { - return (__wt_meta_ckptlist_get((WT_SESSION_IMPL *)session, name, false, ckptbasep)); + return (__wt_meta_ckptlist_get((WT_SESSION_IMPL *)session, name, false, ckptbasep, NULL)); } /* diff --git a/src/third_party/wiredtiger/src/meta/meta_track.c b/src/third_party/wiredtiger/src/meta/meta_track.c index 0739175c3a1..88186deff68 100644 --- a/src/third_party/wiredtiger/src/meta/meta_track.c +++ b/src/third_party/wiredtiger/src/meta/meta_track.c @@ -426,8 +426,8 @@ __wt_meta_track_update(WT_SESSION_IMPL *session, const char *key) WT_ERR(__wt_strdup(session, key, &trk->a)); /* - * If there was a previous value, keep it around -- if not, then this - * "update" is really an insert. + * If there was a previous value, keep it around -- if not, then this "update" is really an + * insert. */ if ((ret = __wt_metadata_search(session, key, &trk->b)) == WT_NOTFOUND) { trk->op = WT_ST_REMOVE; diff --git a/src/third_party/wiredtiger/src/os_posix/os_fs.c b/src/third_party/wiredtiger/src/os_posix/os_fs.c index cc9d2c08ca1..1ae6259e5d8 100644 --- a/src/third_party/wiredtiger/src/os_posix/os_fs.c +++ b/src/third_party/wiredtiger/src/os_posix/os_fs.c @@ -49,18 +49,15 @@ __posix_sync(WT_SESSION_IMPL *session, int fd, const char *name, const char *fun #if defined(F_FULLFSYNC) /* - * OS X fsync documentation: - * "Note that while fsync() will flush all data from the host to the - * drive (i.e. the "permanent storage device"), the drive itself may - * not physically write the data to the platters for quite some time - * and it may be written in an out-of-order sequence. For applications - * that require tighter guarantees about the integrity of their data, - * Mac OS X provides the F_FULLFSYNC fcntl. The F_FULLFSYNC fcntl asks - * the drive to flush all buffered data to permanent storage." + * OS X fsync documentation: "Note that while fsync() will flush all data from the host to the + * drive (i.e. the "permanent storage device"), the drive itself may not physically write the + * data to the platters for quite some time and it may be written in an out-of-order sequence. + * For applications that require tighter guarantees about the integrity of their data, Mac OS X + * provides the F_FULLFSYNC fcntl. The F_FULLFSYNC fcntl asks the drive to flush all buffered + * data to permanent storage." * - * OS X F_FULLFSYNC fcntl documentation: - * "This is currently implemented on HFS, MS-DOS (FAT), and Universal - * Disk Format (UDF) file systems." + * OS X F_FULLFSYNC fcntl documentation: "This is currently implemented on HFS, MS-DOS (FAT), + * and Universal Disk Format (UDF) file systems." * * See comment in __posix_sync(): sync cannot be retried or fail. */ @@ -1032,11 +1029,10 @@ __wt_map_file(WT_FILE_HANDLE *file_handle, WT_SESSION *wt_session) * file while others might be reading or writing it: * * Every time someone reads or writes from the mapped region, they increment the "use" count via - * cas. If someone wants to change the file size, they set the "stop" flag. If a session sees - * the stop flag, it does not read via mmap, but resorts to the regular syscall. The session - * that set the stop flag spin-waits until the "use" count goes to zero. Then it changes the - * file size and remaps the region without synchronization. Once all that is done, it resets the - * "stop" flag. + * cas. If someone wants to change the file size, they set the "stop" flag. If a session sees the + * stop flag, it does not read via mmap, but resorts to the regular syscall. The session that set + * the stop flag spin-waits until the "use" count goes to zero. Then it changes the file size and + * remaps the region without synchronization. Once all that is done, it resets the "stop" flag. */ /* diff --git a/src/third_party/wiredtiger/src/reconcile/rec_row.c b/src/third_party/wiredtiger/src/reconcile/rec_row.c index db4bb56c976..bc4174e084a 100644 --- a/src/third_party/wiredtiger/src/reconcile/rec_row.c +++ b/src/third_party/wiredtiger/src/reconcile/rec_row.c @@ -18,29 +18,24 @@ __rec_key_state_update(WT_RECONCILE *r, bool ovfl_key) WT_ITEM *a; /* - * If writing an overflow key onto the page, don't update the "last key" - * value, and leave the state of prefix compression alone. (If we are - * currently doing prefix compression, we have a key state which will - * continue to work, we're just skipping the key just created because - * it's an overflow key and doesn't participate in prefix compression. - * If we are not currently doing prefix compression, we can't start, an - * overflow key doesn't give us any state.) + * If writing an overflow key onto the page, don't update the "last key" value, and leave the + * state of prefix compression alone. (If we are currently doing prefix compression, we have a + * key state which will continue to work, we're just skipping the key just created because it's + * an overflow key and doesn't participate in prefix compression. If we are not currently doing + * prefix compression, we can't start, an overflow key doesn't give us any state.) * - * Additionally, if we wrote an overflow key onto the page, turn off the - * suffix compression of row-store internal node keys. (When we split, - * "last key" is the largest key on the previous page, and "cur key" is - * the first key on the next page, which is being promoted. In some - * cases we can discard bytes from the "cur key" that are not needed to - * distinguish between the "last key" and "cur key", compressing the - * size of keys on internal nodes. If we just built an overflow key, - * we're not going to update the "last key", making suffix compression - * impossible for the next key. Alternatively, we could remember where - * the last key was on the page, detect it's an overflow key, read it - * from disk and do suffix compression, but that's too much work for an - * unlikely event.) + * Additionally, if we wrote an overflow key onto the page, turn off the suffix compression of + * row-store internal node keys. (When we split, "last key" is the largest key on the previous + * page, and "cur key" is the first key on the next page, which is being promoted. In some cases + * we can discard bytes from the "cur key" that are not needed to distinguish between the "last + * key" and "cur key", compressing the size of keys on internal nodes. If we just built an + * overflow key, we're not going to update the "last key", making suffix compression impossible + * for the next key. Alternatively, we could remember where the last key was on the page, detect + * it's an overflow key, read it from disk and do suffix compression, but that's too much work + * for an unlikely event.) * - * If we're not writing an overflow key on the page, update the last-key - * value and turn on both prefix and suffix compression. + * If we're not writing an overflow key on the page, update the last-key value and turn on both + * prefix and suffix compression. */ if (ovfl_key) r->key_sfx_compress = false; @@ -143,18 +138,27 @@ __rec_cell_build_leaf_key( break; /* - * Prefix compression may cost us CPU and memory when the page is re-loaded, don't do it - * unless there's reasonable gain. + * Prefix compression costs CPU and memory when the page is re-loaded, skip unless + * there's a reasonable gain. Also, if the previous key was prefix compressed, don't + * increase the prefix compression if we aren't getting a reasonable gain. (Groups of + * keys with the same prefix can be quickly built without needing to roll forward + * through intermediate keys or allocating memory so they can be built faster in the + * future, for that reason try and create big groups of keys with the same prefix.) */ if (pfx < btree->prefix_compression_min) pfx = 0; - else + else if (r->key_pfx_last != 0 && pfx > r->key_pfx_last && + pfx < r->key_pfx_last + WT_KEY_PREFIX_PREVIOUS_MINIMUM) + pfx = r->key_pfx_last; + + if (pfx != 0) WT_STAT_DATA_INCRV(session, rec_prefix_compression, pfx); } /* Copy the non-prefix bytes into the key buffer. */ WT_RET(__wt_buf_set(session, &key->buf, (uint8_t *)data + pfx, size - pfx)); } + r->key_pfx_last = pfx; /* Create an overflow object if the data won't fit. */ if (key->buf.size > btree->maxleafkey) { @@ -214,6 +218,7 @@ __wt_bulk_insert_row(WT_SESSION_IMPL *session, WT_CURSOR_BULK *cbulk) */ if (r->key_pfx_compress_conf) { r->key_pfx_compress = false; + r->key_pfx_last = 0; if (!ovfl_key) WT_RET(__rec_cell_build_leaf_key(session, r, NULL, 0, &ovfl_key)); } @@ -582,6 +587,7 @@ __rec_row_leaf_insert(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_INSERT *ins) * Turn off prefix and suffix compression until a full key is written into the new page. */ r->key_pfx_compress = r->key_sfx_compress = false; + r->key_pfx_last = 0; continue; } @@ -633,6 +639,7 @@ __rec_row_leaf_insert(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_INSERT *ins) */ if (r->key_pfx_compress_conf) { r->key_pfx_compress = false; + r->key_pfx_last = 0; if (!ovfl_key) WT_RET(__rec_cell_build_leaf_key(session, r, NULL, 0, &ovfl_key)); } @@ -715,10 +722,13 @@ __wt_rec_row_leaf( WT_TIME_WINDOW tw; WT_UPDATE *upd; WT_UPDATE_SELECT upd_select; + size_t key_size; uint64_t slvg_skip; uint32_t i; + uint8_t key_prefix; bool dictionary, key_onpage_ovfl, ovfl_key; void *copy; + const void *key_data; btree = S2BT(session); hs_cursor = NULL; @@ -764,20 +774,19 @@ __wt_rec_row_leaf( dictionary = false; /* - * Figure out the key: set any cell reference (and unpack it), set any instantiated key - * reference. + * Figure out if the key is an overflow key, and in that case unpack the cell, we'll need it + * later. */ copy = WT_ROW_KEY_COPY(rip); - WT_IGNORE_RET_BOOL(__wt_row_leaf_key_info(page, copy, &ikey, &cell, NULL, NULL)); - if (cell == NULL) - kpack = NULL; - else { + __wt_row_leaf_key_info(page, copy, &ikey, &cell, &key_data, &key_size, &key_prefix); + kpack = NULL; + if (__wt_cell_type(cell) == WT_CELL_KEY_OVFL) { kpack = &_kpack; __wt_cell_unpack_kv(session, page->dsk, cell, kpack); } /* Unpack the on-page value cell. */ - __wt_row_leaf_value_cell(session, page, rip, NULL, vpack); + __wt_row_leaf_value_cell(session, page, rip, vpack); /* Look for an update. */ WT_ERR(__wt_rec_upd_select(session, r, NULL, rip, vpack, &upd_select)); @@ -890,7 +899,7 @@ __wt_rec_row_leaf( /* * Keys are part of the name-space, we can't remove them from the in-memory * tree; if an overflow key was deleted without being instantiated (for example, - * cursor-based truncation), do it now. + * cursor-based truncation), instantiate it now. */ if (ikey == NULL) WT_ERR(__wt_row_leaf_key(session, page, rip, tmpkey, true)); @@ -972,25 +981,41 @@ __wt_rec_row_leaf( * previous key (it's a fast path for simple, prefix-compressed keys), or by building * the key from scratch. */ - if (__wt_row_leaf_key_info(page, copy, NULL, &cell, &tmpkey->data, &tmpkey->size)) + __wt_row_leaf_key_info(page, copy, NULL, &cell, &key_data, &key_size, &key_prefix); + if (key_data == NULL) { + if (__wt_cell_type(cell) != WT_CELL_KEY) + goto slow; + kpack = &_kpack; + __wt_cell_unpack_kv(session, page->dsk, cell, kpack); + key_data = kpack->data; + key_size = kpack->size; + key_prefix = kpack->prefix; + } + if (key_prefix == 0) { + tmpkey->data = key_data; + tmpkey->size = key_size; goto build; + } - kpack = &_kpack; - __wt_cell_unpack_kv(session, page->dsk, cell, kpack); - if (kpack->type == WT_CELL_KEY && tmpkey->size >= kpack->prefix && tmpkey->size != 0) { - /* - * Grow the buffer as necessary, ensuring data data has been copied into local - * buffer space, then append the suffix to the prefix already in the buffer. - * - * Don't grow the buffer unnecessarily or copy data we don't need, truncate the - * item's data length to the prefix bytes. - */ - tmpkey->size = kpack->prefix; - WT_ERR(__wt_buf_grow(session, tmpkey, tmpkey->size + kpack->size)); - memcpy((uint8_t *)tmpkey->mem + tmpkey->size, kpack->data, kpack->size); - tmpkey->size += kpack->size; - } else + if (tmpkey->size == 0 || tmpkey->size < key_prefix) + goto slow; + + /* + * Grow the buffer as necessary as well as ensure data has been copied into local buffer + * space, then append the suffix to the prefix already in the buffer. Don't grow the + * buffer unnecessarily or copy data we don't need, truncate the item's CURRENT data + * length to the prefix bytes before growing the buffer. + */ + tmpkey->size = key_prefix; + WT_ERR(__wt_buf_grow(session, tmpkey, key_prefix + key_size)); + memcpy((uint8_t *)tmpkey->mem + key_prefix, key_data, key_size); + tmpkey->size = key_prefix + key_size; + + if (0) { +slow: WT_ERR(__wt_row_leaf_key_copy(session, page, rip, tmpkey)); + } + build: WT_ERR(__rec_cell_build_leaf_key(session, r, tmpkey->data, tmpkey->size, &ovfl_key)); } @@ -1012,6 +1037,7 @@ build: */ if (r->key_pfx_compress_conf) { r->key_pfx_compress = false; + r->key_pfx_last = 0; if (!ovfl_key) WT_ERR(__rec_cell_build_leaf_key(session, r, NULL, 0, &ovfl_key)); } diff --git a/src/third_party/wiredtiger/src/reconcile/rec_write.c b/src/third_party/wiredtiger/src/reconcile/rec_write.c index 97743d7c3ee..24fe6bb252d 100644 --- a/src/third_party/wiredtiger/src/reconcile/rec_write.c +++ b/src/third_party/wiredtiger/src/reconcile/rec_write.c @@ -1217,15 +1217,13 @@ __wt_rec_split(WT_SESSION_IMPL *session, WT_RECONCILE *r, size_t next_len, bool done: /* - * Overflow values can be larger than the maximum page size but still be - * "on-page". If the next key/value pair is larger than space available - * after a split has happened (in other words, larger than the maximum - * page size), create a page sized to hold that one key/value pair. This - * generally splits the page into key/value pairs before a large object, - * the object, and key/value pairs after the object. It's possible other - * key/value pairs will also be aggregated onto the bigger page before - * or after, if the page happens to hold them, but it won't necessarily - * happen that way. + * Overflow values can be larger than the maximum page size but still be "on-page". If the next + * key/value pair is larger than space available after a split has happened (in other words, + * larger than the maximum page size), create a page sized to hold that one key/value pair. This + * generally splits the page into key/value pairs before a large object, the object, and + * key/value pairs after the object. It's possible other key/value pairs will also be aggregated + * onto the bigger page before or after, if the page happens to hold them, but it won't + * necessarily happen that way. */ if (r->space_avail < next_len) WT_RET(__rec_split_grow(session, r, next_len)); diff --git a/src/third_party/wiredtiger/src/schema/schema_create.c b/src/third_party/wiredtiger/src/schema/schema_create.c index bc76391db43..c814a32c935 100644 --- a/src/third_party/wiredtiger/src/schema/schema_create.c +++ b/src/third_party/wiredtiger/src/schema/schema_create.c @@ -62,7 +62,8 @@ __check_imported_ts(WT_SESSION_IMPL *session, const char *uri, const char *confi ckptbase = NULL; txn_global = &S2C(session)->txn_global; - WT_ERR_NOTFOUND_OK(__wt_meta_ckptlist_get_from_config(session, false, &ckptbase, config), true); + WT_ERR_NOTFOUND_OK( + __wt_meta_ckptlist_get_from_config(session, false, &ckptbase, NULL, config), true); if (ret == WT_NOTFOUND) WT_ERR_MSG(session, EINVAL, "%s: import could not find any checkpoint information in supplied metadata", uri); @@ -128,10 +129,11 @@ __create_file( WT_SESSION_IMPL *session, const char *uri, bool exclusive, bool import, const char *config) { WT_CONFIG_ITEM cval; + WT_DECL_ITEM(buf); WT_DECL_ITEM(val); WT_DECL_RET; const char *filename, **p, - *filecfg[] = {WT_CONFIG_BASE(session, file_meta), config, NULL, NULL, NULL}; + *filecfg[] = {WT_CONFIG_BASE(session, file_meta), config, NULL, NULL, NULL, NULL}; char *fileconf, *filemeta; uint32_t allocsize; bool exists, import_repair, is_metadata; @@ -140,6 +142,7 @@ __create_file( import_repair = false; is_metadata = strcmp(uri, WT_METAFILE_URI) == 0; + WT_ERR(__wt_scr_alloc(session, 1024, &buf)); filename = uri; WT_PREFIX_SKIP_REQUIRED(session, filename, "file:"); @@ -200,6 +203,12 @@ __create_file( } WT_ERR(__wt_strndup(session, cval.str, cval.len, &filemeta)); filecfg[2] = filemeta; + /* + * If there is a file metadata provided, reconstruct the incremental backup + * information as the imported file was not part of any backup. + */ + WT_ERR(__wt_reset_blkmod(session, config, buf)); + filecfg[3] = buf->mem; } else { /* * If there is no file metadata provided, the user should be specifying a "repair". @@ -217,14 +226,15 @@ __create_file( WT_ERR(__create_file_block_manager(session, uri, filename, allocsize)); /* - * If creating an ordinary file, update the file ID and current version numbers and strip the - * incremental backup information and checkpoint LSN from the extracted metadata. + * If creating an ordinary file, update the file ID and current version numbers and strip + * checkpoint LSN from the extracted metadata. If importing an existing file, incremental backup + * information is reconstructed inside import repair or when grabbing file metadata. */ if (!is_metadata) { if (!import_repair) { WT_ERR(__wt_scr_alloc(session, 0, &val)); WT_ERR(__wt_buf_fmt(session, val, - "id=%" PRIu32 ",version=(major=%d,minor=%d),checkpoint_backup_info=,checkpoint_lsn=", + "id=%" PRIu32 ",version=(major=%d,minor=%d),checkpoint_lsn=", ++S2C(session)->next_file_id, WT_BTREE_MAJOR_VERSION_MAX, WT_BTREE_MINOR_VERSION_MAX)); for (p = filecfg; *p != NULL; ++p) @@ -260,6 +270,7 @@ __create_file( WT_ERR(__wt_session_release_dhandle(session)); err: + __wt_scr_free(session, &buf); __wt_scr_free(session, &val); __wt_free(session, fileconf); __wt_free(session, filemeta); diff --git a/src/third_party/wiredtiger/src/schema/schema_plan.c b/src/third_party/wiredtiger/src/schema/schema_plan.c index bd2ae7fa45c..8bc330abac7 100644 --- a/src/third_party/wiredtiger/src/schema/schema_plan.c +++ b/src/third_party/wiredtiger/src/schema/schema_plan.c @@ -214,10 +214,9 @@ __wt_struct_plan(WT_SESSION_IMPL *session, WT_TABLE *table, const char *columns, WT_RET(__wt_buf_catfmt(session, plan, "%c", WT_PROJ_SKIP)); } /* - * Now copy the value in / out. In the common case, - * where each value is used in one column, we do a - * "next" operation. If the value is used again, we do - * a "reuse" operation to avoid making another copy. + * Now copy the value in / out. In the common case, where each value is used in one + * column, we do a "next" operation. If the value is used again, we do a "reuse" + * operation to avoid making another copy. */ if (!have_it) { WT_RET(__wt_buf_catfmt(session, plan, "%c", WT_PROJ_NEXT)); diff --git a/src/third_party/wiredtiger/src/schema/schema_rename.c b/src/third_party/wiredtiger/src/schema/schema_rename.c index 7145661420f..b0ce8fc3ed7 100644 --- a/src/third_party/wiredtiger/src/schema/schema_rename.c +++ b/src/third_party/wiredtiger/src/schema/schema_rename.c @@ -9,30 +9,6 @@ #include "wt_internal.h" /* - * __rename_blkmod -- - * Reset the incremental backup information for a rename. - */ -static int -__rename_blkmod(WT_SESSION_IMPL *session, const char *oldvalue, WT_ITEM *buf) -{ - WT_CKPT ckpt; - WT_DECL_RET; - - WT_CLEAR(ckpt); - /* - * Replace the old file entries with new file entries. We need to recreate the incremental - * backup information to indicate copying the entire file in its bitmap. - */ - /* First load any existing backup information into a temp checkpoint structure. */ - WT_RET(__wt_meta_blk_mods_load(session, oldvalue, &ckpt, true)); - - /* Take the checkpoint structure and generate the metadata string. */ - ret = __wt_ckpt_blkmod_to_meta(session, buf, &ckpt); - __wt_meta_checkpoint_free(session, &ckpt); - return (ret); -} - -/* * __rename_file -- * WT_SESSION::rename for a file. */ @@ -89,7 +65,7 @@ __rename_file(WT_SESSION_IMPL *session, const char *uri, const char *newuri) WT_ERR(__wt_metadata_remove(session, uri)); filecfg[0] = oldvalue; if (F_ISSET(S2C(session), WT_CONN_INCR_BACKUP)) { - WT_ERR(__rename_blkmod(session, oldvalue, buf)); + WT_ERR(__wt_reset_blkmod(session, oldvalue, buf)); filecfg[1] = buf->mem; } else filecfg[1] = NULL; @@ -135,8 +111,7 @@ __rename_tree(WT_SESSION_IMPL *session, WT_TABLE *table, const char *newuri, con /* * Create the new data source URI and update the schema value. * - * 'name' has the format (colgroup|index):<tablename>[:<suffix>]; - * we need the suffix. + * 'name' has the format (colgroup|index):<tablename>[:<suffix>]; we need the suffix. */ is_colgroup = WT_PREFIX_MATCH(name, "colgroup:"); if (!is_colgroup && !WT_PREFIX_MATCH(name, "index:")) diff --git a/src/third_party/wiredtiger/src/schema/schema_util.c b/src/third_party/wiredtiger/src/schema/schema_util.c index eb342f32a5a..215d8713507 100644 --- a/src/third_party/wiredtiger/src/schema/schema_util.c +++ b/src/third_party/wiredtiger/src/schema/schema_util.c @@ -141,9 +141,9 @@ __wt_str_name_check(WT_SESSION_IMPL *session, const char *str) bool skip; /* - * Check if name is somewhere in the WiredTiger name space: it would be - * "bad" if the application truncated the metadata file. Skip any - * leading URI prefix if needed, check and then skip over a table name. + * Check if name is somewhere in the WiredTiger name space: it would be "bad" if the application + * truncated the metadata file. Skip any leading URI prefix if needed, check and then skip over + * a table name. */ name = str; skip = false; diff --git a/src/third_party/wiredtiger/src/session/session_api.c b/src/third_party/wiredtiger/src/session/session_api.c index 338df444cd2..a6796946c99 100644 --- a/src/third_party/wiredtiger/src/session/session_api.c +++ b/src/third_party/wiredtiger/src/session/session_api.c @@ -1397,7 +1397,7 @@ __session_truncate( WT_ERR(__wt_session_range_truncate(session, uri, start, stop)); err: - TXN_API_END_RETRY(session, ret, 0); + TXN_API_END(session, ret, false); if (ret != 0) WT_STAT_CONN_INCR(session, session_table_truncate_fail); diff --git a/src/third_party/wiredtiger/src/session/session_compact.c b/src/third_party/wiredtiger/src/session/session_compact.c index 057e032a1f1..b4064eb1f69 100644 --- a/src/third_party/wiredtiger/src/session/session_compact.c +++ b/src/third_party/wiredtiger/src/session/session_compact.c @@ -213,7 +213,8 @@ __compact_checkpoint(WT_SESSION_IMPL *session) /* Checkpoints take a lot of time, check if we've run out. */ WT_RET(__wt_session_compact_check_timeout(session)); - if ((ret = __wt_txn_checkpoint(session, checkpoint_cfg, false)) == 0) + ret = __wt_txn_checkpoint(session, checkpoint_cfg, false); + if (ret == 0) return (0); WT_RET_BUSY_OK(ret); diff --git a/src/third_party/wiredtiger/src/support/err.c b/src/third_party/wiredtiger/src/support/err.c index d7099586fc3..f4c20e02746 100644 --- a/src/third_party/wiredtiger/src/support/err.c +++ b/src/third_party/wiredtiger/src/support/err.c @@ -472,8 +472,8 @@ __wt_msg(WT_SESSION_IMPL *session, const char *fmt, ...) WT_GCC_FUNC_ATTRIBUTE(( handler = session->event_handler; ret = handler->handle_message(handler, wt_session, buf->data); +err: __wt_scr_free(session, &buf); - return (ret); } @@ -501,8 +501,8 @@ __wt_ext_msg_printf(WT_EXTENSION_API *wt_api, WT_SESSION *wt_session, const char handler = session->event_handler; ret = handler->handle_message(handler, wt_session, buf->data); +err: __wt_scr_free(session, &buf); - return (ret); } diff --git a/src/third_party/wiredtiger/src/support/scratch.c b/src/third_party/wiredtiger/src/support/scratch.c index c4644bb0aba..33c80a3a0ed 100644 --- a/src/third_party/wiredtiger/src/support/scratch.c +++ b/src/third_party/wiredtiger/src/support/scratch.c @@ -21,15 +21,23 @@ __wt_buf_grow_worker(WT_SESSION_IMPL *session, WT_ITEM *buf, size_t size) /* * Maintain the existing data: there are 3 cases: - * No existing data: allocate the required memory, and initialize - * the data to reference it. - * Existing data local to the buffer: set the data to the same - * offset in the re-allocated memory. - * Existing data not-local to the buffer: copy the data into the - * buffer and set the data to reference it. + * + * 1. No existing data: allocate the required memory, and initialize the data to reference it. + * 2. Existing data local to the buffer: set the data to the same offset in the re-allocated + * memory. The offset in this case is likely a read of an overflow item, the data pointer + * is offset in the buffer in order to skip over the leading data block page header. For + * the same reason, take any offset in the buffer into account when calculating the size + * to allocate, it saves complex calculations in our callers to decide if the buffer is large + * enough in the case of buffers with offset data pointers. + * 3. Existing data not-local to the buffer: copy the data into the buffer and set the data to + * reference it. + * + * Take the offset of the data pointer in the buffer when calculating the size + * needed, overflow items use the data pointer to skip the leading data block page header */ if (WT_DATA_IN_ITEM(buf)) { offset = WT_PTRDIFF(buf->data, buf->mem); + size += offset; copy_data = false; } else { offset = 0; @@ -51,8 +59,14 @@ __wt_buf_grow_worker(WT_SESSION_IMPL *session, WT_ITEM *buf, size_t size) buf->data = buf->mem; buf->size = 0; } else { - if (copy_data) + if (copy_data) { + /* + * It's easy to corrupt memory if you pass in the wrong size for the final buffer size, + * which is harder to debug than this assert. + */ + WT_ASSERT(session, buf->size <= buf->memsize); memcpy(buf->mem, buf->data, buf->size); + } buf->data = (uint8_t *)buf->mem + offset; } @@ -67,9 +81,12 @@ int __wt_buf_fmt(WT_SESSION_IMPL *session, WT_ITEM *buf, const char *fmt, ...) WT_GCC_FUNC_ATTRIBUTE((format(printf, 3, 4))) WT_GCC_FUNC_ATTRIBUTE((visibility("default"))) { + WT_DECL_RET; + WT_VA_ARGS_BUF_FORMAT(session, buf, fmt, false); - return (0); +err: + return (ret); } /* @@ -80,6 +97,8 @@ int __wt_buf_catfmt(WT_SESSION_IMPL *session, WT_ITEM *buf, const char *fmt, ...) WT_GCC_FUNC_ATTRIBUTE((format(printf, 3, 4))) WT_GCC_FUNC_ATTRIBUTE((visibility("default"))) { + WT_DECL_RET; + /* * If we're appending data to an existing buffer, any data field should point into the allocated * memory. (It wouldn't be insane to copy any previously existing data at this point, if data @@ -89,7 +108,8 @@ __wt_buf_catfmt(WT_SESSION_IMPL *session, WT_ITEM *buf, const char *fmt, ...) WT_VA_ARGS_BUF_FORMAT(session, buf, fmt, true); - return (0); +err: + return (ret); } /* diff --git a/src/third_party/wiredtiger/src/support/stat.c b/src/third_party/wiredtiger/src/support/stat.c index d622d44589e..fb9f7870a54 100644 --- a/src/third_party/wiredtiger/src/support/stat.c +++ b/src/third_party/wiredtiger/src/support/stat.c @@ -1280,6 +1280,9 @@ static const char *const __stats_connection_desc[] = { "thread-yield: page delete rollback time sleeping for state change (usecs)", "thread-yield: page reconciliation yielded due to child modification", "transaction: Number of prepared updates", + "transaction: Number of prepared updates committed", + "transaction: Number of prepared updates repeated on the same key", + "transaction: Number of prepared updates rolled back", "transaction: prepared transactions", "transaction: prepared transactions committed", "transaction: prepared transactions currently active", @@ -1798,7 +1801,10 @@ __wt_stat_connection_clear_single(WT_CONNECTION_STATS *stats) stats->page_sleep = 0; stats->page_del_rollback_blocked = 0; stats->child_modify_blocked_page = 0; - stats->txn_prepared_updates_count = 0; + stats->txn_prepared_updates = 0; + stats->txn_prepared_updates_committed = 0; + stats->txn_prepared_updates_key_repeated = 0; + stats->txn_prepared_updates_rolledback = 0; stats->txn_prepare = 0; stats->txn_prepare_commit = 0; stats->txn_prepare_active = 0; @@ -2309,7 +2315,10 @@ __wt_stat_connection_aggregate(WT_CONNECTION_STATS **from, WT_CONNECTION_STATS * to->page_sleep += WT_STAT_READ(from, page_sleep); to->page_del_rollback_blocked += WT_STAT_READ(from, page_del_rollback_blocked); to->child_modify_blocked_page += WT_STAT_READ(from, child_modify_blocked_page); - to->txn_prepared_updates_count += WT_STAT_READ(from, txn_prepared_updates_count); + to->txn_prepared_updates += WT_STAT_READ(from, txn_prepared_updates); + to->txn_prepared_updates_committed += WT_STAT_READ(from, txn_prepared_updates_committed); + to->txn_prepared_updates_key_repeated += WT_STAT_READ(from, txn_prepared_updates_key_repeated); + to->txn_prepared_updates_rolledback += WT_STAT_READ(from, txn_prepared_updates_rolledback); to->txn_prepare += WT_STAT_READ(from, txn_prepare); to->txn_prepare_commit += WT_STAT_READ(from, txn_prepare_commit); to->txn_prepare_active += WT_STAT_READ(from, txn_prepare_active); diff --git a/src/third_party/wiredtiger/src/tiered/tiered_config.c b/src/third_party/wiredtiger/src/tiered/tiered_config.c index 23eb24131cc..6971ec4b7b5 100644 --- a/src/third_party/wiredtiger/src/tiered/tiered_config.c +++ b/src/third_party/wiredtiger/src/tiered/tiered_config.c @@ -115,8 +115,8 @@ __wt_tiered_bucket_config( WT_ERR(__wt_strndup(session, prefix.str, prefix.len, &new->bucket_prefix)); storage = nstorage->storage_source; - WT_ERR(storage->ss_customize_file_system(storage, &session->iface, new->bucket, - new->bucket_prefix, new->auth_token, NULL, &new->file_system)); + WT_ERR(storage->ss_customize_file_system( + storage, &session->iface, new->bucket, new->auth_token, NULL, &new->file_system)); new->storage_source = storage; /* If we're creating a new bucket storage, parse the other settings into it. */ diff --git a/src/third_party/wiredtiger/src/tiered/tiered_cursor.c b/src/third_party/wiredtiger/src/tiered/tiered_cursor.c index db45db54f9e..c913f9b33ca 100644 --- a/src/third_party/wiredtiger/src/tiered/tiered_cursor.c +++ b/src/third_party/wiredtiger/src/tiered/tiered_cursor.c @@ -696,10 +696,9 @@ __curtiered_search_near(WT_CURSOR *cursor, int *exactp) } /* - * Prefer larger cursors. There are two reasons: (1) we expect - * prefix searches to be a common case (as in our own indices); - * and (2) we need a way to unambiguously know we have the - * "closest" result. + * Prefer larger cursors. There are two reasons: (1) we expect prefix searches to be a + * common case (as in our own indices); and (2) we need a way to unambiguously know we have + * the "closest" result. */ if (cmp < 0) { if ((ret = c->next(c)) == WT_NOTFOUND) { diff --git a/src/third_party/wiredtiger/src/tiered/tiered_handle.c b/src/third_party/wiredtiger/src/tiered/tiered_handle.c index a1bb6bc37a6..363a9c97140 100644 --- a/src/third_party/wiredtiger/src/tiered/tiered_handle.c +++ b/src/third_party/wiredtiger/src/tiered/tiered_handle.c @@ -146,15 +146,14 @@ __tiered_create_object(WT_SESSION_IMPL *session, WT_TIERED *tiered) WT_ERR(ret); } /* - * Create the name and metadata of the new shared object of the current local object. - * The data structure keeps this id so that we don't have to parse and manipulate strings. - * I.e. if we have file:example-000000002.wt we want object:example-000000002.wtobj. + * Create the name and metadata of the new shared object of the current local object. The data + * structure keeps this id so that we don't have to parse and manipulate strings. */ WT_ERR( __wt_tiered_name(session, &tiered->iface, tiered->current_id, WT_TIERED_NAME_OBJECT, &name)); cfg[0] = WT_CONFIG_BASE(session, object_meta); cfg[1] = tiered->obj_config; - cfg[2] = "readonly=true"; + cfg[2] = "flush=0,readonly=true"; WT_ASSERT(session, tiered->obj_config != NULL); WT_ERR(__wt_config_merge(session, cfg, NULL, (const char **)&config)); __wt_verbose( @@ -162,6 +161,13 @@ __tiered_create_object(WT_SESSION_IMPL *session, WT_TIERED *tiered) /* Create the new shared object. */ WT_ERR(__wt_schema_create(session, name, config)); +#if 0 + /* + * If we get here we have successfully created the object. It is ready to be fully flushed to + * the cloud. Push a work element to let the internal thread do that here. + */ +#endif + err: __wt_free(session, config); __wt_free(session, name); @@ -307,6 +313,10 @@ static int __tiered_switch(WT_SESSION_IMPL *session, const char *config) { WT_DECL_RET; +#if 0 + WT_FILE_SYSTEM *fs; + WT_STORAGE_SOURCE *storage_source; +#endif WT_TIERED *tiered; bool need_object, need_tree, tracking; @@ -357,10 +367,31 @@ __tiered_switch(WT_SESSION_IMPL *session, const char *config) /* We always need to create a local object. */ WT_ERR(__tiered_create_local(session, tiered)); +#if 0 /* - * Note that removal of overlapping local objects is not in the purview of this function. Some - * other mechanism will remove outdated tiers. Here's where it could be done though. + * We expect this part to be done asynchronously in its own thread. First flush the contents of + * the data file to the new cloud object. */ + storage_source = tiered->bstorage->storage_source; + fs = tiered->bucket_storage->file_system; + WT_ASSERT(session, storage_source != NULL); + + /* This call make take a while, and may fail due to network timeout. */ + WT_ERR(storage_source->ss_flush(storage_source, &session->iface, + fs, old_filename, object_name, NULL)); + + /* + * The metadata for the old local object will be initialized with "flush=0". When the flush call + * completes, it can be marked as "flush=1". When that's done, we can finish the flush. The + * flush finish call moves the file from the home directory to the extension's cache. Then the + * extension will own it. + * + * We may need a way to restart flushes for those not completed (after a crash), or failed (due + * to previous network outage). + */ + WT_ERR(storage_source->ss_flush_finish(storage_source, &session->iface, + fs, old_filename, object_name, NULL)); +#endif /* Update the tiered: metadata to new object number and tiered array. */ WT_ERR(__tiered_update_metadata(session, tiered, config)); @@ -424,7 +455,7 @@ __wt_tiered_name( if (LF_ISSET(WT_TIERED_NAME_PREFIX)) WT_ERR(__wt_buf_fmt(session, tmp, "file:%s-", name)); else - WT_ERR(__wt_buf_fmt(session, tmp, "file:%s-%010" PRIu64 ".wt", name, id)); + WT_ERR(__wt_buf_fmt(session, tmp, "file:%s-%010" PRIu64 ".wtobj", name, id)); } else if (LF_ISSET(WT_TIERED_NAME_OBJECT)) { if (LF_ISSET(WT_TIERED_NAME_PREFIX)) WT_ERR(__wt_buf_fmt(session, tmp, "object:%s-", name)); diff --git a/src/third_party/wiredtiger/src/txn/txn.c b/src/third_party/wiredtiger/src/txn/txn.c index 9aa2f085386..dabce04d12f 100644 --- a/src/third_party/wiredtiger/src/txn/txn.c +++ b/src/third_party/wiredtiger/src/txn/txn.c @@ -1054,6 +1054,7 @@ __txn_resolve_prepared_op(WT_SESSION_IMPL *session, WT_TXN_OP *op, bool commit, #endif size_t not_used; uint32_t hs_btree_id; + char ts_string[3][WT_TS_INT_STRING_SIZE]; bool upd_appended; hs_cursor = NULL; @@ -1063,9 +1064,18 @@ __txn_resolve_prepared_op(WT_SESSION_IMPL *session, WT_TXN_OP *op, bool commit, WT_RET(__txn_search_prepared_op(session, op, cursorp, &upd)); - __wt_verbose(session, WT_VERB_TRANSACTION, - "resolving prepared op for txnid: %" PRIu64 " that %s", txn->id, - commit ? "committed" : "roll backed"); + if (commit) + __wt_verbose(session, WT_VERB_TRANSACTION, + "commit resolving prepared transaction with txnid: %" PRIu64 + "and timestamp: %s to commit and durable timestamps: %s,%s", + txn->id, __wt_timestamp_to_string(txn->prepare_timestamp, ts_string[0]), + __wt_timestamp_to_string(txn->commit_timestamp, ts_string[1]), + __wt_timestamp_to_string(txn->durable_timestamp, ts_string[2])); + else + __wt_verbose(session, WT_VERB_TRANSACTION, + "rollback resolving prepared transaction with txnid: %" PRIu64 "and timestamp:%s", + txn->id, __wt_timestamp_to_string(txn->prepare_timestamp, ts_string[0])); + /* * Aborted updates can exist in the update chain of our transaction. Generally this will occur * due to a reserved update. As such we should skip over these updates. @@ -1082,7 +1092,7 @@ __txn_resolve_prepared_op(WT_SESSION_IMPL *session, WT_TXN_OP *op, bool commit, * we rolled back all associated updates in the previous iteration of this function. */ if (upd == NULL || upd->prepare_state != WT_PREPARE_INPROGRESS) - return (0); + goto prepare_verify; WT_ERR(__txn_commit_timestamps_usage_check(session, op, upd)); /* @@ -1092,9 +1102,14 @@ __txn_resolve_prepared_op(WT_SESSION_IMPL *session, WT_TXN_OP *op, bool commit, * updates first, the history search logic may race with other sessions modifying the same key * and checkpoint moving the new updates to the history store. * - * For prepared delete, we don't need to fix the history store. + * For prepared delete commit, we don't need to fix the history store. Whereas for rollback, if + * the update is also from the same prepared transaction, restore the update from history store + * or remove the key. */ - if (F_ISSET(upd, WT_UPDATE_PREPARE_RESTORED_FROM_DS) && upd->type != WT_UPDATE_TOMBSTONE) { + if (F_ISSET(upd, WT_UPDATE_PREPARE_RESTORED_FROM_DS) && + (upd->type != WT_UPDATE_TOMBSTONE || + (!commit && upd->next != NULL && upd->durable_ts == upd->next->durable_ts && + upd->txnid == upd->next->txnid && upd->start_ts == upd->next->start_ts))) { cbt = (WT_CURSOR_BTREE *)(*cursorp); hs_btree_id = S2BT(session)->id; /* Open a history store table cursor. */ @@ -1140,6 +1155,7 @@ __txn_resolve_prepared_op(WT_SESSION_IMPL *session, WT_TXN_OP *op, bool commit, if (!commit) { upd->txnid = WT_TXN_ABORTED; + WT_STAT_CONN_INCR(session, txn_prepared_updates_rolledback); continue; } @@ -1172,6 +1188,7 @@ __txn_resolve_prepared_op(WT_SESSION_IMPL *session, WT_TXN_OP *op, bool commit, * Resolve the prepared update to be committed update. */ __txn_resolve_prepared_update(session, upd); + WT_STAT_CONN_INCR(session, txn_prepared_updates_committed); } /* @@ -1183,14 +1200,16 @@ __txn_resolve_prepared_op(WT_SESSION_IMPL *session, WT_TXN_OP *op, bool commit, if (fix_upd != NULL) WT_ERR(__txn_fixup_prepared_update(session, hs_cursor, fix_upd, commit)); +prepare_verify: #ifdef HAVE_DIAGNOSTIC for (; head_upd != NULL; head_upd = head_upd->next) { /* - * Assert if we still have an update from the current transaction that hasn't been aborted. - * Only perform this check if aborting the prepared transaction. + * Assert if we still have an update from the current transaction that hasn't been resolved + * or aborted. */ - WT_ASSERT( - session, commit || head_upd->txnid == WT_TXN_ABORTED || head_upd->txnid != txn->id); + WT_ASSERT(session, + head_upd->txnid == WT_TXN_ABORTED || head_upd->prepare_state == WT_PREPARE_RESOLVED || + head_upd->txnid != txn->id); if (head_upd->txnid == WT_TXN_ABORTED) continue; @@ -1407,12 +1426,18 @@ __wt_txn_commit(WT_SESSION_IMPL *session, const char *cfg[]) wt_timestamp_t candidate_durable_timestamp, prev_durable_timestamp; uint32_t fileid; u_int i; +#ifdef HAVE_DIAGNOSTIC + u_int prepare_count; +#endif bool locked, prepare, readonly, update_durable_ts; txn = session->txn; conn = S2C(session); cursor = NULL; txn_global = &conn->txn_global; +#ifdef HAVE_DIAGNOSTIC + prepare_count = 0; +#endif locked = false; prepare = F_ISSET(txn, WT_TXN_PREPARE); readonly = txn->mod_count == 0; @@ -1470,15 +1495,13 @@ __wt_txn_commit(WT_SESSION_IMPL *session, const char *cfg[]) WT_ERR(__wt_config_gets_def(session, cfg, "sync", 0, &cval)); /* - * If the user chose the default setting, check whether sync is enabled - * for this transaction (either inherited or via begin_transaction). - * If sync is disabled, clear the field to avoid the log write being - * flushed. + * If the user chose the default setting, check whether sync is enabled for this transaction + * (either inherited or via begin_transaction). If sync is disabled, clear the field to avoid + * the log write being flushed. * - * Otherwise check for specific settings. We don't need to check for - * "on" because that is the default inherited from the connection. If - * the user set anything in begin_transaction, we only override with an - * explicit setting. + * Otherwise check for specific settings. We don't need to check for "on" because that is the + * default inherited from the connection. If the user set anything in begin_transaction, we only + * override with an explicit setting. */ if (cval.len == 0) { if (!FLD_ISSET(txn->txn_logsync, WT_LOG_SYNC_ENABLED) && !F_ISSET(txn, WT_TXN_SYNC_SET)) @@ -1572,6 +1595,9 @@ __wt_txn_commit(WT_SESSION_IMPL *session, const char *cfg[]) */ if (!F_ISSET(op, WT_TXN_OP_KEY_REPEATED)) WT_ERR(__txn_resolve_prepared_op(session, op, true, &cursor)); +#ifdef HAVE_DIAGNOSTIC + ++prepare_count; +#endif } break; case WT_TXN_OP_REF_DELETE: @@ -1589,6 +1615,10 @@ __wt_txn_commit(WT_SESSION_IMPL *session, const char *cfg[]) WT_CLEAR(cursor->key); } txn->mod_count = 0; +#ifdef HAVE_DIAGNOSTIC + WT_ASSERT(session, txn->prepare_count == prepare_count); + txn->prepare_count = 0; +#endif if (cursor != NULL) { WT_ERR(cursor->close(cursor)); @@ -1680,11 +1710,10 @@ __wt_txn_prepare(WT_SESSION_IMPL *session, const char *cfg[]) WT_TXN *txn; WT_TXN_OP *op; WT_UPDATE *upd, *tmp; - int64_t txn_prepared_updates_count; - u_int i; + u_int i, prepared_updates, prepared_updates_key_repeated; txn = session->txn; - txn_prepared_updates_count = 0; + prepared_updates = prepared_updates_key_repeated = 0; WT_ASSERT(session, F_ISSET(txn, WT_TXN_RUNNING)); WT_ASSERT(session, !F_ISSET(txn, WT_TXN_ERROR)); @@ -1749,7 +1778,7 @@ __wt_txn_prepare(WT_SESSION_IMPL *session, const char *cfg[]) break; } - ++txn_prepared_updates_count; + ++prepared_updates; /* Set prepare timestamp. */ upd->start_ts = txn->prepare_timestamp; @@ -1776,6 +1805,7 @@ __wt_txn_prepare(WT_SESSION_IMPL *session, const char *cfg[]) if (tmp->type != WT_UPDATE_RESERVE && !F_ISSET(tmp, WT_UPDATE_RESTORED_FAST_TRUNCATE)) { F_SET(op, WT_TXN_OP_KEY_REPEATED); + ++prepared_updates_key_repeated; break; } break; @@ -1788,7 +1818,11 @@ __wt_txn_prepare(WT_SESSION_IMPL *session, const char *cfg[]) break; } } - WT_STAT_CONN_INCR(session, txn_prepared_updates_count); + WT_STAT_CONN_INCRV(session, txn_prepared_updates, prepared_updates); + WT_STAT_CONN_INCRV(session, txn_prepared_updates_key_repeated, prepared_updates_key_repeated); +#ifdef HAVE_DIAGNOSTIC + txn->prepare_count = prepared_updates; +#endif /* Set transaction state to prepare. */ F_SET(session->txn, WT_TXN_PREPARE); @@ -1819,10 +1853,16 @@ __wt_txn_rollback(WT_SESSION_IMPL *session, const char *cfg[]) WT_TXN_OP *op; WT_UPDATE *upd; u_int i; +#ifdef HAVE_DIAGNOSTIC + u_int prepare_count; +#endif bool prepare, readonly; cursor = NULL; txn = session->txn; +#ifdef HAVE_DIAGNOSTIC + prepare_count = 0; +#endif prepare = F_ISSET(txn, WT_TXN_PREPARE); readonly = txn->mod_count == 0; @@ -1874,6 +1914,9 @@ __wt_txn_rollback(WT_SESSION_IMPL *session, const char *cfg[]) */ if (!F_ISSET(op, WT_TXN_OP_KEY_REPEATED)) WT_TRET(__txn_resolve_prepared_op(session, op, false, &cursor)); +#ifdef HAVE_DIAGNOSTIC + ++prepare_count; +#endif } break; case WT_TXN_OP_REF_DELETE: @@ -1895,6 +1938,10 @@ __wt_txn_rollback(WT_SESSION_IMPL *session, const char *cfg[]) WT_CLEAR(cursor->key); } txn->mod_count = 0; +#ifdef HAVE_DIAGNOSTIC + WT_ASSERT(session, txn->prepare_count == prepare_count); + txn->prepare_count = 0; +#endif if (cursor != NULL) { WT_TRET(cursor->close(cursor)); diff --git a/src/third_party/wiredtiger/src/txn/txn_ckpt.c b/src/third_party/wiredtiger/src/txn/txn_ckpt.c index dba739792a2..61720a8adaa 100644 --- a/src/third_party/wiredtiger/src/txn/txn_ckpt.c +++ b/src/third_party/wiredtiger/src/txn/txn_ckpt.c @@ -301,7 +301,6 @@ __wt_checkpoint_get_handles(WT_SESSION_IMPL *session, const char *cfg[]) WT_SAVE_DHANDLE(session, ret = __checkpoint_lock_dirty_tree(session, true, force, true, cfg)); WT_RET(ret); if (F_ISSET(btree, WT_BTREE_SKIP_CKPT)) { - WT_ASSERT(session, btree->ckpt == NULL); __checkpoint_update_generation(session); return (0); } @@ -1367,14 +1366,16 @@ __checkpoint_lock_dirty_tree( WT_CONFIG_ITEM cval, k, v; WT_DATA_HANDLE *dhandle; WT_DECL_RET; + size_t ckpt_bytes_allocated; uint64_t now; char *name_alloc; const char *name; - bool is_drop, is_wt_ckpt, skip_ckpt; + bool is_drop, is_wt_ckpt, seen_ckpt_add, skip_ckpt; btree = S2BT(session); ckpt = ckptbase = NULL; dhandle = session->dhandle; + ckpt_bytes_allocated = 0; name_alloc = NULL; /* @@ -1439,12 +1440,30 @@ __checkpoint_lock_dirty_tree( } } + /* + * Discard the saved list of checkpoints, and slow path if this is not a WiredTiger checkpoint + * or if checkpoint drops are involved. Also, if we do not have checkpoint array size, the + * regular checkpoint process did not create the array. It is safer to discard the array in such + * a case. + */ + if (!is_wt_ckpt || is_drop || btree->ckpt_bytes_allocated == 0) + __wt_meta_saved_ckptlist_free(session); + /* If we have to process this btree for any reason, reset the timer and obsolete pages flag. */ WT_BTREE_CLEAN_CKPT(session, btree, 0); F_CLR(btree, WT_BTREE_OBSOLETE_PAGES); - /* Get the list of checkpoints for this file. */ - WT_ERR(__wt_meta_ckptlist_get(session, dhandle->name, true, &ckptbase)); + /* + * Get the list of checkpoints for this file: We try to cache the ckptlist between the + * checkpoints. But there might not be one, as there are operations that can invalidate a + * ckptlist. So, use a cached ckptlist if there is one. Otherwise go through slow path of + * re-generating the ckptlist by reading the metadata. Also, we avoid using a cached checkpoint + * list for metadata. + */ + if (WT_IS_METADATA(dhandle) || + __wt_meta_saved_ckptlist_get(session, dhandle->name, &ckptbase) != 0) + WT_ERR( + __wt_meta_ckptlist_get(session, dhandle->name, true, &ckptbase, &ckpt_bytes_allocated)); /* We may be dropping specific checkpoints, check the configuration. */ if (cfg != NULL) { @@ -1488,19 +1507,36 @@ __checkpoint_lock_dirty_tree( WT_WITH_HOTBACKUP_READ_LOCK_UNCOND(session, ret = __checkpoint_lock_dirty_tree_int(session, is_checkpoint, force, btree, ckpt, ckptbase)); WT_ERR(ret); - if (F_ISSET(btree, WT_BTREE_SKIP_CKPT)) - goto err; - WT_ASSERT(session, btree->ckpt == NULL && !F_ISSET(btree, WT_BTREE_SKIP_CKPT)); - btree->ckpt = ckptbase; + /* + * If we decided to skip checkpointing, we need to remove the new checkpoint entry we might have + * appended to the list. + */ + seen_ckpt_add = false; + if (F_ISSET(btree, WT_BTREE_SKIP_CKPT)) { + WT_CKPT_FOREACH_NAME_OR_ORDER (ckptbase, ckpt) { + /* Checkpoint(s) to be added are always at the end of the list. */ + WT_ASSERT(session, !seen_ckpt_add || F_ISSET(ckpt, WT_CKPT_ADD)); + if (F_ISSET(ckpt, WT_CKPT_ADD)) { + seen_ckpt_add = true; + __wt_meta_checkpoint_free(session, ckpt); + } + } + } - if (0) { + if (ckptbase->name != NULL) { + btree->ckpt = ckptbase; + btree->ckpt_bytes_allocated = ckpt_bytes_allocated; + } else { + /* It is possible that we do not have any checkpoint in the list. */ err: __wt_meta_ckptlist_free(session, &ckptbase); + __wt_meta_saved_ckptlist_free(session); } skip: __wt_free(session, name_alloc); + WT_UNUSED(seen_ckpt_add); return (ret); } @@ -1642,6 +1678,64 @@ __wt_checkpoint_tree_reconcile_update(WT_SESSION_IMPL *session, WT_TIME_AGGREGAT } /* + * __checkpoint_save_ckptlist -- + * Post processing of the ckptlist to carry forward a cached list for the next checkpoint. + */ +static int +__checkpoint_save_ckptlist(WT_SESSION_IMPL *session, WT_CKPT *ckptbase) +{ + WT_CKPT *ckpt, *ckpt_itr; + WT_DECL_ITEM(tmp); + WT_DECL_RET; + + ckpt_itr = ckptbase; + WT_ERR(__wt_scr_alloc(session, 0, &tmp)); + WT_CKPT_FOREACH (ckptbase, ckpt) { + /* Remove any deleted checkpoints, by shifting the array. */ + if (F_ISSET(ckpt, WT_CKPT_DELETE)) { + __wt_meta_checkpoint_free(session, ckpt); + continue; + } + + /* Clean up block manager information. */ + __wt_free(session, ckpt->bpriv); + ckpt->bpriv = NULL; + + /* Update the internal checkpoints to their full names, with the generation count suffix. */ + if (strcmp(ckpt->name, WT_CHECKPOINT) == 0) { + WT_ERR(__wt_buf_fmt(session, tmp, "%s.%" PRId64, WT_CHECKPOINT, ckpt->order)); + __wt_free(session, ckpt->name); + WT_ERR(__wt_strdup(session, tmp->mem, &ckpt->name)); + } + + /* Reset the flags, and mark a checkpoint fake if there is no address. */ + ckpt->flags = 0; + if (ckpt->addr.size == 0) { + WT_ASSERT(session, ckpt->addr.data == NULL); + F_SET(ckpt, WT_CKPT_FAKE); + } + + /* Shift the valid checkpoints, if there are deleted checkpoints in the list. */ + if (ckpt_itr != ckpt) { + *ckpt_itr = *ckpt; + WT_CLEAR(*ckpt); + } + ckpt_itr++; + } + + /* + * Confirm that the last checkpoint has a metadata entry that we can use to base a new + * checkpoint on. + */ + ckpt_itr--; + WT_ASSERT(session, ckpt_itr->block_metadata != NULL); + +err: + __wt_scr_free(session, &tmp); + return (ret); +} + +/* * __checkpoint_tree -- * Checkpoint a single tree. Assumes all necessary locks have been acquired by the caller. */ @@ -1785,7 +1879,15 @@ err: conn->modified = true; } - __wt_meta_ckptlist_free(session, &btree->ckpt); + /* For a successful checkpoint, post process the ckptlist, to keep a cached copy around. */ + if (ret != 0 || WT_IS_METADATA(session->dhandle) || F_ISSET(conn, WT_CONN_CLOSING)) + __wt_meta_saved_ckptlist_free(session); + else { + ret = __checkpoint_save_ckptlist(session, btree->ckpt); + /* Discard the saved checkpoint list if processing the list did not work. */ + if (ret != 0) + __wt_meta_saved_ckptlist_free(session); + } return (ret); } @@ -1880,13 +1982,20 @@ __wt_checkpoint(WT_SESSION_IMPL *session, const char *cfg[]) !WT_IS_METADATA(session->dhandle) || FLD_ISSET(session->lock_flags, WT_SESSION_LOCKED_METADATA)); + /* Discard the cached checkpoint list when checkpointing a single file by itself. */ + __wt_meta_saved_ckptlist_free(session); + WT_RET(__wt_config_gets_def(session, cfg, "force", 0, &cval)); force = cval.val != 0; WT_SAVE_DHANDLE(session, ret = __checkpoint_lock_dirty_tree(session, true, force, true, cfg)); - WT_RET(ret); - if (F_ISSET(S2BT(session), WT_BTREE_SKIP_CKPT)) - return (0); - return (__checkpoint_tree(session, true, cfg)); + if (ret != 0 || F_ISSET(S2BT(session), WT_BTREE_SKIP_CKPT)) + goto done; + ret = __checkpoint_tree(session, true, cfg); + +done: + /* Do not store the cached checkpoint list when checkpointing a single file alone. */ + __wt_meta_saved_ckptlist_free(session); + return (ret); } /* @@ -1948,6 +2057,9 @@ __wt_checkpoint_close(WT_SESSION_IMPL *session, bool final) (!F_ISSET(S2C(session), WT_CONN_FILE_CLOSE_SYNC) && !metadata))) return (__wt_set_return(session, EBUSY)); + /* Discard the cached checkpoint list when checkpointing a single file by itself. */ + __wt_meta_saved_ckptlist_free(session); + /* * Make sure there isn't a potential race between backup copying the metadata and a checkpoint * changing the metadata. Backup holds both the checkpoint and schema locks. Checkpoint should @@ -1975,6 +2087,9 @@ __wt_checkpoint_close(WT_SESSION_IMPL *session, bool final) if (ret == 0 && !F_ISSET(btree, WT_BTREE_SKIP_CKPT)) ret = __checkpoint_tree(session, false, NULL); + /* Do not store the cached checkpoint list when checkpointing a single file alone. */ + __wt_meta_saved_ckptlist_free(session); + if (need_tracking) WT_TRET(__wt_meta_track_off(session, true, ret != 0)); diff --git a/src/third_party/wiredtiger/src/txn/txn_rollback_to_stable.c b/src/third_party/wiredtiger/src/txn/txn_rollback_to_stable.c index de2ff910072..5b5482524f9 100644 --- a/src/third_party/wiredtiger/src/txn/txn_rollback_to_stable.c +++ b/src/third_party/wiredtiger/src/txn/txn_rollback_to_stable.c @@ -300,7 +300,7 @@ __rollback_ondisk_fixup_key(WT_SESSION_IMPL *session, WT_REF *ref, WT_PAGE *page /* Get the full update value from the data store. */ unpack = &_unpack; - __wt_row_leaf_value_cell(session, page, rip, NULL, unpack); + __wt_row_leaf_value_cell(session, page, rip, unpack); } else { /* Unpack a column cell. */ WT_ERR(__wt_scr_alloc(session, WT_INTPACK64_MAXSIZE, &key)); @@ -588,7 +588,7 @@ __rollback_abort_ondisk_kv(WT_SESSION_IMPL *session, WT_REF *ref, WT_COL *cip, W WT_ASSERT(session, (rip != NULL && cip == NULL) || (rip == NULL && cip != NULL)); if (rip != NULL) - __wt_row_leaf_value_cell(session, page, rip, NULL, vpack); + __wt_row_leaf_value_cell(session, page, rip, vpack); else { kcell = WT_COL_PTR(page, cip); __wt_cell_unpack_kv(session, page->dsk, kcell, vpack); diff --git a/src/third_party/wiredtiger/src/utilities/util_load.c b/src/third_party/wiredtiger/src/utilities/util_load.c index 36eea69648a..ba0b019453a 100644 --- a/src/third_party/wiredtiger/src/utilities/util_load.c +++ b/src/third_party/wiredtiger/src/utilities/util_load.c @@ -440,12 +440,12 @@ config_update(WT_SESSION *session, char **list) return (util_err(session, errno, NULL)); /* - * For each match, rewrite the dump configuration as described by any - * command-line configuration arguments. + * For each match, rewrite the dump configuration as described by any command-line configuration + * arguments. * - * New filenames will be chosen as part of the table load, remove all - * "filename=", "source=" and other configurations that foil loading - * from the values; we call an unpublished API to do the work. + * New filenames will be chosen as part of the table load, remove all "filename=", "source=" and + * other configurations that foil loading from the values; we call an unpublished API to do the + * work. */ for (listp = list; *listp != NULL; listp += 2) { cnt = 0; diff --git a/src/third_party/wiredtiger/src/utilities/util_load_json.c b/src/third_party/wiredtiger/src/utilities/util_load_json.c index 38801769e63..d1a92944a43 100644 --- a/src/third_party/wiredtiger/src/utilities/util_load_json.c +++ b/src/third_party/wiredtiger/src/utilities/util_load_json.c @@ -382,11 +382,9 @@ json_top_level(WT_SESSION *session, JSON_INPUT_STATE *ins, uint32_t flags) } /* - * Allow any ordering of 'config', 'colgroups', - * 'indices' before 'data', which must appear last. - * The non-'data' items build up a list of entries - * that created in our session before the data is - * inserted. + * Allow any ordering of 'config', 'colgroups', 'indices' before 'data', which must appear + * last. The non-'data' items build up a list of entries that created in our session before + * the data is inserted. */ for (;;) { if (json_skip(session, ins, json_markers) != 0) diff --git a/src/third_party/wiredtiger/test/cppsuite/configs/config_example_test_default.txt b/src/third_party/wiredtiger/test/cppsuite/configs/config_example_test_default.txt index b46fed225eb..f5d5c916bdc 100644 --- a/src/third_party/wiredtiger/test/cppsuite/configs/config_example_test_default.txt +++ b/src/third_party/wiredtiger/test/cppsuite/configs/config_example_test_default.txt @@ -1,48 +1,4 @@ -# Same parameters as config_poc_test_default -duration_seconds=10, -cache_size_mb=1000, -enable_logging=true, -runtime_monitor= -( - op_count=3, - interval=s, - stat_cache_size= - ( - enabled=true, - limit=100 - ) -), -timestamp_manager= -( - enabled=true, - oldest_lag=1, - stable_lag=1 -), -workload_generator= -( - collection_count=2, - key_count=5, - key_size=1, - ops_per_transaction= - ( - min=5, - max=50 - ), - read_threads=1, - update_threads=1, - value_size=10, - update_config= - ( - op_count=1, - interval=s - ), - insert_config= - ( - op_count=1, - interval=s - ) -), -workload_tracking= -( - enabled=true -) +# Example configuration file, as default are added automatically only non default configurations +# need to be defined. +duration_seconds=5, +cache_size_mb=250 diff --git a/src/third_party/wiredtiger/test/cppsuite/configs/config_poc_test_default.txt b/src/third_party/wiredtiger/test/cppsuite/configs/config_poc_test_default.txt index c677142234d..6caaa4d4456 100644 --- a/src/third_party/wiredtiger/test/cppsuite/configs/config_poc_test_default.txt +++ b/src/third_party/wiredtiger/test/cppsuite/configs/config_poc_test_default.txt @@ -3,23 +3,14 @@ # Used as a basic test for the framework. duration_seconds=10, cache_size_mb=1000, -enable_logging=true, runtime_monitor= ( - op_count=3, - interval=s, stat_cache_size= ( enabled=true, limit=100 ) ), -timestamp_manager= -( - enabled=true, - oldest_lag=1, - stable_lag=1 -), workload_generator= ( collection_count=2, @@ -31,20 +22,5 @@ workload_generator= max=50 ), read_threads=1, - update_threads=1, - update_config= - ( - op_count=1, - interval=s - ), - insert_config= - ( - op_count=1, - interval=s - ), - value_size=10 + update_threads=1 ), -workload_tracking= -( - enabled=true -) diff --git a/src/third_party/wiredtiger/test/cppsuite/configs/config_poc_test_stress.txt b/src/third_party/wiredtiger/test/cppsuite/configs/config_poc_test_stress.txt index 6067bea3983..6eeda0ab7c0 100644 --- a/src/third_party/wiredtiger/test/cppsuite/configs/config_poc_test_stress.txt +++ b/src/third_party/wiredtiger/test/cppsuite/configs/config_poc_test_stress.txt @@ -6,19 +6,12 @@ cache_size_mb=5000, enable_logging=true, runtime_monitor= ( - rate_per_second=3, stat_cache_size= ( enabled=true, limit=100 ) ), -timestamp_manager= -( - enabled=true, - oldest_lag=1, - stable_lag=1 -), workload_generator= ( collection_count=2, @@ -32,8 +25,4 @@ workload_generator= read_threads=1, update_threads=1, value_size=2000 -), -workload_tracking= -( - enabled=true -) +),
\ No newline at end of file diff --git a/src/third_party/wiredtiger/test/cppsuite/create_test.sh b/src/third_party/wiredtiger/test/cppsuite/create_test.sh new file mode 100755 index 00000000000..91f506e39e9 --- /dev/null +++ b/src/third_party/wiredtiger/test/cppsuite/create_test.sh @@ -0,0 +1,81 @@ +#! /bin/bash + +# First argument needs to be the name of the script. +if [ $# -eq 0 ] + then + echo "Please give a name to your test i.e ./s_new_test my_test" + exit 128 +fi + +# Check the test name +if [[ $1 =~ ^[0-9a-zA-Z_-]+$ ]];then + echo "Generating test: $1..." +else + echo "Invalid test name. Only alphanumeric characters are allowed. \"_\" and \"-\" can be used too." + exit 128 +fi + +# Check if the test already exists. +FILE=tests/$1.cxx +if test -f "$FILE"; then + echo "$FILE cannot be created as it already exists." + exit 1 +fi + +# Check if default configuration associated to the test already exists. +CONFIG=configs/config_$1_default.txt +if test -f "$CONFIG"; then + echo "$CONFIG cannot be created as it already exists." + exit 1 +fi + +# Copy the default template. +cp tests/example_test.cxx $FILE +echo "Created $FILE." +cp configs/config_example_test_default.txt $CONFIG +echo "Created $CONFIG." + +# Replace example_test with the new test name. +SEARCH="example_test" +sed -i "s/$SEARCH/$1/" $FILE +echo "Updated $FILE." + +# Replace the first line of the configuration file. +REPLACE="# Configuration for $1." +sed -i "1s/.*/$REPLACE/" $CONFIG +echo "Updated $CONFIG." + +# Include the new test in run.cxx +FILE=tests/run.cxx +SEARCH="#include \"example_test.cxx\"" +VALUE="#include \"$1.cxx\"" +sed -i "/$SEARCH/a $VALUE" $FILE + +# Add the new test to the run_test() method +SEARCH="example_test(config, test_name).run()" +LINE_1="\ else if (test_name == \"$1\")\n" +LINE_2="\ $1(config, test_name).run();" +sed -i "/$SEARCH/a $LINE_1$LINE_2" $FILE + +# Add the new test to all existing tests. +SEARCH="all_tests = {\"example_test\"" +REPLACE="$SEARCH, \"$1\"" +sed -i "s/$SEARCH/$REPLACE/" $FILE +echo "Updated $FILE." + +# Add the new test to test_data.py +FILE=../../dist/test_data.py +SEARCH="example_test" +LINE_1="\ '$1' : Method(test_config)," +sed -i "/$SEARCH/a $LINE_1" $FILE +echo "Updated $FILE." + +# Trigger s_all +echo "Running s_all.." +cd ../../dist +./s_all + +# Last changes to be done manually +echo "Follow the next steps to execute your new test:" +echo "1. Start editing $1.cxx" +echo "2. Compile your changes, go to build_posix/test/cppsuite and run your test with ./run -t $1" diff --git a/src/third_party/wiredtiger/test/cppsuite/test_harness/core/component.h b/src/third_party/wiredtiger/test/cppsuite/test_harness/core/component.h index 91b165d8f29..341932a0236 100644 --- a/src/third_party/wiredtiger/test/cppsuite/test_harness/core/component.h +++ b/src/third_party/wiredtiger/test/cppsuite/test_harness/core/component.h @@ -88,7 +88,7 @@ class component { } bool - is_enabled() const + enabled() const { return _enabled; } diff --git a/src/third_party/wiredtiger/test/cppsuite/test_harness/core/configuration.h b/src/third_party/wiredtiger/test/cppsuite/test_harness/core/configuration.h index c2b9494487f..7eaa96214cb 100644 --- a/src/third_party/wiredtiger/test/cppsuite/test_harness/core/configuration.h +++ b/src/third_party/wiredtiger/test/cppsuite/test_harness/core/configuration.h @@ -29,7 +29,9 @@ #ifndef CONFIGURATION_H #define CONFIGURATION_H +#include <algorithm> #include <string> +#include <stack> extern "C" { #include "test_util.h" @@ -40,14 +42,22 @@ enum class types { BOOL, INT, STRING, STRUCT }; namespace test_harness { class configuration { public: - configuration(const std::string &test_config_name, const std::string &config) : _config(config) + configuration(const std::string &test_config_name, const std::string &config) { + const auto *config_entry = __wt_test_config_match(test_config_name.c_str()); + if (config_entry == nullptr) + testutil_die(EINVAL, "failed to match test config name"); + std::string default_config = std::string(config_entry->base); + /* Merge in the default configuration. */ + _config = merge_default_config(default_config, config); + debug_print("Running with enriched config: " + _config, DEBUG_INFO); + int ret = wiredtiger_test_config_validate( - nullptr, nullptr, test_config_name.c_str(), config.c_str()); + nullptr, nullptr, test_config_name.c_str(), _config.c_str()); if (ret != 0) testutil_die(EINVAL, "failed to validate given config, ensure test config exists"); ret = - wiredtiger_config_parser_open(nullptr, config.c_str(), config.size(), &_config_parser); + wiredtiger_config_parser_open(nullptr, _config.c_str(), _config.size(), &_config_parser); if (ret != 0) testutil_die(EINVAL, "failed to create configuration parser for provided config"); } @@ -173,6 +183,104 @@ class configuration { return func(value); } + /* + * Merge together two configuration strings, the user one and the default one. + */ + static std::string + merge_default_config(const std::string &default_config, const std::string &user_config) + { + std::string merged_config; + auto split_default_config = split_config(default_config); + auto split_user_config = split_config(user_config); + auto user_it = split_user_config.begin(); + for (auto default_it = split_default_config.begin(); + default_it != split_default_config.end(); ++default_it) { + if (user_it->first != default_it->first) + /* The default does not exist in the user configuration, add it. */ + merged_config += default_it->first + "=" + default_it->second; + else { + /* If we have a sub config merge it in. */ + if (user_it->second[0] == '(') + merged_config += default_it->first + "=(" + + merge_default_config(default_it->second, user_it->second) + ')'; + else + /* Add the user configuration as it exists. */ + merged_config += user_it->first + "=" + user_it->second; + ++user_it; + } + /* Add a comma after every item we add except the last one. */ + if (split_default_config.end() - default_it != 1) + merged_config += ","; + } + return (merged_config); + } + + /* + * Split a config string into keys and values, taking care to not split incorrectly when we have + * a sub config. + */ + static std::vector<std::pair<std::string, std::string>> + split_config(const std::string &config) + { + std::string cut_config = config; + std::vector<std::pair<std::string, std::string>> split_config; + std::string key = "", value = ""; + bool in_subconfig = false; + bool expect_value = false; + std::stack<char> subconfig_parens; + + /* All configuration strings must be at least 2 characters. */ + testutil_assert(config.size() > 1); + + /* Remove prefix and trailing "()". */ + if (config[0] == '(') + cut_config = config.substr(1, config.size() - 2); + + size_t start = 0, len = 0; + for (size_t i = 0; i < cut_config.size(); ++i) { + if (cut_config[i] == '(') { + subconfig_parens.push(cut_config[i]); + in_subconfig = true; + } + if (cut_config[i] == ')') { + subconfig_parens.pop(); + in_subconfig = !subconfig_parens.empty(); + } + if (cut_config[i] == '=' && !in_subconfig) { + expect_value = true; + key = cut_config.substr(start, len); + start += len + 1; + len = 0; + continue; + } + if (cut_config[i] == ',' && !in_subconfig) { + expect_value = false; + if (start + len >= cut_config.size()) + break; + value = cut_config.substr(start, len); + start += len + 1; + len = 0; + split_config.push_back(std::make_pair(key, value)); + continue; + } + ++len; + } + if (expect_value) { + value = cut_config.substr(start, len); + split_config.push_back(std::make_pair(key, value)); + } + + /* We have to sort the config here otherwise we will match incorrectly while merging. */ + std::sort(split_config.begin(), split_config.end(), comparator); + return (split_config); + } + + static bool + comparator(std::pair<std::string, std::string> a, std::pair<std::string, std::string> b) + { + return (a.first < b.first); + } + std::string _config; WT_CONFIG_PARSER *_config_parser = nullptr; }; diff --git a/src/third_party/wiredtiger/test/cppsuite/test_harness/runtime_monitor.h b/src/third_party/wiredtiger/test/cppsuite/test_harness/runtime_monitor.h index b7897eb39f1..bc559a03104 100644 --- a/src/third_party/wiredtiger/test/cppsuite/test_harness/runtime_monitor.h +++ b/src/third_party/wiredtiger/test/cppsuite/test_harness/runtime_monitor.h @@ -64,7 +64,7 @@ class statistic { virtual ~statistic() {} bool - is_enabled() const + enabled() const { return _enabled; } @@ -154,7 +154,7 @@ class runtime_monitor : public component { do_work() { for (const auto &it : _stats) { - if (it->is_enabled()) + if (it->enabled()) it->check(_cursor); } } diff --git a/src/third_party/wiredtiger/test/cppsuite/test_harness/test.h b/src/third_party/wiredtiger/test/cppsuite/test_harness/test.h index a753e131f0f..f5049df074d 100644 --- a/src/third_party/wiredtiger/test/cppsuite/test_harness/test.h +++ b/src/third_party/wiredtiger/test/cppsuite/test_harness/test.h @@ -102,7 +102,7 @@ class test : public database_operation { run() { int64_t cache_size_mb, duration_seconds; - bool enable_logging, is_success = true; + bool enable_logging; /* Build the database creation config string. */ std::string db_create_config = CONNECTION_CREATE; @@ -124,6 +124,10 @@ class test : public database_operation { for (const auto &it : _components) _thread_manager->add_thread(&component::run, it); + /* The initial population phase needs to be finished before starting the actual test. */ + while (_workload_generator->enabled() && !_workload_generator->db_populated()) + std::this_thread::sleep_for(std::chrono::milliseconds(10)); + /* The test will run for the duration as defined in the config. */ duration_seconds = _config->get_int(DURATION_SECONDS); testutil_assert(duration_seconds >= 0); @@ -135,13 +139,13 @@ class test : public database_operation { _thread_manager->join(); /* Validation stage. */ - if (_workload_tracking->is_enabled()) { + if (_workload_tracking->enabled()) { workload_validation wv; - is_success = wv.validate(_workload_tracking->get_operation_table_name(), + wv.validate(_workload_tracking->get_operation_table_name(), _workload_tracking->get_schema_table_name(), _workload_generator->get_database()); } - debug_print(is_success ? "SUCCESS" : "FAILED", DEBUG_INFO); + debug_print("SUCCESS", DEBUG_INFO); connection_manager::instance().close(); } diff --git a/src/third_party/wiredtiger/test/cppsuite/test_harness/util/debug_utils.h b/src/third_party/wiredtiger/test/cppsuite/test_harness/util/debug_utils.h index da09a08c9d8..a2694f6987c 100644 --- a/src/third_party/wiredtiger/test/cppsuite/test_harness/util/debug_utils.h +++ b/src/third_party/wiredtiger/test/cppsuite/test_harness/util/debug_utils.h @@ -34,7 +34,6 @@ /* Define helpful functions related to debugging. */ namespace test_harness { -#define DEBUG_ABORT -1 #define DEBUG_ERROR 0 #define DEBUG_INFO 1 #define DEBUG_TRACE 2 diff --git a/src/third_party/wiredtiger/test/cppsuite/test_harness/workload/database_model.h b/src/third_party/wiredtiger/test/cppsuite/test_harness/workload/database_model.h index 07e7c007ea7..c2a7ed9f6a6 100644 --- a/src/third_party/wiredtiger/test/cppsuite/test_harness/workload/database_model.h +++ b/src/third_party/wiredtiger/test/cppsuite/test_harness/workload/database_model.h @@ -42,9 +42,6 @@ struct key_t { bool exists; }; -/* Iterator type used to iterate over keys that are stored in the data model. */ -typedef std::map<test_harness::key_value_t, test_harness::key_t>::const_iterator keys_iterator_t; - /* Representation of a value. */ struct value_t { key_value_t value; @@ -59,18 +56,6 @@ struct collection_t { /* Representation of the collections in memory. */ class database { public: - const keys_iterator_t - get_collection_keys_begin(const std::string &collection_name) const - { - return (collections.at(collection_name).keys.begin()); - } - - const keys_iterator_t - get_collection_keys_end(const std::string &collection_name) const - { - return (collections.at(collection_name).keys.end()); - } - const std::vector<std::string> get_collection_names() const { diff --git a/src/third_party/wiredtiger/test/cppsuite/test_harness/workload/database_operation.h b/src/third_party/wiredtiger/test/cppsuite/test_harness/workload/database_operation.h index 7a88ed9b662..fc97c1e381c 100644 --- a/src/third_party/wiredtiger/test/cppsuite/test_harness/workload/database_operation.h +++ b/src/third_party/wiredtiger/test/cppsuite/test_harness/workload/database_operation.h @@ -46,7 +46,7 @@ class database_operation { * - Open a cursor on each collection. * - Insert m key/value pairs in each collection. Values are random strings which size is * defined by the configuration. - * - Store in memory the created collections and the generated keys that were inserted. + * - Store in memory the created collections. */ virtual void populate(database &database, timestamp_manager *timestamp_manager, configuration *config, @@ -58,7 +58,7 @@ class database_operation { int64_t collection_count, key_count, key_cpt, key_size, value_size; std::string collection_name, cfg, home; key_value_t generated_key, generated_value; - bool ts_enabled = timestamp_manager->is_enabled(); + bool ts_enabled = timestamp_manager->enabled(); cursor = nullptr; collection_count = key_count = key_size = value_size = 0; @@ -67,13 +67,14 @@ class database_operation { session = connection_manager::instance().create_session(); /* Create n collections as per the configuration and store each collection name. */ collection_count = config->get_int(COLLECTION_COUNT); - for (int i = 0; i < collection_count; ++i) { + for (size_t i = 0; i < collection_count; ++i) { collection_name = "table:collection" + std::to_string(i); database.collections[collection_name] = {}; testutil_check( session->create(session, collection_name.c_str(), DEFAULT_FRAMEWORK_SCHEMA)); ts = timestamp_manager->get_next_ts(); - testutil_check(tracking->save(tracking_operation::CREATE, collection_name, 0, "", ts)); + tracking->save_schema_operation( + tracking_operation::CREATE_COLLECTION, collection_name, ts); } debug_print(std::to_string(collection_count) + " collections created", DEBUG_TRACE); @@ -89,11 +90,13 @@ class database_operation { for (const auto &it_collections : database.collections) { collection_name = it_collections.first; key_cpt = 0; - /* WiredTiger lets you open a cursor on a collection using the same pointer. When a - * session is closed, WiredTiger APIs close the cursors too. */ + /* + * WiredTiger lets you open a cursor on a collection using the same pointer. When a + * session is closed, WiredTiger APIs close the cursors too. + */ testutil_check( session->open_cursor(session, collection_name.c_str(), NULL, NULL, &cursor)); - for (size_t j = 0; j < key_count; ++j) { + for (size_t i = 0; i < key_count; ++i) { /* Generation of a unique key. */ generated_key = number_to_string(key_size, key_cpt); ++key_cpt; @@ -106,16 +109,12 @@ class database_operation { ts = timestamp_manager->get_next_ts(); if (ts_enabled) testutil_check(session->begin_transaction(session, "")); - testutil_check(insert(cursor, tracking, collection_name, generated_key.c_str(), - generated_value.c_str(), ts)); + insert(cursor, tracking, collection_name, generated_key.c_str(), + generated_value.c_str(), ts); if (ts_enabled) { cfg = std::string(COMMIT_TS) + "=" + timestamp_manager->decimal_to_hex(ts); testutil_check(session->commit_transaction(session, cfg.c_str())); } - /* Update the memory representation of the collections. */ - database.collections[collection_name].keys[generated_key].exists = true; - /* Values are not stored here. */ - database.collections[collection_name].values = nullptr; } } debug_print("Populate stage done", DEBUG_TRACE); @@ -150,13 +149,15 @@ class database_operation { virtual void update_operation(thread_context &context, WT_SESSION *session) { + WT_DECL_RET; WT_CURSOR *cursor; wt_timestamp_t ts; std::vector<WT_CURSOR *> cursors; - std::string collection_name; std::vector<std::string> collection_names = context.get_collection_names(); - key_value_t generated_value, key; - int64_t cpt, value_size = context.get_value_size(); + key_value_t key, generated_value; + const char *key_tmp; + int64_t value_size = context.get_value_size(); + uint64_t i; testutil_assert(session != nullptr); /* Get a cursor for each collection in collection_names. */ @@ -165,17 +166,31 @@ class database_operation { cursors.push_back(cursor); } - cpt = 0; - /* Walk each cursor. */ - for (const auto &it : cursors) { - collection_name = collection_names[cpt]; - /* Walk each key. */ - for (keys_iterator_t iter_key = context.get_collection_keys_begin(collection_name); - iter_key != context.get_collection_keys_end(collection_name); ++iter_key) { - /* Do not process removed keys. */ - if (!iter_key->second.exists) - continue; - + /* + * Update each collection while the test is running. + */ + i = 0; + while (context.is_running() && !collection_names.empty()) { + if (i >= collection_names.size()) + i = 0; + ret = cursors[i]->next(cursors[i]); + /* If we have reached the end of the collection, reset. */ + if (ret == WT_NOTFOUND) { + testutil_check(cursors[i]->reset(cursors[i])); + ++i; + } else if (ret != 0) + /* Stop updating in case of an error. */ + testutil_die(DEBUG_ERROR, "update_operation: cursor->next() failed: %d", ret); + else { + testutil_check(cursors[i]->get_key(cursors[i], &key_tmp)); + /* + * The retrieved key needs to be passed inside the update function. However, the + * update API doesn't guarantee our buffer will still be valid once it is called, as + * such we copy the buffer and then pass it into the API. + */ + key = key_value_t(key_tmp); + generated_value = + random_generator::random_generator::instance().generate_string(value_size); ts = context.get_timestamp_manager()->get_next_ts(); /* Start a transaction if possible. */ @@ -183,17 +198,15 @@ class database_operation { context.begin_transaction(session, ""); context.set_commit_timestamp(session, ts); } - generated_value = - random_generator::random_generator::instance().generate_string(value_size); - testutil_check(update(context.get_tracking(), it, collection_name, - iter_key->first.c_str(), generated_value.c_str(), ts)); + + update(context.get_tracking(), cursors[i], collection_names[i], key.c_str(), + generated_value.c_str(), ts); /* Commit the current transaction if possible. */ context.increment_operation_count(); if (context.can_commit_transaction()) context.commit_transaction(session, ""); } - ++cpt; } /* @@ -211,48 +224,34 @@ class database_operation { private: /* WiredTiger APIs wrappers for single operations. */ template <typename K, typename V> - int + void insert(WT_CURSOR *cursor, workload_tracking *tracking, const std::string &collection_name, const K &key, const V &value, wt_timestamp_t ts) { - int error_code; - testutil_assert(cursor != nullptr); + cursor->set_key(cursor, key); cursor->set_value(cursor, value); - error_code = cursor->insert(cursor); + testutil_check(cursor->insert(cursor)); + debug_print("key/value inserted", DEBUG_TRACE); - if (error_code == 0) { - debug_print("key/value inserted", DEBUG_TRACE); - error_code = - tracking->save(tracking_operation::INSERT, collection_name, key, value, ts); - } else - debug_print("key/value insertion failed", DEBUG_ERROR); - - return (error_code); + tracking->save_operation(tracking_operation::INSERT, collection_name, key, value, ts); } template <typename K, typename V> - static int + static void update(workload_tracking *tracking, WT_CURSOR *cursor, const std::string &collection_name, K key, V value, wt_timestamp_t ts) { - int error_code; - testutil_assert(tracking != nullptr); testutil_assert(cursor != nullptr); + cursor->set_key(cursor, key); cursor->set_value(cursor, value); - error_code = cursor->update(cursor); - - if (error_code == 0) { - debug_print("key/value update", DEBUG_TRACE); - error_code = - tracking->save(tracking_operation::UPDATE, collection_name, key, value, ts); - } else - debug_print("key/value update failed", DEBUG_ERROR); + testutil_check(cursor->update(cursor)); + debug_print("key/value updated", DEBUG_TRACE); - return (error_code); + tracking->save_operation(tracking_operation::UPDATE, collection_name, key, value, ts); } /* diff --git a/src/third_party/wiredtiger/test/cppsuite/test_harness/workload/thread_context.h b/src/third_party/wiredtiger/test/cppsuite/test_harness/workload/thread_context.h index e5275bc7819..2cf20066504 100644 --- a/src/third_party/wiredtiger/test/cppsuite/test_harness/workload/thread_context.h +++ b/src/third_party/wiredtiger/test/cppsuite/test_harness/workload/thread_context.h @@ -69,18 +69,6 @@ class thread_context { return (_database.get_collection_names()); } - const keys_iterator_t - get_collection_keys_begin(const std::string &collection_name) const - { - return (_database.get_collection_keys_begin(collection_name)); - } - - const keys_iterator_t - get_collection_keys_end(const std::string &collection_name) const - { - return (_database.get_collection_keys_end(collection_name)); - } - thread_operation get_thread_operation() const { @@ -132,7 +120,7 @@ class thread_context { void begin_transaction(WT_SESSION *session, const std::string &config) { - if (!_in_txn && _timestamp_manager->is_enabled()) { + if (!_in_txn && _timestamp_manager->enabled()) { testutil_check( session->begin_transaction(session, config.empty() ? nullptr : config.c_str())); /* This randomizes the number of operations to be executed in one transaction. */ @@ -154,7 +142,7 @@ class thread_context { bool can_commit_transaction() const { - return (_timestamp_manager->is_enabled() && _in_txn && + return (_timestamp_manager->enabled() && _in_txn && (!_running || (_current_op_count > _max_op_count))); } @@ -180,7 +168,7 @@ class thread_context { void set_commit_timestamp(WT_SESSION *session, wt_timestamp_t ts) { - if (!_timestamp_manager->is_enabled()) + if (!_timestamp_manager->enabled()) return; std::string config = std::string(COMMIT_TS) + "=" + _timestamp_manager->decimal_to_hex(ts); diff --git a/src/third_party/wiredtiger/test/cppsuite/test_harness/workload/workload_tracking.h b/src/third_party/wiredtiger/test/cppsuite/test_harness/workload/workload_tracking.h index 4d1b2d755a8..41efadb440b 100644 --- a/src/third_party/wiredtiger/test/cppsuite/test_harness/workload/workload_tracking.h +++ b/src/third_party/wiredtiger/test/cppsuite/test_harness/workload/workload_tracking.h @@ -49,7 +49,7 @@ namespace test_harness { /* Tracking operations. */ -enum class tracking_operation { CREATE, DELETE_COLLECTION, DELETE_KEY, INSERT, UPDATE }; +enum class tracking_operation { CREATE_COLLECTION, DELETE_COLLECTION, DELETE_KEY, INSERT, UPDATE }; /* Class used to track operations performed on collections */ class workload_tracking : public component { @@ -107,41 +107,49 @@ class workload_tracking : public component { /* Does not do anything. */ } - template <typename K, typename V> - int - save(const tracking_operation &operation, const std::string &collection_name, const K &key, - const V &value, wt_timestamp_t ts) + void + save_schema_operation( + const tracking_operation &operation, const std::string &collection_name, wt_timestamp_t ts) { - WT_CURSOR *cursor; - int error_code = 0; + std::string error_message; if (!_enabled) - return (error_code); - - /* Select the correct cursor to save in the collection associated to specific operations. */ - switch (operation) { - case tracking_operation::CREATE: - case tracking_operation::DELETE_COLLECTION: - cursor = _cursor_schema; - cursor->set_key(cursor, collection_name.c_str(), ts); - cursor->set_value(cursor, static_cast<int>(operation)); - break; - - default: - cursor = _cursor_operations; - cursor->set_key(cursor, collection_name.c_str(), key, ts); - cursor->set_value(cursor, static_cast<int>(operation), value); - break; + return; + + if (operation == tracking_operation::CREATE_COLLECTION || + operation == tracking_operation::DELETE_COLLECTION) { + _cursor_schema->set_key(_cursor_schema, collection_name.c_str(), ts); + _cursor_schema->set_value(_cursor_schema, static_cast<int>(operation)); + testutil_check(_cursor_schema->insert(_cursor_schema)); + } else { + error_message = "save_schema_operation: invalid operation " + + std::to_string(static_cast<int>(operation)); + testutil_die(EINVAL, error_message.c_str()); } + debug_print("save_schema_operation: workload tracking saved operation.", DEBUG_TRACE); + } - error_code = cursor->insert(cursor); + template <typename K, typename V> + void + save_operation(const tracking_operation &operation, const std::string &collection_name, + const K &key, const V &value, wt_timestamp_t ts) + { + std::string error_message; - if (error_code == 0) - debug_print("Workload tracking saved operation.", DEBUG_TRACE); - else - debug_print("Workload tracking failed to save operation !", DEBUG_ERROR); + if (!_enabled) + return; - return error_code; + if (operation == tracking_operation::CREATE_COLLECTION || + operation == tracking_operation::DELETE_COLLECTION) { + error_message = + "save_operation: invalid operation " + std::to_string(static_cast<int>(operation)); + testutil_die(EINVAL, error_message.c_str()); + } else { + _cursor_operations->set_key(_cursor_operations, collection_name.c_str(), key, ts); + _cursor_operations->set_value(_cursor_operations, static_cast<int>(operation), value); + testutil_check(_cursor_operations->insert(_cursor_operations)); + } + debug_print("save_operation: workload tracking saved operation.", DEBUG_TRACE); } private: diff --git a/src/third_party/wiredtiger/test/cppsuite/test_harness/workload/workload_validation.h b/src/third_party/wiredtiger/test/cppsuite/test_harness/workload/workload_validation.h index 5ef7992e773..aaab9ad25a9 100644 --- a/src/third_party/wiredtiger/test/cppsuite/test_harness/workload/workload_validation.h +++ b/src/third_party/wiredtiger/test/cppsuite/test_harness/workload/workload_validation.h @@ -45,95 +45,113 @@ namespace test_harness { class workload_validation { public: /* - * Validate the on disk data against what has been tracked during the test. - * - The first step is to replay the tracked operations so a representation in memory of the - * collections is created. This representation is then compared to what is on disk. - * - The second step is to go through what has been saved on disk and make sure the memory - * representation has the same data. - * operation_table_name is the collection that contains all the operations about the key/value - * pairs in the different collections used during the test. schema_table_name is the collection - * that contains all the operations about the creation or deletion of collections during the - * test. + * Validate the on disk data against what has been tracked during the test. This is done by + * replaying the tracked operations so a representation in memory of the collections is created. + * This representation is then compared to what is on disk. operation_table_name: collection + * that contains all the operations about the key/value pairs in the different collections used + * during the test. schema_table_name: collection that contains all the operations about the + * creation or deletion of collections during the test. */ - bool + void validate(const std::string &operation_table_name, const std::string &schema_table_name, database &database) { + WT_DECL_RET; + WT_CURSOR *cursor; WT_SESSION *session; - std::string collection_name; - /* Existing collections after the test. */ + wt_timestamp_t key_timestamp; std::vector<std::string> created_collections, deleted_collections; - bool is_valid = true; + const char *key, *key_collection_name, *value; + int value_operation_type; + std::string collection_name; session = connection_manager::instance().create_session(); /* Retrieve the collections that were created and deleted during the test. */ - collection_name = schema_table_name; parse_schema_tracking_table( - session, collection_name, created_collections, deleted_collections); - - /* Make sure they exist in memory. */ - for (auto const &it : created_collections) { - if (database.collections.count(it) == 0) { - debug_print("Collection missing in memory: " + it, DEBUG_ERROR); - is_valid = false; - break; - } - } - - if (!is_valid) - return (is_valid); + session, schema_table_name, created_collections, deleted_collections); - /* Make sure they don't exist in memory nor on disk. */ + /* + * Make sure the deleted collections do not exist on disk. The created collections are + * checked in check_reference. + */ for (auto const &it : deleted_collections) { - if (database.collections.count(it) > 0) { - debug_print( - "Collection present in memory while it has been tracked as deleted: " + it, - DEBUG_ERROR); - is_valid = false; - break; - } - if (!verify_collection_state(session, it, false)) { - debug_print( - "Collection present on disk while it has been tracked as deleted: " + it, - DEBUG_ERROR); - is_valid = false; - break; - } + if (!verify_collection_state(session, it, false)) + testutil_die(DEBUG_ERROR, + "validate: collection %s present on disk while it has been tracked as deleted.", + it.c_str()); } - for (auto const &collection_name : database.get_collection_names()) { - if (!is_valid) - break; - - /* Get the values associated to the different keys in the current collection. */ - parse_operation_tracking_table( - session, operation_table_name, collection_name, database); - /* Check all tracked operations in memory against the database on disk. */ - if (!check_reference(session, collection_name, database)) { - debug_print( - "check_reference failed for collection " + collection_name, DEBUG_ERROR); - is_valid = false; - } - /* Check what has been saved on disk against what has been tracked. */ - else if (!check_disk_state(session, collection_name, database)) { - debug_print( - "check_disk_state failed for collection " + collection_name, DEBUG_ERROR); - is_valid = false; + /* Parse the tracking table. */ + testutil_check( + session->open_cursor(session, operation_table_name.c_str(), NULL, NULL, &cursor)); + while ((ret = cursor->next(cursor)) == 0) { + testutil_check(cursor->get_key(cursor, &key_collection_name, &key, &key_timestamp)); + testutil_check(cursor->get_value(cursor, &value_operation_type, &value)); + + debug_print("Collection name is " + std::string(key_collection_name), DEBUG_TRACE); + debug_print("Key is " + std::string(key), DEBUG_TRACE); + debug_print("Timestamp is " + std::to_string(key_timestamp), DEBUG_TRACE); + debug_print("Operation type is " + std::to_string(value_operation_type), DEBUG_TRACE); + debug_print("Value is " + std::string(value), DEBUG_TRACE); + + /* + * If the cursor points to values from a collection that has been created during the + * test, update the data model. + */ + if (std::find(created_collections.begin(), created_collections.end(), + key_collection_name) != created_collections.end()) + update_data_model(static_cast<tracking_operation>(value_operation_type), + key_collection_name, key, value, database); + /* + * The collection should be part of the deleted collections if it has not be found in + * the created ones. + */ + else if (std::find(deleted_collections.begin(), deleted_collections.end(), + key_collection_name) == deleted_collections.end()) + testutil_die(DEBUG_ERROR, + "validate: The collection %s is not part of the created or deleted collections.", + key_collection_name); + + if (collection_name.empty()) + collection_name = key_collection_name; + else if (collection_name != key_collection_name) { + /* + * The data model is now fully updated for the last read collection. It can be + * checked. + */ + check_reference(session, collection_name, database.collections.at(collection_name)); + /* Clear memory. */ + delete database.collections[collection_name].values; + database.collections[collection_name].values = nullptr; + + collection_name = key_collection_name; } + }; + + /* The value of ret should be WT_NOTFOUND once the cursor has read all rows. */ + if (ret != WT_NOTFOUND) + testutil_die(DEBUG_ERROR, "validate: cursor->next() %d.", ret); + + /* + * Once the cursor has read the entire table, the last parsed collection has not been + * checked yet. We still have to make sure collection_name has been updated. It will remain + * empty if there is no collections to check after the end of the test (no collections + * created or all deleted). + */ + if (!collection_name.empty()) { + check_reference(session, collection_name, database.collections.at(collection_name)); /* Clear memory. */ delete database.collections[collection_name].values; database.collections[collection_name].values = nullptr; } - - return (is_valid); } private: /* * Read the tracking table to retrieve the created and deleted collections during the test. - * collection_name is the collection that contains the operations on the different collections - * during the test. + * collection_name: collection that contains the operations on the different collections during + * the test. */ void parse_schema_tracking_table(WT_SESSION *session, const std::string &collection_name, @@ -155,7 +173,7 @@ class workload_validation { debug_print("Operation type is " + std::to_string(value_operation_type), DEBUG_TRACE); if (static_cast<tracking_operation>(value_operation_type) == - tracking_operation::CREATE) { + tracking_operation::CREATE_COLLECTION) { deleted_collections.erase(std::remove(deleted_collections.begin(), deleted_collections.end(), key_collection_name), deleted_collections.end()); @@ -170,211 +188,95 @@ class workload_validation { } } - /* - * Parse the tracked operations to build a representation in memory of the collections at the - * end of the test. tracking_collection_name is the tracking collection used to save the - * operations performed on the collections during the test. collection_name is the collection - * that needs to be represented in memory. - */ + /* Update the data model. */ void - parse_operation_tracking_table(WT_SESSION *session, const std::string &tracking_collection_name, - const std::string &collection_name, database &database) + update_data_model(const tracking_operation &operation, const std::string &collection_name, + const char *key, const char *value, database &database) { - WT_CURSOR *cursor; - wt_timestamp_t key_timestamp; - int exact, value_operation_type; - const char *key, *key_collection_name, *value; - std::vector<key_value_t> collection_keys; - std::string key_str; - - /* Retrieve all keys from the given collection. */ - for (auto const &it : database.collections.at(collection_name).keys) - collection_keys.push_back(it.first); - /* There must be at least a key. */ - testutil_assert(!collection_keys.empty()); - /* Sort keys. */ - std::sort(collection_keys.begin(), collection_keys.end()); - /* Use the first key as a parameter for search_near. */ - key_str = collection_keys[0]; - - testutil_check( - session->open_cursor(session, tracking_collection_name.c_str(), NULL, NULL, &cursor)); - - cursor->set_key(cursor, collection_name.c_str(), key_str.c_str()); - testutil_check(cursor->search_near(cursor, &exact)); - /* - * Since the timestamp which is part of the key is not provided, exact cannot be 0. If it is - * -1, we need to go to the next key. - */ - testutil_assert(exact != 0); - if (exact < 0) - testutil_check(cursor->next(cursor)); - - do { - testutil_check(cursor->get_key(cursor, &key_collection_name, &key, &key_timestamp)); - testutil_check(cursor->get_value(cursor, &value_operation_type, &value)); - - debug_print("Collection name is " + std::string(key_collection_name), DEBUG_TRACE); - debug_print("Key is " + std::string(key), DEBUG_TRACE); - debug_print("Timestamp is " + std::to_string(key_timestamp), DEBUG_TRACE); - debug_print("Operation type is " + std::to_string(value_operation_type), DEBUG_TRACE); - debug_print("Value is " + std::string(value), DEBUG_TRACE); - + switch (operation) { + case tracking_operation::DELETE_KEY: /* - * If the cursor is reading an operation for a different collection, we know all the - * operations have been parsed for the collection we were interested in. + * Operations are parsed from the oldest to the most recent one. It is safe to assume + * the key has been inserted previously in an existing collection and can be safely + * deleted. */ - if (std::string(key_collection_name) != collection_name) - break; - - /* Replay the current operation. */ - switch (static_cast<tracking_operation>(value_operation_type)) { - case tracking_operation::DELETE_KEY: - /* - * Operations are parsed from the oldest to the most recent one. It is safe to - * assume the key has been inserted previously in an existing collection and can be - * safely deleted. - */ - database.collections.at(key_collection_name).keys.at(std::string(key)).exists = - false; - delete database.collections.at(key_collection_name).values; - database.collections.at(key_collection_name).values = nullptr; - break; - case tracking_operation::INSERT: { - /* Keys are unique, it is safe to assume the key has not been encountered before. */ - database.collections[key_collection_name].keys[std::string(key)].exists = true; - if (database.collections[key_collection_name].values == nullptr) { - database.collections[key_collection_name].values = - new std::map<key_value_t, value_t>(); - } - value_t v; - v.value = key_value_t(value); - std::pair<key_value_t, value_t> pair(key_value_t(key), v); - database.collections[key_collection_name].values->insert(pair); - break; - } - case tracking_operation::UPDATE: - database.collections[key_collection_name].values->at(key).value = - key_value_t(value); - break; - default: - testutil_die(DEBUG_ABORT, "Unexpected operation in the tracking table: %d", - value_operation_type); - break; - } - - } while (cursor->next(cursor) == 0); - - if (cursor->reset(cursor) != 0) - debug_print("Cursor could not be reset !", DEBUG_ERROR); + database.collections.at(collection_name).keys.at(key).exists = false; + delete database.collections.at(collection_name).values; + database.collections.at(collection_name).values = nullptr; + break; + case tracking_operation::INSERT: { + /* + * Keys are unique, it is safe to assume the key has not been encountered before. + */ + database.collections[collection_name].keys[key].exists = true; + if (database.collections[collection_name].values == nullptr) + database.collections[collection_name].values = new std::map<key_value_t, value_t>(); + value_t v; + v.value = key_value_t(value); + std::pair<key_value_t, value_t> pair(key_value_t(key), v); + database.collections[collection_name].values->insert(pair); + break; + } + case tracking_operation::UPDATE: + database.collections[collection_name].values->at(key).value = key_value_t(value); + break; + default: + testutil_die(DEBUG_ERROR, "Unexpected operation in the tracking table: %d", + static_cast<tracking_operation>(operation)); + break; + } } /* - * Compare the tracked operations against what has been saved on disk. database is the - * representation in memory of the collections after the test according to the tracking table. + * Compare the tracked operations against what has been saved on disk. collection: + * representation in memory of the collection values and keys according to the tracking table. */ - bool + void check_reference( - WT_SESSION *session, const std::string &collection_name, const database &database) + WT_SESSION *session, const std::string &collection_name, const collection_t &collection) { bool is_valid; - collection_t collection; key_t key; key_value_t key_str; /* Check the collection exists on disk. */ - is_valid = verify_collection_state(session, collection_name, true); - - if (is_valid) { - collection = database.collections.at(collection_name); - /* Walk through each key/value pair of the current collection. */ - for (const auto &keys : collection.keys) { - key_str = keys.first; - key = keys.second; - /* The key/value pair exists. */ - if (key.exists) - is_valid = (is_key_present(session, collection_name, key_str.c_str()) == true); - /* The key has been deleted. */ - else - is_valid = (is_key_present(session, collection_name, key_str.c_str()) == false); - - /* Check the associated value is valid. */ - if (is_valid && key.exists) { - testutil_assert(collection.values != nullptr); - is_valid = verify_value(session, collection_name, key_str.c_str(), - collection.values->at(key_str).value); - } - - if (!is_valid) { - debug_print("check_reference failed for key " + key_str, DEBUG_ERROR); - break; - } - } - } - - if (!is_valid) - debug_print("check_reference failed for collection " + collection_name, DEBUG_ERROR); - - return (is_valid); - } - - /* Check what is present on disk against what has been tracked. */ - bool - check_disk_state( - WT_SESSION *session, const std::string &collection_name, const database &database) - { - WT_CURSOR *cursor; - collection_t collection; - bool is_valid = true; - /* Key/value pairs on disk. */ - const char *key_on_disk, *value_on_disk; - key_value_t key_str, value_str; - - testutil_check(session->open_cursor(session, collection_name.c_str(), NULL, NULL, &cursor)); - - collection = database.collections.at(collection_name); - - /* Read the collection on disk. */ - while (is_valid && (cursor->next(cursor) == 0)) { - testutil_check(cursor->get_key(cursor, &key_on_disk)); - testutil_check(cursor->get_value(cursor, &value_on_disk)); + if (!verify_collection_state(session, collection_name, true)) + testutil_die(DEBUG_ERROR, + "check_reference: collection %s not present on disk while it has been tracked as " + "created.", + collection_name.c_str()); + + /* Walk through each key/value pair of the current collection. */ + for (const auto &keys : collection.keys) { + key_str = keys.first; + key = keys.second; + /* The key/value pair exists. */ + if (key.exists) + is_valid = (is_key_present(session, collection_name, key_str.c_str()) == true); + /* The key has been deleted. */ + else + is_valid = (is_key_present(session, collection_name, key_str.c_str()) == false); - key_str = std::string(key_on_disk); - - debug_print("Key on disk is " + key_str, DEBUG_TRACE); - debug_print("Value on disk is " + std::string(value_on_disk), DEBUG_TRACE); + if (!is_valid) + testutil_die(DEBUG_ERROR, "check_reference: failed for key %s in collection %s.", + key_str.c_str(), collection_name.c_str()); - /* Check the key on disk has been saved in memory too. */ - if ((collection.keys.count(key_str) > 0) && collection.keys.at(key_str).exists) { - /* Memory should be allocated for values. */ + /* Check the associated value is valid. */ + if (key.exists) { testutil_assert(collection.values != nullptr); - value_str = collection.values->at(key_str).value; - /* - * Check the key/value pair on disk matches the one in memory from the tracked - * operations. - */ - is_valid = (value_str == key_value_t(value_on_disk)); - if (!is_valid) - debug_print(" Key/Value pair mismatch.\n Disk key: " + key_str + - "\n Disk value: " + std ::string(value_on_disk) + - "\n Tracking table key: " + key_str + "\n Tracking table value exists: " + - std::to_string(collection.keys.at(key_str).exists) + - "\n Tracking table value: " + value_str, - DEBUG_ERROR); - } else { - is_valid = false; - debug_print( - "The key " + std::string(key_on_disk) + " present on disk has not been tracked", - DEBUG_ERROR); + if (!verify_value(session, collection_name, key_str.c_str(), + collection.values->at(key_str).value)) + testutil_die(DEBUG_ERROR, + "check_reference: failed for key %s / value %s in collection %s.", + key_str.c_str(), collection.values->at(key_str).value.c_str(), + collection_name.c_str()); } } - - return (is_valid); } /* - * Check whether a collection exists on disk. collection_name is the collection to check. exists - * needs to be set to true if the collection is expected to be existing, false otherwise. + * Check whether a collection exists on disk. exists: needs to be set to true if the collection + * is expected to be existing, false otherwise. */ bool verify_collection_state( @@ -385,6 +287,7 @@ class workload_validation { return (exists ? (ret == 0) : (ret != 0)); } + /* Check whether a keys exists in a collection on disk. */ template <typename K> bool is_key_present(WT_SESSION *session, const std::string &collection_name, const K &key) diff --git a/src/third_party/wiredtiger/test/cppsuite/test_harness/workload_generator.h b/src/third_party/wiredtiger/test/cppsuite/test_harness/workload_generator.h index 9413834ba31..5e084229123 100644 --- a/src/third_party/wiredtiger/test/cppsuite/test_harness/workload_generator.h +++ b/src/third_party/wiredtiger/test/cppsuite/test_harness/workload_generator.h @@ -72,6 +72,7 @@ class workload_generator : public component { /* Populate the database. */ _database_operation->populate(_database, _timestamp_manager, _config, _tracking); + _db_populated = true; /* Retrieve useful parameters from the test configuration. */ transaction_config = _config->get_subconfig(OPS_PER_TRANSACTION); @@ -87,7 +88,7 @@ class workload_generator : public component { testutil_assert(value_size >= 0); /* Generate threads to execute read operations on the collections. */ - for (int i = 0; i < read_threads; ++i) { + for (size_t i = 0; i < read_threads && _running; ++i) { thread_context *tc = new thread_context(_timestamp_manager, _tracking, _database, thread_operation::READ, max_operation_per_transaction, min_operation_per_transaction, value_size, throttle()); @@ -96,7 +97,7 @@ class workload_generator : public component { } /* Generate threads to execute update operations on the collections. */ - for (int i = 0; i < update_threads; ++i) { + for (size_t i = 0; i < update_threads && _running; ++i) { thread_context *tc = new thread_context(_timestamp_manager, _tracking, _database, thread_operation::UPDATE, max_operation_per_transaction, min_operation_per_transaction, value_size, throttle(update_config)); @@ -123,7 +124,13 @@ class workload_generator : public component { database & get_database() { - return _database; + return (_database); + } + + bool + db_populated() const + { + return (_db_populated); } /* Workload threaded operations. */ @@ -148,7 +155,7 @@ class workload_generator : public component { db_operation.update_operation(context, session); break; default: - testutil_die(DEBUG_ABORT, "system: thread_operation is unknown : %d", + testutil_die(DEBUG_ERROR, "system: thread_operation is unknown : %d", static_cast<int>(context.get_thread_operation())); break; } @@ -161,6 +168,7 @@ class workload_generator : public component { timestamp_manager *_timestamp_manager; workload_tracking *_tracking; std::vector<thread_context *> _workers; + bool _db_populated = false; }; } // namespace test_harness diff --git a/src/third_party/wiredtiger/test/cppsuite/tests/run.cxx b/src/third_party/wiredtiger/test/cppsuite/tests/run.cxx index 5fe6641cc3b..67d77116cf1 100755 --- a/src/third_party/wiredtiger/test/cppsuite/tests/run.cxx +++ b/src/third_party/wiredtiger/test/cppsuite/tests/run.cxx @@ -140,7 +140,7 @@ main(int argc, char *argv[]) * -l : Trace level. * -t : Test to run. All tests are run if not specified. */ - for (int i = 1; (i < argc) && (error_code == 0); ++i) { + for (size_t i = 1; (i < argc) && (error_code == 0); ++i) { if (std::string(argv[i]) == "-h") { print_help(); return 0; diff --git a/src/third_party/wiredtiger/test/csuite/incr_backup/main.c b/src/third_party/wiredtiger/test/csuite/incr_backup/main.c index 9b535d7bc54..b09e1b44da4 100644 --- a/src/third_party/wiredtiger/test/csuite/incr_backup/main.c +++ b/src/third_party/wiredtiger/test/csuite/incr_backup/main.c @@ -74,8 +74,8 @@ static bool do_rename = true; } while (0) /* - * We keep an array of tables, each one may or may not be in use. - * "In use" means it has been created, and will be updated from time to time. + * We keep an array of tables, each one may or may not be in use. "In use" means it has been + * created, and will be updated from time to time. */ typedef struct { char *name; /* non-null entries represent tables in use */ @@ -189,8 +189,7 @@ key_value(uint64_t change_count, char *key, size_t key_size, WT_ITEM *item, OPER * is inserted, it is all the letter 'a'. When the value is updated, is it mostly 'b', with some * 'c' mixed in. When the value is to modified, we'll end up with a value with mostly 'b' and * 'M' mixed in, in different spots. Thus the modify operation will have both additions ('M') - * and - * subtractions ('c') from the previous version. + * and subtractions ('c') from the previous version. */ if (op_type == INSERT) ch = 'a'; diff --git a/src/third_party/wiredtiger/test/csuite/wt2719_reconfig/main.c b/src/third_party/wiredtiger/test/csuite/wt2719_reconfig/main.c index 5434aa191ef..27d1a58ccbd 100644 --- a/src/third_party/wiredtiger/test/csuite/wt2719_reconfig/main.c +++ b/src/third_party/wiredtiger/test/csuite/wt2719_reconfig/main.c @@ -183,9 +183,8 @@ main(int argc, char *argv[]) /* * A linear pass through the list, adding random elements. * - * WiredTiger configurations are usually "the last one set wins", but - * "shared_cache" and "cache_set" options aren't allowed in the same - * configuration string. + * WiredTiger configurations are usually "the last one set wins", but "shared_cache" and + * "cache_set" options aren't allowed in the same configuration string. */ for (i = 0; i < WT_ELEMENTS(list); ++i) { p = list[i]; diff --git a/src/third_party/wiredtiger/test/evergreen.yml b/src/third_party/wiredtiger/test/evergreen.yml index b4ab0507399..6e8c3f01d11 100755 --- a/src/third_party/wiredtiger/test/evergreen.yml +++ b/src/third_party/wiredtiger/test/evergreen.yml @@ -2061,10 +2061,6 @@ tasks: - func: "format test" vars: extra_args: checkpoints=1 leak_memory=0 mmap=1 file_type=row compression=zlib huffman_value=1 - # FIXME-WT-6668: temporarily disable lower isolation level test - # - func: "format test" - # vars: - # extra_args: checkpoints=1 leak_memory=0 mmap=1 file_type=row isolation=random transaction_timestamps=0 - func: "format test" vars: extra_args: checkpoints=1 leak_memory=0 mmap=1 file_type=row data_source=lsm bloom=1 diff --git a/src/third_party/wiredtiger/test/format/bulk.c b/src/third_party/wiredtiger/test/format/bulk.c index 5af036d1495..c1decf24b6f 100644 --- a/src/third_party/wiredtiger/test/format/bulk.c +++ b/src/third_party/wiredtiger/test/format/bulk.c @@ -38,7 +38,8 @@ bulk_begin_transaction(WT_SESSION *session) uint64_t ts; char buf[64]; - wiredtiger_begin_transaction(session, "isolation=snapshot"); + /* Writes require snapshot isolation. */ + wiredtiger_begin_transaction(session, NULL); ts = __wt_atomic_addv64(&g.timestamp, 1); testutil_check(__wt_snprintf(buf, sizeof(buf), "read_timestamp=%" PRIx64, ts)); testutil_check(session->timestamp_transaction(session, buf)); @@ -113,7 +114,6 @@ wts_load(void) bulk_begin_transaction(session); for (committed_keyno = keyno = 0; ++keyno <= g.c_rows;) { - key_gen(&key, keyno); val_gen(NULL, &value, keyno); switch (g.type) { @@ -132,6 +132,7 @@ wts_load(void) trace_msg("bulk %" PRIu32 " {%.*s}", keyno, (int)value.size, (char *)value.data); break; case ROW: + key_gen(&key, keyno); cursor->set_key(cursor, &key); cursor->set_value(cursor, &value); if (g.trace_all) @@ -188,22 +189,22 @@ wts_load(void) } } + if (g.c_txn_timestamps) + bulk_commit_transaction(session); + /* * Ideally, the insert loop runs until the number of rows plus one, in which case row counts are * correct. If the loop exited early, reset the counters and rewrite the CONFIG file (so reopens * aren't surprised). */ if (keyno != g.c_rows + 1) { - testutil_assert(committed_keyno > 0); + g.c_rows = g.c_txn_timestamps ? committed_keyno : (keyno - 1); + testutil_assert(g.c_rows > 0); + g.rows = g.c_rows; - g.rows = committed_keyno; - g.c_rows = (uint32_t)committed_keyno; config_print(false); } - if (g.c_txn_timestamps) - bulk_commit_transaction(session); - testutil_check(cursor->close(cursor)); trace_msg("%s", "=============== bulk load stop"); diff --git a/src/third_party/wiredtiger/test/format/config.c b/src/third_party/wiredtiger/test/format/config.c index 20431b3f1ab..f7321f77c99 100644 --- a/src/third_party/wiredtiger/test/format/config.c +++ b/src/third_party/wiredtiger/test/format/config.c @@ -51,8 +51,8 @@ static void config_map_checksum(const char *, u_int *); static void config_map_compression(const char *, u_int *); static void config_map_encryption(const char *, u_int *); static void config_map_file_type(const char *, u_int *); -static void config_map_isolation(const char *, u_int *); static void config_pct(void); +static void config_prefix(void); static void config_reset(void); static void config_transaction(void); @@ -202,6 +202,7 @@ config_run(void) config_compression("btree.compression"); config_compression("logging.compression"); config_encryption(); + config_prefix(); /* Configuration based on the configuration already chosen. */ config_directio(); @@ -876,23 +877,6 @@ config_pct(void) } /* - * Cursor modify isn't possible for anything besides snapshot isolation transactions. If both - * forced, it's an error. The run-time operations code converts modify operations into updates - * if we're in some other transaction type, but if we're never going to be able to do a modify, - * turn it off in the CONFIG output to avoid misleading debuggers. - */ - if (g.c_isolation_flag == ISOLATION_READ_COMMITTED || - g.c_isolation_flag == ISOLATION_READ_UNCOMMITTED) { - if (config_is_perm("transaction.isolation") && config_is_perm("ops.pct.modify") && - g.c_modify_pct != 0) - testutil_die( - EINVAL, "WT_CURSOR.modify only supported with snapshot isolation transactions"); - - list[CONFIG_MODIFY_ENTRY].order = 0; - *list[CONFIG_MODIFY_ENTRY].vp = 0; - } - - /* * Walk the list, allocating random numbers of operations in a random order. * * If the "order" field is non-zero, we need to create a value for this operation. Find the @@ -924,116 +908,70 @@ config_pct(void) } /* + * config_prefix -- + * Prefix configuration. + */ +static void +config_prefix(void) +{ + /* Add prefix compression if prefixes are configured and no explicit choice was made. */ + if (g.c_prefix != 0 && g.c_prefix_compression == 0 && + !config_is_perm("btree.prefix_compression")) + config_single("btree.prefix_compression=on", false); +} + +/* * config_transaction -- * Transaction configuration. */ static void config_transaction(void) { - /* - * WiredTiger cannot support relaxed isolation levels. Turn off everything but timestamps with - * snapshot isolation. - */ - if ((!g.c_txn_timestamps && config_is_perm("transaction.timestamps")) || - (g.c_isolation_flag != ISOLATION_SNAPSHOT && config_is_perm("transaction.isolation"))) - testutil_die(EINVAL, "format limited to timestamp and snapshot-isolation testing"); - if (!g.c_txn_timestamps) - config_single("transaction.timestamps=on", false); - if (g.c_isolation_flag != ISOLATION_SNAPSHOT) - config_single("transaction.isolation=snapshot", false); - - /* - * Check the permanent configuration. We can't prepare a transaction if logging is configured or - * timestamps aren't configured. For repeatable reads to work in timestamp testing, all updates - * must be done in a snapshot isolation transaction. - */ + /* Transaction prepare requires timestamps and is incompatible with logging. */ if (g.c_prepare && config_is_perm("ops.prepare")) { if (g.c_logging && config_is_perm("logging")) testutil_die(EINVAL, "prepare is incompatible with logging"); if (!g.c_txn_timestamps && config_is_perm("transaction.timestamps")) testutil_die(EINVAL, "prepare requires transaction timestamps"); - if (g.c_isolation_flag != ISOLATION_SNAPSHOT && config_is_perm("transaction.isolation")) - testutil_die(EINVAL, "prepare requires snapshot isolation"); - if (g.c_txn_freq != 100 && config_is_perm("transaction.frequency")) - testutil_die(EINVAL, "prepare requires transaction frequency set to 100"); } + + /* Transaction timestamps are incompatible with implicit transactions. */ if (g.c_txn_timestamps && config_is_perm("transaction.timestamps")) { - if (g.c_isolation_flag != ISOLATION_SNAPSHOT && config_is_perm("transaction.isolation")) - testutil_die(EINVAL, "timestamps require snapshot isolation"); - if (g.c_txn_freq != 100 && config_is_perm("transaction.frequency")) - testutil_die(EINVAL, "timestamps require transaction frequency set to 100"); - } - if (g.c_logging && config_is_perm("logging") && g.c_prepare) - config_single("ops.prepare=off", false); + if (g.c_txn_implicit && config_is_perm("transaction.implicit")) + testutil_die( + EINVAL, "transaction.timestamps is incompatible with implicit transactions"); - /* FIXME-WT-6431: temporarily disable salvage with timestamps. */ - if (g.c_txn_timestamps && g.c_salvage) { - if (config_is_perm("ops.salvage")) - testutil_die(EINVAL, "salvage cannot run with timestamps"); - config_single("ops.salvage=off", false); + /* FIXME-WT-6431: temporarily disable salvage with timestamps. */ + if (g.c_salvage && config_is_perm("ops.salvage")) + testutil_die(EINVAL, "transaction.timestamps is incompatible with salvage"); } - if (g.c_isolation_flag == ISOLATION_SNAPSHOT && config_is_perm("transaction.isolation")) { - if (!g.c_txn_timestamps && config_is_perm("transaction.timestamps")) - testutil_die(EINVAL, "snapshot isolation requires timestamps"); - if (g.c_txn_freq != 100 && config_is_perm("transaction.frequency")) - testutil_die(EINVAL, "snapshot isolation requires transaction frequency set to 100"); - } - if (g.c_txn_rollback_to_stable && config_is_perm("transaction.rollback_to_stable") && - g.c_isolation_flag != ISOLATION_SNAPSHOT && config_is_perm("transaction.isolation")) - testutil_die(EINVAL, "rollback to stable requires snapshot isolation"); /* - * The permanent configuration has no incompatible settings, adjust the temporary configuration - * as necessary. Prepare overrides timestamps, overrides isolation, for no reason other than - * prepare is the least configured and timestamps are the option we want to test the most. + * Incompatible permanent configurations have been checked, now turn off any incompatible flags. + * The choices are inclined to prepare (it's only rarely configured), then timestamps. Note any + * of the options may still be set as required for the run, so we still have to check if that's + * the case until we run out of combinations (for example, prepare turns off logging, so by the + * time we check logging, logging must have been required by the run if both logging and prepare + * are still set, so we can just turn off prepare in that case). */ if (g.c_prepare) { - if (g.c_logging) + if (!config_is_perm("logging")) config_single("logging=off", false); - if (!g.c_txn_timestamps) - config_single("transaction.timestamps=on", false); - if (g.c_isolation_flag != ISOLATION_SNAPSHOT) - config_single("transaction.isolation=snapshot", false); - if (g.c_txn_freq != 100) - config_single("transaction.frequency=100", false); - } - if (g.c_txn_rollback_to_stable) { - if (!g.c_txn_timestamps) + if (!config_is_perm("transaction.timestamps")) config_single("transaction.timestamps=on", false); } if (g.c_txn_timestamps) { - if (g.c_isolation_flag != ISOLATION_SNAPSHOT) - config_single("transaction.isolation=snapshot", false); - if (g.c_txn_freq != 100) - config_single("transaction.frequency=100", false); - } - if (g.c_isolation_flag == ISOLATION_NOT_SET) { - switch (mmrand(NULL, 1, 20)) { - case 1: /* 5% */ - config_single("transaction.isolation=random", false); - break; - case 2: /* 5% */ - config_single("transaction.isolation=read-uncommitted", false); - break; - case 3: /* 5% */ - config_single("transaction.isolation=read-committed", false); - break; - default: /* 85% */ - config_single("transaction.isolation=snapshot", false); - break; - } - if (g.c_isolation_flag == ISOLATION_SNAPSHOT) { - if (!g.c_txn_timestamps) - config_single("transaction.timestamps=on", false); - if (g.c_txn_freq != 100) - config_single("transaction.frequency=100", false); - } else { - if (g.c_prepare) - config_single("ops.prepare=off", false); - if (g.c_txn_timestamps) - config_single("transaction.timestamps=off", false); - } + if (!config_is_perm("transaction.implicit")) + config_single("transaction.implicit=0", false); + if (!config_is_perm("ops.salvage")) + config_single("ops.salvage=off", false); } + if (g.c_logging) + config_single("ops.prepare=off", false); + if (g.c_txn_implicit) + config_single("transaction.timestamps=off", false); + if (g.c_salvage) + config_single("transaction.timestamps=off", false); } /* @@ -1175,9 +1113,6 @@ config_reset(void) { CONFIG *cp; - if (!config_is_perm("transaction.isolation")) - g.c_isolation_flag = ISOLATION_NOT_SET; - /* Clear temporary allocated configuration data. */ for (cp = c; cp->name != NULL; ++cp) { F_CLR(cp, C_TEMP); @@ -1289,9 +1224,6 @@ config_single(const char *s, bool perm) } else if (strncmp(s, "runs.type", strlen("runs.type")) == 0) { config_map_file_type(equalp, &g.type); *cp->vstr = dstrdup(config_file_type(g.type)); - } else if (strncmp(s, "transaction.isolation", strlen("transaction.isolation")) == 0) { - config_map_isolation(equalp, &g.c_isolation_flag); - *cp->vstr = dstrdup(equalp); } else if (strncmp(s, "logging.compression", strlen("logging.compression")) == 0) { config_map_compression(equalp, &g.c_logging_compression_flag); *cp->vstr = dstrdup(equalp); @@ -1475,25 +1407,6 @@ config_map_encryption(const char *s, u_int *vp) } /* - * config_map_isolation -- - * Map an isolation configuration to a flag. - */ -static void -config_map_isolation(const char *s, u_int *vp) -{ - if (strcmp(s, "random") == 0) - *vp = ISOLATION_RANDOM; - else if (strcmp(s, "read-uncommitted") == 0) - *vp = ISOLATION_READ_UNCOMMITTED; - else if (strcmp(s, "read-committed") == 0) - *vp = ISOLATION_READ_COMMITTED; - else if (strcmp(s, "snapshot") == 0) - *vp = ISOLATION_SNAPSHOT; - else - testutil_die(EINVAL, "illegal isolation configuration: %s", s); -} - -/* * config_is_perm * Return if a specific configuration entry was permanently set. */ diff --git a/src/third_party/wiredtiger/test/format/config.h b/src/third_party/wiredtiger/test/format/config.h index a06509b0dba..0feb22f202c 100644 --- a/src/third_party/wiredtiger/test/format/config.h +++ b/src/third_party/wiredtiger/test/format/config.h @@ -60,14 +60,14 @@ typedef struct { #define COMPRESSION_LIST " (none | lz4 | snappy | zlib | zstd)" static CONFIG c[] = { - /* 5% */ - {"assert.commit_timestamp", "assert commit_timestamp", C_BOOL, 5, 0, 0, - &g.c_assert_commit_timestamp, NULL}, - - /* 5% */ - {"assert.read_timestamp", "assert read_timestamp", C_BOOL, 5, 0, 0, &g.c_assert_read_timestamp, + /* 2% */ + {"assert.read_timestamp", "assert read_timestamp", C_BOOL, 2, 0, 0, &g.c_assert_read_timestamp, NULL}, + /* 2% */ + {"assert.write_timestamp", "set write_timestamp_usage and assert write_timestamp", C_BOOL, 2, 0, + 0, &g.c_assert_write_timestamp, NULL}, + /* 20% */ {"backup", "configure backups", C_BOOL, 20, 0, 0, &g.c_backups, NULL}, @@ -98,8 +98,6 @@ static CONFIG c[] = { {"btree.internal_page_max", "btree internal node maximum size", 0x0, 9, 17, 27, &g.c_intl_page_max, NULL}, - {"btree.key_gap", "btree page instantiated key gap", 0x0, 0, 20, 20, &g.c_key_gap, NULL}, - {"btree.key_max", "maximum key size", 0x0, 20, 128, MEGABYTE(10), &g.c_key_max, NULL}, /* @@ -113,6 +111,8 @@ static CONFIG c[] = { {"btree.memory_page_max", "maximum cache page size", 0x0, 1, 10, 128, &g.c_memory_page_max, NULL}, + {"btree.prefix", "common key prefix", C_BOOL, 3, 0, 0, &g.c_prefix, NULL}, + /* 80% */ {"btree.prefix_compression", "configure prefix compressed keys", C_BOOL, 80, 0, 0, &g.c_prefix_compression, NULL}, @@ -184,8 +184,8 @@ static CONFIG c[] = { /* * 0% - * FIXME-WT-7418 and FIXME-WT-7416: Temporarily disable import until WT_ROLLBACK error and - * interaction with backup thread is fixed. Should be 20% + * FIXME-WT-7418 and FIXME-WT-7510: Temporarily disable import until WT_ROLLBACK error and + * wt_copy_and_sync error is fixed. It should be (C_BOOL, 20, 0, 0). */ {"import", "import table from newly created database", C_BOOL, 0, 0, 0, &g.c_import, NULL}, @@ -340,19 +340,11 @@ static CONFIG c[] = { /* 2% */ {"stress.split_8", "stress splits (#8)", C_BOOL, 2, 0, 0, &g.c_timing_stress_split_8, NULL}, - {"transaction.frequency", "operations inside an explicit transaction (percentage)", 0x0, 1, 100, - 100, &g.c_txn_freq, NULL}, - - {"transaction.isolation", - "isolation level (random | read-uncommitted | read-committed | snapshot)", C_IGNORE | C_STRING, - 0, 0, 0, NULL, &g.c_isolation}, - - /* 0% - By default, turned off until fallout has been debugged. */ - {"transaction.rollback_to_stable", "configure rollback_to_stable", C_BOOL, 0, 0, 0, - &g.c_txn_rollback_to_stable, NULL}, + {"transaction.implicit", "implicit, without timestamps, transactions (percentage)", 0x0, 0, 100, + 100, &g.c_txn_implicit, NULL}, /* 70% */ - {"transaction.timestamps", "configure transaction timestamps", C_BOOL, 70, 0, 0, + {"transaction.timestamps", "all transactions (or none), have timestamps", C_BOOL, 80, 0, 0, &g.c_txn_timestamps, NULL}, {"wiredtiger.config", "wiredtiger_open API configuration string", C_IGNORE | C_STRING, 0, 0, 0, diff --git a/src/third_party/wiredtiger/test/format/config_compat.c b/src/third_party/wiredtiger/test/format/config_compat.c index 4a5789bf854..2926d54ca4b 100644 --- a/src/third_party/wiredtiger/test/format/config_compat.c +++ b/src/third_party/wiredtiger/test/format/config_compat.c @@ -101,10 +101,6 @@ static const char *list[] = { "btree.internal_key_truncation", "internal_page_max=", "btree.internal_page_max", - "isolation=", - "transaction.isolation", - "key_gap=", - "btree.key_gap", "key_max=", "btree.key_max", "key_min=", diff --git a/src/third_party/wiredtiger/test/format/config_compat.sed b/src/third_party/wiredtiger/test/format/config_compat.sed index 0f43b19fc6a..b90b21332e8 100644 --- a/src/third_party/wiredtiger/test/format/config_compat.sed +++ b/src/third_party/wiredtiger/test/format/config_compat.sed @@ -8,7 +8,6 @@ s/^btree.dictionary=/dictionary=/ s/^btree.huffman_value=/huffman_value=/ s/^btree.internal_key_truncation=/internal_key_truncation=/ s/^btree.internal_page_max=/internal_page_max=/ -s/^btree.key_gap=/key_gap=/ s/^btree.key_max=/key_max=/ s/^btree.key_min=/key_min=/ s/^btree.leaf_page_max=/leaf_page_max=/ diff --git a/src/third_party/wiredtiger/test/format/format.h b/src/third_party/wiredtiger/test/format/format.h index 7aefc071396..cd46f43a781 100644 --- a/src/third_party/wiredtiger/test/format/format.h +++ b/src/third_party/wiredtiger/test/format/format.h @@ -140,8 +140,8 @@ typedef struct { uint32_t c_abort; /* Config values */ uint32_t c_alter; - uint32_t c_assert_commit_timestamp; uint32_t c_assert_read_timestamp; + uint32_t c_assert_write_timestamp; uint32_t c_auto_throttle; char *c_backup_incremental; uint32_t c_backup_incr_granularity; @@ -178,8 +178,6 @@ typedef struct { uint32_t c_insert_pct; uint32_t c_internal_key_truncation; uint32_t c_intl_page_max; - char *c_isolation; - uint32_t c_key_gap; uint32_t c_key_max; uint32_t c_key_min; uint32_t c_leaf_page_max; @@ -197,6 +195,7 @@ typedef struct { uint32_t c_mmap_all; uint32_t c_modify_pct; uint32_t c_ops; + uint32_t c_prefix; uint32_t c_prefix_compression; uint32_t c_prefix_compression_min; uint32_t c_prepare; @@ -228,8 +227,7 @@ typedef struct { uint32_t c_timing_stress_split_7; uint32_t c_timing_stress_split_8; uint32_t c_truncate; - uint32_t c_txn_freq; - uint32_t c_txn_rollback_to_stable; + uint32_t c_txn_implicit; uint32_t c_txn_timestamps; uint32_t c_value_max; uint32_t c_value_min; @@ -270,13 +268,6 @@ typedef struct { #define ENCRYPT_ROTN_7 2 u_int c_encryption_flag; /* Encryption flag value */ -#define ISOLATION_NOT_SET 0 -#define ISOLATION_RANDOM 1 -#define ISOLATION_READ_UNCOMMITTED 2 -#define ISOLATION_READ_COMMITTED 3 -#define ISOLATION_SNAPSHOT 4 - u_int c_isolation_flag; /* Isolation flag value */ - /* The page must be a multiple of the allocation size, and 512 always works. */ #define BLOCK_ALLOCATION_SIZE 512 uint32_t intl_page_max; /* Maximum page sizes */ @@ -284,6 +275,7 @@ typedef struct { uint64_t rows; /* Total rows */ + uint32_t prefix_len; /* Common key prefix length */ uint32_t key_rand_len[1031]; /* Key lengths */ } GLOBAL; extern GLOBAL g; diff --git a/src/third_party/wiredtiger/test/format/format.sh b/src/third_party/wiredtiger/test/format/format.sh index 9d462aed0df..a2fcc71c93e 100755 --- a/src/third_party/wiredtiger/test/format/format.sh +++ b/src/third_party/wiredtiger/test/format/format.sh @@ -256,6 +256,37 @@ skip_known_errors() return 1 } +# Categorize the failures +# $1 Log file +categorize_failure() +{ + log=$1 + + # Add any important configs to be picked from the detailed failed configuration. + configs=("backup=" "runs.source" "runs.type" "transaction.isolation" "transaction.rollback_to_stable" + "ops.prepare" "transaction.timestamps") + count=${#configs[@]} + + search_string="" + + # now loop through the config array + for ((i=0; i<$count; i++)) + do + if [ $i == $(($count - 1)) ] + then + search_string+=${configs[i]} + else + search_string+="${configs[i]}|" + fi + done + + echo "############################################" + echo "test/format run configuration highlights" + echo "############################################" + grep -E "$search_string" $log + echo "############################################" +} + # Report a failure. # $1 directory name report_failure() @@ -288,6 +319,8 @@ report_failure() echo "$name: $dir/CONFIG:" sed 's/^/ /' < $dir/CONFIG + categorize_failure $log + echo "$name: failure status reported" > $dir/$status } diff --git a/src/third_party/wiredtiger/test/format/kv.c b/src/third_party/wiredtiger/test/format/kv.c index 04e9e0fc46c..32788b86ffb 100644 --- a/src/third_party/wiredtiger/test/format/kv.c +++ b/src/third_party/wiredtiger/test/format/kv.c @@ -75,6 +75,10 @@ key_init(void) for (i = 0; i < WT_ELEMENTS(g.key_rand_len); ++i) fprintf(fp, "%" PRIu32 "\n", g.key_rand_len[i]); fclose_and_clear(&fp); + + /* Fill in the common key prefix length (which is added to the key min/max). */ + if (g.c_prefix != 0) + g.prefix_len = mmrand(NULL, 15, 80); } /* @@ -87,7 +91,7 @@ key_gen_init(WT_ITEM *key) size_t i, len; char *p; - len = WT_MAX(KILOBYTE(100), g.c_key_max); + len = WT_MAX(KILOBYTE(100), g.c_key_max + g.prefix_len); p = dmalloc(len); for (i = 0; i < len; ++i) p[i] = "abcdefghijklmnopqrstuvwxyz"[i % 26]; @@ -111,45 +115,62 @@ key_gen_teardown(WT_ITEM *key) /* * key_gen_common -- - * Key generation code shared between normal and insert key generation. + * Row-store key generation code shared between normal and insert key generation. */ void key_gen_common(WT_ITEM *key, uint64_t keyno, const char *const suffix) { - int len; + uint64_t n; char *p; + const char *bucket; + + testutil_assert(g.type == ROW); p = key->mem; /* - * The key always starts with a 10-digit string (the specified row) followed by two digits, a - * random number between 1 and 15 if it's an insert, otherwise 00. + * The workload we're trying to mimic with a prefix is a long common prefix followed by a record + * number, the tricks are creating a prefix that won't re-order keys, and to change the prefix + * with some regularity to test prefix boundaries. Split the key space into power-of-2 buckets: + * that results in tiny runs of prefix strings at the beginning of the tree, and increasingly + * large common prefixes as the tree grows (with a testing sweet spot in the middle). After the + * bucket value, append a string of common bytes. The standard, zero-padded key itself sorts + * lexicographically, meaning the common key prefix will grow and shrink by a few bytes as the + * number increments, which is a good thing for testing. + */ + if (g.prefix_len > 0) { + bucket = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"; + for (n = keyno; n > 0; n >>= 1) { + if (*bucket == 'z') + break; + ++bucket; + } + p[0] = *bucket; + memset(p + 1, 'C', g.prefix_len - 1); + p += g.prefix_len; + } + + /* + * After any common prefix, the key starts with a 10-digit string (the specified row) followed + * by two digits (a random number between 1 and 15 if it's an insert, otherwise 00). */ - u64_to_string_zf(keyno, key->mem, 11); + u64_to_string_zf(keyno, p, 11); p[10] = '.'; p[11] = suffix[0]; p[12] = suffix[1]; - len = 13; + p[13] = '/'; /* - * In a column-store, the key isn't used, it doesn't need a random length. + * Because we're doing table lookup for key sizes, we can't set overflow key sizes in the table, + * the table isn't big enough to keep our hash from selecting too many big keys and blowing out + * the cache. Handle that here, use a really big key 1 in 2500 times. */ - if (g.type == ROW) { - p[len] = '/'; - - /* - * Because we're doing table lookup for key sizes, we weren't able to set really big keys - * sizes in the table, the table isn't big enough to keep our hash from selecting too many - * big keys and blowing out the cache. Handle that here, use a really big key 1 in 2500 - * times. - */ - len = keyno % 2500 == 0 && g.c_key_max < KILOBYTE(80) ? - KILOBYTE(80) : - (int)g.key_rand_len[keyno % WT_ELEMENTS(g.key_rand_len)]; - } - key->data = key->mem; - key->size = (size_t)len; + key->size = g.prefix_len; + key->size += keyno % 2500 == 0 && g.c_key_max < KILOBYTE(80) ? + KILOBYTE(80) : + g.key_rand_len[keyno % WT_ELEMENTS(g.key_rand_len)]; + testutil_assert(key->size <= key->memsize); } static char *val_base; /* Base/original value */ diff --git a/src/third_party/wiredtiger/test/format/ops.c b/src/third_party/wiredtiger/test/format/ops.c index 0e5f8a30422..3fd5706efad 100644 --- a/src/third_party/wiredtiger/test/format/ops.c +++ b/src/third_party/wiredtiger/test/format/ops.c @@ -189,35 +189,20 @@ tinfo_teardown(void) } /* - * Command used before rollback to stable to save the interesting files so we can replay the command - * as necessary. - * - * Redirect the "cd" command to /dev/null so chatty cd implementations don't add the new working - * directory to our output. - */ -#define ROLLBACK_STABLE_COPY_CMD \ - "cd %s > /dev/null && " \ - "rm -rf ROLLBACK.copy && mkdir ROLLBACK.copy && " \ - "cp WiredTiger* wt* ROLLBACK.copy/" - -/* - * tinfo_rollback_to_stable_and_check -- - * Do a rollback to stable, then check that changes are correct from what we know in the worker - * thread structures. + * tinfo_rollback_to_stable -- + * Do a rollback to stable and verify operations. */ static void -tinfo_rollback_to_stable_and_check(WT_SESSION *session) +tinfo_rollback_to_stable(WT_SESSION *session) { WT_CURSOR *cursor; - WT_DECL_RET; - char cmd[512]; - testutil_check(__wt_snprintf(cmd, sizeof(cmd), ROLLBACK_STABLE_COPY_CMD, g.home)); - if ((ret = system(cmd)) != 0) - testutil_die(ret, "rollback to stable copy (\"%s\") failed", cmd); - trace_msg("%-10s ts=%" PRIu64, "rts", g.stable_timestamp); + /* Rollback-to-stable only makes sense for timestamps and on-disk stores. */ + if (g.c_txn_timestamps == 0 || g.c_in_memory != 0) + return; - g.wts_conn->rollback_to_stable(g.wts_conn, NULL); + trace_msg("%-10s ts=%" PRIu64, "rts", g.stable_timestamp); + testutil_check(g.wts_conn->rollback_to_stable(g.wts_conn, NULL)); /* Check the saved snap operations for consistency. */ testutil_check(session->open_cursor(session, g.uri, NULL, NULL, &cursor)); @@ -402,8 +387,13 @@ operations(u_int ops_seconds, bool lastrun) trace_msg("%s", "=============== thread ops stop"); - if (g.c_txn_rollback_to_stable) - tinfo_rollback_to_stable_and_check(session); + /* + * The system should be quiescent at this point, call rollback to stable. Generally, we expect + * applications to do rollback-to-stable as part of the database open, but calling it outside of + * the open path is expected in the case of applications that are "restarting" but skipping the + * close/re-open pair. + */ + tinfo_rollback_to_stable(session); if (lastrun) { tinfo_teardown(); @@ -418,20 +408,16 @@ operations(u_int ops_seconds, bool lastrun) * Begin a timestamped transaction. */ static void -begin_transaction_ts(TINFO *tinfo, u_int *iso_configp) +begin_transaction_ts(TINFO *tinfo) { TINFO **tlp; WT_DECL_RET; WT_SESSION *session; uint64_t ts; - const char *config; char buf[64]; session = tinfo->session; - config = "isolation=snapshot"; - *iso_configp = ISOLATION_SNAPSHOT; - /* * Transaction reads are normally repeatable, but WiredTiger timestamps allow rewriting commits, * that is, applications can specify at commit time the timestamp at which the commit happens. @@ -444,7 +430,7 @@ begin_transaction_ts(TINFO *tinfo, u_int *iso_configp) for (ts = UINT64_MAX, tlp = tinfo_list; *tlp != NULL; ++tlp) ts = WT_MIN(ts, (*tlp)->commit_ts); if (ts != 0) { - wiredtiger_begin_transaction(session, config); + wiredtiger_begin_transaction(session, NULL); /* * If the timestamp has aged out of the system, we'll get EINVAL when we try and set it. @@ -463,7 +449,7 @@ begin_transaction_ts(TINFO *tinfo, u_int *iso_configp) testutil_check(session->rollback_transaction(session, NULL)); } - wiredtiger_begin_transaction(session, config); + wiredtiger_begin_transaction(session, NULL); /* * Otherwise, pick a current timestamp. @@ -487,40 +473,19 @@ begin_transaction_ts(TINFO *tinfo, u_int *iso_configp) /* * begin_transaction -- - * Choose an isolation configuration and begin a transaction. + * Begin a non-timestamp transaction. */ static void -begin_transaction(TINFO *tinfo, u_int *iso_configp) +begin_transaction(TINFO *tinfo, const char *iso_config) { WT_SESSION *session; - u_int v; - const char *config; session = tinfo->session; - if ((v = g.c_isolation_flag) == ISOLATION_RANDOM) - v = mmrand(&tinfo->rnd, 1, 3); - switch (v) { - case 1: - v = ISOLATION_READ_UNCOMMITTED; - config = "isolation=read-uncommitted"; - break; - case 2: - v = ISOLATION_READ_COMMITTED; - config = "isolation=read-committed"; - break; - case 3: - default: - v = ISOLATION_SNAPSHOT; - config = "isolation=snapshot"; - break; - } - *iso_configp = v; - - wiredtiger_begin_transaction(session, config); + wiredtiger_begin_transaction(session, iso_config); snap_op_init(tinfo, WT_TS_NONE, false); - trace_op(tinfo, "begin %s", config); + trace_op(tinfo, "begin %s", iso_config); } /* @@ -641,7 +606,7 @@ prepare_transaction(TINFO *tinfo) #define OP_FAILED(notfound_ok) \ do { \ positioned = false; \ - if (intxn && (ret == WT_CACHE_FULL || ret == WT_ROLLBACK || ret == WT_CACHE_FULL)) \ + if (intxn && (ret == WT_CACHE_FULL || ret == WT_ROLLBACK)) \ goto rollback; \ testutil_assert( \ (notfound_ok && ret == WT_NOTFOUND) || ret == WT_CACHE_FULL || ret == WT_ROLLBACK); \ @@ -660,16 +625,6 @@ prepare_transaction(TINFO *tinfo) } while (0) /* - * When in a transaction on the live table with snapshot isolation, track operations for later - * repetition. - */ -#define SNAP_TRACK(tinfo, op) \ - do { \ - if (intxn && iso_config == ISOLATION_SNAPSHOT) \ - snap_track(tinfo, op); \ - } while (0) - -/* * ops_open_session -- * Create a new session/cursor pair for the thread. */ @@ -702,6 +657,21 @@ ops_open_session(TINFO *tinfo) tinfo->cursor = cursor; } +/* Isolation configuration. */ +typedef enum { + ISOLATION_READ_COMMITTED, + ISOLATION_READ_UNCOMMITTED, + ISOLATION_SNAPSHOT +} iso_level_t; + +/* When in an explicit snapshot isolation transaction, track operations for later + * repetition. */ +#define SNAP_TRACK(tinfo, op) \ + do { \ + if (intxn && iso_level == ISOLATION_SNAPSHOT) \ + snap_track(tinfo, op); \ + } while (0) + /* * ops -- * Per-thread operations. @@ -713,10 +683,12 @@ ops(void *arg) WT_CURSOR *cursor; WT_DECL_RET; WT_SESSION *session; + iso_level_t iso_level; thread_op op; uint64_t reset_op, session_op, truncate_op; uint32_t range, rnd; - u_int i, j, iso_config; + u_int i, j; + const char *iso_config; bool greater_than, intxn, next, positioned, prepared; tinfo = arg; @@ -733,7 +705,7 @@ ops(void *arg) else __wt_random_init(&tinfo->rnd); - iso_config = ISOLATION_RANDOM; /* -Wconditional-uninitialized */ + iso_level = ISOLATION_SNAPSHOT; /* -Wconditional-uninitialized */ /* Set the first operation where we'll create sessions and cursors. */ cursor = NULL; @@ -769,9 +741,9 @@ ops(void *arg) } /* - * If not in a transaction, reset the session now and then, just to make sure that operation - * gets tested. The test is not for equality, we have to do the reset outside of a - * transaction so we aren't likely to get an exact match. + * If not in a transaction, reset the session periodically to make sure that operation is + * tested. The test is not for equality, resets must be done outside of transactions so we + * aren't likely to get an exact match. */ if (!intxn && tinfo->ops > reset_op) { testutil_check(session->reset(session)); @@ -781,42 +753,66 @@ ops(void *arg) } /* - * If not in a transaction, have a live handle and running in a timestamp world, - * occasionally repeat a timestamped operation. + * If not in a transaction and in a timestamp world, occasionally repeat a timestamped + * operation. */ if (!intxn && g.c_txn_timestamps && mmrand(&tinfo->rnd, 1, 15) == 1) { ++tinfo->search; snap_repeat_single(cursor, tinfo); } + /* If not in a transaction and in a timestamp world, start a transaction. */ + if (!intxn && g.c_txn_timestamps) { + iso_level = ISOLATION_SNAPSHOT; + begin_transaction_ts(tinfo); + intxn = true; + } + /* - * If not in a transaction and have a live handle, choose an isolation level and start a - * transaction some percentage of the time. + * If not in a transaction and not in a timestamp world, start a transaction some percentage + * of the time. */ - if (!intxn && (g.c_txn_timestamps || mmrand(&tinfo->rnd, 1, 100) <= g.c_txn_freq)) { - if (g.c_txn_timestamps) - begin_transaction_ts(tinfo, &iso_config); - else - begin_transaction(tinfo, &iso_config); + if (!intxn && mmrand(&tinfo->rnd, 1, 100) < g.c_txn_implicit) { + iso_level = ISOLATION_SNAPSHOT; + iso_config = "isolation=snapshot"; + + /* Occasionally do reads at an isolation level lower than snapshot. */ + switch (mmrand(NULL, 1, 20)) { + case 1: + iso_level = ISOLATION_READ_COMMITTED; /* 5% */ + iso_config = "isolation=read-committed"; + break; + case 2: + iso_level = ISOLATION_READ_UNCOMMITTED; /* 5% */ + iso_config = "isolation=read-uncommitted"; + break; + } + + begin_transaction(tinfo, iso_config); intxn = true; } - /* Select an operation. */ + /* + * Select an operation: all updates must be in snapshot isolation, modify must be in an + * explicit transaction. + */ op = READ; - i = mmrand(&tinfo->rnd, 1, 100); - if (i < g.c_delete_pct && tinfo->ops > truncate_op) { - op = TRUNCATE; - - /* Pick the next truncate operation. */ - truncate_op += mmrand(&tinfo->rnd, 20000, 100000); - } else if (i < g.c_delete_pct) - op = REMOVE; - else if (i < g.c_delete_pct + g.c_insert_pct) - op = INSERT; - else if (i < g.c_delete_pct + g.c_insert_pct + g.c_modify_pct) - op = MODIFY; - else if (i < g.c_delete_pct + g.c_insert_pct + g.c_modify_pct + g.c_write_pct) - op = UPDATE; + if (iso_level == ISOLATION_SNAPSHOT) { + i = mmrand(&tinfo->rnd, 1, 100); + if (i < g.c_delete_pct && tinfo->ops > truncate_op) { + op = TRUNCATE; + + /* Pick the next truncate operation. */ + truncate_op += mmrand(&tinfo->rnd, 20000, 100000); + } else if (i < g.c_delete_pct) + op = REMOVE; + else if (i < g.c_delete_pct + g.c_insert_pct) + op = INSERT; + else if (intxn && i < g.c_delete_pct + g.c_insert_pct + g.c_modify_pct) + op = MODIFY; + else if (i < g.c_delete_pct + g.c_insert_pct + g.c_modify_pct + g.c_write_pct) + op = UPDATE; + } /* Select a row. */ tinfo->keyno = mmrand(&tinfo->rnd, 1, (u_int)g.rows); @@ -838,10 +834,10 @@ ops(void *arg) } /* - * Optionally reserve a row. Reserving a row before a read isn't all that sensible, but not - * unexpected, either. + * Optionally reserve a row, it's an update so it requires snapshot isolation. Reserving a + * row before a read isn't all that sensible, but not unexpected, either. */ - if (intxn && mmrand(&tinfo->rnd, 0, 20) == 1) { + if (intxn && iso_level == ISOLATION_SNAPSHOT && mmrand(&tinfo->rnd, 0, 20) == 1) { switch (g.type) { case ROW: ret = row_reserve(tinfo, cursor, positioned); @@ -853,8 +849,7 @@ ops(void *arg) } if (ret == 0) { positioned = true; - - __wt_yield(); /* Let other threads proceed. */ + __wt_yield(); /* Encourage races */ } else WRITE_OP_FAILED(true); } @@ -888,13 +883,6 @@ ops(void *arg) WRITE_OP_FAILED(false); break; case MODIFY: - /* - * Change modify into update if not part of a snapshot isolation transaction, modify - * isn't supported in those cases. - */ - if (!intxn || iso_config != ISOLATION_SNAPSHOT) - goto update_instead_of_chosen_op; - ++tinfo->update; switch (g.type) { case ROW: @@ -1050,17 +1038,17 @@ update_instead_of_chosen_op: testutil_check(cursor->reset(cursor)); /* - * Continue if not in a transaction, else add more operations to the transaction half the - * time. + * No post-operation work is needed outside of a transaction. If in a transaction, add more + * operations to the transaction half the time. */ if (!intxn || (rnd = mmrand(&tinfo->rnd, 1, 10)) > 5) continue; /* - * Ending a transaction. If on a live handle and the transaction was configured for snapshot - * isolation, repeat the operations and confirm the results are unchanged. + * Ending a transaction. If the transaction was configured for snapshot isolation, repeat + * the operations and confirm the results are unchanged. */ - if (intxn && iso_config == ISOLATION_SNAPSHOT) { + if (intxn && iso_level == ISOLATION_SNAPSHOT) { __wt_yield(); /* Encourage races */ ret = snap_repeat_txn(cursor, tinfo); @@ -1069,13 +1057,10 @@ update_instead_of_chosen_op: goto rollback; } - /* - * If prepare configured, prepare the transaction 10% of the time. - */ + /* If prepare configured, prepare the transaction 10% of the time. */ prepared = false; if (g.c_prepare && mmrand(&tinfo->rnd, 1, 10) == 1) { - ret = prepare_transaction(tinfo); - if (ret != 0) + if ((ret = prepare_transaction(tinfo)) != 0) WRITE_OP_FAILED(false); __wt_yield(); /* Encourage races */ @@ -1083,7 +1068,8 @@ update_instead_of_chosen_op: } /* - * If we're in a transaction, commit 40% of the time and rollback 10% of the time. + * If we're in a transaction, commit 40% of the time and rollback 10% of the time (we + * continued to add operations to the transaction the remaining 50% of the time). */ switch (rnd) { case 1: @@ -1351,8 +1337,8 @@ order_error_col: * to the row's key.) Keys are strings with terminating '/' values, so absent key * corruption, we can simply do the underlying string conversion on the key string. */ - keyno_prev = strtoul(tinfo->key->data, NULL, 10); - keyno = strtoul(key.data, NULL, 10); + keyno_prev = strtoul((char *)tinfo->key->data + g.prefix_len, NULL, 10); + keyno = strtoul((char *)key.data + g.prefix_len, NULL, 10); if (incrementing) { if (keyno_prev != keyno && keyno_prev + 1 != keyno) goto order_error_row; diff --git a/src/third_party/wiredtiger/test/format/snap.c b/src/third_party/wiredtiger/test/format/snap.c index bc2b58b2f3c..1c934a5d187 100644 --- a/src/third_party/wiredtiger/test/format/snap.c +++ b/src/third_party/wiredtiger/test/format/snap.c @@ -29,11 +29,6 @@ #include "format.h" /* - * Issue a warning when there enough consecutive unsuccessful checks for rollback to stable. - */ -#define WARN_RTS_NO_CHECK 5 - -/* * snap_init -- * Initialize the repeatable operation tracking. */ @@ -41,14 +36,16 @@ void snap_init(TINFO *tinfo) { /* - * We maintain two snap lists. The current one is indicated by tinfo->s, and keeps the most - * recent operations. The other one is used when we are running with rollback_to_stable. When - * each thread notices that the stable timestamp has changed, it stashes the current snap list - * and starts fresh with the other snap list. After we've completed a rollback_to_stable, we can - * the secondary snap list to see the state of keys/values seen and updated at the time of the - * rollback. + * We maintain two snap lists, where the current one is indicated by tinfo->s, and keeps the + * most recent operations. + * + * The other one is used when we are running timestamp transactions with rollback_to_stable. + * When each thread notices that the stable timestamp has changed, it stashes the current snap + * list and starts fresh with the other snap list. After we've completed a rollback_to_stable, + * we can the secondary snap list to see the state of keys/values seen and updated at the time + * of the rollback. */ - if (g.c_txn_rollback_to_stable) { + if (g.c_txn_timestamps) { tinfo->s = &tinfo->snap_states[1]; tinfo->snap_list = dcalloc(SNAP_LIST_SIZE, sizeof(SNAP_OPS)); tinfo->snap_end = &tinfo->snap_list[SNAP_LIST_SIZE]; @@ -113,7 +110,7 @@ snap_op_init(TINFO *tinfo, uint64_t read_ts, bool repeatable_reads) ++tinfo->opid; - if (g.c_txn_rollback_to_stable) { + if (g.c_txn_timestamps) { /* * If the stable timestamp has changed and we've advanced beyond it, preserve the current * snapshot history up to this point, we'll use it verify rollback_to_stable. Switch our @@ -528,40 +525,45 @@ snap_repeat_update(TINFO *tinfo, bool committed) * Repeat one operation. */ static void -snap_repeat(WT_CURSOR *cursor, TINFO *tinfo, SNAP_OPS *snap, bool rollback_allowed) +snap_repeat(WT_CURSOR *cursor, TINFO *tinfo, SNAP_OPS *snap) { WT_DECL_RET; WT_SESSION *session; +#define MAX_RETRY_ON_ROLLBACK 1000 + u_int max_retry; char buf[64]; session = cursor->session; - /* - * Start a new transaction. Set the read timestamp. Verify the record. Discard the transaction. - */ - wiredtiger_begin_transaction(session, "isolation=snapshot"); + trace_op(tinfo, "repeat %" PRIu64 " ts=%" PRIu64 " {%s}", snap->keyno, snap->ts, + trace_bytes(tinfo, snap->vdata, snap->vsize)); - /* - * If the timestamp has aged out of the system, we'll get EINVAL when we try and set it. - */ + /* Start a transaction with a read-timestamp and verify the record. */ testutil_check(__wt_snprintf(buf, sizeof(buf), "read_timestamp=%" PRIx64, snap->ts)); - ret = session->timestamp_transaction(session, buf); - if (ret == 0) { - trace_op(tinfo, "repeat %" PRIu64 " ts=%" PRIu64 " {%s}", snap->keyno, snap->ts, - trace_bytes(tinfo, snap->vdata, snap->vsize)); - - /* The only expected error is rollback. */ - ret = snap_verify(cursor, tinfo, snap); + for (max_retry = 0; max_retry < MAX_RETRY_ON_ROLLBACK; ++max_retry, __wt_yield()) { + wiredtiger_begin_transaction(session, "isolation=snapshot"); - if (ret != 0 && (!rollback_allowed || (ret != WT_ROLLBACK && ret != WT_CACHE_FULL))) - testutil_check(ret); - } else if (ret == EINVAL) - snap_ts_clear(tinfo, snap->ts); - else + /* EINVAL means the timestamp has aged out of the system. */ + if ((ret = session->timestamp_transaction(session, buf)) == EINVAL) { + snap_ts_clear(tinfo, snap->ts); + break; + } testutil_check(ret); - /* Discard the transaction. */ + /* + * The only expected error is rollback (as a read-only transaction, cache-full shouldn't + * matter to us). Persist after rollback, as a repeatable read we should succeed, yield to + * let eviction catch up. + */ + if ((ret = snap_verify(cursor, tinfo, snap)) == 0) + break; + testutil_assert(ret == WT_ROLLBACK); + + testutil_check(session->rollback_transaction(session, NULL)); + } + testutil_assert(max_retry < MAX_RETRY_ON_ROLLBACK); + testutil_check(session->rollback_transaction(session, NULL)); } @@ -593,7 +595,7 @@ snap_repeat_single(WT_CURSOR *cursor, TINFO *tinfo) if (count == 0) return; - snap_repeat(cursor, tinfo, snap, true); + snap_repeat(cursor, tinfo, snap); } /* @@ -626,9 +628,8 @@ snap_repeat_rollback(WT_CURSOR *cursor, TINFO **tinfo_array, size_t tinfo_count) for (statenum = 0; statenum < WT_ELEMENTS(tinfo->snap_states); statenum++) { state = &tinfo->snap_states[statenum]; for (snap = state->snap_state_list; snap < state->snap_state_end; ++snap) { - if (snap->repeatable && snap->ts <= g.stable_timestamp && - snap->ts >= g.oldest_timestamp) { - snap_repeat(cursor, tinfo, snap, false); + if (snap->repeatable && snap->ts <= g.stable_timestamp) { + snap_repeat(cursor, tinfo, snap); ++count; if (count % 100 == 0) { testutil_check(__wt_snprintf( @@ -646,6 +647,7 @@ snap_repeat_rollback(WT_CURSOR *cursor, TINFO **tinfo_array, size_t tinfo_count) __wt_snprintf(buf, sizeof(buf), "rollback_to_stable: %" PRIu32 " ops repeated", count)); track(buf, 0ULL, NULL); if (count == 0) { +#define WARN_RTS_NO_CHECK 5 if (++g.rts_no_check >= WARN_RTS_NO_CHECK) fprintf(stderr, "Warning: %" PRIu32 " consecutive runs with no rollback_to_stable checking\n", count); diff --git a/src/third_party/wiredtiger/test/format/util.c b/src/third_party/wiredtiger/test/format/util.c index f35b2a8416c..8c5efd007ee 100644 --- a/src/third_party/wiredtiger/test/format/util.c +++ b/src/third_party/wiredtiger/test/format/util.c @@ -282,7 +282,7 @@ timestamp_once(bool allow_lag, bool final) /* * If a lag is permitted, move the oldest timestamp half the way to the current - * "all_durable" timestamp. Move the stable timestamp to "all_durable". + * "all_durable" timestamp. Move the stable timestamp to "all_durable". */ if (allow_lag) g.oldest_timestamp = (all_durable + g.oldest_timestamp) / 2; diff --git a/src/third_party/wiredtiger/test/format/wts.c b/src/third_party/wiredtiger/test/format/wts.c index 3b37b3a43d1..f95d7903c94 100644 --- a/src/third_party/wiredtiger/test/format/wts.c +++ b/src/third_party/wiredtiger/test/format/wts.c @@ -319,10 +319,8 @@ create_object(WT_CONNECTION *conn) CONFIG_APPEND(p, ",value_format=%" PRIu32 "t", g.c_bitcnt); break; case ROW: - if (g.c_prefix_compression) - CONFIG_APPEND(p, ",prefix_compression_min=%" PRIu32, g.c_prefix_compression_min); - else - CONFIG_APPEND(p, ",prefix_compression=false"); + CONFIG_APPEND(p, ",prefix_compression=%s,prefix_compression_min=%" PRIu32, + g.c_prefix_compression == 0 ? "false" : "true", g.c_prefix_compression_min); if (g.c_reverse) CONFIG_APPEND(p, ",collator=reverse"); /* FALLTHROUGH */ @@ -351,22 +349,16 @@ create_object(WT_CONNECTION *conn) if (g.c_compression_flag != COMPRESS_NONE) CONFIG_APPEND(p, ",block_compressor=\"%s\"", compressor(g.c_compression_flag)); - /* Configure Btree internal key truncation. */ + /* Configure Btree. */ CONFIG_APPEND(p, ",internal_key_truncate=%s", g.c_internal_key_truncation ? "true" : "false"); - - /* Configure Btree page key gap. */ - CONFIG_APPEND(p, ",key_gap=%" PRIu32, g.c_key_gap); - - /* Configure Btree split page percentage. */ CONFIG_APPEND(p, ",split_pct=%" PRIu32, g.c_split_pct); - /* - * Assertions. Assertions slow down the code for additional diagnostic checking. - */ - if (g.c_txn_timestamps && g.c_assert_commit_timestamp) - CONFIG_APPEND(p, ",write_timestamp_usage=key_consistent,assert=(write_timestamp=on)"); - if (g.c_txn_timestamps && g.c_assert_read_timestamp) - CONFIG_APPEND(p, ",assert=(read_timestamp=always)"); + /* Assertions: assertions slow down the code for additional diagnostic checking. */ + if (g.c_assert_read_timestamp) + CONFIG_APPEND(p, ",assert=(read_timestamp=%s)", g.c_txn_timestamps ? "always" : "never"); + if (g.c_assert_write_timestamp) + CONFIG_APPEND(p, ",assert=(write_timestamp=on),write_timestamp_usage=%s", + g.c_txn_timestamps ? "always" : "never"); /* Configure LSM. */ if (DATASOURCE("lsm")) { diff --git a/src/third_party/wiredtiger/test/suite/test_backup22.py b/src/third_party/wiredtiger/test/suite/test_backup22.py new file mode 100644 index 00000000000..06d1a81ef7c --- /dev/null +++ b/src/third_party/wiredtiger/test/suite/test_backup22.py @@ -0,0 +1,93 @@ +#!/usr/bin/env python +# +# Public Domain 2014-present MongoDB, Inc. +# Public Domain 2008-2014 WiredTiger, Inc. +# +# This is free and unencumbered software released into the public domain. +# +# Anyone is free to copy, modify, publish, use, compile, sell, or +# distribute this software, either in source code form or as a compiled +# binary, for any purpose, commercial or non-commercial, and by any +# means. +# +# In jurisdictions that recognize copyright laws, the author or authors +# of this software dedicate any and all copyright interest in the +# software to the public domain. We make this dedication for the benefit +# of the public at large and to the detriment of our heirs and +# successors. We intend this dedication to be an overt act of +# relinquishment in perpetuity of all present and future rights to this +# software under copyright law. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +# IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR +# OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +# ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR +# OTHER DEALINGS IN THE SOFTWARE. + +import wiredtiger, os +from wtscenario import make_scenarios +from wtbackup import backup_base + +# test_backup22.py +# Test interaction between import and incremental backup. +# Test the functionality of importing dropped tables in incremental backup. +# +class test_backup22(backup_base): + create_config = 'allocation_size=512,key_format=i,value_format=i' + # Backup directory name + dir='backup.dir' + incr_dir = 'incr_backup.dir' + uri = 'test_backup22' + scenarios = make_scenarios([ + ('import_with_metadata', dict(repair=False,checkpoint=False)), + ('import_repair', dict(repair=True,checkpoint=False)), + ('import_with_metadata_ckpt', dict(repair=False,checkpoint=True)), + ('import_repair_ckpt', dict(repair=True,checkpoint=True)), + ]) + + def test_import_with_open_backup_cursor(self): + os.mkdir(self.dir) + os.mkdir(self.incr_dir) + + # Create and populate the table. + table_uri = 'table:' + self.uri + self.session.create(table_uri, self.create_config) + cursor = self.session.open_cursor(table_uri) + for i in range(1, 1000): + cursor[i] = i + cursor.close() + self.session.checkpoint() + + # Export the metadata for the file. + file_uri = 'file:' + self.uri + '.wt' + c = self.session.open_cursor('metadata:', None, None) + original_db_table_config = c[table_uri] + original_db_file_config = c[file_uri] + c.close() + + config = 'incremental=(enabled,granularity=4k,this_id="ID1")' + bkup_c = self.session.open_cursor('backup:', None, config) + self.take_full_backup(self.dir, bkup_c) + bkup_c.close() + self.session.drop(table_uri, 'remove_files=false') + + # First construct the config string for the default or repair import scenario, + # then call create to import the table. + if self.repair: + import_config = 'import=(enabled,repair=true)' + else: + import_config = '{},import=(enabled,repair=false,file_metadata=({}))'.format( + original_db_table_config, original_db_file_config) + self.session.create(table_uri, import_config) + + if self.checkpoint: + self.session.checkpoint() + # Perform incremental backup with id 2 on empty directory. We want empty directory + # because we expect all files to be copied over in it's entirety. + self.take_incr_backup(self.incr_dir, 2) + self.compare_backups(self.uri, self.dir, self.incr_dir) + +if __name__ == '__main__': + wttest.run() diff --git a/src/third_party/wiredtiger/test/suite/test_prepare15.py b/src/third_party/wiredtiger/test/suite/test_prepare15.py new file mode 100644 index 00000000000..4c4ba49a182 --- /dev/null +++ b/src/third_party/wiredtiger/test/suite/test_prepare15.py @@ -0,0 +1,204 @@ +#!/usr/bin/env python +# +# Public Domain 2014-present MongoDB, Inc. +# Public Domain 2008-2014 WiredTiger, Inc. +# +# This is free and unencumbered software released into the public domain. +# +# Anyone is free to copy, modify, publish, use, compile, sell, or +# distribute this software, either in source code form or as a compiled +# binary, for any purpose, commercial or non-commercial, and by any +# means. +# +# In jurisdictions that recognize copyright laws, the author or authors +# of this software dedicate any and all copyright interest in the +# software to the public domain. We make this dedication for the benefit +# of the public at large and to the detriment of our heirs and +# successors. We intend this dedication to be an overt act of +# relinquishment in perpetuity of all present and future rights to this +# software under copyright law. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +# IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR +# OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +# ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR +# OTHER DEALINGS IN THE SOFTWARE. + +import wttest +from wiredtiger import WT_NOTFOUND +from wtscenario import make_scenarios + +def timestamp_str(t): + return '%x' % t + +# test_prepare15.py +# Test that the prepare transaction rollback removes the on-disk key +# or replace it with history store and commit retains the changes when +# both insert and remove operations are from the same transaction. +class test_prepare15(wttest.WiredTigerTestCase): + in_memory_values = [ + ('no_inmem', dict(in_memory=False)), + ('inmem', dict(in_memory=True)) + ] + + key_format_values = [ + ('column', dict(key_format='r')), + ('integer_row', dict(key_format='i')), + ] + + txn_end_values = [ + ('commit', dict(commit=True)), + ('rollback', dict(commit=False)), + ] + + scenarios = make_scenarios(in_memory_values, key_format_values, txn_end_values) + + def conn_config(self): + config = 'cache_size=50MB' + if self.in_memory: + config += ',in_memory=true' + else: + config += ',in_memory=false' + return config + + def test_prepare_restore_hs_update(self): + # Prepare transactions for column store table is not yet supported. + if self.key_format == 'r': + self.skipTest('Prepare transactions for column store table is not yet supported') + + # Create a table without logging. + uri = "table:prepare15" + create_config = 'allocation_size=512,key_format=S,value_format=S' + self.session.create(uri, create_config) + + # Pin oldest and stable timestamps to 10. + self.conn.set_timestamp('oldest_timestamp=' + timestamp_str(10) + + ',stable_timestamp=' + timestamp_str(10)) + + valuea = 'a' + valueb = 'a' + + # Perform an update and remove. + cursor = self.session.open_cursor(uri) + self.session.begin_transaction() + cursor[str(0)] = valuea + self.session.commit_transaction('commit_timestamp=' + timestamp_str(20)) + + self.session.begin_transaction() + cursor.set_key(str(0)) + cursor.remove() + self.session.commit_transaction('commit_timestamp=' + timestamp_str(30)) + cursor.close() + + # Perform an update and remove. + s = self.conn.open_session() + cursor = s.open_cursor(uri) + s.begin_transaction() + cursor[str(0)] = valueb + cursor.set_key(str(0)) + cursor.remove() + cursor.close() + s.prepare_transaction('prepare_timestamp=' + timestamp_str(40)) + + # Configure debug behavior on a cursor to evict the page positioned on when the reset API is used. + evict_cursor = self.session.open_cursor(uri, None, "debug=(release_evict)") + + # Search for the key so we position our cursor on the page that we want to evict. + self.session.begin_transaction('ignore_prepare = true') + evict_cursor.set_key(str(0)) + self.assertEquals(evict_cursor.search(), WT_NOTFOUND) + evict_cursor.reset() + evict_cursor.close() + self.session.commit_transaction() + + if self.commit: + # Commit the prepared transaction + s.timestamp_transaction('commit_timestamp=' + timestamp_str(50)) + s.timestamp_transaction('durable_timestamp=' + timestamp_str(60)) + s.commit_transaction() + else: + # Rollback the prepared transaction + s.rollback_transaction() + + # Configure debug behavior on a cursor to evict the page positioned on when the reset API is used. + evict_cursor = self.session.open_cursor(uri, None, "debug=(release_evict)") + + # Search for the key so we position our cursor on the page that we want to evict. + self.session.begin_transaction() + evict_cursor.set_key(str(0)) + self.assertEquals(evict_cursor.search(), WT_NOTFOUND) + evict_cursor.reset() + evict_cursor.close() + self.session.commit_transaction() + + self.session.begin_transaction('read_timestamp=' + timestamp_str(20)) + cursor2 = self.session.open_cursor(uri) + cursor2.set_key(str(0)) + self.assertEquals(cursor2.search(), 0) + self.assertEqual(cursor2.get_value(), valuea) + self.session.commit_transaction() + + def test_prepare_not_found(self): + # Prepare transactions for column store table is not yet supported. + if self.key_format == 'r': + self.skipTest('Prepare transactions for column store table is not yet supported') + + # Create a table without logging. + uri = "table:prepare15" + create_config = 'allocation_size=512,key_format=S,value_format=S' + self.session.create(uri, create_config) + + # Pin oldest and stable timestamps to 10. + self.conn.set_timestamp('oldest_timestamp=' + timestamp_str(10) + + ',stable_timestamp=' + timestamp_str(10)) + + value = 'a' + + # Perform an update and remove. + s = self.conn.open_session() + cursor = s.open_cursor(uri) + s.begin_transaction() + cursor[str(0)] = value + cursor.set_key(str(0)) + cursor.remove() + cursor.close() + s.prepare_transaction('prepare_timestamp=' + timestamp_str(20)) + + # Configure debug behavior on a cursor to evict the page positioned on when the reset API is used. + evict_cursor = self.session.open_cursor(uri, None, "debug=(release_evict)") + + # Search for the key so we position our cursor on the page that we want to evict. + self.session.begin_transaction("ignore_prepare = true") + evict_cursor.set_key(str(0)) + self.assertEquals(evict_cursor.search(), WT_NOTFOUND) + evict_cursor.reset() + evict_cursor.close() + self.session.commit_transaction() + + if self.commit: + # Commit the prepared transaction + s.timestamp_transaction('commit_timestamp=' + timestamp_str(30)) + s.timestamp_transaction('durable_timestamp=' + timestamp_str(40)) + s.commit_transaction() + else: + # Rollback the prepared transaction + s.rollback_transaction() + + # Configure debug behavior on a cursor to evict the page positioned on when the reset API is used. + evict_cursor = self.session.open_cursor(uri, None, "debug=(release_evict)") + + # Search for the key so we position our cursor on the page that we want to evict. + self.session.begin_transaction() + evict_cursor.set_key(str(0)) + self.assertEquals(evict_cursor.search(), WT_NOTFOUND) + evict_cursor.reset() + evict_cursor.close() + self.session.commit_transaction() + + self.session.begin_transaction() + cursor2 = self.session.open_cursor(uri) + cursor2.set_key(str(0)) + self.assertEquals(cursor2.search(), WT_NOTFOUND) + self.session.commit_transaction() diff --git a/src/third_party/wiredtiger/test/suite/test_tiered04.py b/src/third_party/wiredtiger/test/suite/test_tiered04.py index 0347647031f..75d4fac1b19 100755 --- a/src/third_party/wiredtiger/test/suite/test_tiered04.py +++ b/src/third_party/wiredtiger/test/suite/test_tiered04.py @@ -35,7 +35,7 @@ StorageSource = wiredtiger.StorageSource # easy access to constants class test_tiered04(wttest.WiredTigerTestCase): # If the 'uri' changes all the other names must change with it. - fileuri = 'file:test_tiered04-0000000001.wt' + fileuri = 'file:test_tiered04-0000000001.wtobj' objuri = 'object:test_tiered04-0000000001.wtobj' tiereduri = "tiered:test_tiered04" uri = "table:test_tiered04" diff --git a/src/third_party/wiredtiger/test/suite/test_tiered06.py b/src/third_party/wiredtiger/test/suite/test_tiered06.py index e0614cd8c1b..c797936a82b 100755 --- a/src/third_party/wiredtiger/test/suite/test_tiered06.py +++ b/src/third_party/wiredtiger/test/suite/test_tiered06.py @@ -64,7 +64,7 @@ class test_tiered06(wttest.WiredTigerTestCase): local = self.get_local_storage_source() os.mkdir("objects") - fs = local.ss_customize_file_system(session, "./objects", "cluster1-", "Secret", None) + fs = local.ss_customize_file_system(session, "./objects", "Secret", None) # The object doesn't exist yet. self.assertFalse(fs.fs_exist(session, 'foobar')) @@ -95,33 +95,29 @@ class test_tiered06(wttest.WiredTigerTestCase): fh.fh_lock(session, False) fh.close(session) - self.assertEquals(fs.fs_directory_list(session, '', ''), ['foobar']) + # Nothing is in the directory list until a flush. + self.assertEquals(fs.fs_directory_list(session, '', ''), []) - # Newly created objects are in the list. fh = fs.fs_open_file(session, 'zzz', FileSystem.open_file_type_data, FileSystem.open_create) - # TODO: tiered: the newly created file should be visible, but it is not yet. - # self.assertEquals(sorted(fs.fs_directory_list(session, '', '')), ['foobar', 'zzz' ]) - # Sync merely syncs to the local disk. fh.fh_sync(session) fh.close(session) # zero length - self.assertEquals(sorted(fs.fs_directory_list(session, '', '')), ['foobar', 'zzz' ]) + self.assertEquals(sorted(fs.fs_directory_list(session, '', '')), []) # See that we can rename objects. fs.fs_rename(session, 'zzz', 'yyy', 0) - self.assertEquals(sorted(fs.fs_directory_list(session, '', '')), ['foobar', 'yyy' ]) + self.assertEquals(sorted(fs.fs_directory_list(session, '', '')), []) # See that we can remove objects. fs.fs_remove(session, 'yyy', 0) - self.assertEquals(fs.fs_directory_list(session, '', ''), ['foobar']) - # TODO: tiered: flush tests disabled, as the interface - # for flushing will be changed. - return + # Nothing is in the directory list until a flush. + self.assertEquals(fs.fs_directory_list(session, '', ''), []) - # Flushing doesn't do anything that's visible. - local.ss_flush(session, fs, None, '') + # Flushing moves the file. + local.ss_flush(session, fs, 'foobar', 'foobar', None) + local.ss_flush_finish(session, fs, 'foobar', 'foobar', None) self.assertEquals(fs.fs_directory_list(session, '', ''), ['foobar']) # Files that have been flushed cannot be manipulated. @@ -145,7 +141,7 @@ class test_tiered06(wttest.WiredTigerTestCase): local = self.get_local_storage_source() os.mkdir("objects") - fs = local.ss_customize_file_system(session, "./objects", "cluster1-", "Secret", None) + fs = local.ss_customize_file_system(session, "./objects", "Secret", None) # We call these 4K chunks of data "blocks" for this test, but that doesn't # necessarily relate to WT block sizing. @@ -208,11 +204,23 @@ class test_tiered06(wttest.WiredTigerTestCase): cachedir1 = "./cache1" cachedir2 = "./cache2" - def check(self, fs, prefix, expect): + # Add a suffix to each in a list + def suffix(self, lst, sfx): + return [x + '.' + sfx for x in lst] + + def check_dirlist(self, fs, prefix, expect): # We don't require any sorted output for directory lists, # so we'll sort before comparing.' got = sorted(fs.fs_directory_list(self.session, '', prefix)) - expect = sorted(expect) + expect = sorted(self.suffix(expect, 'wtobj')) + self.assertEquals(got, expect) + + # Check for data files in the WiredTiger home directory. + def check_home(self, expect): + # Get list of all .wt files in home, prune out the WiredTiger produced ones + got = sorted(list(os.listdir(self.home))) + got = [x for x in got if not x.startswith('WiredTiger') and x.endswith('.wt')] + expect = sorted(self.suffix(expect, 'wt')) self.assertEquals(got, expect) # Check that objects are "in the cloud" after a flush. @@ -220,12 +228,25 @@ class test_tiered06(wttest.WiredTigerTestCase): # objectdir1 or objectdir2 def check_objects(self, expect1, expect2): got = sorted(list(os.listdir(self.objectdir1))) - expect = sorted(expect1) + expect = sorted(self.suffix(expect1, 'wtobj')) self.assertEquals(got, expect) got = sorted(list(os.listdir(self.objectdir2))) - expect = sorted(expect2) + expect = sorted(self.suffix(expect2, 'wtobj')) self.assertEquals(got, expect) + # Check that objects are in the cache directory after flush_finish. + def check_caches(self, expect1, expect2): + got = sorted(list(os.listdir(self.cachedir1))) + expect = sorted(self.suffix(expect1, 'wtobj')) + self.assertEquals(got, expect) + got = sorted(list(os.listdir(self.cachedir2))) + expect = sorted(self.suffix(expect2, 'wtobj')) + self.assertEquals(got, expect) + + def create_wt_file(self, name): + with open(name + '.wt', 'w') as f: + f.write('hello') + def test_local_file_systems(self): # Test using various buckets, hosts @@ -244,11 +265,11 @@ class test_tiered06(wttest.WiredTigerTestCase): errmsg = '/No such file or directory/' self.assertRaisesWithMessage(wiredtiger.WiredTigerError, lambda: local.ss_customize_file_system( - session, "./objects1", "pre1-", "k1", bad_config), errmsg) + session, "./objects1", "k1", bad_config), errmsg) self.assertRaisesWithMessage(wiredtiger.WiredTigerError, lambda: local.ss_customize_file_system( - session, "./objects_BAD", "pre1-", "k1", config1), errmsg) + session, "./objects_BAD", "k1", config1), errmsg) # Create an empty file, try to use it as a directory. with open("some_file", "w"): @@ -256,143 +277,75 @@ class test_tiered06(wttest.WiredTigerTestCase): errmsg = '/Invalid argument/' self.assertRaisesWithMessage(wiredtiger.WiredTigerError, lambda: local.ss_customize_file_system( - session, "some_file", "pre1-", "k1", config1), errmsg) + session, "some_file", "k1", config1), errmsg) # Now create some file systems that should succeed. # Use either different bucket directories or different prefixes, # so activity that happens in the various file systems should be independent. - fs1 = local.ss_customize_file_system(session, "./objects1", "pre1-", "k1", config1) - fs2 = local.ss_customize_file_system(session, "./objects2", "pre1-", "k2", config2) - fs3 = local.ss_customize_file_system(session, "./objects1", "pre2-", "k3", config1) - fs4 = local.ss_customize_file_system(session, "./objects2", "pre2-", "k4", config2) - - # Create files in the file systems with some name overlap - self.create_with_fs(fs1, 'alpaca') - self.create_with_fs(fs2, 'bear') - self.create_with_fs(fs3, 'crab') - self.create_with_fs(fs4, 'deer') + fs1 = local.ss_customize_file_system(session, "./objects1", "k1", config1) + fs2 = local.ss_customize_file_system(session, "./objects2", "k2", config2) + + # Create files in the wt home directory. for a in ['beagle', 'bird', 'bison', 'bat']: - self.create_with_fs(fs1, a) - for a in ['bird', 'bison', 'bat', 'badger']: - self.create_with_fs(fs2, a) - for a in ['bison', 'bat', 'badger', 'baboon']: - self.create_with_fs(fs3, a) - for a in ['bat', 'badger', 'baboon', 'beagle']: - self.create_with_fs(fs4, a) - - # Make sure we see the expected file names - self.check(fs1, '', ['alpaca', 'beagle', 'bird', 'bison', 'bat']) - self.check(fs1, 'a', ['alpaca']) - self.check(fs1, 'b', ['beagle', 'bird', 'bison', 'bat']) - self.check(fs1, 'c', []) - self.check(fs1, 'd', []) - - self.check(fs2, '', ['bear', 'bird', 'bison', 'bat', 'badger']) - self.check(fs2, 'a', []) - self.check(fs2, 'b', ['bear', 'bird', 'bison', 'bat', 'badger']) - self.check(fs2, 'c', []) - self.check(fs2, 'd', []) - - self.check(fs3, '', ['crab', 'bison', 'bat', 'badger', 'baboon']) - self.check(fs3, 'a', []) - self.check(fs3, 'b', ['bison', 'bat', 'badger', 'baboon']) - self.check(fs3, 'c', ['crab']) - self.check(fs3, 'd', []) - - self.check(fs4, '', ['deer', 'bat', 'badger', 'baboon', 'beagle']) - self.check(fs4, 'a', []) - self.check(fs4, 'b', ['bat', 'badger', 'baboon', 'beagle']) - self.check(fs4, 'c', []) - self.check(fs4, 'd', ['deer']) - - # Flushing copies files to one of the subdirectories: - # "./objects1" (for fs1 and fs3) - # "./objects2" (for fs2 and fs4) - # - # After every flush, we'll check that the right objects appear in the right directory. - # check_objects takes two lists: objects expected to be in ./objects1, - # and objects expected to be in ./objects2 . + self.create_wt_file(a) + for a in ['cat', 'cougar', 'coyote', 'cub']: + self.create_wt_file(a) + + # Everything is in wt home, nothing in the file system yet. + self.check_home(['beagle', 'bird', 'bison', 'bat', 'cat', 'cougar', 'coyote', 'cub']) + self.check_dirlist(fs1, '', []) + self.check_dirlist(fs2, '', []) + self.check_caches([], []) self.check_objects([], []) - # TODO: tiered: flush tests disabled, as the interface - # for flushing will be changed. - enable_fs_flush_tests = False - if enable_fs_flush_tests: - local.ss_flush(session, fs4, None, '') - self.check_objects([], ['pre2-deer', 'pre2-bat', 'pre2-badger', 'pre2-baboon', 'pre2-beagle']) - - local.ss_flush(session, fs3, 'badger', '') - self.check_objects(['pre2-badger'], - ['pre2-deer', 'pre2-bat', 'pre2-badger', 'pre2-baboon', 'pre2-beagle']) - - #local.ss_flush(session, fs3, 'c', '') # make sure we don't flush prefixes - self.check_objects(['pre2-badger'], - ['pre2-deer', 'pre2-bat', 'pre2-badger', 'pre2-baboon', 'pre2-beagle']) - - local.ss_flush(session, fs3, 'b', '') # or suffixes - self.check_objects(['pre2-badger'], - ['pre2-deer', 'pre2-bat', 'pre2-badger', 'pre2-baboon', 'pre2-beagle']) - - local.ss_flush(session, fs3, 'crab', '') - self.check_objects(['pre2-crab', 'pre2-badger'], - ['pre2-deer', 'pre2-bat', 'pre2-badger', 'pre2-baboon', 'pre2-beagle']) - - local.ss_flush(session, fs3, 'crab', '') # should do nothing - self.check_objects(['pre2-crab', 'pre2-badger'], - ['pre2-deer', 'pre2-bat', 'pre2-badger', 'pre2-baboon', 'pre2-beagle']) - - local.ss_flush(session, None, None, '') # flush everything else - self.check_objects(['pre1-alpaca', 'pre1-beagle', 'pre1-bird', 'pre1-bison', 'pre1-bat', - 'pre2-crab', 'pre2-bison', 'pre2-bat', 'pre2-badger', 'pre2-baboon'], - ['pre1-bear', 'pre1-bird', 'pre1-bison', 'pre1-bat', 'pre1-badger', - 'pre2-deer', 'pre2-bat', 'pre2-badger', 'pre2-baboon', 'pre2-beagle']) - - local.ss_flush(session, None, None, '') # should do nothing - self.check_objects(['pre1-alpaca', 'pre1-beagle', 'pre1-bird', 'pre1-bison', 'pre1-bat', - 'pre2-crab', 'pre2-bison', 'pre2-bat', 'pre2-badger', 'pre2-baboon'], - ['pre1-bear', 'pre1-bird', 'pre1-bison', 'pre1-bat', 'pre1-badger', - 'pre2-deer', 'pre2-bat', 'pre2-badger', 'pre2-baboon', 'pre2-beagle']) - - self.create_with_fs(fs4, 'zebra') # should do nothing in the objects directories - self.create_with_fs(fs4, 'yeti') # should do nothing in the objects directories - self.check_objects(['pre1-alpaca', 'pre1-beagle', 'pre1-bird', 'pre1-bison', 'pre1-bat', - 'pre2-crab', 'pre2-bison', 'pre2-bat', 'pre2-badger', 'pre2-baboon'], - ['pre1-bear', 'pre1-bird', 'pre1-bison', 'pre1-bat', 'pre1-badger', - 'pre2-deer', 'pre2-bat', 'pre2-badger', 'pre2-baboon', 'pre2-beagle']) - - # Try remove and rename, should be possible until we flush - self.check(fs4, '', ['deer', 'bat', 'badger', 'baboon', 'beagle', 'yeti', 'zebra']) - fs4.fs_remove(session, 'yeti', 0) - self.check(fs4, '', ['deer', 'bat', 'badger', 'baboon', 'beagle', 'zebra']) - fs4.fs_rename(session, 'zebra', 'okapi', 0) - self.check(fs4, '', ['deer', 'bat', 'badger', 'baboon', 'beagle', 'okapi']) - local.ss_flush(session, None, None, '') - self.check(fs4, '', ['deer', 'bat', 'badger', 'baboon', 'beagle', 'okapi']) - self.check_objects(['pre1-alpaca', 'pre1-beagle', 'pre1-bird', 'pre1-bison', 'pre1-bat', - 'pre2-crab', 'pre2-bison', 'pre2-bat', 'pre2-badger', 'pre2-baboon'], - ['pre1-bear', 'pre1-bird', 'pre1-bison', 'pre1-bat', 'pre1-badger', - 'pre2-deer', 'pre2-bat', 'pre2-badger', 'pre2-baboon', 'pre2-beagle', - 'pre2-okapi']) - - errmsg = '/rename of flushed file not allowed/' - self.assertRaisesWithMessage(wiredtiger.WiredTigerError, - lambda: fs4.fs_rename(session, 'okapi', 'zebra', 0), errmsg) - - # XXX - # At the moment, removal of flushed files is not allowed - as flushed files are immutable. - # We may need to explicitly evict flushed files from cache directory via the API, if so, - # the API to do that might be on the local store object, not the file system. - errmsg = '/remove of flushed file not allowed/' - self.assertRaisesWithMessage(wiredtiger.WiredTigerError, - lambda: fs4.fs_remove(session, 'okapi', 0), errmsg) - - # No change since last time. - self.check(fs4, '', ['deer', 'bat', 'badger', 'baboon', 'beagle', 'okapi']) - self.check_objects(['pre1-alpaca', 'pre1-beagle', 'pre1-bird', 'pre1-bison', 'pre1-bat', - 'pre2-crab', 'pre2-bison', 'pre2-bat', 'pre2-badger', 'pre2-baboon'], - ['pre1-bear', 'pre1-bird', 'pre1-bison', 'pre1-bat', 'pre1-badger', - 'pre2-deer', 'pre2-bat', 'pre2-badger', 'pre2-baboon', 'pre2-beagle', - 'pre2-okapi']) + # A flush copies to the cloud, nothing is removed. + local.ss_flush(session, fs1, 'beagle.wt', 'beagle.wtobj') + self.check_home(['beagle', 'bird', 'bison', 'bat', 'cat', 'cougar', 'coyote', 'cub']) + self.check_dirlist(fs1, '', []) + self.check_dirlist(fs2, '', []) + self.check_caches([], []) + self.check_objects(['beagle'], []) + + # Bad file to flush + errmsg = '/No such file/' + self.assertRaisesWithMessage(wiredtiger.WiredTigerError, + lambda: local.ss_flush(session, fs1, 'bad.wt', 'bad.wtobj'), errmsg) + + # It's okay to flush again, nothing changes + local.ss_flush(session, fs1, 'beagle.wt', 'beagle.wtobj') + self.check_home(['beagle', 'bird', 'bison', 'bat', 'cat', 'cougar', 'coyote', 'cub']) + self.check_dirlist(fs1, '', []) + self.check_dirlist(fs2, '', []) + self.check_caches([], []) + self.check_objects(['beagle'], []) + + # When we flush_finish, the local file will move to the cache directory + local.ss_flush_finish(session, fs1, 'beagle.wt', 'beagle.wtobj') + self.check_home(['bird', 'bison', 'bat', 'cat', 'cougar', 'coyote', 'cub']) + self.check_dirlist(fs1, '', ['beagle']) + self.check_dirlist(fs2, '', []) + self.check_caches(['beagle'], []) + self.check_objects(['beagle'], []) + + # Do a some more in each file ssytem + local.ss_flush(session, fs1, 'bison.wt', 'bison.wtobj') + local.ss_flush(session, fs2, 'cat.wt', 'cat.wtobj') + local.ss_flush(session, fs1, 'bat.wt', 'bat.wtobj') + local.ss_flush_finish(session, fs2, 'cat.wt', 'cat.wtobj') + local.ss_flush(session, fs2, 'cub.wt', 'cub.wtobj') + local.ss_flush_finish(session, fs1, 'bat.wt', 'bat.wtobj') + + self.check_home(['bird', 'bison', 'cougar', 'coyote', 'cub']) + self.check_dirlist(fs1, '', ['beagle', 'bat']) + self.check_dirlist(fs2, '', ['cat']) + self.check_caches(['beagle', 'bat'], ['cat']) + self.check_objects(['beagle', 'bat', 'bison'], ['cat', 'cub']) + + # Test directory listing prefixes + self.check_dirlist(fs1, '', ['beagle', 'bat']) + self.check_dirlist(fs1, 'ba', ['bat']) + self.check_dirlist(fs1, 'be', ['beagle']) + self.check_dirlist(fs1, 'x', []) if __name__ == '__main__': wttest.run() |