diff options
author | Alex Gorrod <alexander.gorrod@mongodb.com> | 2017-03-02 13:34:29 +1100 |
---|---|---|
committer | Alex Gorrod <alexander.gorrod@mongodb.com> | 2017-03-02 13:40:59 +1100 |
commit | 086c21e2b4c87952273fde78ab8fb18f18e8fdc6 (patch) | |
tree | 23f231369745a7d2e724f6db1ad4473a85db36aa /src/third_party | |
parent | e73cfbddea6d3f5c1499f93878679a768aa97dff (diff) | |
download | mongo-086c21e2b4c87952273fde78ab8fb18f18e8fdc6.tar.gz |
Import wiredtiger: d6659de8d742b9562d08c1ba5138be881f8e24fa from branch mongodb-3.4
ref: 8d23249433..d6659de8d7
for: 3.4.3
SERVER-16796 Increase logging activity for journal recovery operations
WT-2402 Misaligned structure accesses lead to undefined behavior
WT-2771 Add a statistic to track per-btree dirty cache usage
WT-2790 Fix a text case false positive in test_sweep01
WT-2833 improvement: add projections to wt dump utility
WT-2898 Improve performance of eviction-heavy workloads by dynamically controlling the number of eviction threads
WT-2909 Create automatable test verifying checkpoint integrity after errors
WT-2994 Create documentation describing page sizes and relationships
WT-3080 Python test suite: add timestamp or elapsed time for tests
WT-3082 Python test suite: shorten default run to avoid pull request timeouts.
WT-3083 Fix a bug in wtperf config dump
WT-3086 Add transaction state information to cache stuck diagnostic information
WT-3088 bug: Don't evict a page with refs visible to readers after a split
WT-3091 Add stats to test_perf0001
WT-3092 Quiet a warning from autogen.sh
WT-3093 Padding the WT_RWLOCK structure grew the WT_PAGE structure.
WT-3097 Race on reconfigure or shutdown can lead to waiting for statistics log server
WT-3099 lint: static function declarations, non-text characters in documentation
WT-3100 test bug: format is weighted to delete, insert, then write operations.
WT-3104 Fix wtperf configs for eviction tests
WT-3105 Fix a deadlock caused by allocating eviction thread sessions dynamically
WT-3106 Add truncate support to command line wt utility
WT-3108 Also dump disk page size as part of metadata information
WT-3109 wording fix in transaction doc
WT-3110 Add more test cases for the WT command line utility
WT-3111 util_create() doesnt free memory assigned to "uri"
WT-3112 Handle list lock statistic not incremented in eviction server
WT-3113 Add a verbose mode to dump the cache when eviction is stuck
WT-3114 Avoid archiving log files immediately after recovery
WT-3115 Change the dhandle lock to a read/write lock
WT-3116 Python style testing in s_all may not execute correctly
WT-3118 Protect random-abort test against unexpectedly slow child start
WT-3120 Fix ordering problem in connection_close for filesystem loaded in an extension
WT-3121 In test suite create standard way to load extensions
WT-3126 Fix a bug in dist/s_all script
WT-3127 bug: CPU yield calls don't necessarily imply memory barriers
WT-3128 Fix a bug where wt printlog returns operation-not-supported if it doesn't find any log files
WT-3130 Proposal to change initialization of custom filesystem
WT-3134 Coverity scan reports 1368529 and 1368528
WT-3135 search_near() for index with custom collator
WT-3137 Hang in __log_slot_join/__log_slot_switch_internal
WT-3139 Enhance wtperf to support periodic table scans
WT-3143 Fix Coverity static analysis complaint in test program
WT-3144 bug fix: random cursor returns not-found when descending to an empty page
WT-3148 Improve eviction efficiency with many small trees
WT-3149 Change eviction to start new walks from a random place in the tree
WT-3150 Reduce impact of checkpoints on eviction server
WT-3152 Convert table lock from a spinlock to a read write lock
WT-3156 Assertion in log_write fires after write failure
WT-3157 checkpoint/transaction integrity issue when writes fail.
WT-3159 Incorrect key for index containing multiple variable sized entries
WT-3161 checkpoint hang after write failure injection.
WT-3164 Ensure all relevant btree fields are reset on checkpoint error
WT-3170 Clear the eviction walk point while populating from a tree
WT-3173 Add runtime detection for s390x CRC32 hardware support
WT-3174 Coverity/lint cleanup
WT-3175 New hang in internal page split
WT-3179 test bug: clang sanitizer failure in fail_fs
WT-3180 fault injection tests should only run as "long" tests and should not create core files
WT-3184 Problem duplicating index cursor with custom collator
WT-3186 Fix error path and panic detection in logging loops
WT-3187 Hang on shutdown with a busy cache pool
WT-3188 Fix error handling in logging where fatal errors could lead to a hang
WT-3189 Fix a segfault in the eviction server random positioning
Diffstat (limited to 'src/third_party')
260 files changed, 7718 insertions, 3078 deletions
diff --git a/src/third_party/wiredtiger/NEWS b/src/third_party/wiredtiger/NEWS index bdc84ed6ef5..268949b119f 100644 --- a/src/third_party/wiredtiger/NEWS +++ b/src/third_party/wiredtiger/NEWS @@ -1,3 +1,53 @@ +WiredTiger release 2.9.1, 2016-12-22 +------------------------------------ + +New features and API changes; refer to the API documentation for full details: + +* SERVER-26545 Remove fixed-size limitation on WiredTiger hazard pointers. See the upgrading documentation for details +* WT-283 Add a new WT_SESSION::alter method that can be used to reconfigure table metadata +* WT-2670 Change the default file system access pattern advice for data files from random to no advice. Add access_pattern_hint configuration option for WT_SESSION::create API that can be used to advise the file system of expected access semantics. See the upgrading documentation for details. +* WT-3034 Add support for including updates when reading from named snapshots + +Significant changes and bug fixes: + +* WT-2960 Reduce likelihood of using the lookaside file, especially when inserting multi-megabyte values +* WT-3056 Allow projected table and join cursors to use primary keys +* WT-3070 Fix a bug in search_near on indexes + +Other noteworthy changes since the previous release: + +* WT-2336 Add a test validating schema operations via file system call monitoring +* WT-2402 Pad structures to avoid cache line sharing +* WT-2771 Add a statistic to track per-btree dirty cache usage +* WT-2833 Add projections to wt dump utility +* WT-2969 Possible snapshot corruption during compaction +* WT-3014 Add GCC/clang support for ELF symbol visibility +* WT-3021 Fixes for java log example, raw mode in java, and raw mode in log cursors +* WT-3025 Fix error path in log_force_sync +* WT-3028 Don't check for blocked eviction with in-memory workloads +* WT-3030 Fix a race between scans and splits reading the index hint +* WT-3037 Clean up some log slot comments +* WT-3048 WiredTiger maximum size warning uses the wrong format +* WT-3051 Remove external __wt_hex symbol +* WT-3052 Improve search if index hint is wrong +* WT-3053 Make Python use internal memory allocation again +* WT-3054 Make a PackOutputStream constructor that is compatible with the previous interface. +* WT-3055 When an AsyncOp is created, cache the whether the cursor is "raw" +* WT-3057 WiredTiger hazard pointers should use the WT_REF, not the WT_PAGE +* WT-3061 Syscall testing should support pwrite64 on Linux +* WT-3064 Minor tree cleanups: .gitignore, NEWS misspelling +* WT-3066 Minor code cleanups +* WT-3068 Copy artifacts of test runs in wtperf_run script +* WT-3068 Have Jenkins include specific files for copy rather than exclude +* WT-3069 Fix LevelDB APIs build failures +* WT-3071 Fixed sign-conversion compiler errors in Java and Python SWIG code +* WT-3075 Document and enforce that WiredTiger now depends on Python 2.7 +* WT-3078 Test reconfiguration hang in the statlog server +* WT-3080 Python test suite: add elapsed time for tests +* WT-3082 Python test suite: shorten default run to avoid timeouts +* WT-3084 Fix Coverity resource leak complaint +* WT-3091 Add stats to test_perf001 test, so we can investigate what happened when it failed + WiredTiger release 2.9.0, 2016-09-06 ------------------------------------ diff --git a/src/third_party/wiredtiger/README b/src/third_party/wiredtiger/README index 55e9058826a..f7edae2835d 100644 --- a/src/third_party/wiredtiger/README +++ b/src/third_party/wiredtiger/README @@ -1,6 +1,6 @@ -WiredTiger 2.9.1: (December 7, 2016) +WiredTiger 2.9.2: (December 23, 2016) -This is version 2.9.1 of WiredTiger. +This is version 2.9.2 of WiredTiger. WiredTiger release packages and documentation can be found at: @@ -8,7 +8,7 @@ WiredTiger release packages and documentation can be found at: The documentation for this specific release can be found at: - http://source.wiredtiger.com/2.9.1/index.html + http://source.wiredtiger.com/2.9.2/index.html The WiredTiger source code can be found at: diff --git a/src/third_party/wiredtiger/RELEASE_INFO b/src/third_party/wiredtiger/RELEASE_INFO index 502b17188ce..b7145aa2cb3 100644 --- a/src/third_party/wiredtiger/RELEASE_INFO +++ b/src/third_party/wiredtiger/RELEASE_INFO @@ -1,6 +1,6 @@ WIREDTIGER_VERSION_MAJOR=2 WIREDTIGER_VERSION_MINOR=9 -WIREDTIGER_VERSION_PATCH=1 +WIREDTIGER_VERSION_PATCH=2 WIREDTIGER_VERSION="$WIREDTIGER_VERSION_MAJOR.$WIREDTIGER_VERSION_MINOR.$WIREDTIGER_VERSION_PATCH" WIREDTIGER_RELEASE_DATE=`date "+%B %e, %Y"` diff --git a/src/third_party/wiredtiger/SConstruct b/src/third_party/wiredtiger/SConstruct index df7a66238e8..e9e72630b11 100644 --- a/src/third_party/wiredtiger/SConstruct +++ b/src/third_party/wiredtiger/SConstruct @@ -313,6 +313,7 @@ wtbin = env.Program("wt", [ "src/utilities/util_rename.c", "src/utilities/util_salvage.c", "src/utilities/util_stat.c", + "src/utilities/util_truncate.c", "src/utilities/util_upgrade.c", "src/utilities/util_verbose.c", "src/utilities/util_verify.c", diff --git a/src/third_party/wiredtiger/bench/wtperf/config.c b/src/third_party/wiredtiger/bench/wtperf/config.c index 5b14a4cdf68..9eea99eeec4 100644 --- a/src/third_party/wiredtiger/bench/wtperf/config.c +++ b/src/third_party/wiredtiger/bench/wtperf/config.c @@ -215,6 +215,7 @@ config_threads(WTPERF *wtperf, const char *config, size_t len) return (EINVAL); } workp = &wtperf->workload[wtperf->workload_cnt++]; + workp->table_index = INT32_MAX; while ((ret = scan->next(scan, &k, &v)) == 0) { if (STRING_MATCH("count", k.str, k.len)) { @@ -233,12 +234,28 @@ config_threads(WTPERF *wtperf, const char *config, size_t len) goto err; continue; } + if (STRING_MATCH("pause", k.str, k.len)) { + if ((workp->pause = v.val) < 0) + goto err; + continue; + } if (STRING_MATCH("read", k.str, k.len) || STRING_MATCH("reads", k.str, k.len)) { if ((workp->read = v.val) < 0) goto err; continue; } + if (STRING_MATCH("read_range", k.str, k.len)) { + if ((workp->read_range = v.val) < 0) + goto err; + continue; + } + if (STRING_MATCH("table", k.str, k.len)) { + if (v.val <= 0) + goto err; + workp->table_index = (int32_t)v.val - 1; + continue; + } if (STRING_MATCH("throttle", k.str, k.len)) { workp->throttle = (uint64_t)v.val; continue; @@ -622,17 +639,9 @@ config_opt_str(WTPERF *wtperf, const char *optstr) return (ret); } - /* - * Append the current line to our copy of the config. The config is - * stored in the order it is processed, so added options will be after - * any parsed from the original config. We allocate len + 1 to allow for - * a null byte to be added. - */ - config_line = dcalloc(sizeof(CONFIG_QUEUE_ENTRY), 1); - config_line->string = dstrdup(optstr); - TAILQ_INSERT_TAIL(&opts->config_head, config_line, q); - while (ret == 0) { + size_t pos; + if ((ret = scan->next(scan, &k, &v)) != 0) { /* Any parse error has already been reported. */ if (ret == WT_NOTFOUND) @@ -640,6 +649,46 @@ config_opt_str(WTPERF *wtperf, const char *optstr) break; } ret = config_opt(wtperf, &k, &v); + + /* + * Append the key-value pair to our copy of the config. + * The config is stored in the order it is processed, so added + * options will be after any parsed from the original config. + */ + config_line = dcalloc(sizeof(CONFIG_QUEUE_ENTRY), 1); + /* + * If key or value is a string, consider extra space for the + * quotes. Add 2 to the required space for '=' and the ending + * null character in "key=value". + */ + config_line->string = dcalloc( + k.len + (k.type == WT_CONFIG_ITEM_STRING ? 2 : 0) + + v.len + (v.type == WT_CONFIG_ITEM_STRING ? 2 : 0) + 2, 1); + pos = 0; + if (k.type == WT_CONFIG_ITEM_STRING) { + config_line->string[pos] = '"'; + pos++; + } + strncpy(config_line->string + pos, k.str, k.len); + pos += k.len; + if (k.type == WT_CONFIG_ITEM_STRING) { + config_line->string[pos] = '"'; + pos++; + } + config_line->string[pos] = '='; + pos++; + if (v.type == WT_CONFIG_ITEM_STRING) { + config_line->string[pos] = '"'; + pos++; + } + strncpy(config_line->string + pos, v.str, v.len); + pos += v.len; + if (v.type == WT_CONFIG_ITEM_STRING) { + config_line->string[pos] = '"'; + pos++; + } + config_line->string[pos] = '\0'; + TAILQ_INSERT_TAIL(&opts->config_head, config_line, q); } if ((t_ret = scan->close(scan)) != 0) { lprintf(wtperf, ret, 0, "Error in config_scan_end"); @@ -728,16 +777,33 @@ config_sanity(WTPERF *wtperf) opts->value_sz_min = opts->value_sz; } - if (opts->readonly && wtperf->workload != NULL) + if (wtperf->workload != NULL) for (i = 0, workp = wtperf->workload; - i < wtperf->workload_cnt; ++i, ++workp) - if (workp->insert != 0 || workp->update != 0 || - workp->truncate != 0) { + i < wtperf->workload_cnt; ++i, ++workp) { + if (opts->readonly && + (workp->insert != 0 || workp->update != 0 || + workp->truncate != 0)) { fprintf(stderr, "Invalid workload: insert, update or " "truncate specified with readonly\n"); return (EINVAL); } + if (workp->insert != 0 && + workp->table_index != INT32_MAX) { + fprintf(stderr, + "Invalid workload: Cannot insert into " + "specific table only\n"); + return (EINVAL); + } + if (workp->table_index != INT32_MAX && + workp->table_index >= (int32_t)opts->table_count) { + fprintf(stderr, + "Workload table index %" PRId32 + " is larger than table count %" PRId32, + workp->table_index, opts->table_count); + return (EINVAL); + } + } return (0); } @@ -754,8 +820,11 @@ config_consolidate(CONFIG_OPTS *opts) /* * This loop iterates over the config queue and for each entry checks if - * a later queue entry has the same key. If there's a match, the current - * queue entry is removed and we continue. + * a later queue entry has the same key. If there's a match, and key is + * "conn_config" or "table_config", the later queue entry is replaced + * with a concatenated entry of the two queue entries, the current queue + * entry is removed. For any other key, if there is a match, the current + * queue entry is removed. */ conf_line = TAILQ_FIRST(&opts->config_head); while (conf_line != NULL) { @@ -771,6 +840,34 @@ config_consolidate(CONFIG_OPTS *opts) if (strncmp(conf_line->string, test_line->string, (size_t)((string_key - conf_line->string) + 1)) == 0) { + if ((strncmp("conn_config=", conf_line->string, + (size_t)((string_key - conf_line->string) + + 1)) == 0) || + (strncmp("table_config=", conf_line->string, + (size_t)((string_key - conf_line->string) + + 1)) == 0)) { + char *concat_str, *val_pointer; + + /* + * To concatenate the two config + * strings, copy the first string to a + * new one, replace the ending '"' with + * a ',' and then concatenate the second + * string's value after its starting '"' + */ + val_pointer = + strchr(test_line->string, '=') + 2; + concat_str = + dmalloc(strlen(conf_line->string) + + strlen(val_pointer) + 1); + strcpy(concat_str, conf_line->string); + concat_str[strlen(concat_str) - 1] = + ','; + strcat(concat_str, val_pointer); + free(test_line->string); + test_line->string = concat_str; + } + TAILQ_REMOVE(&opts->config_head, conf_line, q); free(conf_line->string); free(conf_line); diff --git a/src/third_party/wiredtiger/bench/wtperf/idle_table_cycle.c b/src/third_party/wiredtiger/bench/wtperf/idle_table_cycle.c index 13fa55e86f5..bb44cfbde59 100644 --- a/src/third_party/wiredtiger/bench/wtperf/idle_table_cycle.c +++ b/src/third_party/wiredtiger/bench/wtperf/idle_table_cycle.c @@ -120,6 +120,7 @@ cycle_idle_tables(void *arg) return (NULL); start = stop; +#if 1 /* * Drop the table. Keep retrying on EBUSY failure - it is an * expected return when checkpoints are happening. @@ -136,6 +137,7 @@ cycle_idle_tables(void *arg) } if (check_timing(wtperf, "drop", start, &stop) != 0) return (NULL); +#endif } return (NULL); diff --git a/src/third_party/wiredtiger/bench/wtperf/runners/500m-btree-50r50u.wtperf b/src/third_party/wiredtiger/bench/wtperf/runners/500m-btree-50r50u.wtperf index 536127f0dd8..4d2a70f1107 100644 --- a/src/third_party/wiredtiger/bench/wtperf/runners/500m-btree-50r50u.wtperf +++ b/src/third_party/wiredtiger/bench/wtperf/runners/500m-btree-50r50u.wtperf @@ -5,7 +5,7 @@ # # Set cache to half of memory of AWS perf instance. Enable logging and # checkpoints. Collect wiredtiger stats for ftdc. -conn_config="cache_size=16G,checkpoint=(wait=60,log_size=2GB),session_max=20000,log=(enabled),statistics=(fast),statistics_log=(wait=30,json),eviction=(threads_max=4)" +conn_config="cache_size=16G,checkpoint=(wait=60,log_size=2GB),session_max=20000,log=(enabled),statistics=(fast),statistics_log=(wait=30,json),eviction=(threads_max=8)" create=false compression="snappy" sess_config="isolation=snapshot" diff --git a/src/third_party/wiredtiger/bench/wtperf/runners/500m-btree-80r20u.wtperf b/src/third_party/wiredtiger/bench/wtperf/runners/500m-btree-80r20u.wtperf index d6218c44af0..6645df835df 100644 --- a/src/third_party/wiredtiger/bench/wtperf/runners/500m-btree-80r20u.wtperf +++ b/src/third_party/wiredtiger/bench/wtperf/runners/500m-btree-80r20u.wtperf @@ -5,7 +5,7 @@ # # Set cache to half of memory of AWS perf instance. Enable logging and # checkpoints. Collect wiredtiger stats for ftdc. -conn_config="cache_size=16G,checkpoint=(wait=60,log_size=2GB),session_max=20000,log=(enabled),statistics=(fast),statistics_log=(wait=30,json),eviction=(threads_max=4)" +conn_config="cache_size=16G,checkpoint=(wait=60,log_size=2GB),session_max=20000,log=(enabled),statistics=(fast),statistics_log=(wait=30,json),eviction=(threads_max=8)" create=false compression="snappy" # close_conn as false allows this test to close/finish faster, but if running diff --git a/src/third_party/wiredtiger/bench/wtperf/runners/500m-btree-populate.wtperf b/src/third_party/wiredtiger/bench/wtperf/runners/500m-btree-populate.wtperf index f9aed094aa1..ab7b17ca683 100644 --- a/src/third_party/wiredtiger/bench/wtperf/runners/500m-btree-populate.wtperf +++ b/src/third_party/wiredtiger/bench/wtperf/runners/500m-btree-populate.wtperf @@ -9,7 +9,7 @@ # # This generates about 80 Gb of uncompressed data. But it should compress # well and be small on disk. -conn_config="cache_size=16G,checkpoint=(wait=60,log_size=2GB),session_max=20000,log=(enabled),statistics=(fast),statistics_log=(wait=30,json),eviction=(threads_max=4)" +conn_config="cache_size=16G,checkpoint=(wait=60,log_size=2GB),session_max=20000,log=(enabled),statistics=(fast),statistics_log=(wait=30,json),eviction=(threads_max=8)" compact=true compression="snappy" sess_config="isolation=snapshot" diff --git a/src/third_party/wiredtiger/bench/wtperf/runners/500m-btree-rdonly.wtperf b/src/third_party/wiredtiger/bench/wtperf/runners/500m-btree-rdonly.wtperf index 2c9540ff589..e8958d20e2c 100644 --- a/src/third_party/wiredtiger/bench/wtperf/runners/500m-btree-rdonly.wtperf +++ b/src/third_party/wiredtiger/bench/wtperf/runners/500m-btree-rdonly.wtperf @@ -5,7 +5,7 @@ # # Set cache to half of memory of AWS perf instance. Enable logging and # checkpoints. Collect wiredtiger stats for ftdc. -conn_config="cache_size=16G,checkpoint=(wait=60,log_size=2GB),session_max=20000,log=(enabled),statistics=(fast),statistics_log=(wait=30,json),eviction=(threads_max=4)" +conn_config="cache_size=16G,checkpoint=(wait=60,log_size=2GB),session_max=20000,log=(enabled),statistics=(fast),statistics_log=(wait=30,json),eviction=(threads_max=8)" create=false compression="snappy" sess_config="isolation=snapshot" diff --git a/src/third_party/wiredtiger/bench/wtperf/runners/checkpoint-stress.wtperf b/src/third_party/wiredtiger/bench/wtperf/runners/checkpoint-stress.wtperf index bbd3a3ba5ed..5daa276e622 100644 --- a/src/third_party/wiredtiger/bench/wtperf/runners/checkpoint-stress.wtperf +++ b/src/third_party/wiredtiger/bench/wtperf/runners/checkpoint-stress.wtperf @@ -1,6 +1,6 @@ # A stress configuration to create long running checkpoints while doing a lot # of updates. -conn_config="cache_size=16GB,eviction=(threads_max=4),log=(enabled=false)" +conn_config="cache_size=16GB,eviction=(threads_max=8),log=(enabled=false)" table_config="leaf_page_max=32k,internal_page_max=16k,allocation_size=4k,split_pct=90,type=file" # Enough data to fill the cache. 150 million 1k records results in two ~11GB # tables diff --git a/src/third_party/wiredtiger/bench/wtperf/runners/evict-btree-1.wtperf b/src/third_party/wiredtiger/bench/wtperf/runners/evict-btree-1.wtperf index 24da4dd7902..741101d083f 100644 --- a/src/third_party/wiredtiger/bench/wtperf/runners/evict-btree-1.wtperf +++ b/src/third_party/wiredtiger/bench/wtperf/runners/evict-btree-1.wtperf @@ -1,5 +1,5 @@ # wtperf options file: evict btree configuration -conn_config="cache_size=50M" +conn_config="cache_size=50M,eviction=(threads_max=1)" table_config="type=file" icount=10000000 report_interval=5 diff --git a/src/third_party/wiredtiger/bench/wtperf/runners/evict-btree-readonly.wtperf b/src/third_party/wiredtiger/bench/wtperf/runners/evict-btree-readonly.wtperf index 25599fadd8d..972bc371f2d 100644 --- a/src/third_party/wiredtiger/bench/wtperf/runners/evict-btree-readonly.wtperf +++ b/src/third_party/wiredtiger/bench/wtperf/runners/evict-btree-readonly.wtperf @@ -1,5 +1,5 @@ # wtperf options file: evict btree configuration -conn_config="cache_size=50M,eviction=(threads_max=4),mmap=false" +conn_config="cache_size=50M,eviction=(threads_max=8),mmap=false" table_config="type=file" icount=10000000 report_interval=5 diff --git a/src/third_party/wiredtiger/bench/wtperf/runners/evict-btree-stress-multi.wtperf b/src/third_party/wiredtiger/bench/wtperf/runners/evict-btree-stress-multi.wtperf index a5a29f66fa0..5a2cad6d78e 100644 --- a/src/third_party/wiredtiger/bench/wtperf/runners/evict-btree-stress-multi.wtperf +++ b/src/third_party/wiredtiger/bench/wtperf/runners/evict-btree-stress-multi.wtperf @@ -1,4 +1,4 @@ -conn_config="cache_size=1G,eviction=(threads_max=4),session_max=2000" +conn_config="cache_size=1G,eviction=(threads_max=8),session_max=2000" table_config="type=file" table_count=100 close_conn=false diff --git a/src/third_party/wiredtiger/bench/wtperf/runners/evict-btree-stress.wtperf b/src/third_party/wiredtiger/bench/wtperf/runners/evict-btree-stress.wtperf index 740fb88c050..96e3f01b325 100644 --- a/src/third_party/wiredtiger/bench/wtperf/runners/evict-btree-stress.wtperf +++ b/src/third_party/wiredtiger/bench/wtperf/runners/evict-btree-stress.wtperf @@ -1,5 +1,5 @@ # wtperf options file: evict btree configuration -conn_config="cache_size=50M,eviction=(threads_max=4)" +conn_config="cache_size=50M,eviction=(threads_max=8)" table_config="type=file" icount=10000000 report_interval=5 diff --git a/src/third_party/wiredtiger/bench/wtperf/runners/evict-btree.wtperf b/src/third_party/wiredtiger/bench/wtperf/runners/evict-btree.wtperf index e7d967e5c63..3810e6a8294 100644 --- a/src/third_party/wiredtiger/bench/wtperf/runners/evict-btree.wtperf +++ b/src/third_party/wiredtiger/bench/wtperf/runners/evict-btree.wtperf @@ -1,5 +1,5 @@ # wtperf options file: evict btree configuration -conn_config="cache_size=50M,eviction=(threads_max=4)" +conn_config="cache_size=50M,eviction=(threads_max=8)" table_config="type=file" icount=10000000 report_interval=5 diff --git a/src/third_party/wiredtiger/bench/wtperf/runners/evict-lsm-1.wtperf b/src/third_party/wiredtiger/bench/wtperf/runners/evict-lsm-1.wtperf index ad885d98eb7..641a85dc889 100644 --- a/src/third_party/wiredtiger/bench/wtperf/runners/evict-lsm-1.wtperf +++ b/src/third_party/wiredtiger/bench/wtperf/runners/evict-lsm-1.wtperf @@ -1,5 +1,5 @@ # wtperf options file: evict lsm configuration -conn_config="cache_size=50M,lsm_manager=(worker_thread_max=6)" +conn_config="cache_size=50M,eviction=(threads_max=1),lsm_manager=(worker_thread_max=6)" table_config="type=lsm,lsm=(chunk_size=2M),os_cache_dirty_max=16MB" compact=true icount=10000000 diff --git a/src/third_party/wiredtiger/bench/wtperf/runners/evict-lsm-readonly.wtperf b/src/third_party/wiredtiger/bench/wtperf/runners/evict-lsm-readonly.wtperf index 661b8e21924..470dca695dd 100644 --- a/src/third_party/wiredtiger/bench/wtperf/runners/evict-lsm-readonly.wtperf +++ b/src/third_party/wiredtiger/bench/wtperf/runners/evict-lsm-readonly.wtperf @@ -1,5 +1,5 @@ # wtperf options file: evict lsm configuration -conn_config="cache_size=50M,lsm_manager=(worker_thread_max=6),eviction=(threads_max=4)" +conn_config="cache_size=50M,lsm_manager=(worker_thread_max=6),eviction=(threads_max=8)" table_config="type=lsm,lsm=(chunk_size=2M),os_cache_dirty_max=16MB" compact=true icount=10000000 diff --git a/src/third_party/wiredtiger/bench/wtperf/runners/evict-lsm.wtperf b/src/third_party/wiredtiger/bench/wtperf/runners/evict-lsm.wtperf index b872d429046..a0f2a78d013 100644 --- a/src/third_party/wiredtiger/bench/wtperf/runners/evict-lsm.wtperf +++ b/src/third_party/wiredtiger/bench/wtperf/runners/evict-lsm.wtperf @@ -1,5 +1,5 @@ # wtperf options file: evict lsm configuration -conn_config="cache_size=50M,lsm_manager=(worker_thread_max=6),eviction=(threads_max=4)" +conn_config="cache_size=50M,lsm_manager=(worker_thread_max=6),eviction=(threads_max=8)" table_config="type=lsm,lsm=(chunk_size=2M),os_cache_dirty_max=16MB" compact=true icount=10000000 diff --git a/src/third_party/wiredtiger/bench/wtperf/runners/log.wtperf b/src/third_party/wiredtiger/bench/wtperf/runners/log.wtperf index 6cf50dfb5a5..4379ba22373 100644 --- a/src/third_party/wiredtiger/bench/wtperf/runners/log.wtperf +++ b/src/third_party/wiredtiger/bench/wtperf/runners/log.wtperf @@ -16,7 +16,7 @@ # - Config + "-C "checkpoint=(wait=0)": no checkpoints # - Config + "-C "log=(enabled,prealloc=false,file_max=1M)": no pre-allocation # -conn_config="cache_size=5G,log=(enabled=true),checkpoint=(log_size=500M),eviction=(threads_max=4)" +conn_config="cache_size=5G,log=(enabled=true),checkpoint=(log_size=500M),eviction=(threads_max=8)" table_config="type=file" icount=1000000 report_interval=5 diff --git a/src/third_party/wiredtiger/bench/wtperf/runners/mongodb-secondary-apply.wtperf b/src/third_party/wiredtiger/bench/wtperf/runners/mongodb-secondary-apply.wtperf index f9e41184f95..58bd1a76b97 100644 --- a/src/third_party/wiredtiger/bench/wtperf/runners/mongodb-secondary-apply.wtperf +++ b/src/third_party/wiredtiger/bench/wtperf/runners/mongodb-secondary-apply.wtperf @@ -1,5 +1,5 @@ # Simulate the MongoDB oplog apply threads on a secondary. -conn_config="cache_size=10GB,session_max=1000,eviction=(threads_min=4,threads_max=4),log=(enabled=false),transaction_sync=(enabled=false),checkpoint_sync=true,checkpoint=(wait=60),statistics=(fast),statistics_log=(json,wait=1)" +conn_config="cache_size=10GB,session_max=1000,eviction=(threads_min=4,threads_max=8),log=(enabled=false),transaction_sync=(enabled=false),checkpoint_sync=true,checkpoint=(wait=60),statistics=(fast),statistics_log=(json,wait=1)" table_config="allocation_size=4k,memory_page_max=5MB,prefix_compression=false,split_pct=75,leaf_page_max=32k,internal_page_max=16k,type=file" # Spread the workload out over several tables. table_count=4 diff --git a/src/third_party/wiredtiger/bench/wtperf/runners/multi-btree-read-heavy-stress.wtperf b/src/third_party/wiredtiger/bench/wtperf/runners/multi-btree-read-heavy-stress.wtperf index d7b27f8fda4..f07e6c80b39 100644 --- a/src/third_party/wiredtiger/bench/wtperf/runners/multi-btree-read-heavy-stress.wtperf +++ b/src/third_party/wiredtiger/bench/wtperf/runners/multi-btree-read-heavy-stress.wtperf @@ -2,7 +2,7 @@ # up by dividing the workload across a lot of threads. This needs to be # tuned to the particular machine so the workload is close to capacity in the # steady state, but not overwhelming. -conn_config="cache_size=20GB,session_max=1000,eviction=(threads_min=4,threads_max=4),log=(enabled=false),transaction_sync=(enabled=false),checkpoint_sync=true,checkpoint=(wait=60),statistics=(fast),statistics_log=(json,wait=1)" +conn_config="cache_size=20GB,session_max=1000,eviction=(threads_min=4,threads_max=8),log=(enabled=false),transaction_sync=(enabled=false),checkpoint_sync=true,checkpoint=(wait=60),statistics=(fast),statistics_log=(json,wait=1)" table_config="allocation_size=4k,memory_page_max=10MB,prefix_compression=false,split_pct=90,leaf_page_max=32k,internal_page_max=16k,type=file" # Divide original icount by database_count. table_count=8 diff --git a/src/third_party/wiredtiger/bench/wtperf/runners/multi-btree-stress.wtperf b/src/third_party/wiredtiger/bench/wtperf/runners/multi-btree-stress.wtperf index b10b08f6035..bee1f431043 100644 --- a/src/third_party/wiredtiger/bench/wtperf/runners/multi-btree-stress.wtperf +++ b/src/third_party/wiredtiger/bench/wtperf/runners/multi-btree-stress.wtperf @@ -1,7 +1,7 @@ # wtperf options file: multi-database configuration attempting to # trigger slow operations by overloading CPU and disk. # References Jira WT-2131 -conn_config="cache_size=2GB,eviction=(threads_min=2,threads_max=2),log=(enabled=false),direct_io=(data,checkpoint),buffer_alignment=4096,checkpoint_sync=true,checkpoint=(wait=60)" +conn_config="cache_size=2GB,eviction=(threads_min=2,threads_max=8),log=(enabled=false),direct_io=(data,checkpoint),buffer_alignment=4096,checkpoint_sync=true,checkpoint=(wait=60)" table_config="allocation_size=4k,prefix_compression=false,split_pct=75,leaf_page_max=4k,internal_page_max=16k,leaf_item_max=1433,internal_item_max=3100,type=file" # Divide original icount by database_count. database_count=5 diff --git a/src/third_party/wiredtiger/bench/wtperf/runners/multi-btree-zipfian-populate.wtperf b/src/third_party/wiredtiger/bench/wtperf/runners/multi-btree-zipfian-populate.wtperf index ddd9c055eac..1fdba049779 100644 --- a/src/third_party/wiredtiger/bench/wtperf/runners/multi-btree-zipfian-populate.wtperf +++ b/src/third_party/wiredtiger/bench/wtperf/runners/multi-btree-zipfian-populate.wtperf @@ -1,5 +1,5 @@ # Create a set of tables with uneven distribution of data -conn_config="cache_size=1G,eviction=(threads_max=4),file_manager=(close_idle_time=100000),checkpoint=(wait=60,log_size=2GB),statistics=(fast),statistics_log=(wait=5,json),session_max=1000" +conn_config="cache_size=1G,eviction=(threads_max=8),file_manager=(close_idle_time=100000),checkpoint=(wait=60,log_size=2GB),statistics=(fast),statistics_log=(wait=5,json),session_max=1000" table_config="type=file" table_count=100 icount=0 diff --git a/src/third_party/wiredtiger/bench/wtperf/runners/multi-btree-zipfian-workload.wtperf b/src/third_party/wiredtiger/bench/wtperf/runners/multi-btree-zipfian-workload.wtperf index 380350c88c8..dfb3306a7a5 100644 --- a/src/third_party/wiredtiger/bench/wtperf/runners/multi-btree-zipfian-workload.wtperf +++ b/src/third_party/wiredtiger/bench/wtperf/runners/multi-btree-zipfian-workload.wtperf @@ -1,5 +1,5 @@ # Read from a set of tables with uneven distribution of data -conn_config="cache_size=1G,eviction=(threads_max=4),file_manager=(close_idle_time=100000),checkpoint=(wait=60,log_size=2GB),statistics=(fast),statistics_log=(wait=5,json),session_max=1000" +conn_config="cache_size=1G,eviction=(threads_max=8),file_manager=(close_idle_time=100000),checkpoint=(wait=60,log_size=2GB),statistics=(fast),statistics_log=(wait=5,json),session_max=1000" table_config="type=file" table_count=100 icount=0 diff --git a/src/third_party/wiredtiger/bench/wtperf/stress/btree-split-stress.wtperf b/src/third_party/wiredtiger/bench/wtperf/stress/btree-split-stress.wtperf index deb8c70d12f..eb6ca1cfddc 100644 --- a/src/third_party/wiredtiger/bench/wtperf/stress/btree-split-stress.wtperf +++ b/src/third_party/wiredtiger/bench/wtperf/stress/btree-split-stress.wtperf @@ -1,4 +1,4 @@ -conn_config="cache_size=2GB,statistics=[fast,clear],statistics_log=(wait=10),eviction=(threads_max=4,threads_min=4)" +conn_config="cache_size=2GB,statistics=[fast,clear],statistics_log=(wait=10),eviction=(threads_max=8,threads_min=4)" table_config="type=file,leaf_page_max=8k,internal_page_max=8k,memory_page_max=2MB,split_deepen_min_child=250" icount=200000 report_interval=5 @@ -6,5 +6,4 @@ run_time=300 reopen_connection=false populate_threads=2 value_sz=256 -read_range=100 -threads=((count=4,inserts=1,throttle=100000),(count=8,reads=1)) +threads=((count=4,inserts=1,throttle=100000),(count=8,reads=1,read_range=100)) diff --git a/src/third_party/wiredtiger/bench/wtperf/wtperf.c b/src/third_party/wiredtiger/bench/wtperf/wtperf.c index 8c7f0053388..7f5e5ad3373 100644 --- a/src/third_party/wiredtiger/bench/wtperf/wtperf.c +++ b/src/third_party/wiredtiger/bench/wtperf/wtperf.c @@ -432,19 +432,17 @@ err: wtperf->error = wtperf->stop = true; * search do them. Ensuring the keys we see are always in order. */ static int -do_range_reads(WTPERF *wtperf, WT_CURSOR *cursor) +do_range_reads(WTPERF *wtperf, WT_CURSOR *cursor, int64_t read_range) { - CONFIG_OPTS *opts; - size_t range; uint64_t next_val, prev_val; + int64_t range; char *range_key_buf; char buf[512]; int ret; - opts = wtperf->opts; ret = 0; - if (opts->read_range == 0) + if (read_range == 0) return (0); memset(&buf[0], 0, 512 * sizeof(char)); @@ -454,7 +452,7 @@ do_range_reads(WTPERF *wtperf, WT_CURSOR *cursor) testutil_check(cursor->get_key(cursor, &range_key_buf)); extract_key(range_key_buf, &next_val); - for (range = 0; range < opts->read_range; ++range) { + for (range = 0; range < read_range; ++range) { prev_val = next_val; ret = cursor->next(cursor); /* We are done if we reach the end. */ @@ -475,12 +473,56 @@ do_range_reads(WTPERF *wtperf, WT_CURSOR *cursor) return (0); } +/* pre_load_data -- + * Pull everything into cache before starting the workload phase. + */ +static int +pre_load_data(WTPERF *wtperf) +{ + CONFIG_OPTS *opts; + WT_CONNECTION *conn; + WT_CURSOR *cursor; + WT_SESSION *session; + char *key; + int ret; + size_t i; + + opts = wtperf->opts; + conn = wtperf->conn; + + if ((ret = conn->open_session( + conn, NULL, opts->sess_config, &session)) != 0) { + lprintf(wtperf, ret, 0, "worker: WT_CONNECTION.open_session"); + goto err; + } + for (i = 0; i < opts->table_count; i++) { + if ((ret = session->open_cursor(session, + wtperf->uris[i], NULL, NULL, &cursor)) != 0) { + lprintf(wtperf, ret, 0, + "worker: WT_SESSION.open_cursor: %s", + wtperf->uris[i]); + goto err; + } + while (cursor->next(cursor) == 0) + if ((ret = cursor->get_key(cursor, &key)) != 0) + goto err; + if ((ret = cursor->close(cursor)) != 0) + goto err; + } + if ((ret = session->close(session, NULL)) != 0) + goto err; + if (ret != 0) +err: lprintf(wtperf, ret, 0, "Pre-workload traverse error"); + return (ret); +} + static void * worker(void *arg) { struct timespec start, stop; CONFIG_OPTS *opts; TRACK *trk; + WORKLOAD *workload; WTPERF *wtperf; WTPERF_THREAD *thread; WT_CONNECTION *conn; @@ -495,13 +537,14 @@ worker(void *arg) char buf[512]; thread = (WTPERF_THREAD *)arg; + workload = thread->workload; wtperf = thread->wtperf; opts = wtperf->opts; conn = wtperf->conn; cursors = NULL; - log_table_cursor = NULL; /* -Wconditional-initialized */ + cursor = log_table_cursor = NULL; /* -Wconditional-initialized */ ops = 0; - ops_per_txn = thread->workload->ops_per_txn; + ops_per_txn = workload->ops_per_txn; session = NULL; trk = NULL; @@ -510,7 +553,6 @@ worker(void *arg) lprintf(wtperf, ret, 0, "worker: WT_CONNECTION.open_session"); goto err; } - cursors = dcalloc(opts->table_count, sizeof(WT_CURSOR *)); for (i = 0; i < opts->table_count_idle; i++) { snprintf(buf, 512, "%s_idle%05d", wtperf->uris[0], (int)i); if ((ret = session->open_cursor( @@ -525,14 +567,34 @@ worker(void *arg) goto err; } } - for (i = 0; i < opts->table_count; i++) { + if (workload->table_index != INT32_MAX) { if ((ret = session->open_cursor(session, - wtperf->uris[i], NULL, NULL, &cursors[i])) != 0) { + wtperf->uris[workload->table_index], + NULL, NULL, &cursor)) != 0) { lprintf(wtperf, ret, 0, "worker: WT_SESSION.open_cursor: %s", - wtperf->uris[i]); + wtperf->uris[workload->table_index]); goto err; } + if ((ret = session->open_cursor(session, + wtperf->uris[workload->table_index], + NULL, "next_random=true", &thread->rand_cursor)) != 0) { + lprintf(wtperf, ret, 0, + "worker: WT_SESSION.open_cursor: random %s", + wtperf->uris[workload->table_index]); + goto err; + } + } else { + cursors = dcalloc(opts->table_count, sizeof(WT_CURSOR *)); + for (i = 0; i < opts->table_count; i++) { + if ((ret = session->open_cursor(session, + wtperf->uris[i], NULL, NULL, &cursors[i])) != 0) { + lprintf(wtperf, ret, 0, + "worker: WT_SESSION.open_cursor: %s", + wtperf->uris[i]); + goto err; + } + } } if (opts->log_like_table && (ret = session->open_cursor(session, wtperf->log_table_uri, NULL, NULL, &log_table_cursor)) != 0) { @@ -543,19 +605,19 @@ worker(void *arg) } /* Setup the timer for throttling. */ - if (thread->workload->throttle != 0) + if (workload->throttle != 0) setup_throttle(thread); /* Setup for truncate */ - if (thread->workload->truncate != 0) + if (workload->truncate != 0) if ((ret = setup_truncate(wtperf, thread, session)) != 0) goto err; key_buf = thread->key_buf; value_buf = thread->value_buf; - op = thread->workload->ops; - op_end = op + sizeof(thread->workload->ops); + op = workload->ops; + op_end = op + sizeof(workload->ops); if ((ops_per_txn != 0 || opts->log_like_table) && (ret = session->begin_transaction(session, NULL)) != 0) { @@ -564,6 +626,8 @@ worker(void *arg) } while (!wtperf->stop) { + if (workload->pause != 0) + (void)sleep((unsigned int)workload->pause); /* * Generate the next key and setup operation specific * statistics tracking objects. @@ -603,10 +667,12 @@ worker(void *arg) generate_key(opts, key_buf, next_val); - /* - * Spread the data out around the multiple databases. - */ - cursor = cursors[map_key_to_table(wtperf->opts, next_val)]; + if (workload->table_index == INT32_MAX) + /* + * Spread the data out around the multiple databases. + */ + cursor = cursors[ + map_key_to_table(wtperf->opts, next_val)]; /* * Skip the first time we do an operation, when trk->ops @@ -642,7 +708,8 @@ worker(void *arg) * for several operations, confirming that the * next key is in the correct order. */ - ret = do_range_reads(wtperf, cursor); + ret = do_range_reads(wtperf, + cursor, workload->read_range); } if (ret == 0 || ret == WT_NOTFOUND) @@ -689,7 +756,7 @@ worker(void *arg) */ strncpy(value_buf, value, opts->value_sz_max - 1); - if (thread->workload->update_delta != 0) + if (workload->update_delta != 0) update_value_delta(thread); if (value_buf[0] == 'a') value_buf[0] = 'b'; @@ -806,7 +873,7 @@ op_err: if (ret == WT_ROLLBACK && ops_per_txn != 0) { /* Schedule the next operation */ if (++op == op_end) - op = thread->workload->ops; + op = workload->ops; /* * Decrement throttle ops and check if we should sleep @@ -843,7 +910,7 @@ run_mix_schedule_op(WORKLOAD *workp, int op, int64_t op_cnt) uint8_t *p, *end; /* Jump around the array to roughly spread out the operations. */ - jump = 100 / op_cnt; + jump = (int)(100 / op_cnt); /* * Find a read operation and replace it with another operation. This @@ -884,17 +951,6 @@ run_mix_schedule(WTPERF *wtperf, WORKLOAD *workp) opts = wtperf->opts; - /* Confirm reads, inserts, truncates and updates cannot all be zero. */ - if (workp->insert == 0 && workp->read == 0 && - workp->truncate == 0 && workp->update == 0) { - lprintf(wtperf, EINVAL, 0, "no operations scheduled"); - return (EINVAL); - } - - /* - * Handle truncate first - it's a special case that can't be used in - * a mixed workload. - */ if (workp->truncate != 0) { if (workp->insert != 0 || workp->read != 0 || workp->update != 0) { @@ -906,6 +962,12 @@ run_mix_schedule(WTPERF *wtperf, WORKLOAD *workp) return (0); } + /* Confirm reads, inserts and updates cannot all be zero. */ + if (workp->insert == 0 && workp->read == 0 && workp->update == 0) { + lprintf(wtperf, EINVAL, 0, "no operations scheduled"); + return (EINVAL); + } + /* * Check for a simple case where the thread is only doing insert or * update operations (because the default operation for a @@ -2244,6 +2306,8 @@ start_run(WTPERF *wtperf) opts->checkpoint_threads, checkpoint_worker) != 0) goto err; } + if (opts->pre_load_data && (ret = pre_load_data(wtperf)) != 0) + goto err; /* Execute the workload. */ if ((ret = execute_workload(wtperf)) != 0) goto err; @@ -2361,11 +2425,11 @@ main(int argc, char *argv[]) { CONFIG_OPTS *opts; WTPERF *wtperf, _wtperf; - size_t req_len, sreq_len; + size_t pos, req_len, sreq_len; bool monitor_set; int ch, ret; const char *cmdflags = "C:h:m:O:o:T:"; - const char *config_opts; + const char *append_comma, *config_opts; char *cc_buf, *path, *sess_cfg, *tc_buf, *user_cconfig, *user_tconfig; /* The first WTPERF structure (from which all others are derived). */ @@ -2502,53 +2566,101 @@ main(int argc, char *argv[]) __wt_stream_set_line_buffer(stdout); /* Concatenate non-default configuration strings. */ - if (opts->verbose > 1 || user_cconfig != NULL || - opts->session_count_idle > 0 || wtperf->compress_ext != NULL || - wtperf->async_config != NULL) { - req_len = strlen(debug_cconfig) + 20; - if (user_cconfig != NULL) - req_len += strlen(user_cconfig); - if (wtperf->async_config != NULL) - req_len += strlen(wtperf->async_config); - if (wtperf->compress_ext != NULL) - req_len += strlen(wtperf->compress_ext); + if ((opts->verbose > 1 && strlen(debug_cconfig) != 0) || + user_cconfig != NULL || opts->session_count_idle > 0 || + wtperf->compress_ext != NULL || wtperf->async_config != NULL) { + req_len = 20; + req_len += wtperf->async_config != NULL ? + strlen(wtperf->async_config) : 0; + req_len += wtperf->compress_ext != NULL ? + strlen(wtperf->compress_ext) : 0; if (opts->session_count_idle > 0) { - sreq_len = strlen(",session_max=") + 6; + sreq_len = strlen("session_max=") + 6; req_len += sreq_len; sess_cfg = dmalloc(sreq_len); snprintf(sess_cfg, sreq_len, - ",session_max=%" PRIu32, + "session_max=%" PRIu32, opts->session_count_idle + wtperf->workers_cnt + opts->populate_threads + 10); } + req_len += user_cconfig != NULL ? strlen(user_cconfig) : 0; + req_len += debug_cconfig != NULL ? strlen(debug_cconfig) : 0; cc_buf = dmalloc(req_len); - snprintf(cc_buf, req_len, "%s,%s,%s,%s,%s", - wtperf->async_config ? wtperf->async_config : "", - wtperf->compress_ext ? wtperf->compress_ext : "", - opts->verbose > 1 ? debug_cconfig : "", - sess_cfg != NULL ? sess_cfg : "", - user_cconfig != NULL ? user_cconfig : ""); - if (strlen(cc_buf) && (ret = + + pos = 0; + append_comma = ""; + if (wtperf->async_config != NULL && + strlen(wtperf->async_config) != 0) { + pos += (size_t)snprintf( + cc_buf + pos, req_len - pos, "%s%s", + append_comma, wtperf->async_config); + append_comma = ","; + } + if (wtperf->compress_ext != NULL && + strlen(wtperf->compress_ext) != 0) { + pos += (size_t)snprintf( + cc_buf + pos, req_len - pos, "%s%s", + append_comma, wtperf->compress_ext); + append_comma = ","; + } + if (sess_cfg != NULL && strlen(sess_cfg) != 0) { + pos += (size_t)snprintf( + cc_buf + pos, req_len - pos, "%s%s", + append_comma, sess_cfg); + append_comma = ","; + } + if (user_cconfig != NULL && strlen(user_cconfig) != 0) { + pos += (size_t)snprintf( + cc_buf + pos, req_len - pos, "%s%s", + append_comma, user_cconfig); + append_comma = ","; + } + if (opts->verbose > 1 && strlen(debug_cconfig) != 0) + pos += (size_t)snprintf( + cc_buf + pos, req_len - pos, "%s%s", + append_comma, debug_cconfig); + + if (strlen(cc_buf) != 0 && (ret = config_opt_name_value(wtperf, "conn_config", cc_buf)) != 0) goto err; } - if (opts->verbose > 1 || opts->index || + if ((opts->verbose > 1 && strlen(debug_tconfig) != 0) || opts->index || user_tconfig != NULL || wtperf->compress_table != NULL) { - req_len = strlen(debug_tconfig) + 20; - if (user_tconfig != NULL) - req_len += strlen(user_tconfig); - if (wtperf->compress_table != NULL) - req_len += strlen(wtperf->compress_table); - if (opts->index) - req_len += strlen(INDEX_COL_NAMES); + req_len = 20; + req_len += wtperf->compress_table != NULL ? + strlen(wtperf->compress_table) : 0; + req_len += opts->index ? strlen(INDEX_COL_NAMES) : 0; + req_len += user_tconfig != NULL ? strlen(user_tconfig) : 0; + req_len += debug_tconfig != NULL ? strlen(debug_tconfig) : 0; tc_buf = dmalloc(req_len); - snprintf(tc_buf, req_len, "%s,%s,%s,%s", - opts->index ? INDEX_COL_NAMES : "", - wtperf->compress_table != NULL ? - wtperf->compress_table : "", - opts->verbose > 1 ? debug_tconfig : "", - user_tconfig ? user_tconfig : ""); - if (strlen(tc_buf) && (ret = + + pos = 0; + append_comma = ""; + if (wtperf->compress_table != NULL && + strlen(wtperf->compress_table) != 0) { + pos += (size_t)snprintf( + tc_buf + pos, req_len - pos, "%s%s", + append_comma, wtperf->compress_table); + append_comma = ","; + } + if (opts->index) { + pos += (size_t)snprintf( + tc_buf + pos, req_len - pos, "%s%s", + append_comma, INDEX_COL_NAMES); + append_comma = ","; + } + if (user_tconfig != NULL && strlen(user_tconfig) != 0) { + pos += (size_t)snprintf( + tc_buf + pos, req_len - pos, "%s%s", + append_comma, user_tconfig); + append_comma = ","; + } + if (opts->verbose > 1 && strlen(debug_tconfig) != 0) + pos += (size_t)snprintf( + tc_buf + pos, req_len - pos, "%s%s", + append_comma, debug_tconfig); + + if (strlen(tc_buf) != 0 && (ret = config_opt_name_value(wtperf, "table_config", tc_buf)) != 0) goto err; } @@ -2779,14 +2891,43 @@ static uint64_t wtperf_rand(WTPERF_THREAD *thread) { CONFIG_OPTS *opts; + WT_CURSOR *rnd_cursor; WTPERF *wtperf; double S1, S2, U; uint64_t rval; + int ret; + char *key_buf; wtperf = thread->wtperf; opts = wtperf->opts; /* + * If we have a random cursor set up then use it. + */ + if ((rnd_cursor = thread->rand_cursor) != NULL) { + if ((ret = rnd_cursor->next(rnd_cursor)) != 0) { + lprintf(wtperf, ret, 0, "worker: rand next failed"); + /* 0 is outside the expected range. */ + return (0); + } + if ((ret = rnd_cursor->get_key(rnd_cursor, &key_buf)) != 0) { + lprintf(wtperf, ret, 0, + "worker: rand next key retrieval"); + return (0); + } + /* + * Resetting the cursor is not fatal. We still return the + * value we retrieved above. We do it so that we don't + * leave a cursor positioned. + */ + if ((ret = rnd_cursor->reset(rnd_cursor)) != 0) + lprintf(wtperf, ret, 0, + "worker: rand cursor reset failed"); + extract_key(key_buf, &rval); + return (rval); + } + + /* * Use WiredTiger's random number routine: it's lock-free and fairly * good. */ diff --git a/src/third_party/wiredtiger/bench/wtperf/wtperf.h b/src/third_party/wiredtiger/bench/wtperf/wtperf.h index 81d74e134f6..3efb8ab700e 100644 --- a/src/third_party/wiredtiger/bench/wtperf/wtperf.h +++ b/src/third_party/wiredtiger/bench/wtperf/wtperf.h @@ -66,6 +66,9 @@ typedef struct { uint64_t throttle; /* Maximum operations/second */ /* Number of operations per transaction. Zero for autocommit */ int64_t ops_per_txn; + int64_t pause; /* Time between scans */ + int64_t read_range; /* Range of reads */ + int32_t table_index; /* Table to focus ops on */ int64_t truncate; /* Truncate ratio */ uint64_t truncate_pct; /* Truncate Percent */ uint64_t truncate_count; /* Truncate Count */ @@ -225,6 +228,7 @@ typedef struct { struct __wtperf_thread { /* Per-thread structure */ WTPERF *wtperf; /* Enclosing configuration */ + WT_CURSOR *rand_cursor; /* Random key cursor */ WT_RAND_STATE rnd; /* Random number generation state */ diff --git a/src/third_party/wiredtiger/bench/wtperf/wtperf_opt.i b/src/third_party/wiredtiger/bench/wtperf/wtperf_opt.i index 680eb53a90e..63cef4c28fb 100644 --- a/src/third_party/wiredtiger/bench/wtperf/wtperf_opt.i +++ b/src/third_party/wiredtiger/bench/wtperf/wtperf_opt.i @@ -145,12 +145,13 @@ DEF_OPT_AS_UINT32(populate_ops_per_txn, 0, "phase, zero for auto-commit") DEF_OPT_AS_UINT32(populate_threads, 1, "number of populate threads, 1 for bulk load") +DEF_OPT_AS_BOOL(pre_load_data, 0, + "Scan all data prior to starting the workload phase to warm the cache") DEF_OPT_AS_UINT32(random_range, 0, "if non zero choose a value from within this range as the key for " "insert operations") DEF_OPT_AS_BOOL(random_value, 0, "generate random content for the value") DEF_OPT_AS_BOOL(range_partition, 0, "partition data by range (vs hash)") -DEF_OPT_AS_UINT32(read_range, 0, "scan a range of keys after each search") DEF_OPT_AS_BOOL(readonly, 0, "reopen the connection between populate and workload phases in readonly " "mode. Requires reopen_connection turned on (default). Requires that " @@ -192,9 +193,10 @@ DEF_OPT_AS_STRING(threads, "", "workload configuration: each 'count' " "'threads=((count=2,reads=1)(count=8,reads=1,inserts=2,updates=1))' " "which would create 2 threads doing nothing but reads and 8 threads " "each doing 50% inserts and 25% reads and updates. Allowed configuration " - "values are 'count', 'throttle', 'update_delta', 'reads', 'inserts', " - "'updates', 'truncate', 'truncate_pct' and 'truncate_count'. There are " - "also behavior modifiers, supported modifiers are 'ops_per_txn'") + "values are 'count', 'throttle', 'update_delta', 'reads', 'read_range', " + "'inserts', 'updates', 'truncate', 'truncate_pct' and 'truncate_count'. " + "There are also behavior modifiers, supported modifiers are " + "'ops_per_txn'") DEF_OPT_AS_CONFIG_STRING(transaction_config, "", "WT_SESSION.begin_transaction configuration string, applied during the " "populate phase when populate_ops_per_txn is nonzero") diff --git a/src/third_party/wiredtiger/build_posix/Make.base b/src/third_party/wiredtiger/build_posix/Make.base index 9354eb4b183..e5228fac885 100644 --- a/src/third_party/wiredtiger/build_posix/Make.base +++ b/src/third_party/wiredtiger/build_posix/Make.base @@ -36,6 +36,7 @@ wt_SOURCES =\ src/utilities/util_rename.c \ src/utilities/util_salvage.c \ src/utilities/util_stat.c \ + src/utilities/util_truncate.c \ src/utilities/util_upgrade.c \ src/utilities/util_verbose.c \ src/utilities/util_verify.c \ diff --git a/src/third_party/wiredtiger/build_posix/Make.subdirs b/src/third_party/wiredtiger/build_posix/Make.subdirs index 01f23dcbbc1..4ecec37ca6c 100644 --- a/src/third_party/wiredtiger/build_posix/Make.subdirs +++ b/src/third_party/wiredtiger/build_posix/Make.subdirs @@ -17,6 +17,7 @@ ext/encryptors/nop ext/encryptors/rotn ext/extractors/csv ext/test/kvs_bdb HAVE_BERKELEY_DB +ext/test/fail_fs . api/leveldb LEVELDB examples/c diff --git a/src/third_party/wiredtiger/build_posix/aclocal/options.m4 b/src/third_party/wiredtiger/build_posix/aclocal/options.m4 index 7043430a6d6..bc4b31dfee3 100644 --- a/src/third_party/wiredtiger/build_posix/aclocal/options.m4 +++ b/src/third_party/wiredtiger/build_posix/aclocal/options.m4 @@ -57,7 +57,7 @@ AH_TEMPLATE( HAVE_CRC32_HARDWARE, [Define to 1 to configure CRC32 hardware support.]) AC_MSG_CHECKING(if --enable-crc32-hardware option specified) AC_ARG_ENABLE(crc32-hardware, - AC_HELP_STRING([--enable-crc32-hardware], + AS_HELP_STRING([--enable-crc32-hardware], [Enable CRC32 hardware support.]), r=$enableval, r=yes) case "$r" in no) wt_cv_enable_crc32_hardware=no;; diff --git a/src/third_party/wiredtiger/build_posix/aclocal/version-set.m4 b/src/third_party/wiredtiger/build_posix/aclocal/version-set.m4 index ecb45b5e73e..c677ce41192 100644 --- a/src/third_party/wiredtiger/build_posix/aclocal/version-set.m4 +++ b/src/third_party/wiredtiger/build_posix/aclocal/version-set.m4 @@ -2,8 +2,8 @@ dnl build by dist/s_version VERSION_MAJOR=2 VERSION_MINOR=9 -VERSION_PATCH=1 -VERSION_STRING='"WiredTiger 2.9.1: (December 7, 2016)"' +VERSION_PATCH=2 +VERSION_STRING='"WiredTiger 2.9.2: (December 23, 2016)"' AC_SUBST(VERSION_MAJOR) AC_SUBST(VERSION_MINOR) diff --git a/src/third_party/wiredtiger/build_posix/aclocal/version.m4 b/src/third_party/wiredtiger/build_posix/aclocal/version.m4 index a75ba93e405..29782a22f82 100644 --- a/src/third_party/wiredtiger/build_posix/aclocal/version.m4 +++ b/src/third_party/wiredtiger/build_posix/aclocal/version.m4 @@ -1,2 +1,2 @@ dnl WiredTiger product version for AC_INIT. Maintained by dist/s_version -2.9.1 +2.9.2 diff --git a/src/third_party/wiredtiger/build_posix/configure.ac.in b/src/third_party/wiredtiger/build_posix/configure.ac.in index b7c39b5da8b..0fef587b4b8 100644 --- a/src/third_party/wiredtiger/build_posix/configure.ac.in +++ b/src/third_party/wiredtiger/build_posix/configure.ac.in @@ -91,6 +91,9 @@ fi # Linux requires _GNU_SOURCE to be defined AS_CASE([$host_os], [linux*], [AM_CFLAGS="$AM_CFLAGS -D_GNU_SOURCE"]) +# Configure options. +AM_OPTIONS + # If enable-strict is configured, turn on as much error checking as we can for # this compiler. Intended for developers, and only works for gcc/clang, but it # fills a need. @@ -109,9 +112,6 @@ if test "$wt_cv_enable_strict" = "yes"; then AM_CFLAGS="$AM_CFLAGS $wt_cv_strict_warnings" fi -# Configure options. -AM_OPTIONS - # Java and Python APIs if test "$wt_cv_enable_java" = "yes" -o "$wt_cv_enable_python" = "yes"; then # Only a warning, we need to build release packages without SWIG. @@ -133,7 +133,7 @@ if test "$wt_cv_enable_java" = "yes"; then fi if test "$wt_cv_enable_python" = "yes"; then - AM_PATH_PYTHON([2.7]) + AM_PATH_PYTHON([2.6]) if test -n "$with_python_prefix" ; then PYTHON_INSTALL_ARG="-d $with_python_prefix" fi diff --git a/src/third_party/wiredtiger/dist/api_data.py b/src/third_party/wiredtiger/dist/api_data.py index 98f9b5a230a..1d669fa7fe0 100644 --- a/src/third_party/wiredtiger/dist/api_data.py +++ b/src/third_party/wiredtiger/dist/api_data.py @@ -406,7 +406,7 @@ connection_runtime_config = [ Config('eviction', '', r''' eviction configuration options''', type='category', subconfig=[ - Config('threads_max', '1', r''' + Config('threads_max', '8', r''' maximum number of threads WiredTiger will start to help evict pages from cache. The number of threads started will vary depending on the current eviction load. Each eviction worker @@ -524,6 +524,7 @@ connection_runtime_config = [ 'checkpoint', 'compact', 'evict', + 'evict_stuck', 'evictserver', 'fileops', 'handleops', @@ -537,6 +538,7 @@ connection_runtime_config = [ 'rebalance', 'reconcile', 'recovery', + 'recovery_progress', 'salvage', 'shared_cache', 'split', @@ -716,7 +718,7 @@ wiredtiger_open_common =\ ]), Config('extensions', '', r''' list of shared library extensions to load (using dlopen). - Any values specified to an library extension are passed to + Any values specified to a library extension are passed to WT_CONNECTION::load_extension as the \c config parameter (for example, <code>extensions=(/path/ext.so={entry=my_entry})</code>)''', diff --git a/src/third_party/wiredtiger/dist/filelist b/src/third_party/wiredtiger/dist/filelist index 13d67ef961b..3886035eaa9 100644 --- a/src/third_party/wiredtiger/dist/filelist +++ b/src/third_party/wiredtiger/dist/filelist @@ -30,6 +30,7 @@ src/btree/bt_io.c src/btree/bt_misc.c src/btree/bt_ovfl.c src/btree/bt_page.c +src/btree/bt_random.c src/btree/bt_read.c src/btree/bt_rebalance.c src/btree/bt_ret.c diff --git a/src/third_party/wiredtiger/dist/flags.py b/src/third_party/wiredtiger/dist/flags.py index 320bd8f6fb9..b20a7181532 100644 --- a/src/third_party/wiredtiger/dist/flags.py +++ b/src/third_party/wiredtiger/dist/flags.py @@ -64,6 +64,7 @@ flags = { 'VERB_COMPACT', 'VERB_EVICT', 'VERB_EVICTSERVER', + 'VERB_EVICT_STUCK', 'VERB_FILEOPS', 'VERB_HANDLEOPS', 'VERB_LOG', @@ -76,6 +77,7 @@ flags = { 'VERB_REBALANCE', 'VERB_RECONCILE', 'VERB_RECOVERY', + 'VERB_RECOVERY_PROGRESS', 'VERB_SALVAGE', 'VERB_SHARED_CACHE', 'VERB_SPLIT', @@ -115,12 +117,14 @@ flags = { 'SESSION_CAN_WAIT', 'SESSION_INTERNAL', 'SESSION_LOCKED_CHECKPOINT', - 'SESSION_LOCKED_HANDLE_LIST', + 'SESSION_LOCKED_HANDLE_LIST_READ', + 'SESSION_LOCKED_HANDLE_LIST_WRITE', 'SESSION_LOCKED_METADATA', 'SESSION_LOCKED_PASS', 'SESSION_LOCKED_SCHEMA', 'SESSION_LOCKED_SLOT', - 'SESSION_LOCKED_TABLE', + 'SESSION_LOCKED_TABLE_READ', + 'SESSION_LOCKED_TABLE_WRITE', 'SESSION_LOCKED_TURTLE', 'SESSION_LOGGING_INMEM', 'SESSION_LOOKASIDE_CURSOR', diff --git a/src/third_party/wiredtiger/dist/package/wiredtiger.spec b/src/third_party/wiredtiger/dist/package/wiredtiger.spec index ca88f76b06b..aacdf327c98 100644 --- a/src/third_party/wiredtiger/dist/package/wiredtiger.spec +++ b/src/third_party/wiredtiger/dist/package/wiredtiger.spec @@ -1,5 +1,5 @@ Name: wiredtiger -Version: 2.9.1 +Version: 2.9.2 Release: 1%{?dist} Summary: WiredTiger data storage engine diff --git a/src/third_party/wiredtiger/dist/s_all b/src/third_party/wiredtiger/dist/s_all index 4c9d4eccebb..be33657e640 100755 --- a/src/third_party/wiredtiger/dist/s_all +++ b/src/third_party/wiredtiger/dist/s_all @@ -57,7 +57,7 @@ errchk() # Some tests shouldn't return an error, we exclude them here. case "$1" in *s_export|*s_tags) - break;; + ;; *) errfound=1;; esac @@ -97,10 +97,10 @@ COMMANDS=" 2>&1 ./s_string > ${t_pfx}s_string 2>&1 ./s_tags > ${t_pfx}tags 2>&1 ./s_typedef -c > ${t_pfx}s_typedef_c -2>&1 ./s_void > ${t_pfx}s_void" +2>&1 ./s_void > ${t_pfx}s_void 2>&1 ./s_whitespace > ${t_pfx}s_whitespace 2>&1 ./s_win > ${t_pfx}s_win -2>&1 python style.py > ${t_pfx}py_style +2>&1 python style.py > ${t_pfx}py_style" # Parallelize if possible. xp="" @@ -111,14 +111,13 @@ fi echo "$COMMANDS" | xargs $xp -I{} /bin/sh -c {} for f in `find . -name ${t_pfx}\*`; do - if ! `test -s $f`; then - continue + if `test -s $f`; then + LOCAL_NAME=`basename $f` + # Find original command and trim redirect garbage + FAILED_CMD=`echo "$COMMANDS" | grep $LOCAL_NAME | \ + sed -e 's/ >.*//' -e 's/.* //'` + errchk "$FAILED_CMD" $f fi - LOCAL_NAME=`basename $f` - # Find original command and trim redirect garbage - FAILED_CMD=`echo "$COMMANDS" | grep $LOCAL_NAME | \ - sed -e 's/ >.*//' -e 's/.* //'` - errchk "$FAILED_CMD" $f done echo 'dist/s_all run finished' diff --git a/src/third_party/wiredtiger/dist/s_define.list b/src/third_party/wiredtiger/dist/s_define.list index 53a3df87615..8911d888077 100644 --- a/src/third_party/wiredtiger/dist/s_define.list +++ b/src/third_party/wiredtiger/dist/s_define.list @@ -39,6 +39,8 @@ WT_PADDING_CHECK WT_READ_BARRIER WT_REF_SIZE WT_SESSION_LOCKED_CHECKPOINT +WT_SESSION_LOCKED_TABLE_READ +WT_SESSION_LOCKED_TABLE_WRITE WT_SESSION_LOCKED_TURTLE WT_SIZE_CHECK WT_STATS_FIELD_TO_OFFSET diff --git a/src/third_party/wiredtiger/dist/s_docs b/src/third_party/wiredtiger/dist/s_docs index f4332257193..6ebffb947ec 100755 --- a/src/third_party/wiredtiger/dist/s_docs +++ b/src/third_party/wiredtiger/dist/s_docs @@ -96,7 +96,8 @@ spellchk() type aspell > /dev/null 2>&1 || return (cd ../src/docs && - cat *.dox | aspell --lang=en --personal=./spell.ok list) | + cat *.dox | + aspell --encoding=iso-8859-1 --lang=en --personal=./spell.ok list) | sort -u > $t test -s $t && { echo "=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=" diff --git a/src/third_party/wiredtiger/dist/s_funcs.list b/src/third_party/wiredtiger/dist/s_funcs.list index 01835390997..b73767cad13 100644 --- a/src/third_party/wiredtiger/dist/s_funcs.list +++ b/src/third_party/wiredtiger/dist/s_funcs.list @@ -13,7 +13,6 @@ __wt_bloom_get __wt_bulk_insert_fix __wt_bulk_insert_row __wt_bulk_insert_var -__wt_cache_dump __wt_config_getone __wt_cursor_get_raw_value __wt_debug_addr diff --git a/src/third_party/wiredtiger/dist/s_stat b/src/third_party/wiredtiger/dist/s_stat index 5d5937e1833..6aeeca6faa6 100755 --- a/src/third_party/wiredtiger/dist/s_stat +++ b/src/third_party/wiredtiger/dist/s_stat @@ -25,9 +25,6 @@ cat << UNUSED_STAT_FIELDS lock_checkpoint_count lock_checkpoint_wait_application lock_checkpoint_wait_internal -lock_handle_list_count -lock_handle_list_wait_application -lock_handle_list_wait_internal lock_metadata_count lock_metadata_wait_application lock_metadata_wait_internal diff --git a/src/third_party/wiredtiger/dist/s_string.ok b/src/third_party/wiredtiger/dist/s_string.ok index f2429237f21..e033f77327f 100644 --- a/src/third_party/wiredtiger/dist/s_string.ok +++ b/src/third_party/wiredtiger/dist/s_string.ok @@ -21,6 +21,7 @@ Alakuijala Alexandrescu's Alloc Async +AsyncOp Athanassoulis Athlon BBBBB @@ -276,6 +277,7 @@ PRNG PTHREAD PTR PackInputStream +PackOutputStream Pandis Phong PlatformSDK @@ -339,6 +341,7 @@ Split's Stoica StoreLoad StoreStore +Syscall TAILQ TCMalloc TESTUTIL @@ -767,6 +770,7 @@ idx ifdef ifdef's iiSii +iiiS iiii iiu ikey @@ -1135,6 +1139,7 @@ subgetraw subgets subinit sublicense +subtest subtree sunique superset @@ -1179,6 +1184,7 @@ txt typedef uB uS +ui uint uintmax unbare @@ -1214,6 +1220,7 @@ upg uri uri's uris +usec usecs usedp userbad @@ -1244,6 +1251,9 @@ vunpack vw vxr waitpid +waker +wakeup +wakeups walk's warmup wb diff --git a/src/third_party/wiredtiger/dist/s_void b/src/third_party/wiredtiger/dist/s_void index 025f6d4c7eb..90425d5a718 100755 --- a/src/third_party/wiredtiger/dist/s_void +++ b/src/third_party/wiredtiger/dist/s_void @@ -78,10 +78,20 @@ func_ok() -e '/int demo_file_sync$/d' \ -e '/int demo_fs_directory_list_free$/d' \ -e '/int demo_fs_exist$/d' \ + -e '/int fail_file_lock$/d' \ + -e '/int fail_file_sync$/d' \ + -e '/int fail_fs_directory_list_free$/d' \ + -e '/int fail_fs_exist$/d' \ + -e '/int fail_fs_simulate_fail$/d' \ + -e '/int fail_fs_terminate$/d' \ -e '/int handle_message$/d' \ -e '/int handle_progress$/d' \ -e '/int helium_cursor_reset$/d' \ -e '/int helium_session_verify$/d' \ + -e '/int index_compare_primary$/d' \ + -e '/int index_compare_S$/d' \ + -e '/int index_compare_u$/d' \ + -e '/int index_extractor_u$/d' \ -e '/int log_print_err$/d' \ -e '/int lz4_error$/d' \ -e '/int lz4_pre_size$/d' \ diff --git a/src/third_party/wiredtiger/dist/stat_data.py b/src/third_party/wiredtiger/dist/stat_data.py index 022810d5c49..a4d92345f88 100644 --- a/src/third_party/wiredtiger/dist/stat_data.py +++ b/src/third_party/wiredtiger/dist/stat_data.py @@ -193,6 +193,7 @@ connection_stats = [ CacheStat('cache_bytes_other', 'bytes not belonging to page images in the cache', 'no_clear,no_scale,size'), CacheStat('cache_bytes_read', 'bytes read into cache', 'size'), CacheStat('cache_bytes_write', 'bytes written from cache', 'size'), + CacheStat('cache_eviction_active_workers', 'eviction worker thread active', 'no_clear'), CacheStat('cache_eviction_aggressive_set', 'eviction currently operating in aggressive mode', 'no_clear,no_scale'), CacheStat('cache_eviction_app', 'pages evicted by application threads'), CacheStat('cache_eviction_app_dirty', 'modified pages evicted by application threads'), @@ -222,12 +223,15 @@ connection_stats = [ CacheStat('cache_eviction_slow', 'eviction server unable to reach eviction goal'), CacheStat('cache_eviction_split_internal', 'internal pages split during eviction'), CacheStat('cache_eviction_split_leaf', 'leaf pages split during eviction'), + CacheStat('cache_eviction_stable_state_workers', 'eviction worker thread stable number', 'no_clear'), CacheStat('cache_eviction_state', 'eviction state', 'no_clear,no_scale'), CacheStat('cache_eviction_walk', 'pages walked for eviction'), CacheStat('cache_eviction_walks_abandoned', 'eviction walks abandoned'), CacheStat('cache_eviction_walks_active', 'files with active eviction walks', 'no_clear,no_scale'), CacheStat('cache_eviction_walks_started', 'files with new eviction walks started'), CacheStat('cache_eviction_worker_evicting', 'eviction worker thread evicting pages'), + CacheStat('cache_eviction_worker_created', 'eviction worker thread created'), + CacheStat('cache_eviction_worker_removed', 'eviction worker thread removed'), CacheStat('cache_hazard_checks', 'hazard pointer check calls'), CacheStat('cache_hazard_max', 'hazard pointer maximum array length', 'max_aggregate,no_scale'), CacheStat('cache_hazard_walks', 'hazard pointer check entries walked'), @@ -284,9 +288,7 @@ connection_stats = [ LockStat('lock_checkpoint_count', 'checkpoint lock acquisitions'), LockStat('lock_checkpoint_wait_application', 'checkpoint lock application thread wait time (usecs)'), LockStat('lock_checkpoint_wait_internal', 'checkpoint lock internal thread wait time (usecs)'), - LockStat('lock_handle_list_count', 'handle-list lock acquisitions'), - LockStat('lock_handle_list_wait_application', 'handle-list lock application thread wait time (usecs)'), - LockStat('lock_handle_list_wait_internal', 'handle-list lock internal thread wait time (usecs)'), + LockStat('lock_handle_list_wait_eviction', 'handle-list lock eviction thread wait time (usecs)'), LockStat('lock_metadata_count', 'metadata lock acquisitions'), LockStat('lock_metadata_wait_application', 'metadata lock application thread wait time (usecs)'), LockStat('lock_metadata_wait_internal', 'metadata lock internal thread wait time (usecs)'), @@ -477,6 +479,7 @@ dsrc_stats = [ ########################################## # Cache and eviction statistics ########################################## + CacheStat('cache_bytes_dirty', 'tracked dirty bytes in the cache', 'no_clear,no_scale,size'), CacheStat('cache_bytes_inuse', 'bytes currently in the cache', 'no_clear,no_scale,size'), CacheStat('cache_bytes_read', 'bytes read into cache', 'size'), CacheStat('cache_bytes_write', 'bytes written from cache', 'size'), diff --git a/src/third_party/wiredtiger/examples/c/ex_file_system.c b/src/third_party/wiredtiger/examples/c/ex_file_system.c index 56869171558..e807ac54d3b 100644 --- a/src/third_party/wiredtiger/examples/c/ex_file_system.c +++ b/src/third_party/wiredtiger/examples/c/ex_file_system.c @@ -399,6 +399,7 @@ demo_fs_directory_list(WT_FILE_SYSTEM *file_system, uint32_t allocated, count; int ret = 0; char *name, **entries; + void *p; (void)session; /* Unused */ @@ -424,14 +425,16 @@ demo_fs_directory_list(WT_FILE_SYSTEM *file_system, * matter if the list is a bit longer than necessary. */ if (count >= allocated) { - entries = realloc( - entries, (allocated + 10) * sizeof(char *)); - if (entries == NULL) { + p = realloc( + entries, (allocated + 10) * sizeof(*entries)); + if (p == NULL) { ret = ENOMEM; goto err; } - memset(entries + allocated * sizeof(char *), - 0, 10 * sizeof(char *)); + + entries = p; + memset(entries + allocated * sizeof(*entries), + 0, 10 * sizeof(*entries)); allocated += 10; } entries[count++] = strdup(name); diff --git a/src/third_party/wiredtiger/ext/test/fail_fs/Makefile.am b/src/third_party/wiredtiger/ext/test/fail_fs/Makefile.am new file mode 100644 index 00000000000..f31f5395cd1 --- /dev/null +++ b/src/third_party/wiredtiger/ext/test/fail_fs/Makefile.am @@ -0,0 +1,9 @@ +AM_CPPFLAGS = -I$(top_builddir) -I$(top_srcdir)/src/include + +noinst_LTLIBRARIES = libwiredtiger_fail_fs.la +libwiredtiger_fail_fs_la_SOURCES = fail_fs.c + +# libtool hack: noinst_LTLIBRARIES turns off building shared libraries as well +# as installation, it will only build static libraries. As far as I can tell, +# the "approved" libtool way to turn them back on is by adding -rpath. +libwiredtiger_fail_fs_la_LDFLAGS = -avoid-version -module -rpath /nowhere diff --git a/src/third_party/wiredtiger/ext/test/fail_fs/fail_fs.c b/src/third_party/wiredtiger/ext/test/fail_fs/fail_fs.c new file mode 100644 index 00000000000..d0d8a14c8c2 --- /dev/null +++ b/src/third_party/wiredtiger/ext/test/fail_fs/fail_fs.c @@ -0,0 +1,847 @@ +/*- + * Public Domain 2014-2016 MongoDB, Inc. + * Public Domain 2008-2014 WiredTiger, Inc. + * + * This is free and unencumbered software released into the public domain. + * + * Anyone is free to copy, modify, publish, use, compile, sell, or + * distribute this software, either in source code form or as a compiled + * binary, for any purpose, commercial or non-commercial, and by any + * means. + * + * In jurisdictions that recognize copyright laws, the author or authors + * of this software dedicate any and all copyright interest in the + * software to the public domain. We make this dedication for the benefit + * of the public at large and to the detriment of our heirs and + * successors. We intend this dedication to be an overt act of + * relinquishment in perpetuity of all present and future rights to this + * software under copyright law. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +#include <string.h> +#include <limits.h> +#include <errno.h> +#include <stdlib.h> +#include <assert.h> +#include <pthread.h> +#include <unistd.h> +#include <fcntl.h> +#include <sys/stat.h> +#include <execinfo.h> + +#include <wiredtiger_ext.h> +#include "queue.h" + +#define FAIL_FS_GIGABYTE (1024 * 1024 * 1024) + +#define FAIL_FS_ENV_ENABLE "WT_FAIL_FS_ENABLE" +#define FAIL_FS_ENV_WRITE_ALLOW "WT_FAIL_FS_WRITE_ALLOW" +#define FAIL_FS_ENV_READ_ALLOW "WT_FAIL_FS_READ_ALLOW" + +/* + * A "fail file system", that is, a file system extension that fails when we + * want it to. This is only used in test frameworks, this fact allows us to + * simplify some error paths. This code is not portable to Windows, as it has + * direct knowledge of file descriptors, environment variables and stack + * traces. + * + * When the filesystem extension is configured, parameters can set how many + * reads or writes can be allowed before failure. If this is not fine-grained + * enough, an 'environment' configuration parameter can be specified. If that + * is used, then on every file system read or write, environment variables are + * checked that control when reading or writing should fail. + */ +typedef struct { + WT_FILE_SYSTEM iface; + /* + * WiredTiger performs schema and I/O operations in parallel, all file + * system and file handle access must be thread-safe. This extension + * uses a single, global file system lock. + */ + pthread_rwlock_t lock; /* Lock */ + bool fail_enabled; + bool use_environment; + bool verbose; + int64_t read_ops; + int64_t write_ops; + int64_t allow_reads; + int64_t allow_writes; + /* Queue of file handles */ + TAILQ_HEAD(fail_file_handle_qh, fail_file_handle) fileq; + WT_EXTENSION_API *wtext; /* Extension functions */ +} FAIL_FILE_SYSTEM; + +typedef struct fail_file_handle { + WT_FILE_HANDLE iface; + + /* + * Track the system file descriptor for each file. + */ + FAIL_FILE_SYSTEM *fail_fs; /* Enclosing file system */ + TAILQ_ENTRY(fail_file_handle) q; /* Queue of handles */ + int fd; /* System file descriptor */ +} FAIL_FILE_HANDLE; + +static int fail_file_close(WT_FILE_HANDLE *, WT_SESSION *); +static void fail_file_handle_remove(WT_SESSION *, FAIL_FILE_HANDLE *); +static int fail_file_lock(WT_FILE_HANDLE *, WT_SESSION *, bool); +static int fail_file_read( + WT_FILE_HANDLE *, WT_SESSION *, wt_off_t, size_t, void *); +static int fail_file_size(WT_FILE_HANDLE *, WT_SESSION *, wt_off_t *); +static int fail_file_sync(WT_FILE_HANDLE *, WT_SESSION *); +static int fail_file_truncate(WT_FILE_HANDLE *, WT_SESSION *, wt_off_t); +static int fail_file_write( + WT_FILE_HANDLE *, WT_SESSION *, wt_off_t, size_t, const void *); +static bool fail_fs_arg( + const char *, WT_CONFIG_ITEM *, WT_CONFIG_ITEM *, int64_t *); +static int fail_fs_directory_list(WT_FILE_SYSTEM *, WT_SESSION *, + const char *, const char *, char ***, uint32_t *); +static int fail_fs_directory_list_free( + WT_FILE_SYSTEM *, WT_SESSION *, char **, uint32_t); +static void fail_fs_env(const char *, int64_t *); +static int fail_fs_exist(WT_FILE_SYSTEM *, WT_SESSION *, const char *, bool *); +static int fail_fs_open(WT_FILE_SYSTEM *, WT_SESSION *, + const char *, WT_FS_OPEN_FILE_TYPE, uint32_t, WT_FILE_HANDLE **); +static int fail_fs_remove( + WT_FILE_SYSTEM *, WT_SESSION *, const char *, uint32_t); +static int fail_fs_rename( + WT_FILE_SYSTEM *, WT_SESSION *, const char *, const char *, uint32_t); +static int fail_fs_simulate_fail( + FAIL_FILE_HANDLE *, WT_SESSION *, int64_t, const char *); +static int fail_fs_size( + WT_FILE_SYSTEM *, WT_SESSION *, const char *, wt_off_t *); +static int fail_fs_terminate(WT_FILE_SYSTEM *, WT_SESSION *); + +/* + * We use pthread functions for portable locking. + * Assert on errors for simplicity. + */ +static void +fail_fs_allocate_lock(pthread_rwlock_t *lockp) +{ + assert(pthread_rwlock_init(lockp, NULL) == 0); +} + +static void +fail_fs_destroy_lock(pthread_rwlock_t *lockp) +{ + assert(pthread_rwlock_destroy(lockp) == 0); +} + +static void +fail_fs_lock(pthread_rwlock_t *lockp) +{ + assert(pthread_rwlock_wrlock(lockp) == 0); +} + +static void +fail_fs_unlock(pthread_rwlock_t *lockp) +{ + assert(pthread_rwlock_unlock(lockp) == 0); +} + +/* + * fail_file_close -- + * ANSI C close. + */ +static int +fail_file_close(WT_FILE_HANDLE *file_handle, WT_SESSION *session) +{ + FAIL_FILE_HANDLE *fail_fh; + FAIL_FILE_SYSTEM *fail_fs; + int ret; + + (void)session; /* Unused */ + + fail_fh = (FAIL_FILE_HANDLE *)file_handle; + fail_fs = fail_fh->fail_fs; + + /* + * We don't actually open an fd when opening directories for flushing, + * so ignore that case here. + */ + if (fail_fh->fd < 0) + return (0); + ret = close(fail_fh->fd); + fail_fh->fd = -1; + fail_fs_lock(&fail_fs->lock); + fail_file_handle_remove(session, fail_fh); + fail_fs_unlock(&fail_fs->lock); + return (ret); +} + +/* + * fail_file_handle_remove -- + * Destroy an in-memory file handle. Should only happen on remove or + * shutdown. The file system lock must be held during this call. + */ +static void +fail_file_handle_remove(WT_SESSION *session, FAIL_FILE_HANDLE *fail_fh) +{ + FAIL_FILE_SYSTEM *fail_fs; + + (void)session; /* Unused */ + fail_fs = fail_fh->fail_fs; + + TAILQ_REMOVE(&fail_fs->fileq, fail_fh, q); + + free(fail_fh->iface.name); + free(fail_fh); +} + +/* + * fail_file_lock -- + * Lock/unlock a file. + */ +static int +fail_file_lock(WT_FILE_HANDLE *file_handle, WT_SESSION *session, bool lock) +{ + /* Locks are always granted. */ + (void)file_handle; /* Unused */ + (void)session; /* Unused */ + (void)lock; /* Unused */ + + return (0); +} + +/* + * fail_file_read -- + * POSIX pread. + */ +static int +fail_file_read(WT_FILE_HANDLE *file_handle, + WT_SESSION *session, wt_off_t offset, size_t len, void *buf) +{ + FAIL_FILE_HANDLE *fail_fh; + FAIL_FILE_SYSTEM *fail_fs; + WT_EXTENSION_API *wtext; + int64_t envint, read_ops; + int ret; + size_t chunk; + ssize_t nr; + uint8_t *addr; + + fail_fh = (FAIL_FILE_HANDLE *)file_handle; + fail_fs = fail_fh->fail_fs; + wtext = fail_fs->wtext; + read_ops = 0; + ret = 0; + + fail_fs_lock(&fail_fs->lock); + + if (fail_fs->use_environment) { + fail_fs_env(FAIL_FS_ENV_ENABLE, &envint); + if (envint != 0) { + if (!fail_fs->fail_enabled) { + fail_fs->fail_enabled = true; + fail_fs_env(FAIL_FS_ENV_READ_ALLOW, + &fail_fs->allow_reads); + fail_fs->read_ops = 0; + } + read_ops = ++fail_fs->read_ops; + } else + fail_fs->fail_enabled = false; + } else + read_ops = ++fail_fs->read_ops; + + fail_fs_unlock(&fail_fs->lock); + + if (fail_fs->fail_enabled && fail_fs->allow_reads != 0 && + read_ops % fail_fs->allow_reads == 0) + return (fail_fs_simulate_fail( + fail_fh, session, read_ops, "read")); + + /* Break reads larger than 1GB into 1GB chunks. */ + for (addr = buf; len > 0; addr += nr, len -= (size_t)nr, offset += nr) { + chunk = (len < FAIL_FS_GIGABYTE) ? len : FAIL_FS_GIGABYTE; + if ((nr = pread(fail_fh->fd, addr, chunk, offset)) <= 0) { + (void)wtext->err_printf(wtext, session, + "%s: handle-read: failed to read %" PRIuMAX + " bytes at offset %" PRIuMAX ": %s", + fail_fh->iface.name, + (uintmax_t)len, (uintmax_t)offset, + wtext->strerror(wtext, NULL, errno)); + ret = (nr == 0 ? WT_ERROR : errno); + break; + } + } + return (ret); +} + +/* + * fail_file_size -- + * Get the size of a file in bytes, by file handle. + */ +static int +fail_file_size( + WT_FILE_HANDLE *file_handle, WT_SESSION *session, wt_off_t *sizep) +{ + FAIL_FILE_HANDLE *fail_fh; + struct stat statbuf; + int ret; + + (void)session; /* Unused */ + + fail_fh = (FAIL_FILE_HANDLE *)file_handle; + ret = 0; + + if ((ret = fstat(fail_fh->fd, &statbuf)) != 0) + return (ret); + *sizep = statbuf.st_size; + return (0); +} + +/* + * fail_file_sync -- + * Ensure the content of the file is stable. This is a no-op in our + * file system. + */ +static int +fail_file_sync(WT_FILE_HANDLE *file_handle, WT_SESSION *session) +{ + (void)file_handle; /* Unused */ + (void)session; /* Unused */ + + return (0); +} + +/* + * fail_file_truncate -- + * POSIX ftruncate. + */ +static int +fail_file_truncate( + WT_FILE_HANDLE *file_handle, WT_SESSION *session, wt_off_t offset) +{ + FAIL_FILE_HANDLE *fail_fh; + + (void)session; /* Unused */ + + fail_fh = (FAIL_FILE_HANDLE *)file_handle; + return (ftruncate(fail_fh->fd, offset)); +} + +/* + * fail_file_write -- + * POSIX pwrite. + */ +static int +fail_file_write(WT_FILE_HANDLE *file_handle, WT_SESSION *session, + wt_off_t offset, size_t len, const void *buf) +{ + FAIL_FILE_HANDLE *fail_fh; + FAIL_FILE_SYSTEM *fail_fs; + WT_EXTENSION_API *wtext; + int64_t envint, write_ops; + int ret; + size_t chunk; + ssize_t nr; + const uint8_t *addr; + + fail_fh = (FAIL_FILE_HANDLE *)file_handle; + fail_fs = fail_fh->fail_fs; + wtext = fail_fs->wtext; + write_ops = 0; + ret = 0; + + fail_fs_lock(&fail_fs->lock); + + if (fail_fs->use_environment) { + fail_fs_env(FAIL_FS_ENV_ENABLE, &envint); + if (envint != 0) { + if (!fail_fs->fail_enabled) { + fail_fs->fail_enabled = true; + fail_fs_env(FAIL_FS_ENV_WRITE_ALLOW, + &fail_fs->allow_writes); + fail_fs->write_ops = 0; + } + write_ops = ++fail_fs->write_ops; + } else + fail_fs->fail_enabled = false; + } else + write_ops = ++fail_fs->write_ops; + + fail_fs_unlock(&fail_fs->lock); + + if (fail_fs->fail_enabled && fail_fs->allow_writes != 0 && + write_ops % fail_fs->allow_writes == 0) + return (fail_fs_simulate_fail( + fail_fh, session, write_ops, "write")); + + /* Break writes larger than 1GB into 1GB chunks. */ + for (addr = buf; len > 0; addr += nr, len -= (size_t)nr, offset += nr) { + chunk = (len < FAIL_FS_GIGABYTE) ? len : FAIL_FS_GIGABYTE; + if ((nr = pwrite(fail_fh->fd, addr, chunk, offset)) <= 0) { + (void)wtext->err_printf(wtext, session, + "%s: handle-write: failed to write %" PRIuMAX + " bytes at offset %" PRIuMAX ": %s", + fail_fh->iface.name, + (uintmax_t)len, (uintmax_t)offset, + wtext->strerror(wtext, NULL, errno)); + ret = (nr == 0 ? WT_ERROR : errno); + break; + } + } + return (ret); +} + +/* + * fail_fs_arg -- + * If the key matches, return the value interpreted as an integer. + */ +static bool +fail_fs_arg(const char *match, WT_CONFIG_ITEM *key, WT_CONFIG_ITEM *value, + int64_t *argp) +{ + if (strncmp(match, key->str, key->len) == 0 && + match[key->len] == '\0' && + (value->type == WT_CONFIG_ITEM_BOOL || + value->type == WT_CONFIG_ITEM_NUM)) { + *argp = value->val; + return (true); + } + return (false); +} + +/* + * fail_fs_directory_list -- + * Return a list of files in a given sub-directory. + */ +static int +fail_fs_directory_list(WT_FILE_SYSTEM *file_system, + WT_SESSION *session, const char *directory, + const char *prefix, char ***dirlistp, uint32_t *countp) +{ + FAIL_FILE_HANDLE *fail_fh; + FAIL_FILE_SYSTEM *fail_fs; + size_t len, prefix_len; + uint32_t allocated, count; + int ret; + char *name, **entries; + void *p; + + (void)session; /* Unused */ + + fail_fs = (FAIL_FILE_SYSTEM *)file_system; + ret = 0; + *dirlistp = NULL; + *countp = 0; + + entries = NULL; + allocated = count = 0; + len = strlen(directory); + prefix_len = prefix == NULL ? 0 : strlen(prefix); + + fail_fs_lock(&fail_fs->lock); + TAILQ_FOREACH(fail_fh, &fail_fs->fileq, q) { + name = fail_fh->iface.name; + if (strncmp(name, directory, len) != 0 || + (prefix != NULL && strncmp(name, prefix, prefix_len) != 0)) + continue; + + /* + * Increase the list size in groups of 10, it doesn't + * matter if the list is a bit longer than necessary. + */ + if (count >= allocated) { + p = realloc( + entries, (allocated + 10) * sizeof(*entries)); + if (p == NULL) { + ret = ENOMEM; + goto err; + } + entries = p; + memset(entries + allocated * sizeof(*entries), + 0, 10 * sizeof(*entries)); + allocated += 10; + } + entries[count++] = strdup(name); + } + + *dirlistp = entries; + *countp = count; + +err: fail_fs_unlock(&fail_fs->lock); + if (ret == 0) + return (0); + + if (entries != NULL) { + while (count > 0) + free(entries[--count]); + free(entries); + } + + return (ret); +} + +/* + * fail_fs_directory_list_free -- + * Free memory allocated by fail_fs_directory_list. + */ +static int +fail_fs_directory_list_free(WT_FILE_SYSTEM *file_system, + WT_SESSION *session, char **dirlist, uint32_t count) +{ + (void)file_system; /* Unused */ + (void)session; /* Unused */ + + if (dirlist != NULL) { + while (count > 0) + free(dirlist[--count]); + free(dirlist); + } + return (0); +} + +/* + * fail_fs_env -- + * If the name is in the environment, return its integral value. + */ +static void +fail_fs_env(const char *name, int64_t *valp) +{ + int64_t result; + char *s, *value; + + result = 0; + if ((value = getenv(name)) != NULL) { + s = value; + if (strcmp(value, "true") == 0) + result = 1; + else if (strcmp(value, "false") != 0) { + result = strtoll(value, &s, 10); + if (*s != '\0') + result = 0; + } + } + *valp = result; +} + +/* + * fail_fs_exist -- + * Return if the file exists. + */ +static int +fail_fs_exist(WT_FILE_SYSTEM *file_system, + WT_SESSION *session, const char *name, bool *existp) +{ + (void)file_system; /* Unused */ + (void)session; /* Unused */ + + *existp = (access(name, F_OK) == 0); + return (0); +} + +/* + * fail_fs_open -- + * fopen for the fail file system. + */ +static int +fail_fs_open(WT_FILE_SYSTEM *file_system, WT_SESSION *session, + const char *name, WT_FS_OPEN_FILE_TYPE file_type, uint32_t flags, + WT_FILE_HANDLE **file_handlep) +{ + FAIL_FILE_HANDLE *fail_fh; + FAIL_FILE_SYSTEM *fail_fs; + WT_EXTENSION_API *wtext; + WT_FILE_HANDLE *file_handle; + int fd, open_flags, ret; + + (void)session; /* Unused */ + + *file_handlep = NULL; + + fail_fh = NULL; + fail_fs = (FAIL_FILE_SYSTEM *)file_system; + fd = -1; + ret = 0; + + if (fail_fs->verbose) { + wtext = fail_fs->wtext; + (void)wtext->msg_printf(wtext, session, "fail_fs: open: %s", + name); + } + + fail_fs_lock(&fail_fs->lock); + + open_flags = 0; + if ((flags & WT_FS_OPEN_CREATE) != 0) + open_flags |= O_CREAT; + if ((flags & WT_FS_OPEN_EXCLUSIVE) != 0) + open_flags |= O_EXCL; + if ((flags & WT_FS_OPEN_READONLY) != 0) + open_flags |= O_RDONLY; + else + open_flags |= O_RDWR; + + /* + * Opening a file handle on a directory is only to support filesystems + * that require a directory sync for durability. This is a no-op + * for this file system. + */ + if (file_type == WT_FS_OPEN_FILE_TYPE_DIRECTORY) + fd = -1; + else if ((fd = open(name, open_flags, 0666)) < 0) { + ret = errno; + goto err; + } + + /* We create a handle structure for each open. */ + if ((fail_fh = calloc(1, sizeof(FAIL_FILE_HANDLE))) == NULL) { + ret = ENOMEM; + goto err; + } + + /* Initialize private information. */ + fail_fh->fail_fs = fail_fs; + fail_fh->fd = fd; + + /* Initialize public information. */ + file_handle = (WT_FILE_HANDLE *)fail_fh; + if ((file_handle->name = strdup(name)) == NULL) { + ret = ENOMEM; + goto err; + } + + /* Setup the function call table. */ + file_handle->close = fail_file_close; + file_handle->fh_advise = NULL; + file_handle->fh_extend = NULL; + file_handle->fh_extend_nolock = NULL; + file_handle->fh_lock = fail_file_lock; + file_handle->fh_map = NULL; + file_handle->fh_map_discard = NULL; + file_handle->fh_map_preload = NULL; + file_handle->fh_unmap = NULL; + file_handle->fh_read = fail_file_read; + file_handle->fh_size = fail_file_size; + file_handle->fh_sync = fail_file_sync; + file_handle->fh_sync_nowait = NULL; + file_handle->fh_truncate = fail_file_truncate; + file_handle->fh_write = fail_file_write; + + TAILQ_INSERT_HEAD(&fail_fs->fileq, fail_fh, q); + + *file_handlep = file_handle; + + if (0) { +err: if (fd != -1) + (void)close(fd); + free(fail_fh); + } + + fail_fs_unlock(&fail_fs->lock); + return (ret); +} + +/* + * fail_fs_remove -- + * POSIX remove. + */ +static int +fail_fs_remove(WT_FILE_SYSTEM *file_system, + WT_SESSION *session, const char *name, uint32_t flags) +{ + (void)file_system; /* Unused */ + (void)session; /* Unused */ + (void)flags; /* Unused */ + + return (unlink(name)); +} + +/* + * fail_fs_rename -- + * POSIX rename. + */ +static int +fail_fs_rename(WT_FILE_SYSTEM *file_system, + WT_SESSION *session, const char *from, const char *to, uint32_t flags) +{ + (void)file_system; /* Unused */ + (void)session; /* Unused */ + (void)flags; /* Unused */ + + return (rename(from, to)); +} + +/* + * fail_fs_simulate_fail -- + * Simulate a failure from this file system by reporting it + * and returning a non-zero return code. + */ +static int +fail_fs_simulate_fail(FAIL_FILE_HANDLE *fail_fh, WT_SESSION *session, + int64_t nops, const char *opkind) +{ + FAIL_FILE_SYSTEM *fail_fs; + WT_EXTENSION_API *wtext; +#ifdef __FreeBSD__ + size_t btret, i; +#else + int btret, i; +#endif + void *bt[100]; + char **btstr; + + fail_fs = fail_fh->fail_fs; + if (fail_fs->verbose) { + wtext = fail_fs->wtext; + (void)wtext->msg_printf(wtext, session, + "fail_fs: %s: simulated failure after %" PRId64 + " %s operations", fail_fh->iface.name, nops, opkind); +#ifdef __FreeBSD__ + btret = backtrace(bt, sizeof(bt) / sizeof(bt[0])); +#else + btret = backtrace(bt, (int)(sizeof(bt) / sizeof(bt[0]))); +#endif + if ((btstr = backtrace_symbols(bt, btret)) != NULL) { + for (i = 0; i < btret; i++) + (void)wtext->msg_printf(wtext, session, " %s", + btstr[i]); + free(btstr); + } + } + return (EIO); +} + +/* + * fail_fs_size -- + * Get the size of a file in bytes, by file name. + */ +static int +fail_fs_size(WT_FILE_SYSTEM *file_system, + WT_SESSION *session, const char *name, wt_off_t *sizep) +{ + struct stat statbuf; + int ret; + + (void)file_system; /* Unused */ + (void)session; /* Unused */ + + ret = 0; + if ((ret = stat(name, &statbuf)) != 0) + return (ret); + *sizep = statbuf.st_size; + return (0); +} + +/* + * fail_fs_terminate -- + * Discard any resources on termination + */ +static int +fail_fs_terminate(WT_FILE_SYSTEM *file_system, WT_SESSION *session) +{ + FAIL_FILE_HANDLE *fail_fh; + FAIL_FILE_SYSTEM *fail_fs; + + fail_fs = (FAIL_FILE_SYSTEM *)file_system; + + while ((fail_fh = TAILQ_FIRST(&fail_fs->fileq)) != NULL) + fail_file_handle_remove(session, fail_fh); + + fail_fs_destroy_lock(&fail_fs->lock); + free(fail_fs); + + return (0); +} + +/* + * wiredtiger_extension_init -- + * WiredTiger fail filesystem extension. + */ +int +wiredtiger_extension_init(WT_CONNECTION *conn, WT_CONFIG_ARG *config) +{ + FAIL_FILE_SYSTEM *fail_fs; + WT_CONFIG_ITEM k, v; + WT_CONFIG_PARSER *config_parser; + WT_EXTENSION_API *wtext; + WT_FILE_SYSTEM *file_system; + int64_t argval; + int ret; + + ret = 0; + wtext = conn->get_extension_api(conn); + if ((fail_fs = calloc(1, sizeof(FAIL_FILE_SYSTEM))) == NULL) { + (void)wtext->err_printf(wtext, NULL, + "fail_file_system extension_init: %s", + wtext->strerror(wtext, NULL, ENOMEM)); + return (ENOMEM); + } + fail_fs->wtext = wtext; + file_system = (WT_FILE_SYSTEM *)fail_fs; + + /* Get any configuration values. */ + if ((ret = wtext->config_parser_open_arg( + wtext, NULL, config, &config_parser)) != 0) { + (void)wtext->err_printf(wtext, NULL, + "WT_EXTENSION_API.config_parser_open: config: %s", + wtext->strerror(wtext, NULL, ret)); + goto err; + } + while ((ret = config_parser->next(config_parser, &k, &v)) == 0) { + if (fail_fs_arg("environment", &k, &v, &argval)) { + fail_fs->use_environment = (argval != 0); + continue; + } else if (fail_fs_arg("verbose", &k, &v, &argval)) { + fail_fs->verbose = (argval != 0); + continue; + } else if (fail_fs_arg("allow_writes", &k, &v, + &fail_fs->allow_writes)) + continue; + else if (fail_fs_arg("allow_reads", &k, &v, + &fail_fs->allow_reads)) + continue; + + (void)wtext->err_printf(wtext, NULL, + "WT_CONFIG_PARSER.next: unexpected configuration " + "information: %.*s=%.*s: %s", + (int)k.len, k.str, (int)v.len, v.str, + wtext->strerror(wtext, NULL, ret)); + goto err; + } + if (ret != WT_NOTFOUND) { + (void)wtext->err_printf(wtext, NULL, + "WT_CONFIG_PARSER.next: config: %s", + wtext->strerror(wtext, NULL, ret)); + goto err; + } + if ((ret = config_parser->close(config_parser)) != 0) { + (void)wtext->err_printf(wtext, NULL, + "WT_CONFIG_PARSER.close: config: %s", + wtext->strerror(wtext, NULL, ret)); + goto err; + } + if (fail_fs->allow_writes != 0 || fail_fs->allow_reads != 0) + fail_fs->fail_enabled = true; + + fail_fs_allocate_lock(&fail_fs->lock); + /* Initialize the in-memory jump table. */ + file_system->fs_directory_list = fail_fs_directory_list; + file_system->fs_directory_list_free = fail_fs_directory_list_free; + file_system->fs_exist = fail_fs_exist; + file_system->fs_open_file = fail_fs_open; + file_system->fs_remove = fail_fs_remove; + file_system->fs_rename = fail_fs_rename; + file_system->fs_size = fail_fs_size; + file_system->terminate = fail_fs_terminate; + if ((ret = conn->set_file_system(conn, file_system, NULL)) != 0) { + (void)wtext->err_printf(wtext, NULL, + "WT_CONNECTION.set_file_system: %s", + wtext->strerror(wtext, NULL, ret)); + goto err; + } + return (0); + +err: free(fail_fs); + return (ret); +} diff --git a/src/third_party/wiredtiger/import.data b/src/third_party/wiredtiger/import.data index 3b81e36d999..fe6ec8bb905 100644 --- a/src/third_party/wiredtiger/import.data +++ b/src/third_party/wiredtiger/import.data @@ -1,5 +1,5 @@ { - "commit": "8d2324943364286056ae399043f70b8a937de312", + "commit": "d6659de8d742b9562d08c1ba5138be881f8e24fa", "github": "wiredtiger/wiredtiger.git", "branch": "mongodb-3.4" } diff --git a/src/third_party/wiredtiger/lang/python/Makefile.am b/src/third_party/wiredtiger/lang/python/Makefile.am index 03c65a57028..b32d0321194 100644 --- a/src/third_party/wiredtiger/lang/python/Makefile.am +++ b/src/third_party/wiredtiger/lang/python/Makefile.am @@ -17,7 +17,8 @@ install-exec-local: (cd $(PYSRC) && \ $(PYTHON) setup.py build_py -d $(abs_builddir)/build && \ $(PYTHON) setup.py build_ext -f -b $(abs_builddir)/build $(PYDIRS) && \ - $(PYTHON) setup.py install_lib -b $(abs_builddir)/build --skip-build $(PYTHON_INSTALL_ARG)) + $(PYTHON) setup.py install_lib -b $(abs_builddir)/build --skip-build $(PYTHON_INSTALL_ARG) && \ + rm -rf $(abs_builddir)/build) # We build in different places for an install vs running from the tree: # clean up both. Don't rely on "setup.py clean" -- everything that should diff --git a/src/third_party/wiredtiger/src/async/async_api.c b/src/third_party/wiredtiger/src/async/async_api.c index 54bcb7cd26c..026a008188c 100644 --- a/src/third_party/wiredtiger/src/async/async_api.c +++ b/src/third_party/wiredtiger/src/async/async_api.c @@ -240,8 +240,7 @@ __async_start(WT_SESSION_IMPL *session) async = conn->async; TAILQ_INIT(&async->formatqh); WT_RET(__wt_spin_init(session, &async->ops_lock, "ops")); - WT_RET(__wt_cond_alloc( - session, "async flush", false, &async->flush_cond)); + WT_RET(__wt_cond_alloc(session, "async flush", &async->flush_cond)); WT_RET(__wt_async_op_init(session)); /* @@ -541,7 +540,7 @@ retry: async->flush_op.state = WT_ASYNCOP_READY; WT_RET(__wt_async_op_enqueue(session, &async->flush_op)); while (async->flush_state != WT_ASYNC_FLUSH_COMPLETE) - __wt_cond_wait(session, async->flush_cond, 100000); + __wt_cond_wait(session, async->flush_cond, 100000, NULL); /* * Flush is done. Clear the flags. */ diff --git a/src/third_party/wiredtiger/src/async/async_worker.c b/src/third_party/wiredtiger/src/async/async_worker.c index b1bc3902f7c..11f59ed14f1 100644 --- a/src/third_party/wiredtiger/src/async/async_worker.c +++ b/src/third_party/wiredtiger/src/async/async_worker.c @@ -107,7 +107,7 @@ __async_flush_wait(WT_SESSION_IMPL *session, WT_ASYNC *async, uint64_t my_gen) { while (async->flush_state == WT_ASYNC_FLUSHING && async->flush_gen == my_gen) - __wt_cond_wait(session, async->flush_cond, 10000); + __wt_cond_wait(session, async->flush_cond, 10000, NULL); } /* diff --git a/src/third_party/wiredtiger/src/block/block_write.c b/src/third_party/wiredtiger/src/block/block_write.c index d08aba45920..ea7859d6a38 100644 --- a/src/third_party/wiredtiger/src/block/block_write.c +++ b/src/third_party/wiredtiger/src/block/block_write.c @@ -43,10 +43,10 @@ __wt_block_truncate(WT_SESSION_IMPL *session, WT_BLOCK *block, wt_off_t len) * more targeted solution at some point. */ if (!conn->hot_backup) { - __wt_readlock(session, conn->hot_backup_lock); + __wt_readlock(session, &conn->hot_backup_lock); if (!conn->hot_backup) ret = __wt_ftruncate(session, block->fh, len); - __wt_readunlock(session, conn->hot_backup_lock); + __wt_readunlock(session, &conn->hot_backup_lock); } /* diff --git a/src/third_party/wiredtiger/src/btree/bt_curnext.c b/src/third_party/wiredtiger/src/btree/bt_curnext.c index 4d3976f9647..ba5fceae7c7 100644 --- a/src/third_party/wiredtiger/src/btree/bt_curnext.c +++ b/src/third_party/wiredtiger/src/btree/bt_curnext.c @@ -338,7 +338,7 @@ new_insert: if ((ins = cbt->ins) != NULL) { } /* Check for the end of the page. */ - if (cbt->row_iteration_slot >= page->pg_row_entries * 2 + 1) + if (cbt->row_iteration_slot >= page->entries * 2 + 1) return (WT_NOTFOUND); ++cbt->row_iteration_slot; @@ -356,7 +356,7 @@ new_insert: if ((ins = cbt->ins) != NULL) { cbt->ins = NULL; cbt->slot = cbt->row_iteration_slot / 2 - 1; - rip = &page->pg_row_d[cbt->slot]; + rip = &page->pg_row[cbt->slot]; upd = __wt_txn_read(session, WT_ROW_UPDATE(page, rip)); if (upd != NULL && WT_UPDATE_DELETED_ISSET(upd)) { if (__wt_txn_visible_all(session, upd->txnid)) diff --git a/src/third_party/wiredtiger/src/btree/bt_curprev.c b/src/third_party/wiredtiger/src/btree/bt_curprev.c index 2dd443ffac1..602c01b60eb 100644 --- a/src/third_party/wiredtiger/src/btree/bt_curprev.c +++ b/src/third_party/wiredtiger/src/btree/bt_curprev.c @@ -458,13 +458,13 @@ __cursor_row_prev(WT_CURSOR_BTREE *cbt, bool newpage) if (!F_ISSET_ATOMIC(page, WT_PAGE_BUILD_KEYS)) WT_RET(__wt_row_leaf_keys(session, page)); - if (page->pg_row_entries == 0) + if (page->entries == 0) cbt->ins_head = WT_ROW_INSERT_SMALLEST(page); else cbt->ins_head = - WT_ROW_INSERT_SLOT(page, page->pg_row_entries - 1); + WT_ROW_INSERT_SLOT(page, page->entries - 1); cbt->ins = WT_SKIP_LAST(cbt->ins_head); - cbt->row_iteration_slot = page->pg_row_entries * 2 + 1; + cbt->row_iteration_slot = page->entries * 2 + 1; cbt->rip_saved = NULL; goto new_insert; } @@ -515,7 +515,7 @@ new_insert: if ((ins = cbt->ins) != NULL) { cbt->ins = NULL; cbt->slot = cbt->row_iteration_slot / 2 - 1; - rip = &page->pg_row_d[cbt->slot]; + rip = &page->pg_row[cbt->slot]; upd = __wt_txn_read(session, WT_ROW_UPDATE(page, rip)); if (upd != NULL && WT_UPDATE_DELETED_ISSET(upd)) { if (__wt_txn_visible_all(session, upd->txnid)) diff --git a/src/third_party/wiredtiger/src/btree/bt_cursor.c b/src/third_party/wiredtiger/src/btree/bt_cursor.c index 650289f2cd8..5fde2237538 100644 --- a/src/third_party/wiredtiger/src/btree/bt_cursor.c +++ b/src/third_party/wiredtiger/src/btree/bt_cursor.c @@ -76,11 +76,11 @@ __cursor_fix_implicit(WT_BTREE *btree, WT_CURSOR_BTREE *cbt) } /* - * __cursor_valid -- + * __wt_cursor_valid -- * Return if the cursor references an valid key/value pair. */ -static inline bool -__cursor_valid(WT_CURSOR_BTREE *cbt, WT_UPDATE **updp) +bool +__wt_cursor_valid(WT_CURSOR_BTREE *cbt, WT_UPDATE **updp) { WT_BTREE *btree; WT_CELL *cell; @@ -163,7 +163,7 @@ __cursor_valid(WT_CURSOR_BTREE *cbt, WT_UPDATE **updp) * column-store pages don't have slots, but map one-to-one to * keys, check for retrieval past the end of the page. */ - if (cbt->recno >= cbt->ref->ref_recno + page->pg_fix_entries) + if (cbt->recno >= cbt->ref->ref_recno + page->entries) return (false); /* @@ -173,9 +173,9 @@ __cursor_valid(WT_CURSOR_BTREE *cbt, WT_UPDATE **updp) break; case BTREE_COL_VAR: /* The search function doesn't check for empty pages. */ - if (page->pg_var_entries == 0) + if (page->entries == 0) return (false); - WT_ASSERT(session, cbt->slot < page->pg_var_entries); + WT_ASSERT(session, cbt->slot < page->entries); /* * Column-store updates are stored as "insert" objects. If @@ -191,16 +191,16 @@ __cursor_valid(WT_CURSOR_BTREE *cbt, WT_UPDATE **updp) * backing store; check the cell for a record already deleted * when read. */ - cip = &page->pg_var_d[cbt->slot]; + cip = &page->pg_var[cbt->slot]; if ((cell = WT_COL_PTR(page, cip)) == NULL || __wt_cell_type(cell) == WT_CELL_DEL) return (false); break; case BTREE_ROW: /* The search function doesn't check for empty pages. */ - if (page->pg_row_entries == 0) + if (page->entries == 0) return (false); - WT_ASSERT(session, cbt->slot < page->pg_row_entries); + WT_ASSERT(session, cbt->slot < page->entries); /* * See above: for row-store, no insert object can have the same @@ -330,7 +330,7 @@ __wt_btcur_search(WT_CURSOR_BTREE *cbt) WT_ERR(btree->type == BTREE_ROW ? __cursor_row_search(session, cbt, cbt->ref, false) : __cursor_col_search(session, cbt, cbt->ref)); - valid = cbt->compare == 0 && __cursor_valid(cbt, &upd); + valid = cbt->compare == 0 && __wt_cursor_valid(cbt, &upd); } if (!valid) { WT_ERR(__cursor_func_init(cbt, true)); @@ -338,7 +338,7 @@ __wt_btcur_search(WT_CURSOR_BTREE *cbt) WT_ERR(btree->type == BTREE_ROW ? __cursor_row_search(session, cbt, NULL, false) : __cursor_col_search(session, cbt, NULL)); - valid = cbt->compare == 0 && __cursor_valid(cbt, &upd); + valid = cbt->compare == 0 && __wt_cursor_valid(cbt, &upd); } if (valid) @@ -418,16 +418,15 @@ __wt_btcur_search_near(WT_CURSOR_BTREE *cbt, int *exactp) * might be legitimately positioned after the last page slot). * Ignore those cases, it makes things too complicated. */ - if (cbt->slot != 0 && - cbt->slot != cbt->ref->page->pg_row_entries - 1) - valid = __cursor_valid(cbt, &upd); + if (cbt->slot != 0 && cbt->slot != cbt->ref->page->entries - 1) + valid = __wt_cursor_valid(cbt, &upd); } if (!valid) { WT_ERR(__cursor_func_init(cbt, true)); WT_ERR(btree->type == BTREE_ROW ? __cursor_row_search(session, cbt, NULL, true) : __cursor_col_search(session, cbt, NULL)); - valid = __cursor_valid(cbt, &upd); + valid = __wt_cursor_valid(cbt, &upd); } /* @@ -463,7 +462,7 @@ __wt_btcur_search_near(WT_CURSOR_BTREE *cbt, int *exactp) WT_ERR(btree->type == BTREE_ROW ? __cursor_row_search(session, cbt, NULL, true) : __cursor_col_search(session, cbt, NULL)); - if (__cursor_valid(cbt, &upd)) { + if (__wt_cursor_valid(cbt, &upd)) { exact = cbt->compare; ret = __wt_kv_return(session, cbt, upd); } else if ((ret = __wt_btcur_prev(cbt, false)) != WT_NOTFOUND) @@ -538,7 +537,7 @@ retry: WT_RET(__cursor_func_init(cbt, true)); * Fail in that case, the record exists. */ if (!F_ISSET(cursor, WT_CURSTD_OVERWRITE) && - ((cbt->compare == 0 && __cursor_valid(cbt, NULL)) || + ((cbt->compare == 0 && __wt_cursor_valid(cbt, NULL)) || (cbt->compare != 0 && __cursor_fix_implicit(btree, cbt)))) WT_ERR(WT_DUPLICATE_KEY); @@ -553,7 +552,7 @@ retry: WT_RET(__cursor_func_init(cbt, true)); * key/value pair. */ if (!F_ISSET(cursor, WT_CURSTD_OVERWRITE) && - cbt->compare == 0 && __cursor_valid(cbt, NULL)) + cbt->compare == 0 && __wt_cursor_valid(cbt, NULL)) WT_ERR(WT_DUPLICATE_KEY); ret = __cursor_row_modify(session, cbt, false); @@ -683,12 +682,12 @@ retry: WT_RET(__cursor_func_init(cbt, true)); /* * If we find a matching record, check whether an update would * conflict. Do this before checking if the update is visible - * in __cursor_valid, or we can miss conflict. + * in __wt_cursor_valid, or we can miss conflict. */ WT_ERR(__curfile_update_check(cbt)); /* Remove the record if it exists. */ - if (cbt->compare != 0 || !__cursor_valid(cbt, NULL)) { + if (cbt->compare != 0 || !__wt_cursor_valid(cbt, NULL)) { if (!__cursor_fix_implicit(btree, cbt)) WT_ERR(WT_NOTFOUND); /* @@ -712,7 +711,7 @@ retry: WT_RET(__cursor_func_init(cbt, true)); /* Check whether an update would conflict. */ WT_ERR(__curfile_update_check(cbt)); - if (cbt->compare != 0 || !__cursor_valid(cbt, NULL)) + if (cbt->compare != 0 || !__wt_cursor_valid(cbt, NULL)) WT_ERR(WT_NOTFOUND); ret = __cursor_row_modify(session, cbt, true); @@ -787,7 +786,8 @@ retry: WT_RET(__cursor_func_init(cbt, true)); */ if (!F_ISSET(cursor, WT_CURSTD_OVERWRITE)) { WT_ERR(__curfile_update_check(cbt)); - if ((cbt->compare != 0 || !__cursor_valid(cbt, NULL)) && + if ((cbt->compare != 0 || + !__wt_cursor_valid(cbt, NULL)) && !__cursor_fix_implicit(btree, cbt)) WT_ERR(WT_NOTFOUND); } @@ -801,7 +801,7 @@ retry: WT_RET(__cursor_func_init(cbt, true)); */ if (!F_ISSET(cursor, WT_CURSTD_OVERWRITE)) { WT_ERR(__curfile_update_check(cbt)); - if (cbt->compare != 0 || !__cursor_valid(cbt, NULL)) + if (cbt->compare != 0 || !__wt_cursor_valid(cbt, NULL)) WT_ERR(WT_NOTFOUND); } ret = __cursor_row_modify(session, cbt, false); @@ -831,111 +831,6 @@ err: if (ret == WT_RESTART) { } /* - * __wt_btcur_next_random -- - * Move to a random record in the tree. There are two algorithms, one - * where we select a record at random from the whole tree on each - * retrieval and one where we first select a record at random from the - * whole tree, and then subsequently sample forward from that location. - * The sampling approach allows us to select reasonably uniform random - * points from unbalanced trees. - */ -int -__wt_btcur_next_random(WT_CURSOR_BTREE *cbt) -{ - WT_BTREE *btree; - WT_DECL_RET; - WT_SESSION_IMPL *session; - WT_UPDATE *upd; - wt_off_t size; - uint64_t skip; - - session = (WT_SESSION_IMPL *)cbt->iface.session; - btree = cbt->btree; - - /* - * Only supports row-store: applications can trivially select a random - * value from a column-store, if there were any reason to do so. - */ - if (btree->type != BTREE_ROW) - WT_RET_MSG(session, ENOTSUP, - "WT_CURSOR.next_random only supported by row-store tables"); - - WT_STAT_CONN_INCR(session, cursor_next); - WT_STAT_DATA_INCR(session, cursor_next); - - /* - * If retrieving random values without sampling, or we don't have a - * page reference, pick a roughly random leaf page in the tree. - */ - if (cbt->ref == NULL || cbt->next_random_sample_size == 0) { - /* - * Skip past the sample size of the leaf pages in the tree - * between each random key return to compensate for unbalanced - * trees. - * - * Use the underlying file size divided by its block allocation - * size as our guess of leaf pages in the file (this can be - * entirely wrong, as it depends on how many pages are in this - * particular checkpoint, how large the leaf and internal pages - * really are, and other factors). Then, divide that value by - * the configured sample size and increment the final result to - * make sure tiny files don't leave us with a skip value of 0. - * - * !!! - * Ideally, the number would be prime to avoid restart issues. - */ - if (cbt->next_random_sample_size != 0) { - WT_ERR(btree->bm->size(btree->bm, session, &size)); - cbt->next_random_leaf_skip = (uint64_t) - ((size / btree->allocsize) / - cbt->next_random_sample_size) + 1; - } - - /* - * Choose a leaf page from the tree. - */ - WT_ERR(__cursor_func_init(cbt, true)); - WT_WITH_PAGE_INDEX( - session, ret = __wt_row_random_descent(session, cbt)); - WT_ERR(ret); - } else { - /* - * Read through the tree, skipping leaf pages. Be cautious about - * the skip count: if the last leaf page skipped was also the - * last leaf page in the tree, it may be set to zero on return - * with the end-of-walk condition. - * - * Pages read for data sampling aren't "useful"; don't update - * the read generation of pages already in memory, and if a page - * is read, set its generation to a low value so it is evicted - * quickly. - */ - for (skip = - cbt->next_random_leaf_skip; cbt->ref == NULL || skip > 0;) - WT_ERR(__wt_tree_walk_skip(session, &cbt->ref, &skip, - WT_READ_NO_GEN | - WT_READ_SKIP_INTL | WT_READ_WONT_NEED)); - } - - /* - * Select a random entry from the leaf page. If it's not valid, move to - * the next entry, if that doesn't work, move to the previous entry. - */ - WT_ERR(__wt_row_random_leaf(session, cbt)); - if (__cursor_valid(cbt, &upd)) - WT_ERR(__wt_kv_return(session, cbt, upd)); - else { - if ((ret = __wt_btcur_next(cbt, false)) == WT_NOTFOUND) - ret = __wt_btcur_prev(cbt, false); - WT_ERR(ret); - } - return (0); - -err: WT_TRET(__cursor_reset(cbt)); - return (ret); -} - -/* * __wt_btcur_compare -- * Return a comparison between two cursors. */ diff --git a/src/third_party/wiredtiger/src/btree/bt_debug.c b/src/third_party/wiredtiger/src/btree/bt_debug.c index d507cc0e396..d664da2ebd3 100644 --- a/src/third_party/wiredtiger/src/btree/bt_debug.c +++ b/src/third_party/wiredtiger/src/btree/bt_debug.c @@ -652,7 +652,7 @@ __debug_page_metadata(WT_DBG *ds, WT_REF *ref) page = ref->page; mod = page->modify; - WT_RET(ds->f(ds, "%p", (void *)page)); + WT_RET(ds->f(ds, "%p", (void *)ref)); switch (page->type) { case WT_PAGE_COL_INT: @@ -662,25 +662,28 @@ __debug_page_metadata(WT_DBG *ds, WT_REF *ref) break; case WT_PAGE_COL_FIX: WT_RET(ds->f(ds, " recno %" PRIu64, ref->ref_recno)); - entries = page->pg_fix_entries; + entries = page->entries; break; case WT_PAGE_COL_VAR: WT_RET(ds->f(ds, " recno %" PRIu64, ref->ref_recno)); - entries = page->pg_var_entries; + entries = page->entries; break; case WT_PAGE_ROW_INT: WT_INTL_INDEX_GET(session, page, pindex); entries = pindex->entries; break; case WT_PAGE_ROW_LEAF: - entries = page->pg_row_entries; + entries = page->entries; break; WT_ILLEGAL_VALUE(session); } WT_RET(ds->f(ds, ": %s\n", __wt_page_type_string(page->type))); - WT_RET(ds->f(ds, - "\t" "disk %p, entries %" PRIu32, (void *)page->dsk, entries)); + WT_RET(ds->f(ds, "\t" "disk %p", (void *)page->dsk)); + if (page->dsk != NULL) + WT_RET(ds->f( + ds, ", dsk_mem_size %" PRIu32, page->dsk->mem_size)); + WT_RET(ds->f(ds, ", entries %" PRIu32, entries)); WT_RET(ds->f(ds, ", %s", __wt_page_is_modified(page) ? "dirty" : "clean")); WT_RET(ds->f(ds, ", %s", __wt_rwlock_islocked( @@ -696,8 +699,6 @@ __debug_page_metadata(WT_DBG *ds, WT_REF *ref) WT_RET(ds->f(ds, ", evict-lru")); if (F_ISSET_ATOMIC(page, WT_PAGE_OVERFLOW_KEYS)) WT_RET(ds->f(ds, ", overflow-keys")); - if (F_ISSET_ATOMIC(page, WT_PAGE_SPLIT_BLOCK)) - WT_RET(ds->f(ds, ", split-block")); if (F_ISSET_ATOMIC(page, WT_PAGE_SPLIT_INSERT)) WT_RET(ds->f(ds, ", split-insert")); if (F_ISSET_ATOMIC(page, WT_PAGE_UPDATE_IGNORE)) diff --git a/src/third_party/wiredtiger/src/btree/bt_delete.c b/src/third_party/wiredtiger/src/btree/bt_delete.c index 00e41475de9..b55ad291c5e 100644 --- a/src/third_party/wiredtiger/src/btree/bt_delete.c +++ b/src/third_party/wiredtiger/src/btree/bt_delete.c @@ -318,13 +318,12 @@ __wt_delete_page_instantiate(WT_SESSION_IMPL *session, WT_REF *ref) * hard case is if a page splits: the update structures might be moved * to different pages, and we still have to find them all for an abort. */ - if (page_del != NULL) WT_RET(__wt_calloc_def( - session, page->pg_row_entries + 1, &page_del->update_list)); + session, page->entries + 1, &page_del->update_list)); /* Allocate the per-page update array. */ - WT_ERR(__wt_calloc_def(session, page->pg_row_entries, &upd_array)); + WT_ERR(__wt_calloc_def(session, page->entries, &upd_array)); page->modify->mod_row_update = upd_array; /* @@ -332,7 +331,7 @@ __wt_delete_page_instantiate(WT_SESSION_IMPL *session, WT_REF *ref) * structures, fill in the per-page update array with references to * deleted items. */ - for (i = 0, size = 0; i < page->pg_row_entries; ++i) { + for (i = 0, size = 0; i < page->entries; ++i) { WT_ERR(__wt_calloc_one(session, &upd)); WT_UPDATE_DELETED_SET(upd); diff --git a/src/third_party/wiredtiger/src/btree/bt_discard.c b/src/third_party/wiredtiger/src/btree/bt_discard.c index c2733d6567b..d2beb84fee9 100644 --- a/src/third_party/wiredtiger/src/btree/bt_discard.c +++ b/src/third_party/wiredtiger/src/btree/bt_discard.c @@ -206,8 +206,7 @@ __free_page_modify(WT_SESSION_IMPL *session, WT_PAGE *page) if (mod->mod_col_update != NULL) __free_skip_array(session, mod->mod_col_update, page->type == - WT_PAGE_COL_FIX ? 1 : page->pg_var_entries, - update_ignore); + WT_PAGE_COL_FIX ? 1 : page->entries, update_ignore); break; case WT_PAGE_ROW_LEAF: /* @@ -219,12 +218,12 @@ __free_page_modify(WT_SESSION_IMPL *session, WT_PAGE *page) */ if (mod->mod_row_insert != NULL) __free_skip_array(session, mod->mod_row_insert, - page->pg_row_entries + 1, update_ignore); + page->entries + 1, update_ignore); /* Free the update array. */ if (mod->mod_row_update != NULL) __free_update(session, mod->mod_row_update, - page->pg_row_entries, update_ignore); + page->entries, update_ignore); break; } @@ -332,7 +331,7 @@ static void __free_page_col_var(WT_SESSION_IMPL *session, WT_PAGE *page) { /* Free the RLE lookup array. */ - __wt_free(session, page->pg_var_repeats); + __wt_free(session, page->u.col_var.repeats); } /* diff --git a/src/third_party/wiredtiger/src/btree/bt_handle.c b/src/third_party/wiredtiger/src/btree/bt_handle.c index 47c7972dd57..6ed70788759 100644 --- a/src/third_party/wiredtiger/src/btree/bt_handle.c +++ b/src/third_party/wiredtiger/src/btree/bt_handle.c @@ -359,8 +359,7 @@ __btree_conf(WT_SESSION_IMPL *session, WT_CKPT *ckpt) } /* Initialize locks. */ - WT_RET(__wt_rwlock_alloc( - session, &btree->ovfl_lock, "btree overflow lock")); + __wt_rwlock_init(session, &btree->ovfl_lock); WT_RET(__wt_spin_init(session, &btree->flush_lock, "btree flush")); btree->checkpointing = WT_CKPT_OFF; /* Not checkpointing */ diff --git a/src/third_party/wiredtiger/src/btree/bt_ovfl.c b/src/third_party/wiredtiger/src/btree/bt_ovfl.c index 29ea561db3a..ae0da62af57 100644 --- a/src/third_party/wiredtiger/src/btree/bt_ovfl.c +++ b/src/third_party/wiredtiger/src/btree/bt_ovfl.c @@ -67,11 +67,11 @@ __wt_ovfl_read(WT_SESSION_IMPL *session, * Acquire the overflow lock, and retest the on-page cell's value inside * the lock. */ - __wt_readlock(session, S2BT(session)->ovfl_lock); + __wt_readlock(session, &S2BT(session)->ovfl_lock); ret = __wt_cell_type_raw(unpack->cell) == WT_CELL_VALUE_OVFL_RM ? __wt_ovfl_txnc_search(page, unpack->data, unpack->size, store) : __ovfl_read(session, unpack->data, unpack->size, store); - __wt_readunlock(session, S2BT(session)->ovfl_lock); + __wt_readunlock(session, &S2BT(session)->ovfl_lock); return (ret); } @@ -249,7 +249,7 @@ __wt_ovfl_discard(WT_SESSION_IMPL *session, WT_CELL *cell) * Acquire the overflow lock to avoid racing with a thread reading the * backing overflow blocks. */ - __wt_writelock(session, btree->ovfl_lock); + __wt_writelock(session, &btree->ovfl_lock); switch (unpack->raw) { case WT_CELL_KEY_OVFL: @@ -263,7 +263,7 @@ __wt_ovfl_discard(WT_SESSION_IMPL *session, WT_CELL *cell) WT_ILLEGAL_VALUE(session); } - __wt_writeunlock(session, btree->ovfl_lock); + __wt_writeunlock(session, &btree->ovfl_lock); /* Free the backing disk blocks. */ return (bm->free(bm, session, unpack->data, unpack->size)); diff --git a/src/third_party/wiredtiger/src/btree/bt_page.c b/src/third_party/wiredtiger/src/btree/bt_page.c index 7bac7079fe8..f20f6398e37 100644 --- a/src/third_party/wiredtiger/src/btree/bt_page.c +++ b/src/third_party/wiredtiger/src/btree/bt_page.c @@ -67,7 +67,7 @@ __wt_page_alloc(WT_SESSION_IMPL *session, switch (type) { case WT_PAGE_COL_FIX: - page->pg_fix_entries = alloc_entries; + page->entries = alloc_entries; break; case WT_PAGE_COL_INT: case WT_PAGE_ROW_INT: @@ -102,12 +102,12 @@ err: if ((pindex = WT_INTL_INDEX_GET_SAFE(page)) != NULL) { } break; case WT_PAGE_COL_VAR: - page->pg_var_d = (WT_COL *)((uint8_t *)page + sizeof(WT_PAGE)); - page->pg_var_entries = alloc_entries; + page->pg_var = (WT_COL *)((uint8_t *)page + sizeof(WT_PAGE)); + page->entries = alloc_entries; break; case WT_PAGE_ROW_LEAF: - page->pg_row_d = (WT_ROW *)((uint8_t *)page + sizeof(WT_PAGE)); - page->pg_row_entries = alloc_entries; + page->pg_row = (WT_ROW *)((uint8_t *)page + sizeof(WT_PAGE)); + page->entries = alloc_entries; break; WT_ILLEGAL_VALUE(session); } @@ -333,9 +333,10 @@ __inmem_col_var( WT_CELL *cell; WT_CELL_UNPACK *unpack, _unpack; const WT_PAGE_HEADER *dsk; + size_t size; uint64_t rle; - size_t bytes_allocated; uint32_t i, indx, n, repeat_off; + void *p; btree = S2BT(session); dsk = page->dsk; @@ -343,7 +344,6 @@ __inmem_col_var( repeats = NULL; repeat_off = 0; unpack = &_unpack; - bytes_allocated = 0; /* * Walk the page, building references: the page contains unsorted value @@ -351,7 +351,7 @@ __inmem_col_var( * (WT_CELL_VALUE_OVFL) or deleted items (WT_CELL_DEL). */ indx = 0; - cip = page->pg_var_d; + cip = page->pg_var; WT_CELL_FOREACH(btree, dsk, cell, unpack, i) { __wt_cell_unpack(cell, unpack); WT_COL_PTR_SET(cip, WT_PAGE_DISK_OFFSET(page, cell)); @@ -367,12 +367,14 @@ __inmem_col_var( if (rle > 1) { if (repeats == NULL) { __inmem_col_var_repeats(session, page, &n); - WT_RET(__wt_realloc_def(session, - &bytes_allocated, n + 1, &repeats)); + size = sizeof(WT_COL_VAR_REPEAT) + + (n + 1) * sizeof(WT_COL_RLE); + WT_RET(__wt_calloc(session, 1, size, &p)); + *sizep += size; - page->pg_var_repeats = repeats; + page->u.col_var.repeats = p; page->pg_var_nrepeats = n; - *sizep += bytes_allocated; + repeats = page->pg_var_repeats; } repeats[repeat_off].indx = indx; repeats[repeat_off].recno = recno; @@ -569,7 +571,7 @@ __inmem_row_leaf(WT_SESSION_IMPL *session, WT_PAGE *page) unpack = &_unpack; /* Walk the page, building indices. */ - rip = page->pg_row_d; + rip = page->pg_row; WT_CELL_FOREACH(btree, dsk, cell, unpack, i) { __wt_cell_unpack(cell, unpack); switch (unpack->type) { diff --git a/src/third_party/wiredtiger/src/btree/bt_random.c b/src/third_party/wiredtiger/src/btree/bt_random.c new file mode 100644 index 00000000000..4c7ff861d26 --- /dev/null +++ b/src/third_party/wiredtiger/src/btree/bt_random.c @@ -0,0 +1,437 @@ +/*- + * Copyright (c) 2014-2016 MongoDB, Inc. + * Copyright (c) 2008-2014 WiredTiger, Inc. + * All rights reserved. + * + * See the file LICENSE for redistribution information. + */ + +#include "wt_internal.h" + +/* + * __wt_row_random_leaf -- + * Return a random key from a row-store leaf page. + */ +int +__wt_row_random_leaf(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt) +{ + WT_INSERT *ins, **start, **stop; + WT_INSERT_HEAD *ins_head; + WT_PAGE *page; + uint64_t samples; + uint32_t choice, entries, i; + int level; + + page = cbt->ref->page; + start = stop = NULL; /* [-Wconditional-uninitialized] */ + entries = 0; /* [-Wconditional-uninitialized] */ + + __cursor_pos_clear(cbt); + + /* If the page has disk-based entries, select from them. */ + if (page->entries != 0) { + cbt->compare = 0; + cbt->slot = __wt_random(&session->rnd) % page->entries; + + /* + * The real row-store search function builds the key, so we + * have to as well. + */ + return (__wt_row_leaf_key(session, + page, page->pg_row + cbt->slot, cbt->tmp, false)); + } + + /* + * If the tree is new (and not empty), it might have a large insert + * list. + * + * Walk down the list until we find a level with at least 50 entries, + * that's where we'll start rolling random numbers. The value 50 is + * used to ignore levels with only a few entries, that is, levels which + * are potentially badly skewed. + */ + F_SET(cbt, WT_CBT_SEARCH_SMALLEST); + if ((ins_head = WT_ROW_INSERT_SMALLEST(page)) == NULL) + return (WT_NOTFOUND); + for (level = WT_SKIP_MAXDEPTH - 1; level >= 0; --level) { + start = &ins_head->head[level]; + for (entries = 0, stop = start; + *stop != NULL; stop = &(*stop)->next[level]) + ++entries; + + if (entries > 50) + break; + } + + /* + * If it's a tiny list and we went all the way to level 0, correct the + * level; entries is correctly set. + */ + if (level < 0) + level = 0; + + /* + * Step down the skip list levels, selecting a random chunk of the name + * space at each level. + */ + for (samples = entries; level > 0; samples += entries) { + /* + * There are (entries) or (entries + 1) chunks of the name space + * considered at each level. They are: between start and the 1st + * element, between the 1st and 2nd elements, and so on to the + * last chunk which is the name space after the stop element on + * the current level. This last chunk of name space may or may + * not be there: as we descend the levels of the skip list, this + * chunk may appear, depending if the next level down has + * entries logically after the stop point in the current level. + * We can't ignore those entries: because of the algorithm used + * to determine the depth of a skiplist, there may be a large + * number of entries "revealed" by descending a level. + * + * If the next level down has more items after the current stop + * point, there are (entries + 1) chunks to consider, else there + * are (entries) chunks. + */ + if (*(stop - 1) == NULL) + choice = __wt_random(&session->rnd) % entries; + else + choice = __wt_random(&session->rnd) % (entries + 1); + + if (choice == entries) { + /* + * We selected the name space after the stop element on + * this level. Set the start point to the current stop + * point, descend a level and move the stop element to + * the end of the list, that is, the end of the newly + * discovered name space, counting entries as we go. + */ + start = stop; + --start; + --level; + for (entries = 0, stop = start; + *stop != NULL; stop = &(*stop)->next[level]) + ++entries; + } else { + /* + * We selected another name space on the level. Move the + * start pointer the selected number of entries forward + * to the start of the selected chunk (if the selected + * number is 0, start won't move). Set the stop pointer + * to the next element in the list and drop both start + * and stop down a level. + */ + for (i = 0; i < choice; ++i) + start = &(*start)->next[level]; + stop = &(*start)->next[level]; + + --start; + --stop; + --level; + + /* Count the entries in the selected name space. */ + for (entries = 0, + ins = *start; ins != *stop; ins = ins->next[level]) + ++entries; + } + } + + /* + * When we reach the bottom level, entries will already be set. Select + * a random entry from the name space and return it. + * + * It should be impossible for the entries count to be 0 at this point, + * but check for it out of paranoia and to quiet static testing tools. + */ + if (entries > 0) + entries = __wt_random(&session->rnd) % entries; + for (ins = *start; entries > 0; --entries) + ins = ins->next[0]; + + cbt->ins = ins; + cbt->ins_head = ins_head; + cbt->compare = 0; + + /* + * Random lookups in newly created collections can be slow if a page + * consists of a large skiplist. Schedule the page for eviction if we + * encounter a large skiplist. This worthwhile because applications + * that take a sample often take many samples, so the overhead of + * traversing the skip list each time accumulates to real time. + */ + if (samples > 5000) + __wt_page_evict_soon(session, cbt->ref); + + return (0); +} + +/* + * __wt_random_descent -- + * Find a random page in a tree for either sampling or eviction. + */ +int +__wt_random_descent(WT_SESSION_IMPL *session, WT_REF **refp, bool eviction) +{ + WT_BTREE *btree; + WT_DECL_RET; + WT_PAGE *page; + WT_PAGE_INDEX *pindex; + WT_REF *current, *descent; + uint32_t flags, i, entries, retry; + + btree = S2BT(session); + current = NULL; + retry = 100; + + /* Eviction should not be tapped to do eviction. */ + if (eviction) + flags = WT_READ_CACHE | WT_READ_NO_EVICT | WT_READ_NO_GEN | + WT_READ_NO_WAIT | WT_READ_NOTFOUND_OK | WT_READ_RESTART_OK; + else + flags = WT_READ_RESTART_OK; + + if (0) { +restart: /* + * Discard the currently held page and restart the search from + * the root. + */ + WT_RET(__wt_page_release(session, current, flags)); + } + + /* Search the internal pages of the tree. */ + current = &btree->root; + for (;;) { + page = current->page; + /* + * When walking a tree for eviction, an exclusive operation may + * be in progress leaving the root page is not valid. Just give + * up in that case. + */ + if (page == NULL) { + WT_ASSERT(session, eviction); + break; + } + + if (!WT_PAGE_IS_INTERNAL(page)) + break; + + WT_INTL_INDEX_GET(session, page, pindex); + entries = pindex->entries; + + /* Eviction just wants any random child. */ + if (eviction) { + descent = pindex->index[ + __wt_random(&session->rnd) % entries]; + goto descend; + } + + /* + * There may be empty pages in the tree, and they're useless to + * us. If we don't find a non-empty page in "entries" random + * guesses, take the first non-empty page in the tree. If the + * search page contains nothing other than empty pages, restart + * from the root some number of times before giving up. + * + * Random sampling is looking for a key/value pair on a random + * leaf page, and so will accept any page that contains a valid + * key/value pair, so on-disk is fine, but deleted is not. + */ + descent = NULL; + for (i = 0; i < entries; ++i) { + descent = + pindex->index[__wt_random(&session->rnd) % entries]; + if (descent->state == WT_REF_MEM || + descent->state == WT_REF_DISK) + break; + } + if (i == entries) + for (i = 0; i < entries; ++i) { + descent = pindex->index[i]; + if (descent->state == WT_REF_MEM || + descent->state == WT_REF_DISK) + break; + } + if (i == entries || descent == NULL) { + if (--retry > 0) + goto restart; + + WT_RET(__wt_page_release(session, current, flags)); + return (WT_NOTFOUND); + } + + /* + * Swap the current page for the child page. If the page splits + * while we're retrieving it, restart the search at the root. + * + * On other error, simply return, the swap call ensures we're + * holding nothing on failure. + */ +descend: if ((ret = + __wt_page_swap(session, current, descent, flags)) == 0) { + current = descent; + continue; + } + if (eviction && (ret == WT_NOTFOUND || ret == WT_RESTART)) + break; + if (ret == WT_RESTART) + goto restart; + return (ret); + } + + /* + * There is no point starting with the root page: the walk will exit + * immediately. In that case we aren't holding a hazard pointer so + * there is nothing to release. + */ + if (!eviction || !__wt_ref_is_root(current)) + *refp = current; + return (0); +} + +/* + * __wt_btcur_next_random -- + * Move to a random record in the tree. There are two algorithms, one + * where we select a record at random from the whole tree on each + * retrieval and one where we first select a record at random from the + * whole tree, and then subsequently sample forward from that location. + * The sampling approach allows us to select reasonably uniform random + * points from unbalanced trees. + */ +int +__wt_btcur_next_random(WT_CURSOR_BTREE *cbt) +{ + WT_BTREE *btree; + WT_DECL_RET; + WT_SESSION_IMPL *session; + WT_UPDATE *upd; + wt_off_t size; + uint64_t n, skip; + + session = (WT_SESSION_IMPL *)cbt->iface.session; + btree = cbt->btree; + + /* + * Only supports row-store: applications can trivially select a random + * value from a column-store, if there were any reason to do so. + */ + if (btree->type != BTREE_ROW) + WT_RET_MSG(session, ENOTSUP, + "WT_CURSOR.next_random only supported by row-store tables"); + + WT_STAT_CONN_INCR(session, cursor_next); + WT_STAT_DATA_INCR(session, cursor_next); + +#ifdef HAVE_DIAGNOSTIC + /* + * Under some conditions we end up using the underlying cursor.next to + * walk through the object. Since there are multiple calls, we can hit + * the cursor-order checks, turn them off. + */ + __wt_cursor_key_order_reset(cbt); +#endif + + /* + * If we don't have a current position in the tree, or if retrieving + * random values without sampling, pick a roughly random leaf page in + * the tree and return an entry from it. + */ + if (cbt->ref == NULL || cbt->next_random_sample_size == 0) { + WT_ERR(__cursor_func_init(cbt, true)); + WT_WITH_PAGE_INDEX(session, + ret = __wt_random_descent(session, &cbt->ref, false)); + if (ret == 0) + goto random_page_entry; + + /* + * Random descent may return not-found: the tree might be empty + * or have so many deleted items we didn't find any valid pages. + * We can't return WT_NOTFOUND to the application unless a tree + * is really empty, fallback to skipping through tree pages. + */ + WT_ERR_NOTFOUND_OK(ret); + } + + /* + * Cursor through the tree, skipping past the sample size of the leaf + * pages in the tree between each random key return to compensate for + * unbalanced trees. + * + * If the random descent attempt failed, we don't have a configured + * sample size, use 100 for no particular reason. + */ + if (cbt->next_random_sample_size == 0) + cbt->next_random_sample_size = 100; + + /* + * If the random descent attempt failed, or it's our first skip attempt, + * we haven't yet set the pages to skip, do it now. + * + * Use the underlying file size divided by its block allocation size as + * our guess of leaf pages in the file (this can be entirely wrong, as + * it depends on how many pages are in this particular checkpoint, how + * large the leaf and internal pages really are, and other factors). + * Then, divide that value by the configured sample size and increment + * the final result to make sure tiny files don't leave us with a skip + * value of 0. + * + * !!! + * Ideally, the number would be prime to avoid restart issues. + */ + if (cbt->next_random_leaf_skip == 0) { + WT_ERR(btree->bm->size(btree->bm, session, &size)); + cbt->next_random_leaf_skip = (uint64_t) + ((size / btree->allocsize) / + cbt->next_random_sample_size) + 1; + } + + /* + * Be paranoid about loop termination: first, if the last leaf page + * skipped was also the last leaf page in the tree, skip may be set to + * zero on return along with the NULL WT_REF end-of-walk condition. + * Second, if a tree has no valid pages at all (the condition after + * initial creation), we might make no progress at all, or finally, if + * a tree has only deleted pages, we'll make progress, but never get a + * useful WT_REF. And, of course, the tree can switch from one of these + * states to another without warning. Decrement skip regardless of what + * is happening in the search, guarantee we eventually quit. + * + * Pages read for data sampling aren't "useful"; don't update the read + * generation of pages already in memory, and if a page is read, set + * its generation to a low value so it is evicted quickly. + */ + for (skip = cbt->next_random_leaf_skip; cbt->ref == NULL || skip > 0;) { + n = skip; + WT_ERR(__wt_tree_walk_skip(session, &cbt->ref, &skip, + WT_READ_NO_GEN | WT_READ_SKIP_INTL | WT_READ_WONT_NEED)); + if (n == skip) { + if (skip == 0) + break; + --skip; + } + } + + /* + * We can't return WT_NOTFOUND to the application unless a tree is + * really empty, fallback to a random entry from the first page in the + * tree that has anything at all. + */ + if (cbt->ref == NULL) + WT_ERR(__wt_btcur_next(cbt, false)); + +random_page_entry: + /* + * Select a random entry from the leaf page. If it's not valid, move to + * the next entry, if that doesn't work, move to the previous entry. + */ + WT_ERR(__wt_row_random_leaf(session, cbt)); + if (__wt_cursor_valid(cbt, &upd)) + WT_ERR(__wt_kv_return(session, cbt, upd)); + else { + if ((ret = __wt_btcur_next(cbt, false)) == WT_NOTFOUND) + ret = __wt_btcur_prev(cbt, false); + WT_ERR(ret); + } + return (0); + +err: WT_TRET(__cursor_reset(cbt)); + return (ret); +} diff --git a/src/third_party/wiredtiger/src/btree/bt_rebalance.c b/src/third_party/wiredtiger/src/btree/bt_rebalance.c index 29380459b94..24b4f7bb33d 100644 --- a/src/third_party/wiredtiger/src/btree/bt_rebalance.c +++ b/src/third_party/wiredtiger/src/btree/bt_rebalance.c @@ -265,7 +265,7 @@ __rebalance_row_leaf_key(WT_SESSION_IMPL *session, */ WT_RET(__wt_bt_read(session, rs->tmp1, addr, addr_len)); WT_RET(__wt_page_inmem(session, NULL, rs->tmp1->data, 0, 0, &page)); - ret = __wt_row_leaf_key_copy(session, page, &page->pg_row_d[0], key); + ret = __wt_row_leaf_key_copy(session, page, &page->pg_row[0], key); __wt_page_out(session, &page); return (ret); } diff --git a/src/third_party/wiredtiger/src/btree/bt_ret.c b/src/third_party/wiredtiger/src/btree/bt_ret.c index 8ef2db67e7b..6409a1a180c 100644 --- a/src/third_party/wiredtiger/src/btree/bt_ret.c +++ b/src/third_party/wiredtiger/src/btree/bt_ret.c @@ -64,10 +64,10 @@ __wt_kv_return(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, WT_UPDATE *upd) } /* Take the value from the original page cell. */ - cell = WT_COL_PTR(page, &page->pg_var_d[cbt->slot]); + cell = WT_COL_PTR(page, &page->pg_var[cbt->slot]); break; case WT_PAGE_ROW_LEAF: - rip = &page->pg_row_d[cbt->slot]; + rip = &page->pg_row[cbt->slot]; /* * If the cursor references a WT_INSERT item, take its key. diff --git a/src/third_party/wiredtiger/src/btree/bt_slvg.c b/src/third_party/wiredtiger/src/btree/bt_slvg.c index fde4d4fb9de..fea979cac6e 100644 --- a/src/third_party/wiredtiger/src/btree/bt_slvg.c +++ b/src/third_party/wiredtiger/src/btree/bt_slvg.c @@ -603,9 +603,9 @@ __slvg_trk_leaf(WT_SESSION_IMPL *session, */ WT_ERR(__wt_page_inmem(session, NULL, dsk, 0, 0, &page)); WT_ERR(__wt_row_leaf_key_copy(session, - page, &page->pg_row_d[0], &trk->row_start)); - WT_ERR(__wt_row_leaf_key_copy(session, page, - &page->pg_row_d[page->pg_row_entries - 1], &trk->row_stop)); + page, &page->pg_row[0], &trk->row_start)); + WT_ERR(__wt_row_leaf_key_copy(session, + page, &page->pg_row[page->entries - 1], &trk->row_stop)); __wt_verbose(session, WT_VERB_SALVAGE, "%s start key %s", @@ -1235,7 +1235,7 @@ __slvg_col_build_leaf(WT_SESSION_IMPL *session, WT_TRACK *trk, WT_REF *ref) WT_PAGE *page; WT_SALVAGE_COOKIE *cookie, _cookie; uint64_t recno, skip, take; - uint32_t *entriesp, save_entries; + uint32_t save_entries; cookie = &_cookie; WT_CLEAR(*cookie); @@ -1244,11 +1244,8 @@ __slvg_col_build_leaf(WT_SESSION_IMPL *session, WT_TRACK *trk, WT_REF *ref) WT_RET(__wt_page_in(session, ref, 0)); page = ref->page; - entriesp = page->type == WT_PAGE_COL_VAR ? - &page->pg_var_entries : &page->pg_fix_entries; - - save_col_var = page->pg_var_d; - save_entries = *entriesp; + save_col_var = page->pg_var; + save_entries = page->entries; /* * Calculate the number of K/V entries we are going to skip, and @@ -1303,8 +1300,8 @@ __slvg_col_build_leaf(WT_SESSION_IMPL *session, WT_TRACK *trk, WT_REF *ref) WT_ERR(__wt_reconcile(session, ref, cookie, WT_VISIBILITY_ERR, NULL)); /* Reset the page. */ - page->pg_var_d = save_col_var; - *entriesp = save_entries; + page->pg_var = save_col_var; + page->entries = save_entries; ret = __wt_page_release(session, ref, 0); if (ret == 0) @@ -1973,14 +1970,14 @@ __slvg_row_build_leaf( /* We should have selected some entries, but not the entire page. */ WT_ASSERT(session, skip_start + skip_stop > 0 && - skip_start + skip_stop < page->pg_row_entries); + skip_start + skip_stop < page->entries); /* * Take a copy of this page's first key to define the start of * its range. The key may require processing, otherwise, it's * a copy from the page. */ - rip = page->pg_row_d + skip_start; + rip = page->pg_row + skip_start; WT_ERR(__wt_row_leaf_key(session, page, rip, key, false)); WT_ERR(__wt_row_ikey_incr( session, ref->home, 0, key->data, key->size, ref)); @@ -1988,14 +1985,14 @@ __slvg_row_build_leaf( /* Set the referenced flag on overflow pages we're using. */ if (trk->trk_ovfl_cnt != 0) WT_ERR(__slvg_row_ovfl(session, - trk, page, skip_start, page->pg_row_entries - skip_stop)); + trk, page, skip_start, page->entries - skip_stop)); /* * Change the page to reflect the correct record count: there is no * need to copy anything on the page itself, the entries value limits * the number of page items. */ - page->pg_row_entries -= skip_stop; + page->entries -= skip_stop; cookie->skip = skip_start; /* @@ -2014,7 +2011,7 @@ __slvg_row_build_leaf( WT_ERR(__wt_reconcile(session, ref, cookie, WT_VISIBILITY_ERR, NULL)); /* Reset the page. */ - page->pg_row_entries += skip_stop; + page->entries += skip_stop; /* * Discard our hazard pointer and evict the page, updating the @@ -2081,7 +2078,7 @@ __slvg_row_ovfl(WT_SESSION_IMPL *session, * We're merging a row-store page, and we took some number of records, * figure out which (if any) overflow records we used. */ - for (rip = page->pg_row_d + start; start < stop; ++start, ++rip) { + for (rip = page->pg_row + start; start < stop; ++start, ++rip) { copy = WT_ROW_KEY_COPY(rip); (void)__wt_row_leaf_key_info( page, copy, NULL, &cell, NULL, NULL); diff --git a/src/third_party/wiredtiger/src/btree/bt_split.c b/src/third_party/wiredtiger/src/btree/bt_split.c index fe49f937719..45550ff627f 100644 --- a/src/third_party/wiredtiger/src/btree/bt_split.c +++ b/src/third_party/wiredtiger/src/btree/bt_split.c @@ -54,6 +54,16 @@ __split_oldest_gen(WT_SESSION_IMPL *session) } /* + * __wt_split_obsolete -- + * Check if it is safe to free / evict based on split generation. + */ +bool +__wt_split_obsolete(WT_SESSION_IMPL *session, uint64_t split_gen) +{ + return (split_gen < __split_oldest_gen(session)); +} + +/* * __split_stash_add -- * Add a new entry into the session's split stash list. */ @@ -187,7 +197,7 @@ __split_safe_free(WT_SESSION_IMPL *session, #ifdef HAVE_DIAGNOSTIC /* * __split_verify_intl_key_order -- - * Verify the key order on an internal page after a split, diagnostic only. + * Verify the key order on an internal page after a split. */ static void __split_verify_intl_key_order(WT_SESSION_IMPL *session, WT_PAGE *page) @@ -239,6 +249,46 @@ __split_verify_intl_key_order(WT_SESSION_IMPL *session, WT_PAGE *page) break; } } + +/* + * __split_verify_root -- + * Verify a root page involved in a split. + */ +static int +__split_verify_root(WT_SESSION_IMPL *session, WT_PAGE *page) +{ + WT_DECL_RET; + WT_REF *ref; + + /* The split is complete and live, verify all of the pages involved. */ + __split_verify_intl_key_order(session, page); + + WT_INTL_FOREACH_BEGIN(session, page, ref) { + /* + * An eviction thread might be attempting to evict the page + * (the WT_REF may be WT_REF_LOCKED), or it may be a disk based + * page (the WT_REF may be WT_REF_READING), or it may be in + * some other state. Acquire a hazard pointer for any + * in-memory pages so we know the state of the page. + * + * Ignore pages not in-memory (deleted, on-disk, being read), + * there's no in-memory structure to check. + */ + if ((ret = __wt_page_in(session, + ref, WT_READ_CACHE | WT_READ_NO_EVICT)) == WT_NOTFOUND) + continue; + WT_ERR(ret); + + __split_verify_intl_key_order(session, ref->page); + + WT_ERR(__wt_page_release(session, ref, WT_READ_NO_EVICT)); + } WT_INTL_FOREACH_END; + + return (0); + +err: /* Something really bad just happened. */ + WT_PANIC_RET(session, ret, "fatal error during page split"); +} #endif /* @@ -390,12 +440,12 @@ __split_ref_move(WT_SESSION_IMPL *session, WT_PAGE *from_home, } /* - * __split_ref_step1 -- + * __split_ref_prepare -- * Prepare a set of WT_REFs for a move. */ static void -__split_ref_step1( - WT_SESSION_IMPL *session, WT_PAGE_INDEX *pindex, bool skip_first) +__split_ref_prepare(WT_SESSION_IMPL *session, + WT_PAGE_INDEX *pindex, uint64_t split_gen, bool skip_first) { WT_PAGE *child; WT_REF *child_ref, *ref; @@ -418,30 +468,25 @@ __split_ref_step1( child = ref->page; /* - * Block eviction and splits in newly created pages. + * Block eviction in newly created pages. * * Once the split is live, newly created internal pages might be * evicted and their WT_REF structures freed. If that happened * before all threads exit the index of the page that previously * "owned" the WT_REF, a thread might see a freed WT_REF. To - * ensure that doesn't happen, the newly created page's modify - * structure has a field with a transaction ID that's checked - * before any internal page is evicted. Unfortunately, we don't - * know the correct value until we update the original page's - * index (we need a transaction ID from after that update), but - * the act of updating the original page's index is what allows - * the eviction to happen. + * ensure that doesn't happen, the newly created page contains + * the current split generation and can't be evicted until + * all readers have left the old generation. * - * Split blocking was because historic versions of the split - * code didn't update the WT_REF.home field until after the - * split was live, so the WT_REF.home fields being updated could - * split again before the update, there's a race between splits - * as to which would update them first. The current code updates - * the WT_REF.home fields before going live (in this function), - * this shouldn't be an issue, but for now splits remain turned - * off. + * Historic, we also blocked splits in newly created pages + * because we didn't update the WT_REF.home field until after + * the split was live, so the WT_REF.home fields being updated + * could split again before the update, there's a race between + * splits as to which would update them first. The current code + * updates the WT_REF.home fields before going live (in this + * function), this isn't an issue. */ - F_SET_ATOMIC(child, WT_PAGE_SPLIT_BLOCK); + child->pg_intl_split_gen = split_gen; /* * We use a page flag to prevent the child from splitting from @@ -465,64 +510,6 @@ __split_ref_step1( } /* - * __split_ref_step2 -- - * Allow the newly created children to be evicted or split. - */ -static int -__split_ref_step2( - WT_SESSION_IMPL *session, WT_PAGE_INDEX *pindex, bool skip_first) -{ - WT_DECL_RET; - WT_PAGE *child; - WT_REF *ref; - uint32_t i; - - /* - * The split has gone live, enable eviction and splits on the newly - * created internal pages. - */ - WT_WRITE_BARRIER(); - - for (i = skip_first ? 1 : 0; i < pindex->entries; ++i) { - ref = pindex->index[i]; - - /* - * We don't hold hazard pointers on created pages, they cannot - * be evicted because the page-modify transaction value set as - * they were created prevents eviction. (See above, we reset - * that value as part of fixing up the page.) But, an eviction - * thread might be attempting to evict the page (the WT_REF may - * be WT_REF_LOCKED), or it may be a disk based page (the WT_REF - * may be WT_REF_READING), or it may be in some other state. - * Acquire a hazard pointer for any in-memory pages so we know - * the state of the page. Ignore pages not in-memory (deleted, - * on-disk, being read), there's no in-memory structure to fix. - */ - if ((ret = __wt_page_in(session, - ref, WT_READ_CACHE | WT_READ_NO_EVICT)) == WT_NOTFOUND) - continue; - WT_ERR(ret); - - child = ref->page; - - /* The child can now be evicted or split. */ - F_CLR_ATOMIC(child, WT_PAGE_SPLIT_BLOCK); - -#ifdef HAVE_DIAGNOSTIC - WT_WITH_PAGE_INDEX(session, - __split_verify_intl_key_order(session, child)); -#endif - - WT_ERR(__wt_hazard_clear(session, ref)); - } - - return (0); - -err: /* Something really bad just happened. */ - WT_PANIC_RET(session, ret, "fatal error resolving a split"); -} - -/* * __split_root -- * Split the root page in-memory, deepening the tree. */ @@ -653,8 +640,12 @@ __split_root(WT_SESSION_IMPL *session, WT_PAGE *root) /* Start making real changes to the tree, errors are fatal. */ complete = WT_ERR_PANIC; + /* Get a generation for this split, mark the root page. */ + split_gen = __wt_atomic_addv64(&S2C(session)->split_gen, 1); + root->pg_intl_split_gen = split_gen; + /* Prepare the WT_REFs for the move. */ - __split_ref_step1(session, alloc_index, false); + __split_ref_prepare(session, alloc_index, split_gen, false); /* * Confirm the root page's index hasn't moved, then update it, which @@ -662,20 +653,17 @@ __split_root(WT_SESSION_IMPL *session, WT_PAGE *root) */ WT_ASSERT(session, WT_INTL_INDEX_GET_SAFE(root) == pindex); WT_INTL_INDEX_SET(root, alloc_index); + alloc_index = NULL; #ifdef HAVE_DIAGNOSTIC WT_WITH_PAGE_INDEX(session, - __split_verify_intl_key_order(session, root)); + ret = __split_verify_root(session, root)); + WT_ERR(ret); #endif - /* Finalize the WT_REFs we moved. */ - WT_ERR(__split_ref_step2(session, alloc_index, false)); - /* The split is complete and correct, ignore benign errors. */ + /* The split is complete and verified, ignore benign errors. */ complete = WT_ERR_IGNORE; - /* We've installed the allocated page-index, ensure error handling. */ - alloc_index = NULL; - /* * We can't free the previous root's index, there may be threads using * it. Add to the session's discard list, to be freed once we know no @@ -686,7 +674,6 @@ __split_root(WT_SESSION_IMPL *session, WT_PAGE *root) * fails, we don't roll back that change, because threads may already * be using the new index. */ - split_gen = __wt_atomic_addv64(&S2C(session)->split_gen, 1); size = sizeof(WT_PAGE_INDEX) + pindex->entries * sizeof(WT_REF *); WT_TRET(__split_safe_free(session, split_gen, false, pindex, size)); root_decr += size; @@ -838,6 +825,10 @@ __split_parent(WT_SESSION_IMPL *session, WT_REF *ref, WT_REF **ref_new, /* Start making real changes to the tree, errors are fatal. */ complete = WT_ERR_PANIC; + /* Get a generation for this split, mark the parent page. */ + split_gen = __wt_atomic_addv64(&S2C(session)->split_gen, 1); + parent->pg_intl_split_gen = split_gen; + /* * Confirm the parent page's index hasn't moved then update it, which * makes the split visible to threads descending the tree. @@ -846,11 +837,6 @@ __split_parent(WT_SESSION_IMPL *session, WT_REF *ref, WT_REF **ref_new, WT_INTL_INDEX_SET(parent, alloc_index); alloc_index = NULL; -#ifdef HAVE_DIAGNOSTIC - WT_WITH_PAGE_INDEX(session, - __split_verify_intl_key_order(session, parent)); -#endif - /* * If discarding the page's original WT_REF field, reset it to split. * Threads cursoring through the tree were blocked because that WT_REF @@ -869,16 +855,25 @@ __split_parent(WT_SESSION_IMPL *session, WT_REF *ref, WT_REF **ref_new, __wt_free(session, ref->page_del); } + /* + * Set the discarded WT_REF state to split, ensuring we don't + * race with any discard of the WT_REF deleted fields. + */ WT_PUBLISH(ref->state, WT_REF_SPLIT); + + /* + * Push out the change: not required for correctness, but stops + * threads spinning on incorrect page references. + */ + WT_FULL_BARRIER(); } - /* - * Push out the changes: not required for correctness, but don't let - * threads spin on incorrect page references longer than necessary. - */ - WT_FULL_BARRIER(); +#ifdef HAVE_DIAGNOSTIC + WT_WITH_PAGE_INDEX(session, + __split_verify_intl_key_order(session, parent)); +#endif - /* The split is complete and correct, ignore benign errors. */ + /* The split is complete and verified, ignore benign errors. */ complete = WT_ERR_IGNORE; /* @@ -908,7 +903,6 @@ __split_parent(WT_SESSION_IMPL *session, WT_REF *ref, WT_REF **ref_new, * * Acquire a new split generation. */ - split_gen = __wt_atomic_addv64(&S2C(session)->split_gen, 1); for (i = 0, deleted_refs = scr->mem; i < deleted_entries; ++i) { next_ref = pindex->index[deleted_refs[i]]; WT_ASSERT(session, next_ref->state == WT_REF_SPLIT); @@ -1160,14 +1154,21 @@ __split_internal(WT_SESSION_IMPL *session, WT_PAGE *parent, WT_PAGE *page) /* Start making real changes to the tree, errors are fatal. */ complete = WT_ERR_PANIC; + /* Get a generation for this split, mark the page. */ + split_gen = __wt_atomic_addv64(&S2C(session)->split_gen, 1); + page->pg_intl_split_gen = split_gen; + /* Prepare the WT_REFs for the move. */ - __split_ref_step1(session, alloc_index, true); + __split_ref_prepare(session, alloc_index, split_gen, true); /* Split into the parent. */ WT_ERR(__split_parent(session, page_ref, alloc_index->index, alloc_index->entries, parent_incr, false, false)); - /* Confirm the page's index hasn't moved, then update it. */ + /* + * Confirm the page's index hasn't moved, then update it, which makes + * the split visible to threads descending the tree. + */ WT_ASSERT(session, WT_INTL_INDEX_GET_SAFE(page) == pindex); WT_INTL_INDEX_SET(page, replace_index); @@ -1178,19 +1179,10 @@ __split_internal(WT_SESSION_IMPL *session, WT_PAGE *parent, WT_PAGE *page) __split_verify_intl_key_order(session, page)); #endif - /* Finalize the WT_REFs we moved. */ - WT_ERR(__split_ref_step2(session, alloc_index, true)); - - /* The split is complete and correct, ignore benign errors. */ + /* The split is complete and verified, ignore benign errors. */ complete = WT_ERR_IGNORE; /* - * Push out the changes: not required for correctness, but no reason - * to wait. - */ - WT_FULL_BARRIER(); - - /* * We don't care about the page-index we allocated, all we needed was * the array of WT_REF structures, which has now been split into the * parent page. @@ -1207,7 +1199,6 @@ __split_internal(WT_SESSION_IMPL *session, WT_PAGE *parent, WT_PAGE *page) * back that change, because threads may already be using the new parent * page. */ - split_gen = __wt_atomic_addv64(&S2C(session)->split_gen, 1); size = sizeof(WT_PAGE_INDEX) + pindex->entries * sizeof(WT_REF *); WT_TRET(__split_safe_free(session, split_gen, false, pindex, size)); page_decr += size; @@ -1284,10 +1275,6 @@ __split_internal_lock(WT_SESSION_IMPL *session, WT_REF *ref, bool trylock, for (;;) { parent = ref->home; - /* Skip pages that aren't ready to split. */ - if (F_ISSET_ATOMIC(parent, WT_PAGE_SPLIT_BLOCK)) - return (EBUSY); - if (trylock) WT_RET(__wt_try_writelock(session, &parent->page_lock)); else @@ -1770,9 +1757,9 @@ __split_insert(WT_SESSION_IMPL *session, WT_REF *ref) /* Find the last item on the page. */ if (type == WT_PAGE_ROW_LEAF) - ins_head = page->pg_row_entries == 0 ? + ins_head = page->entries == 0 ? WT_ROW_INSERT_SMALLEST(page) : - WT_ROW_INSERT_SLOT(page, page->pg_row_entries - 1); + WT_ROW_INSERT_SLOT(page, page->entries - 1); else ins_head = WT_COL_APPEND(page); moved_ins = WT_SKIP_LAST(ins_head); @@ -1822,7 +1809,7 @@ __split_insert(WT_SESSION_IMPL *session, WT_REF *ref) key->size = WT_INSERT_KEY_SIZE(ins); } else WT_ERR(__wt_row_leaf_key( - session, page, &page->pg_row_d[0], key, true)); + session, page, &page->pg_row[0], key, true)); WT_ERR(__wt_row_ikey(session, 0, key->data, key->size, child)); parent_incr += sizeof(WT_IKEY) + key->size; __wt_scr_free(session, &key); @@ -2086,8 +2073,7 @@ __wt_split_insert(WT_SESSION_IMPL *session, WT_REF *ref) WT_PAGE *parent; bool hazard; - __wt_verbose( - session, WT_VERB_SPLIT, "%p: split-insert", (void *)ref->page); + __wt_verbose(session, WT_VERB_SPLIT, "%p: split-insert", (void *)ref); WT_RET(__split_internal_lock(session, ref, true, &parent, &hazard)); if ((ret = __split_insert(session, ref)) != 0) { @@ -2178,8 +2164,7 @@ __wt_split_multi(WT_SESSION_IMPL *session, WT_REF *ref, int closing) WT_PAGE *parent; bool hazard; - __wt_verbose( - session, WT_VERB_SPLIT, "%p: split-multi", (void *)ref->page); + __wt_verbose(session, WT_VERB_SPLIT, "%p: split-multi", (void *)ref); WT_RET(__split_internal_lock(session, ref, false, &parent, &hazard)); if ((ret = __split_multi(session, ref, closing)) != 0 || closing) { @@ -2207,8 +2192,7 @@ __wt_split_reverse(WT_SESSION_IMPL *session, WT_REF *ref) WT_PAGE *parent; bool hazard; - __wt_verbose( - session, WT_VERB_SPLIT, "%p: reverse-split", (void *)ref->page); + __wt_verbose(session, WT_VERB_SPLIT, "%p: reverse-split", (void *)ref); WT_RET(__split_internal_lock(session, ref, false, &parent, &hazard)); ret = __split_parent(session, ref, NULL, 0, 0, false, true); @@ -2229,8 +2213,7 @@ __wt_split_rewrite(WT_SESSION_IMPL *session, WT_REF *ref, WT_MULTI *multi) page = ref->page; - __wt_verbose( - session, WT_VERB_SPLIT, "%p: split-rewrite", (void *)ref->page); + __wt_verbose(session, WT_VERB_SPLIT, "%p: split-rewrite", (void *)ref); /* * This isn't a split: a reconciliation failed because we couldn't write diff --git a/src/third_party/wiredtiger/src/btree/bt_stat.c b/src/third_party/wiredtiger/src/btree/bt_stat.c index 06428b87f6e..0da0e0807bd 100644 --- a/src/third_party/wiredtiger/src/btree/bt_stat.c +++ b/src/third_party/wiredtiger/src/btree/bt_stat.c @@ -40,6 +40,8 @@ __wt_btree_stat_init(WT_SESSION_IMPL *session, WT_CURSOR_STAT *cst) WT_STAT_SET(session, stats, btree_maxleafpage, btree->maxleafpage); WT_STAT_SET(session, stats, btree_maxleafvalue, btree->maxleafvalue); + WT_STAT_SET(session, stats, cache_bytes_dirty, + __wt_btree_dirty_inuse(session)); WT_STAT_SET(session, stats, cache_bytes_inuse, __wt_btree_bytes_inuse(session)); @@ -104,8 +106,7 @@ __stat_page(WT_SESSION_IMPL *session, WT_PAGE *page, WT_DSRC_STATS **stats) switch (page->type) { case WT_PAGE_COL_FIX: WT_STAT_INCR(session, stats, btree_column_fix); - WT_STAT_INCRV( - session, stats, btree_entries, page->pg_fix_entries); + WT_STAT_INCRV(session, stats, btree_entries, page->entries); break; case WT_PAGE_COL_INT: WT_STAT_INCR(session, stats, btree_column_internal); diff --git a/src/third_party/wiredtiger/src/btree/bt_vrfy.c b/src/third_party/wiredtiger/src/btree/bt_vrfy.c index 340f9bb6f0e..05990918215 100644 --- a/src/third_party/wiredtiger/src/btree/bt_vrfy.c +++ b/src/third_party/wiredtiger/src/btree/bt_vrfy.c @@ -386,7 +386,7 @@ recno_chk: if (recno != vs->record_total + 1) } switch (page->type) { case WT_PAGE_COL_FIX: - vs->record_total += page->pg_fix_entries; + vs->record_total += page->entries; break; case WT_PAGE_COL_VAR: recno = 0; @@ -614,7 +614,7 @@ __verify_row_leaf_key_order( * If a tree is empty (just created), it won't have keys; if there * are no keys, we're done. */ - if (page->pg_row_entries == 0) + if (page->entries == 0) return (0); /* @@ -624,7 +624,7 @@ __verify_row_leaf_key_order( */ if (vs->max_addr->size != 0) { WT_RET(__wt_row_leaf_key_copy( - session, page, page->pg_row_d, vs->tmp1)); + session, page, page->pg_row, vs->tmp1)); /* * Compare the key against the largest key we've seen so far. @@ -653,7 +653,7 @@ __verify_row_leaf_key_order( /* Update the largest key we've seen to the last key on this page. */ WT_RET(__wt_row_leaf_key_copy(session, page, - page->pg_row_d + (page->pg_row_entries - 1), vs->max_key)); + page->pg_row + (page->entries - 1), vs->max_key)); (void)__wt_page_addr_string(session, ref, vs->max_addr); return (0); diff --git a/src/third_party/wiredtiger/src/btree/bt_walk.c b/src/third_party/wiredtiger/src/btree/bt_walk.c index 049700952ee..ddaa2e5f70b 100644 --- a/src/third_party/wiredtiger/src/btree/bt_walk.c +++ b/src/third_party/wiredtiger/src/btree/bt_walk.c @@ -340,9 +340,7 @@ __tree_walk_internal(WT_SESSION_IMPL *session, * Take a copy of any held page and clear the return value. Remember * the hazard pointer we're currently holding. * - * We may be passed a pointer to btree->evict_page that we are clearing - * here. We check when discarding pages that we're not discarding that - * page, so this clear must be done before the page is released. + * Clear the returned value, it makes future error handling easier. */ couple = couple_orig = ref = *refp; *refp = NULL; diff --git a/src/third_party/wiredtiger/src/btree/col_modify.c b/src/third_party/wiredtiger/src/btree/col_modify.c index a7920da5267..9ccb9728189 100644 --- a/src/third_party/wiredtiger/src/btree/col_modify.c +++ b/src/third_party/wiredtiger/src/btree/col_modify.c @@ -115,9 +115,8 @@ __wt_col_modify(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, page, mod->mod_col_update, ins_headp, 1); ins_headp = &mod->mod_col_update[0]; } else { - WT_PAGE_ALLOC_AND_SWAP(session, - page, mod->mod_col_update, ins_headp, - page->pg_var_entries); + WT_PAGE_ALLOC_AND_SWAP(session, page, + mod->mod_col_update, ins_headp, page->entries); ins_headp = &mod->mod_col_update[cbt->slot]; } diff --git a/src/third_party/wiredtiger/src/btree/col_srch.c b/src/third_party/wiredtiger/src/btree/col_srch.c index 64ee9e94f4c..c72d66f8796 100644 --- a/src/third_party/wiredtiger/src/btree/col_srch.c +++ b/src/third_party/wiredtiger/src/btree/col_srch.c @@ -240,8 +240,8 @@ leaf_only: cbt->compare = 1; return (0); } - if (recno >= current->ref_recno + page->pg_fix_entries) { - cbt->recno = current->ref_recno + page->pg_fix_entries; + if (recno >= current->ref_recno + page->entries) { + cbt->recno = current->ref_recno + page->entries; goto past_end; } else { cbt->recno = recno; @@ -257,8 +257,7 @@ leaf_only: } if ((cip = __col_var_search(current, recno, NULL)) == NULL) { cbt->recno = __col_var_last_recno(current); - cbt->slot = page->pg_var_entries == 0 ? - 0 : page->pg_var_entries - 1; + cbt->slot = page->entries == 0 ? 0 : page->entries - 1; goto past_end; } else { cbt->recno = recno; diff --git a/src/third_party/wiredtiger/src/btree/row_key.c b/src/third_party/wiredtiger/src/btree/row_key.c index 99ee34a6c5d..032fdf7d897 100644 --- a/src/third_party/wiredtiger/src/btree/row_key.c +++ b/src/third_party/wiredtiger/src/btree/row_key.c @@ -26,7 +26,7 @@ __wt_row_leaf_keys(WT_SESSION_IMPL *session, WT_PAGE *page) btree = S2BT(session); - if (page->pg_row_entries == 0) { /* Just checking... */ + if (page->entries == 0) { /* Just checking... */ F_SET_ATOMIC(page, WT_PAGE_BUILD_KEYS); return (0); } @@ -51,15 +51,15 @@ __wt_row_leaf_keys(WT_SESSION_IMPL *session, WT_PAGE *page) */ WT_RET(__wt_scr_alloc(session, 0, &key)); WT_RET(__wt_scr_alloc(session, - (uint32_t)__bitstr_size(page->pg_row_entries), &tmp)); + (uint32_t)__bitstr_size(page->entries), &tmp)); memset(tmp->mem, 0, tmp->memsize); if ((gap = btree->key_gap) == 0) gap = 1; - __inmem_row_leaf_slots(tmp->mem, 0, page->pg_row_entries, gap); + __inmem_row_leaf_slots(tmp->mem, 0, page->entries, gap); /* Instantiate the keys. */ - for (rip = page->pg_row_d, i = 0; i < page->pg_row_entries; ++rip, ++i) + for (rip = page->pg_row, i = 0; i < page->entries; ++rip, ++i) if (__bit_test(tmp->mem, i)) WT_ERR(__wt_row_leaf_key_work( session, page, rip, key, true)); @@ -282,7 +282,7 @@ switch_and_jump: /* Switching to a forward roll. */ * the tracking cache. */ if (slot_offset == 0) { - __wt_readlock(session, btree->ovfl_lock); + __wt_readlock(session, &btree->ovfl_lock); copy = WT_ROW_KEY_COPY(rip); if (!__wt_row_leaf_key_info(page, copy, NULL, &cell, &keyb->data, &keyb->size)) { @@ -290,7 +290,7 @@ switch_and_jump: /* Switching to a forward roll. */ ret = __wt_dsk_cell_data_ref(session, WT_PAGE_ROW_LEAF, unpack, keyb); } - __wt_readunlock(session, btree->ovfl_lock); + __wt_readunlock(session, &btree->ovfl_lock); WT_ERR(ret); break; } diff --git a/src/third_party/wiredtiger/src/btree/row_modify.c b/src/third_party/wiredtiger/src/btree/row_modify.c index a1c214e5b8b..b1a81ca3d9f 100644 --- a/src/third_party/wiredtiger/src/btree/row_modify.c +++ b/src/third_party/wiredtiger/src/btree/row_modify.c @@ -85,9 +85,8 @@ __wt_row_modify(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, if (cbt->compare == 0) { if (cbt->ins == NULL) { /* Allocate an update array as necessary. */ - WT_PAGE_ALLOC_AND_SWAP(session, - page, mod->mod_row_update, - upd_entry, page->pg_row_entries); + WT_PAGE_ALLOC_AND_SWAP(session, page, + mod->mod_row_update, upd_entry, page->entries); /* Set the WT_UPDATE array reference. */ upd_entry = &mod->mod_row_update[cbt->slot]; @@ -147,10 +146,10 @@ __wt_row_modify(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, * slot. That's hard, so we set a flag. */ WT_PAGE_ALLOC_AND_SWAP(session, page, - mod->mod_row_insert, ins_headp, page->pg_row_entries + 1); + mod->mod_row_insert, ins_headp, page->entries + 1); ins_slot = F_ISSET(cbt, WT_CBT_SEARCH_SMALLEST) ? - page->pg_row_entries: cbt->slot; + page->entries: cbt->slot; ins_headp = &mod->mod_row_insert[ins_slot]; /* Allocate the WT_INSERT_HEAD structure as necessary. */ diff --git a/src/third_party/wiredtiger/src/btree/row_srch.c b/src/third_party/wiredtiger/src/btree/row_srch.c index d4e82c458d4..9c3d467340e 100644 --- a/src/third_party/wiredtiger/src/btree/row_srch.c +++ b/src/third_party/wiredtiger/src/btree/row_srch.c @@ -486,14 +486,14 @@ leaf_only: if (insert && descend_right) { cbt->append_tree = 1; - if (page->pg_row_entries == 0) { - cbt->slot = WT_ROW_SLOT(page, page->pg_row_d); + if (page->entries == 0) { + cbt->slot = WT_ROW_SLOT(page, page->pg_row); F_SET(cbt, WT_CBT_SEARCH_SMALLEST); ins_head = WT_ROW_INSERT_SMALLEST(page); } else { cbt->slot = WT_ROW_SLOT(page, - page->pg_row_d + (page->pg_row_entries - 1)); + page->pg_row + (page->entries - 1)); ins_head = WT_ROW_INSERT_SLOT(page, cbt->slot); } @@ -511,11 +511,11 @@ leaf_only: * doing the tests and error handling inside the loop costs about 5%. */ base = 0; - limit = page->pg_row_entries; + limit = page->entries; if (collator == NULL && srch_key->size <= WT_COMPARE_SHORT_MAXLEN) for (; limit != 0; limit >>= 1) { indx = base + (limit >> 1); - rip = page->pg_row_d + indx; + rip = page->pg_row + indx; WT_ERR( __wt_row_leaf_key(session, page, rip, item, true)); @@ -529,7 +529,7 @@ leaf_only: else if (collator == NULL) for (; limit != 0; limit >>= 1) { indx = base + (limit >> 1); - rip = page->pg_row_d + indx; + rip = page->pg_row + indx; WT_ERR( __wt_row_leaf_key(session, page, rip, item, true)); @@ -547,7 +547,7 @@ leaf_only: else for (; limit != 0; limit >>= 1) { indx = base + (limit >> 1); - rip = page->pg_row_d + indx; + rip = page->pg_row + indx; WT_ERR( __wt_row_leaf_key(session, page, rip, item, true)); @@ -591,13 +591,13 @@ leaf_match: cbt->compare = 0; */ if (base == 0) { cbt->compare = 1; - cbt->slot = WT_ROW_SLOT(page, page->pg_row_d); + cbt->slot = WT_ROW_SLOT(page, page->pg_row); F_SET(cbt, WT_CBT_SEARCH_SMALLEST); ins_head = WT_ROW_INSERT_SMALLEST(page); } else { cbt->compare = -1; - cbt->slot = WT_ROW_SLOT(page, page->pg_row_d + (base - 1)); + cbt->slot = WT_ROW_SLOT(page, page->pg_row + (base - 1)); ins_head = WT_ROW_INSERT_SLOT(page, cbt->slot); } @@ -623,215 +623,3 @@ leaf_match: cbt->compare = 0; err: WT_TRET(__wt_page_release(session, current, 0)); return (ret); } - -/* - * __wt_row_random_leaf -- - * Return a random key from a row-store leaf page. - */ -int -__wt_row_random_leaf(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt) -{ - WT_INSERT *ins, **start, **stop; - WT_INSERT_HEAD *ins_head; - WT_PAGE *page; - uint64_t samples; - uint32_t choice, entries, i; - int level; - - page = cbt->ref->page; - start = stop = NULL; /* [-Wconditional-uninitialized] */ - entries = 0; /* [-Wconditional-uninitialized] */ - - __cursor_pos_clear(cbt); - - /* If the page has disk-based entries, select from them. */ - if (page->pg_row_entries != 0) { - cbt->compare = 0; - cbt->slot = __wt_random(&session->rnd) % page->pg_row_entries; - - /* - * The real row-store search function builds the key, so we - * have to as well. - */ - return (__wt_row_leaf_key(session, - page, page->pg_row_d + cbt->slot, cbt->tmp, false)); - } - - /* - * If the tree is new (and not empty), it might have a large insert - * list. - * - * Walk down the list until we find a level with at least 50 entries, - * that's where we'll start rolling random numbers. The value 50 is - * used to ignore levels with only a few entries, that is, levels which - * are potentially badly skewed. - */ - F_SET(cbt, WT_CBT_SEARCH_SMALLEST); - if ((ins_head = WT_ROW_INSERT_SMALLEST(page)) == NULL) - return (WT_NOTFOUND); - for (level = WT_SKIP_MAXDEPTH - 1; level >= 0; --level) { - start = &ins_head->head[level]; - for (entries = 0, stop = start; - *stop != NULL; stop = &(*stop)->next[level]) - ++entries; - - if (entries > 50) - break; - } - - /* - * If it's a tiny list and we went all the way to level 0, correct the - * level; entries is correctly set. - */ - if (level < 0) - level = 0; - - /* - * Step down the skip list levels, selecting a random chunk of the name - * space at each level. - */ - for (samples = entries; level > 0; samples += entries) { - /* - * There are (entries) or (entries + 1) chunks of the name space - * considered at each level. They are: between start and the 1st - * element, between the 1st and 2nd elements, and so on to the - * last chunk which is the name space after the stop element on - * the current level. This last chunk of name space may or may - * not be there: as we descend the levels of the skip list, this - * chunk may appear, depending if the next level down has - * entries logically after the stop point in the current level. - * We can't ignore those entries: because of the algorithm used - * to determine the depth of a skiplist, there may be a large - * number of entries "revealed" by descending a level. - * - * If the next level down has more items after the current stop - * point, there are (entries + 1) chunks to consider, else there - * are (entries) chunks. - */ - if (*(stop - 1) == NULL) - choice = __wt_random(&session->rnd) % entries; - else - choice = __wt_random(&session->rnd) % (entries + 1); - - if (choice == entries) { - /* - * We selected the name space after the stop element on - * this level. Set the start point to the current stop - * point, descend a level and move the stop element to - * the end of the list, that is, the end of the newly - * discovered name space, counting entries as we go. - */ - start = stop; - --start; - --level; - for (entries = 0, stop = start; - *stop != NULL; stop = &(*stop)->next[level]) - ++entries; - } else { - /* - * We selected another name space on the level. Move the - * start pointer the selected number of entries forward - * to the start of the selected chunk (if the selected - * number is 0, start won't move). Set the stop pointer - * to the next element in the list and drop both start - * and stop down a level. - */ - for (i = 0; i < choice; ++i) - start = &(*start)->next[level]; - stop = &(*start)->next[level]; - - --start; - --stop; - --level; - - /* Count the entries in the selected name space. */ - for (entries = 0, - ins = *start; ins != *stop; ins = ins->next[level]) - ++entries; - } - } - - /* - * When we reach the bottom level, entries will already be set. Select - * a random entry from the name space and return it. - * - * It should be impossible for the entries count to be 0 at this point, - * but check for it out of paranoia and to quiet static testing tools. - */ - if (entries > 0) - entries = __wt_random(&session->rnd) % entries; - for (ins = *start; entries > 0; --entries) - ins = ins->next[0]; - - cbt->ins = ins; - cbt->ins_head = ins_head; - cbt->compare = 0; - - /* - * Random lookups in newly created collections can be slow if a page - * consists of a large skiplist. Schedule the page for eviction if we - * encounter a large skiplist. This worthwhile because applications - * that take a sample often take many samples, so the overhead of - * traversing the skip list each time accumulates to real time. - */ - if (samples > 5000) - __wt_page_evict_soon(session, cbt->ref); - - return (0); -} - -/* - * __wt_row_random_descent -- - * Find a random leaf page in a row-store tree. - */ -int -__wt_row_random_descent(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt) -{ - WT_BTREE *btree; - WT_DECL_RET; - WT_PAGE *page; - WT_PAGE_INDEX *pindex; - WT_REF *current, *descent; - - btree = S2BT(session); - current = NULL; - - if (0) { -restart: /* - * Discard the currently held page and restart the search from - * the root. - */ - WT_RET(__wt_page_release(session, current, 0)); - } - - /* Search the internal pages of the tree. */ - current = &btree->root; - for (;;) { - page = current->page; - if (page->type != WT_PAGE_ROW_INT) - break; - - WT_INTL_INDEX_GET(session, page, pindex); - descent = pindex->index[ - __wt_random(&session->rnd) % pindex->entries]; - - /* - * Swap the current page for the child page. If the page splits - * while we're retrieving it, restart the search at the root. - * - * On other error, simply return, the swap call ensures we're - * holding nothing on failure. - */ - if ((ret = __wt_page_swap( - session, current, descent, WT_READ_RESTART_OK)) == 0) { - current = descent; - continue; - } - if (ret == WT_RESTART) - goto restart; - return (ret); - } - - cbt->ref = current; - return (0); -} diff --git a/src/third_party/wiredtiger/src/checksum/power8/crc32_wrapper.c b/src/third_party/wiredtiger/src/checksum/power8/crc32_wrapper.c index ddfa2bdaeb8..a9be9ced1c6 100644 --- a/src/third_party/wiredtiger/src/checksum/power8/crc32_wrapper.c +++ b/src/third_party/wiredtiger/src/checksum/power8/crc32_wrapper.c @@ -1,4 +1,6 @@ #if defined(__powerpc64__) +#include "wt_internal.h" + #define CRC_TABLE #include "crc32_constants.h" @@ -68,8 +70,6 @@ out: } #endif -#include "wt_internal.h" - /* * __wt_checksum_hw -- * WiredTiger: return a checksum for a chunk of memory. diff --git a/src/third_party/wiredtiger/src/checksum/zseries/crc32-s390x.c b/src/third_party/wiredtiger/src/checksum/zseries/crc32-s390x.c index f77d6768d42..28b46594220 100644 --- a/src/third_party/wiredtiger/src/checksum/zseries/crc32-s390x.c +++ b/src/third_party/wiredtiger/src/checksum/zseries/crc32-s390x.c @@ -6,8 +6,20 @@ * Author(s): Hendrik Brueckner <brueckner@linux.vnet.ibm.com> * */ +#include "wt_internal.h" + #include <sys/types.h> #include <endian.h> + +#if defined(HAVE_CRC32_HARDWARE) + +#include <sys/auxv.h> + +/* RHEL 7 has kernel support, but does not define this constant in the lib c headers. */ +#ifndef HWCAP_S390_VX +#define HWCAP_S390_VX 2048 +#endif + #include "crc32-s390x.h" #include "slicing-consts.h" @@ -69,8 +81,6 @@ unsigned int __wt_crc32c_le(unsigned int crc, const unsigned char *buf, size_t l /* Main CRC-32 functions */ DEFINE_CRC32_VX(__wt_crc32c_le_vx, __wt_crc32c_le_vgfm_16, __wt_crc32c_le) -#include "wt_internal.h" - /* * __wt_checksum_hw -- * WiredTiger: return a checksum for a chunk of memory. @@ -81,6 +91,8 @@ __wt_checksum_hw(const void *chunk, size_t len) return (~__wt_crc32c_le_vx(0xffffffff, chunk, len)); } +#endif + /* * __wt_checksum_init -- * WiredTiger: detect CRC hardware and set the checksum function. @@ -89,8 +101,14 @@ void __wt_checksum_init(void) { #if defined(HAVE_CRC32_HARDWARE) - __wt_process.checksum = __wt_checksum_hw; -#else + unsigned long caps = getauxval(AT_HWCAP); + + if (caps & HWCAP_S390_VX) + __wt_process.checksum = __wt_checksum_hw; + else + __wt_process.checksum = __wt_checksum_sw; + +#else /* !HAVE_CRC32_HARDWARE */ __wt_process.checksum = __wt_checksum_sw; #endif } diff --git a/src/third_party/wiredtiger/src/config/config_def.c b/src/third_party/wiredtiger/src/config/config_def.c index e4fd7937a40..b11a8d63fdb 100644 --- a/src/third_party/wiredtiger/src/config/config_def.c +++ b/src/third_party/wiredtiger/src/config/config_def.c @@ -147,11 +147,12 @@ static const WT_CONFIG_CHECK confchk_WT_CONNECTION_reconfigure[] = { confchk_WT_CONNECTION_reconfigure_statistics_log_subconfigs, 5 }, { "verbose", "list", NULL, "choices=[\"api\",\"block\",\"checkpoint\",\"compact\"," - "\"evict\",\"evictserver\",\"fileops\",\"handleops\",\"log\"," - "\"lsm\",\"lsm_manager\",\"metadata\",\"mutex\",\"overflow\"," - "\"read\",\"rebalance\",\"reconcile\",\"recovery\",\"salvage\"," - "\"shared_cache\",\"split\",\"temporary\",\"thread_group\"," - "\"transaction\",\"verify\",\"version\",\"write\"]", + "\"evict\",\"evict_stuck\",\"evictserver\",\"fileops\"," + "\"handleops\",\"log\",\"lsm\",\"lsm_manager\",\"metadata\"," + "\"mutex\",\"overflow\",\"read\",\"rebalance\",\"reconcile\"," + "\"recovery\",\"recovery_progress\",\"salvage\",\"shared_cache\"," + "\"split\",\"temporary\",\"thread_group\",\"transaction\"," + "\"verify\",\"version\",\"write\"]", NULL, 0 }, { NULL, NULL, NULL, NULL, NULL, 0 } }; @@ -749,11 +750,12 @@ static const WT_CONFIG_CHECK confchk_wiredtiger_open[] = { { "use_environment_priv", "boolean", NULL, NULL, NULL, 0 }, { "verbose", "list", NULL, "choices=[\"api\",\"block\",\"checkpoint\",\"compact\"," - "\"evict\",\"evictserver\",\"fileops\",\"handleops\",\"log\"," - "\"lsm\",\"lsm_manager\",\"metadata\",\"mutex\",\"overflow\"," - "\"read\",\"rebalance\",\"reconcile\",\"recovery\",\"salvage\"," - "\"shared_cache\",\"split\",\"temporary\",\"thread_group\"," - "\"transaction\",\"verify\",\"version\",\"write\"]", + "\"evict\",\"evict_stuck\",\"evictserver\",\"fileops\"," + "\"handleops\",\"log\",\"lsm\",\"lsm_manager\",\"metadata\"," + "\"mutex\",\"overflow\",\"read\",\"rebalance\",\"reconcile\"," + "\"recovery\",\"recovery_progress\",\"salvage\",\"shared_cache\"," + "\"split\",\"temporary\",\"thread_group\",\"transaction\"," + "\"verify\",\"version\",\"write\"]", NULL, 0 }, { "write_through", "list", NULL, "choices=[\"data\",\"log\"]", @@ -835,11 +837,12 @@ static const WT_CONFIG_CHECK confchk_wiredtiger_open_all[] = { { "use_environment_priv", "boolean", NULL, NULL, NULL, 0 }, { "verbose", "list", NULL, "choices=[\"api\",\"block\",\"checkpoint\",\"compact\"," - "\"evict\",\"evictserver\",\"fileops\",\"handleops\",\"log\"," - "\"lsm\",\"lsm_manager\",\"metadata\",\"mutex\",\"overflow\"," - "\"read\",\"rebalance\",\"reconcile\",\"recovery\",\"salvage\"," - "\"shared_cache\",\"split\",\"temporary\",\"thread_group\"," - "\"transaction\",\"verify\",\"version\",\"write\"]", + "\"evict\",\"evict_stuck\",\"evictserver\",\"fileops\"," + "\"handleops\",\"log\",\"lsm\",\"lsm_manager\",\"metadata\"," + "\"mutex\",\"overflow\",\"read\",\"rebalance\",\"reconcile\"," + "\"recovery\",\"recovery_progress\",\"salvage\",\"shared_cache\"," + "\"split\",\"temporary\",\"thread_group\",\"transaction\"," + "\"verify\",\"version\",\"write\"]", NULL, 0 }, { "version", "string", NULL, NULL, NULL, 0 }, { "write_through", "list", @@ -916,11 +919,12 @@ static const WT_CONFIG_CHECK confchk_wiredtiger_open_basecfg[] = { confchk_wiredtiger_open_transaction_sync_subconfigs, 2 }, { "verbose", "list", NULL, "choices=[\"api\",\"block\",\"checkpoint\",\"compact\"," - "\"evict\",\"evictserver\",\"fileops\",\"handleops\",\"log\"," - "\"lsm\",\"lsm_manager\",\"metadata\",\"mutex\",\"overflow\"," - "\"read\",\"rebalance\",\"reconcile\",\"recovery\",\"salvage\"," - "\"shared_cache\",\"split\",\"temporary\",\"thread_group\"," - "\"transaction\",\"verify\",\"version\",\"write\"]", + "\"evict\",\"evict_stuck\",\"evictserver\",\"fileops\"," + "\"handleops\",\"log\",\"lsm\",\"lsm_manager\",\"metadata\"," + "\"mutex\",\"overflow\",\"read\",\"rebalance\",\"reconcile\"," + "\"recovery\",\"recovery_progress\",\"salvage\",\"shared_cache\"," + "\"split\",\"temporary\",\"thread_group\",\"transaction\"," + "\"verify\",\"version\",\"write\"]", NULL, 0 }, { "version", "string", NULL, NULL, NULL, 0 }, { "write_through", "list", @@ -997,11 +1001,12 @@ static const WT_CONFIG_CHECK confchk_wiredtiger_open_usercfg[] = { confchk_wiredtiger_open_transaction_sync_subconfigs, 2 }, { "verbose", "list", NULL, "choices=[\"api\",\"block\",\"checkpoint\",\"compact\"," - "\"evict\",\"evictserver\",\"fileops\",\"handleops\",\"log\"," - "\"lsm\",\"lsm_manager\",\"metadata\",\"mutex\",\"overflow\"," - "\"read\",\"rebalance\",\"reconcile\",\"recovery\",\"salvage\"," - "\"shared_cache\",\"split\",\"temporary\",\"thread_group\"," - "\"transaction\",\"verify\",\"version\",\"write\"]", + "\"evict\",\"evict_stuck\",\"evictserver\",\"fileops\"," + "\"handleops\",\"log\",\"lsm\",\"lsm_manager\",\"metadata\"," + "\"mutex\",\"overflow\",\"read\",\"rebalance\",\"reconcile\"," + "\"recovery\",\"recovery_progress\",\"salvage\",\"shared_cache\"," + "\"split\",\"temporary\",\"thread_group\",\"transaction\"," + "\"verify\",\"version\",\"write\"]", NULL, 0 }, { "write_through", "list", NULL, "choices=[\"data\",\"log\"]", @@ -1050,7 +1055,7 @@ static const WT_CONFIG_ENTRY config_entries[] = { { "WT_CONNECTION.reconfigure", "async=(enabled=false,ops_max=1024,threads=2),cache_overhead=8," "cache_size=100MB,checkpoint=(log_size=0,wait=0),error_prefix=," - "eviction=(threads_max=1,threads_min=1)," + "eviction=(threads_max=8,threads_min=1)," "eviction_checkpoint_target=5,eviction_dirty_target=5," "eviction_dirty_trigger=20,eviction_target=80,eviction_trigger=95" ",file_manager=(close_handle_minimum=250,close_idle_time=30," @@ -1261,7 +1266,7 @@ static const WT_CONFIG_ENTRY config_entries[] = { ",builtin_extension_config=,cache_overhead=8,cache_size=100MB," "checkpoint=(log_size=0,wait=0),checkpoint_sync=true," "config_base=true,create=false,direct_io=,encryption=(keyid=," - "name=,secretkey=),error_prefix=,eviction=(threads_max=1," + "name=,secretkey=),error_prefix=,eviction=(threads_max=8," "threads_min=1),eviction_checkpoint_target=5," "eviction_dirty_target=5,eviction_dirty_trigger=20," "eviction_target=80,eviction_trigger=95,exclusive=false," @@ -1285,7 +1290,7 @@ static const WT_CONFIG_ENTRY config_entries[] = { ",builtin_extension_config=,cache_overhead=8,cache_size=100MB," "checkpoint=(log_size=0,wait=0),checkpoint_sync=true," "config_base=true,create=false,direct_io=,encryption=(keyid=," - "name=,secretkey=),error_prefix=,eviction=(threads_max=1," + "name=,secretkey=),error_prefix=,eviction=(threads_max=8," "threads_min=1),eviction_checkpoint_target=5," "eviction_dirty_target=5,eviction_dirty_trigger=20," "eviction_target=80,eviction_trigger=95,exclusive=false," @@ -1309,7 +1314,7 @@ static const WT_CONFIG_ENTRY config_entries[] = { ",builtin_extension_config=,cache_overhead=8,cache_size=100MB," "checkpoint=(log_size=0,wait=0),checkpoint_sync=true,direct_io=," "encryption=(keyid=,name=,secretkey=),error_prefix=," - "eviction=(threads_max=1,threads_min=1)," + "eviction=(threads_max=8,threads_min=1)," "eviction_checkpoint_target=5,eviction_dirty_target=5," "eviction_dirty_trigger=20,eviction_target=80,eviction_trigger=95" ",extensions=,file_extend=,file_manager=(close_handle_minimum=250" @@ -1330,7 +1335,7 @@ static const WT_CONFIG_ENTRY config_entries[] = { ",builtin_extension_config=,cache_overhead=8,cache_size=100MB," "checkpoint=(log_size=0,wait=0),checkpoint_sync=true,direct_io=," "encryption=(keyid=,name=,secretkey=),error_prefix=," - "eviction=(threads_max=1,threads_min=1)," + "eviction=(threads_max=8,threads_min=1)," "eviction_checkpoint_target=5,eviction_dirty_target=5," "eviction_dirty_trigger=20,eviction_target=80,eviction_trigger=95" ",extensions=,file_extend=,file_manager=(close_handle_minimum=250" diff --git a/src/third_party/wiredtiger/src/conn/conn_api.c b/src/third_party/wiredtiger/src/conn/conn_api.c index 474b8bbad8a..124250a7a7d 100644 --- a/src/third_party/wiredtiger/src/conn/conn_api.c +++ b/src/third_party/wiredtiger/src/conn/conn_api.c @@ -1798,6 +1798,7 @@ __wt_verbose_config(WT_SESSION_IMPL *session, const char *cfg[]) { "checkpoint", WT_VERB_CHECKPOINT }, { "compact", WT_VERB_COMPACT }, { "evict", WT_VERB_EVICT }, + { "evict_stuck", WT_VERB_EVICT_STUCK }, { "evictserver", WT_VERB_EVICTSERVER }, { "fileops", WT_VERB_FILEOPS }, { "handleops", WT_VERB_HANDLEOPS }, @@ -1811,6 +1812,7 @@ __wt_verbose_config(WT_SESSION_IMPL *session, const char *cfg[]) { "rebalance", WT_VERB_REBALANCE }, { "reconcile", WT_VERB_RECONCILE }, { "recovery", WT_VERB_RECOVERY }, + { "recovery_progress", WT_VERB_RECOVERY_PROGRESS }, { "salvage", WT_VERB_SALVAGE }, { "shared_cache", WT_VERB_SHARED_CACHE }, { "split", WT_VERB_SPLIT }, @@ -1986,6 +1988,16 @@ __conn_set_file_system( CONNECTION_API_CALL(conn, session, set_file_system, config, cfg); WT_UNUSED(cfg); + /* + * You can only configure a file system once, and attempting to do it + * again probably means the extension argument didn't have early-load + * set and we've already configured the default file system. + */ + if (conn->file_system != NULL) + WT_ERR_MSG(session, EPERM, + "filesystem already configured; custom filesystems should " + "enable \"early_load\" configuration"); + conn->file_system = file_system; err: API_END_RET(session, ret); @@ -2174,6 +2186,15 @@ wiredtiger_open(const char *home, WT_EVENT_HANDLER *event_handler, if (cval.val) F_SET(conn, WT_CONN_READONLY); + /* Configure error messages so we get them right early. */ + WT_ERR(__wt_config_gets(session, cfg, "error_prefix", &cval)); + if (cval.len != 0) + WT_ERR(__wt_strndup( + session, cval.str, cval.len, &conn->error_prefix)); + + /* Set the database home so extensions have access to it. */ + WT_ERR(__conn_home(session, home, cfg)); + /* * Load early extensions before doing further initialization (one early * extension is to configure a file system). @@ -2197,6 +2218,9 @@ wiredtiger_open(const char *home, WT_EVENT_HANDLER *event_handler, WT_ERR( __conn_chk_file_system(session, F_ISSET(conn, WT_CONN_READONLY))); + /* Make sure no other thread of control already owns this database. */ + WT_ERR(__conn_single(session, cfg)); + /* * Capture the config_base setting file for later use. Again, if the * application doesn't want us to read the base configuration file, @@ -2206,18 +2230,6 @@ wiredtiger_open(const char *home, WT_EVENT_HANDLER *event_handler, WT_ERR(__wt_config_gets(session, cfg, "config_base", &cval)); config_base_set = cval.val != 0; - /* Configure error messages so we get them right early. */ - WT_ERR(__wt_config_gets(session, cfg, "error_prefix", &cval)); - if (cval.len != 0) - WT_ERR(__wt_strndup( - session, cval.str, cval.len, &conn->error_prefix)); - - /* Get the database home. */ - WT_ERR(__conn_home(session, home, cfg)); - - /* Make sure no other thread of control already owns this database. */ - WT_ERR(__conn_single(session, cfg)); - /* * Build the real configuration stack, in the following order (where * later entries override earlier entries): diff --git a/src/third_party/wiredtiger/src/conn/conn_cache.c b/src/third_party/wiredtiger/src/conn/conn_cache.c index fe5f94ea03d..28dd06332e0 100644 --- a/src/third_party/wiredtiger/src/conn/conn_cache.c +++ b/src/third_party/wiredtiger/src/conn/conn_cache.c @@ -143,7 +143,8 @@ __wt_cache_config(WT_SESSION_IMPL *session, bool reconfigure, const char *cfg[]) if (reconfigure) WT_RET(__wt_thread_group_resize( session, &conn->evict_threads, - conn->evict_threads_min, conn->evict_threads_max, + conn->evict_threads_min, + conn->evict_threads_max, WT_THREAD_CAN_WAIT | WT_THREAD_PANIC_FAIL)); return (0); @@ -186,8 +187,8 @@ __wt_cache_create(WT_SESSION_IMPL *session, const char *cfg[]) WT_RET_MSG(session, EINVAL, "eviction target must be lower than the eviction trigger"); - WT_RET(__wt_cond_auto_alloc(session, "cache eviction server", - false, 10000, WT_MILLION, &cache->evict_cond)); + WT_RET(__wt_cond_auto_alloc(session, + "cache eviction server", 10000, WT_MILLION, &cache->evict_cond)); WT_RET(__wt_spin_init(session, &cache->evict_pass_lock, "evict pass")); WT_RET(__wt_spin_init(session, &cache->evict_queue_lock, "cache eviction queue")); @@ -311,7 +312,7 @@ __wt_cache_destroy(WT_SESSION_IMPL *session) cache->bytes_dirty_intl + cache->bytes_dirty_leaf, cache->pages_dirty_intl + cache->pages_dirty_leaf); - WT_TRET(__wt_cond_auto_destroy(session, &cache->evict_cond)); + WT_TRET(__wt_cond_destroy(session, &cache->evict_cond)); __wt_spin_destroy(session, &cache->evict_pass_lock); __wt_spin_destroy(session, &cache->evict_queue_lock); __wt_spin_destroy(session, &cache->evict_walk_lock); diff --git a/src/third_party/wiredtiger/src/conn/conn_cache_pool.c b/src/third_party/wiredtiger/src/conn/conn_cache_pool.c index 79c2fc23da5..ed078991581 100644 --- a/src/third_party/wiredtiger/src/conn/conn_cache_pool.c +++ b/src/third_party/wiredtiger/src/conn/conn_cache_pool.c @@ -32,7 +32,7 @@ */ #define WT_CACHE_POOL_APP_EVICT_MULTIPLIER 3 #define WT_CACHE_POOL_APP_WAIT_MULTIPLIER 6 -#define WT_CACHE_POOL_READ_MULTIPLIER 1 +#define WT_CACHE_POOL_READ_MULTIPLIER 1 static void __cache_pool_adjust( WT_SESSION_IMPL *, uint64_t, uint64_t, bool, bool *); @@ -104,8 +104,8 @@ __wt_cache_pool_config(WT_SESSION_IMPL *session, const char **cfg) TAILQ_INIT(&cp->cache_pool_qh); WT_ERR(__wt_spin_init( session, &cp->cache_pool_lock, "cache shared pool")); - WT_ERR(__wt_cond_alloc(session, - "cache pool server", false, &cp->cache_pool_cond)); + WT_ERR(__wt_cond_alloc( + session, "cache pool server", &cp->cache_pool_cond)); __wt_process.cache_pool = cp; __wt_verbose(session, @@ -418,8 +418,9 @@ static void __cache_pool_balance(WT_SESSION_IMPL *session, bool forward) { WT_CACHE_POOL *cp; - bool adjusted; uint64_t bump_threshold, highest; + int i; + bool adjusted; cp = __wt_process.cache_pool; adjusted = false; @@ -438,11 +439,17 @@ __cache_pool_balance(WT_SESSION_IMPL *session, bool forward) /* * Actively attempt to: - * - Reduce the amount allocated, if we are over the budget + * - Reduce the amount allocated, if we are over the budget. * - Increase the amount used if there is capacity and any pressure. + * Don't keep trying indefinitely, if we aren't succeeding in reducing + * the cache in use re-assessing the participants' states is necessary. + * We are also holding a lock across this process, which can slow + * participant shutdown if we spend a long time balancing. */ - while (F_ISSET(cp, WT_CACHE_POOL_ACTIVE) && - F_ISSET(S2C(session)->cache, WT_CACHE_POOL_RUN)) { + for (i = 0; + i < 2 * WT_CACHE_POOL_BUMP_THRESHOLD && + F_ISSET(cp, WT_CACHE_POOL_ACTIVE) && + F_ISSET(S2C(session)->cache, WT_CACHE_POOL_RUN); i++) { __cache_pool_adjust( session, highest, bump_threshold, forward, &adjusted); /* @@ -565,7 +572,7 @@ __cache_pool_adjust(WT_SESSION_IMPL *session, WT_CONNECTION_IMPL *entry; uint64_t adjustment, highest_percentile, pressure, reserved, smallest; u_int pct_full; - bool busy, pool_full, grow; + bool busy, decrease_ok, grow, pool_full; *adjustedp = false; cp = __wt_process.cache_pool; @@ -612,6 +619,34 @@ __cache_pool_adjust(WT_SESSION_IMPL *session, continue; /* + * The bump threshold decreases as we try longer to balance + * the pool. Adjust how aggressively we free space from + * participants depending on how long we have been trying. + */ + decrease_ok = false; + /* + * Any participant is a candidate if we have been trying + * for long enough. + */ + if (bump_threshold == 0) + decrease_ok = true; + /* + * Participants that aren't doing application eviction and + * are showing a reasonable amount of usage are excluded + * even if we have been trying for a while. + */ + else if (bump_threshold < WT_CACHE_POOL_BUMP_THRESHOLD / 3 && + (!busy && highest > 1)) + decrease_ok = true; + /* + * Any participant that is proportionally less busy is a + * candidate from the first attempt. + */ + else if (highest > 1 && + pressure < WT_CACHE_POOL_REDUCE_THRESHOLD) + decrease_ok = true; + + /* * If the entry is currently allocated less than the reserved * size, increase its allocation. This should only happen if: * - it's the first time we've seen this member, or @@ -624,17 +659,12 @@ __cache_pool_adjust(WT_SESSION_IMPL *session, * Conditions for reducing the amount of resources for an * entry: * - the pool is full, - * - application threads are not busy doing eviction already, * - this entry has more than the minimum amount of space in * use, - * - the read pressure in this entry is below the threshold, - * other entries need more cache, the entry has more than - * the minimum space and there is no available space in the - * pool. + * - it was determined that this slot is a good candidate */ - } else if (pool_full && !busy && - entry->cache_size > reserved && - pressure < WT_CACHE_POOL_REDUCE_THRESHOLD && highest > 1) { + } else if (pool_full && + entry->cache_size > reserved && decrease_ok) { grow = false; /* * Don't drop the size down too much - or it can @@ -733,7 +763,7 @@ __wt_cache_pool_server(void *arg) F_ISSET(cache, WT_CACHE_POOL_RUN)) { if (cp->currently_used <= cp->size) __wt_cond_wait( - session, cp->cache_pool_cond, WT_MILLION); + session, cp->cache_pool_cond, WT_MILLION, NULL); /* * Re-check pool run flag - since we want to avoid getting the diff --git a/src/third_party/wiredtiger/src/conn/conn_ckpt.c b/src/third_party/wiredtiger/src/conn/conn_ckpt.c index faeef4e71a2..7797ed4421c 100644 --- a/src/third_party/wiredtiger/src/conn/conn_ckpt.c +++ b/src/third_party/wiredtiger/src/conn/conn_ckpt.c @@ -63,6 +63,16 @@ __ckpt_server_config(WT_SESSION_IMPL *session, const char **cfg, bool *startp) } /* + * __ckpt_server_run_chk -- + * Check to decide if the checkpoint server should continue running. + */ +static bool +__ckpt_server_run_chk(WT_SESSION_IMPL *session) +{ + return (F_ISSET(S2C(session), WT_CONN_SERVER_CHECKPOINT)); +} + +/* * __ckpt_server -- * The checkpoint server thread. */ @@ -78,14 +88,18 @@ __ckpt_server(void *arg) conn = S2C(session); wt_session = (WT_SESSION *)session; - while (F_ISSET(conn, WT_CONN_SERVER_RUN) && - F_ISSET(conn, WT_CONN_SERVER_CHECKPOINT)) { + for (;;) { /* * Wait... * NOTE: If the user only configured logsize, then usecs * will be 0 and this wait won't return until signalled. */ - __wt_cond_wait(session, conn->ckpt_cond, conn->ckpt_usecs); + __wt_cond_wait(session, + conn->ckpt_cond, conn->ckpt_usecs, __ckpt_server_run_chk); + + /* Check if we're quitting or being reconfigured. */ + if (!__ckpt_server_run_chk(session)) + break; /* * Checkpoint the database if the connection is marked dirty. @@ -113,7 +127,8 @@ __ckpt_server(void *arg) * it so we don't do another checkpoint * immediately. */ - __wt_cond_wait(session, conn->ckpt_cond, 1); + __wt_cond_wait( + session, conn->ckpt_cond, 1, NULL); } } else WT_STAT_CONN_INCR(session, txn_checkpoint_skipped); @@ -152,8 +167,7 @@ __ckpt_server_start(WT_CONNECTION_IMPL *conn) "checkpoint-server", true, session_flags, &conn->ckpt_session)); session = conn->ckpt_session; - WT_RET(__wt_cond_alloc( - session, "checkpoint server", false, &conn->ckpt_cond)); + WT_RET(__wt_cond_alloc(session, "checkpoint server", &conn->ckpt_cond)); /* * Start the thread. diff --git a/src/third_party/wiredtiger/src/conn/conn_dhandle.c b/src/third_party/wiredtiger/src/conn/conn_dhandle.c index e9e3925c57e..866b8633f71 100644 --- a/src/third_party/wiredtiger/src/conn/conn_dhandle.c +++ b/src/third_party/wiredtiger/src/conn/conn_dhandle.c @@ -25,24 +25,22 @@ __conn_dhandle_destroy(WT_SESSION_IMPL *session, WT_DATA_HANDLE *dhandle) } /* - * __conn_dhandle_alloc -- + * __wt_conn_dhandle_alloc -- * Allocate a new data handle and return it linked into the connection's * list. */ -static int -__conn_dhandle_alloc(WT_SESSION_IMPL *session, - const char *uri, const char *checkpoint, WT_DATA_HANDLE **dhandlep) +int +__wt_conn_dhandle_alloc( + WT_SESSION_IMPL *session, const char *uri, const char *checkpoint) { WT_BTREE *btree; WT_DATA_HANDLE *dhandle; WT_DECL_RET; uint64_t bucket; - *dhandlep = NULL; - WT_RET(__wt_calloc_one(session, &dhandle)); - WT_ERR(__wt_rwlock_alloc(session, &dhandle->rwlock, "data handle")); + __wt_rwlock_init(session, &dhandle->rwlock); dhandle->name_hash = __wt_hash_city64(uri, strlen(uri)); WT_ERR(__wt_strdup(session, uri, &dhandle->name)); WT_ERR(__wt_strdup(session, checkpoint, &dhandle->checkpoint)); @@ -75,7 +73,7 @@ __conn_dhandle_alloc(WT_SESSION_IMPL *session, bucket = dhandle->name_hash % WT_HASH_ARRAY_SIZE; WT_CONN_DHANDLE_INSERT(S2C(session), dhandle, bucket); - *dhandlep = dhandle; + session->dhandle = dhandle; return (0); err: __conn_dhandle_destroy(session, dhandle); @@ -122,10 +120,7 @@ __wt_conn_dhandle_find( } } - WT_RET(__conn_dhandle_alloc(session, uri, checkpoint, &dhandle)); - - session->dhandle = dhandle; - return (0); + return (WT_NOTFOUND); } /* @@ -419,12 +414,11 @@ __wt_conn_btree_apply(WT_SESSION_IMPL *session, const char *uri, { WT_CONNECTION_IMPL *conn; WT_DATA_HANDLE *dhandle; + WT_DECL_RET; uint64_t bucket; conn = S2C(session); - WT_ASSERT(session, F_ISSET(session, WT_SESSION_LOCKED_HANDLE_LIST)); - /* * If we're given a URI, then we walk only the hash list for that * name. If we don't have a URI we walk the entire dhandle list. @@ -432,29 +426,42 @@ __wt_conn_btree_apply(WT_SESSION_IMPL *session, const char *uri, if (uri != NULL) { bucket = __wt_hash_city64(uri, strlen(uri)) % WT_HASH_ARRAY_SIZE; - TAILQ_FOREACH(dhandle, &conn->dhhash[bucket], hashq) { + + for (dhandle = NULL;;) { + WT_WITH_HANDLE_LIST_READ_LOCK(session, + WT_DHANDLE_NEXT(session, dhandle, + &conn->dhhash[bucket], hashq)); + if (dhandle == NULL) + return (0); + if (!F_ISSET(dhandle, WT_DHANDLE_OPEN) || F_ISSET(dhandle, WT_DHANDLE_DEAD) || dhandle->checkpoint != NULL || strcmp(uri, dhandle->name) != 0) continue; - WT_RET(__conn_btree_apply_internal( - session, dhandle, file_func, name_func, cfg)); + WT_ERR(__conn_btree_apply_internal(session, + dhandle, file_func, name_func, cfg)); } } else { - TAILQ_FOREACH(dhandle, &conn->dhqh, q) { + for (dhandle = NULL;;) { + WT_WITH_HANDLE_LIST_READ_LOCK(session, + WT_DHANDLE_NEXT(session, dhandle, &conn->dhqh, q)); + if (dhandle == NULL) + return (0); + if (!F_ISSET(dhandle, WT_DHANDLE_OPEN) || F_ISSET(dhandle, WT_DHANDLE_DEAD) || dhandle->checkpoint != NULL || !WT_PREFIX_MATCH(dhandle->name, "file:") || WT_IS_METADATA(dhandle)) continue; - WT_RET(__conn_btree_apply_internal( - session, dhandle, file_func, name_func, cfg)); + WT_ERR(__conn_btree_apply_internal(session, + dhandle, file_func, name_func, cfg)); } } - return (0); +err: WT_DHANDLE_RELEASE(dhandle); + return (ret); } /* @@ -473,7 +480,8 @@ __wt_conn_dhandle_close_all( conn = S2C(session); - WT_ASSERT(session, F_ISSET(session, WT_SESSION_LOCKED_HANDLE_LIST)); + WT_ASSERT(session, + F_ISSET(session, WT_SESSION_LOCKED_HANDLE_LIST_WRITE)); WT_ASSERT(session, session->dhandle == NULL); bucket = __wt_hash_city64(uri, strlen(uri)) % WT_HASH_ARRAY_SIZE; @@ -534,7 +542,8 @@ __conn_dhandle_remove(WT_SESSION_IMPL *session, bool final) dhandle = session->dhandle; bucket = dhandle->name_hash % WT_HASH_ARRAY_SIZE; - WT_ASSERT(session, F_ISSET(session, WT_SESSION_LOCKED_HANDLE_LIST)); + WT_ASSERT(session, + F_ISSET(session, WT_SESSION_LOCKED_HANDLE_LIST_WRITE)); WT_ASSERT(session, dhandle != conn->cache->evict_file_next); /* Check if the handle was reacquired by a session while we waited. */ @@ -583,7 +592,7 @@ __wt_conn_dhandle_discard_single( } /* Try to remove the handle, protected by the data handle lock. */ - WT_WITH_HANDLE_LIST_LOCK(session, + WT_WITH_HANDLE_LIST_WRITE_LOCK(session, tret = __conn_dhandle_remove(session, final)); if (set_pass_intr) (void)__wt_atomic_subv32(&S2C(session)->cache->pass_intr, 1); diff --git a/src/third_party/wiredtiger/src/conn/conn_handle.c b/src/third_party/wiredtiger/src/conn/conn_handle.c index 02182daa7dc..287e9ca7b99 100644 --- a/src/third_party/wiredtiger/src/conn/conn_handle.c +++ b/src/third_party/wiredtiger/src/conn/conn_handle.c @@ -53,19 +53,18 @@ __wt_connection_init(WT_CONNECTION_IMPL *conn) /* Spinlocks. */ WT_RET(__wt_spin_init(session, &conn->api_lock, "api")); WT_SPIN_INIT_TRACKED(session, &conn->checkpoint_lock, checkpoint); - WT_SPIN_INIT_TRACKED(session, &conn->dhandle_lock, handle_list); WT_RET(__wt_spin_init(session, &conn->encryptor_lock, "encryptor")); WT_RET(__wt_spin_init(session, &conn->fh_lock, "file list")); WT_RET(__wt_spin_init(session, &conn->las_lock, "lookaside table")); WT_SPIN_INIT_TRACKED(session, &conn->metadata_lock, metadata); WT_RET(__wt_spin_init(session, &conn->reconfig_lock, "reconfigure")); WT_SPIN_INIT_TRACKED(session, &conn->schema_lock, schema); - WT_SPIN_INIT_TRACKED(session, &conn->table_lock, table); WT_RET(__wt_spin_init(session, &conn->turtle_lock, "turtle file")); /* Read-write locks */ - WT_RET(__wt_rwlock_alloc( - session, &conn->hot_backup_lock, "hot backup")); + __wt_rwlock_init(session, &conn->dhandle_lock); + __wt_rwlock_init(session, &conn->hot_backup_lock); + __wt_rwlock_init(session, &conn->table_lock); WT_RET(__wt_calloc_def(session, WT_PAGE_LOCKS, &conn->page_lock)); for (i = 0; i < WT_PAGE_LOCKS; ++i) @@ -80,7 +79,7 @@ __wt_connection_init(WT_CONNECTION_IMPL *conn) WT_RET(__wt_spin_init( session, &conn->lsm_manager.switch_lock, "LSM switch queue lock")); WT_RET(__wt_cond_alloc( - session, "LSM worker cond", false, &conn->lsm_manager.work_cond)); + session, "LSM worker cond", &conn->lsm_manager.work_cond)); /* * Generation numbers. @@ -110,16 +109,15 @@ __wt_connection_init(WT_CONNECTION_IMPL *conn) * __wt_connection_destroy -- * Destroy the connection's underlying WT_CONNECTION_IMPL structure. */ -int +void __wt_connection_destroy(WT_CONNECTION_IMPL *conn) { - WT_DECL_RET; WT_SESSION_IMPL *session; u_int i; /* Check there's something to destroy. */ if (conn == NULL) - return (0); + return; session = conn->default_session; @@ -136,7 +134,7 @@ __wt_connection_destroy(WT_CONNECTION_IMPL *conn) __wt_spin_destroy(session, &conn->api_lock); __wt_spin_destroy(session, &conn->block_lock); __wt_spin_destroy(session, &conn->checkpoint_lock); - __wt_spin_destroy(session, &conn->dhandle_lock); + __wt_rwlock_destroy(session, &conn->dhandle_lock); __wt_spin_destroy(session, &conn->encryptor_lock); __wt_spin_destroy(session, &conn->fh_lock); __wt_rwlock_destroy(session, &conn->hot_backup_lock); @@ -144,17 +142,12 @@ __wt_connection_destroy(WT_CONNECTION_IMPL *conn) __wt_spin_destroy(session, &conn->metadata_lock); __wt_spin_destroy(session, &conn->reconfig_lock); __wt_spin_destroy(session, &conn->schema_lock); - __wt_spin_destroy(session, &conn->table_lock); + __wt_rwlock_destroy(session, &conn->table_lock); __wt_spin_destroy(session, &conn->turtle_lock); for (i = 0; i < WT_PAGE_LOCKS; ++i) __wt_spin_destroy(session, &conn->page_lock[i]); __wt_free(session, conn->page_lock); - /* Destroy the file-system configuration. */ - if (conn->file_system != NULL && conn->file_system->terminate != NULL) - WT_TRET(conn->file_system->terminate( - conn->file_system, (WT_SESSION *)session)); - /* Free allocated memory. */ __wt_free(session, conn->cfg); __wt_free(session, conn->home); @@ -163,5 +156,4 @@ __wt_connection_destroy(WT_CONNECTION_IMPL *conn) __wt_stat_connection_discard(session, conn); __wt_free(NULL, conn); - return (ret); } diff --git a/src/third_party/wiredtiger/src/conn/conn_log.c b/src/third_party/wiredtiger/src/conn/conn_log.c index 8198b3a1a02..c6dd795389d 100644 --- a/src/third_party/wiredtiger/src/conn/conn_log.c +++ b/src/third_party/wiredtiger/src/conn/conn_log.c @@ -174,7 +174,7 @@ __logmgr_config( WT_RET(__logmgr_sync_cfg(session, cfg)); if (conn->log_cond != NULL) - __wt_cond_auto_signal(session, conn->log_cond); + __wt_cond_signal(session, conn->log_cond); return (0); } @@ -237,7 +237,7 @@ __log_archive_once(WT_SESSION_IMPL *session, uint32_t backup_file) * We can only archive files if a hot backup is not in progress or * if we are the backup. */ - __wt_readlock(session, conn->hot_backup_lock); + __wt_readlock(session, &conn->hot_backup_lock); locked = true; if (!conn->hot_backup || backup_file != 0) { for (i = 0; i < logcount; i++) { @@ -248,7 +248,7 @@ __log_archive_once(WT_SESSION_IMPL *session, uint32_t backup_file) session, WT_LOG_FILENAME, lognum)); } } - __wt_readunlock(session, conn->hot_backup_lock); + __wt_readunlock(session, &conn->hot_backup_lock); locked = false; /* @@ -260,7 +260,7 @@ __log_archive_once(WT_SESSION_IMPL *session, uint32_t backup_file) if (0) err: __wt_err(session, ret, "log archive server error"); if (locked) - __wt_readunlock(session, conn->hot_backup_lock); + __wt_readunlock(session, &conn->hot_backup_lock); WT_TRET(__wt_fs_directory_list_free(session, &logfiles, logcount)); return (ret); } @@ -341,7 +341,7 @@ __wt_log_truncate_files( conn = S2C(session); if (!FLD_ISSET(conn->log_flags, WT_CONN_LOG_ENABLED)) return (0); - if (F_ISSET(conn, WT_CONN_SERVER_RUN) && + if (F_ISSET(conn, WT_CONN_LOG_SERVER_RUN) && FLD_ISSET(conn->log_flags, WT_CONN_LOG_ARCHIVE)) WT_RET_MSG(session, EINVAL, "Attempt to archive manually while a server is running"); @@ -355,9 +355,9 @@ __wt_log_truncate_files( __wt_verbose(session, WT_VERB_LOG, "log_truncate_files: Archive once up to %" PRIu32, backup_file); - __wt_writelock(session, log->log_archive_lock); + __wt_writelock(session, &log->log_archive_lock); ret = __log_archive_once(session, backup_file); - __wt_writeunlock(session, log->log_archive_lock); + __wt_writeunlock(session, &log->log_archive_lock); return (ret); } @@ -433,7 +433,7 @@ __log_file_server(void *arg) */ if (!conn->hot_backup) { __wt_readlock( - session, conn->hot_backup_lock); + session, &conn->hot_backup_lock); if (!conn->hot_backup) WT_ERR_ERROR_OK( __wt_ftruncate(session, @@ -441,7 +441,7 @@ __log_file_server(void *arg) close_end_lsn.l.offset), ENOTSUP); __wt_readunlock( - session, conn->hot_backup_lock); + session, &conn->hot_backup_lock); } WT_SET_LSN(&close_end_lsn, close_end_lsn.l.file + 1, 0); @@ -505,8 +505,7 @@ __log_file_server(void *arg) locked = false; __wt_spin_unlock(session, &log->log_sync_lock); } else { - __wt_cond_auto_signal( - session, conn->log_wrlsn_cond); + __wt_cond_signal(session, conn->log_wrlsn_cond); /* * We do not want to wait potentially a second * to process this. Yield to give the wrlsn @@ -517,8 +516,9 @@ __log_file_server(void *arg) continue; } } + /* Wait until the next event. */ - __wt_cond_wait(session, conn->log_file_cond, WT_MILLION / 10); + __wt_cond_wait(session, conn->log_file_cond, 100000, NULL); } if (0) { @@ -730,12 +730,8 @@ __log_wrlsn_server(void *arg) if (yield++ < WT_THOUSAND) __wt_yield(); else - /* - * Send in false because if we did any work we would - * not be on this path. - */ __wt_cond_auto_wait( - session, conn->log_wrlsn_cond, did_work); + session, conn->log_wrlsn_cond, did_work, NULL); } /* * On close we need to do this one more time because there could @@ -814,10 +810,11 @@ __log_server(void *arg) * agreed not to rename or remove any files in * the database directory. */ - __wt_readlock(session, conn->hot_backup_lock); + __wt_readlock(session, &conn->hot_backup_lock); if (!conn->hot_backup) ret = __log_prealloc_once(session); - __wt_readunlock(session, conn->hot_backup_lock); + __wt_readunlock( + session, &conn->hot_backup_lock); WT_ERR(ret); } @@ -826,10 +823,10 @@ __log_server(void *arg) */ if (FLD_ISSET(conn->log_flags, WT_CONN_LOG_ARCHIVE)) { if (__wt_try_writelock( - session, log->log_archive_lock) == 0) { + session, &log->log_archive_lock) == 0) { ret = __log_archive_once(session, 0); __wt_writeunlock( - session, log->log_archive_lock); + session, &log->log_archive_lock); WT_ERR(ret); } else __wt_verbose(session, WT_VERB_LOG, @@ -839,10 +836,9 @@ __log_server(void *arg) } /* Wait until the next event. */ - __wt_epoch(session, &start); - __wt_cond_auto_wait_signal(session, - conn->log_cond, did_work, &signalled); + __wt_cond_auto_wait_signal( + session, conn->log_cond, did_work, NULL, &signalled); __wt_epoch(session, &now); timediff = WT_TIMEDIFF_MS(now, start); } @@ -884,8 +880,7 @@ __wt_logmgr_create(WT_SESSION_IMPL *session, const char *cfg[]) WT_RET(__wt_spin_init(session, &log->log_sync_lock, "log sync")); WT_RET(__wt_spin_init(session, &log->log_writelsn_lock, "log write LSN")); - WT_RET(__wt_rwlock_alloc(session, - &log->log_archive_lock, "log archive lock")); + __wt_rwlock_init(session, &log->log_archive_lock); if (FLD_ISSET(conn->direct_io, WT_DIRECT_IO_LOG)) log->allocsize = (uint32_t) WT_MAX(conn->buffer_alignment, WT_LOG_ALIGN); @@ -904,10 +899,8 @@ __wt_logmgr_create(WT_SESSION_IMPL *session, const char *cfg[]) WT_INIT_LSN(&log->write_lsn); WT_INIT_LSN(&log->write_start_lsn); log->fileid = 0; - WT_RET(__wt_cond_alloc( - session, "log sync", false, &log->log_sync_cond)); - WT_RET(__wt_cond_alloc( - session, "log write", false, &log->log_write_cond)); + WT_RET(__wt_cond_alloc(session, "log sync", &log->log_sync_cond)); + WT_RET(__wt_cond_alloc(session, "log write", &log->log_write_cond)); WT_RET(__wt_log_open(session)); WT_RET(__wt_log_slot_init(session)); @@ -930,6 +923,8 @@ __wt_logmgr_open(WT_SESSION_IMPL *session) if (!FLD_ISSET(conn->log_flags, WT_CONN_LOG_ENABLED)) return (0); + F_SET(conn, WT_CONN_LOG_SERVER_RUN); + /* * Start the log close thread. It is not configurable. * If logging is enabled, this thread runs. @@ -937,8 +932,8 @@ __wt_logmgr_open(WT_SESSION_IMPL *session) session_flags = WT_SESSION_NO_DATA_HANDLES; WT_RET(__wt_open_internal_session(conn, "log-close-server", false, session_flags, &conn->log_file_session)); - WT_RET(__wt_cond_alloc(conn->log_file_session, - "log close server", false, &conn->log_file_cond)); + WT_RET(__wt_cond_alloc( + conn->log_file_session, "log close server", &conn->log_file_cond)); /* * Start the log file close thread. @@ -954,8 +949,7 @@ __wt_logmgr_open(WT_SESSION_IMPL *session) WT_RET(__wt_open_internal_session(conn, "log-wrlsn-server", false, session_flags, &conn->log_wrlsn_session)); WT_RET(__wt_cond_auto_alloc(conn->log_wrlsn_session, - "log write lsn server", false, 10000, WT_MILLION, - &conn->log_wrlsn_cond)); + "log write lsn server", 10000, WT_MILLION, &conn->log_wrlsn_cond)); WT_RET(__wt_thread_create(conn->log_wrlsn_session, &conn->log_wrlsn_tid, __log_wrlsn_server, conn->log_wrlsn_session)); conn->log_wrlsn_tid_set = true; @@ -969,13 +963,13 @@ __wt_logmgr_open(WT_SESSION_IMPL *session) if (conn->log_session != NULL) { WT_ASSERT(session, conn->log_cond != NULL); WT_ASSERT(session, conn->log_tid_set == true); - __wt_cond_auto_signal(session, conn->log_cond); + __wt_cond_signal(session, conn->log_cond); } else { /* The log server gets its own session. */ WT_RET(__wt_open_internal_session(conn, "log-server", false, session_flags, &conn->log_session)); WT_RET(__wt_cond_auto_alloc(conn->log_session, - "log server", false, 50000, WT_MILLION, &conn->log_cond)); + "log server", 50000, WT_MILLION, &conn->log_cond)); /* * Start the thread. @@ -1001,6 +995,8 @@ __wt_logmgr_destroy(WT_SESSION_IMPL *session) conn = S2C(session); + F_CLR(conn, WT_CONN_LOG_SERVER_RUN); + if (!FLD_ISSET(conn->log_flags, WT_CONN_LOG_ENABLED)) { /* * We always set up the log_path so printlog can work without @@ -1011,7 +1007,7 @@ __wt_logmgr_destroy(WT_SESSION_IMPL *session) return (0); } if (conn->log_tid_set) { - __wt_cond_auto_signal(session, conn->log_cond); + __wt_cond_signal(session, conn->log_cond); WT_TRET(__wt_thread_join(session, conn->log_tid)); conn->log_tid_set = false; } @@ -1026,7 +1022,7 @@ __wt_logmgr_destroy(WT_SESSION_IMPL *session) conn->log_file_session = NULL; } if (conn->log_wrlsn_tid_set) { - __wt_cond_auto_signal(session, conn->log_wrlsn_cond); + __wt_cond_signal(session, conn->log_wrlsn_cond); WT_TRET(__wt_thread_join(session, conn->log_wrlsn_tid)); conn->log_wrlsn_tid_set = false; } @@ -1047,9 +1043,9 @@ __wt_logmgr_destroy(WT_SESSION_IMPL *session) } /* Destroy the condition variables now that all threads are stopped */ - WT_TRET(__wt_cond_auto_destroy(session, &conn->log_cond)); + WT_TRET(__wt_cond_destroy(session, &conn->log_cond)); WT_TRET(__wt_cond_destroy(session, &conn->log_file_cond)); - WT_TRET(__wt_cond_auto_destroy(session, &conn->log_wrlsn_cond)); + WT_TRET(__wt_cond_destroy(session, &conn->log_wrlsn_cond)); WT_TRET(__wt_cond_destroy(session, &conn->log->log_sync_cond)); WT_TRET(__wt_cond_destroy(session, &conn->log->log_write_cond)); diff --git a/src/third_party/wiredtiger/src/conn/conn_open.c b/src/third_party/wiredtiger/src/conn/conn_open.c index d4ace127bb2..5b20377d437 100644 --- a/src/third_party/wiredtiger/src/conn/conn_open.c +++ b/src/third_party/wiredtiger/src/conn/conn_open.c @@ -25,7 +25,7 @@ __wt_connection_open(WT_CONNECTION_IMPL *conn, const char *cfg[]) * Tell internal server threads to run: this must be set before opening * any sessions. */ - F_SET(conn, WT_CONN_SERVER_RUN | WT_CONN_LOG_SERVER_RUN); + F_SET(conn, WT_CONN_SERVER_RUN); /* WT_SESSION_IMPL array. */ WT_RET(__wt_calloc(session, @@ -100,8 +100,12 @@ __wt_connection_close(WT_CONNECTION_IMPL *conn) __wt_yield(); } - /* Clear any pending async ops. */ + /* + * Clear any pending async operations and shut down the async worker + * threads and system before closing LSM. + */ WT_TRET(__wt_async_flush(session)); + WT_TRET(__wt_async_destroy(session)); /* * Shut down server threads other than the eviction server, which is @@ -110,14 +114,14 @@ __wt_connection_close(WT_CONNECTION_IMPL *conn) * exit before files are closed. */ F_CLR(conn, WT_CONN_SERVER_RUN); - WT_TRET(__wt_async_destroy(session)); WT_TRET(__wt_lsm_manager_destroy(session)); - WT_TRET(__wt_sweep_destroy(session)); F_SET(conn, WT_CONN_CLOSING); - WT_TRET(__wt_checkpoint_server_destroy(session)); WT_TRET(__wt_statlog_destroy(session, true)); + WT_TRET(__wt_sweep_destroy(session)); + + /* The eviction server is shut down last. */ WT_TRET(__wt_evict_destroy(session)); /* Shut down the lookaside table, after all eviction is complete. */ @@ -126,7 +130,7 @@ __wt_connection_close(WT_CONNECTION_IMPL *conn) /* Close open data handles. */ WT_TRET(__wt_conn_dhandle_discard(session)); - /* Shut down metadata tracking, required before creating tables. */ + /* Shut down metadata tracking. */ WT_TRET(__wt_meta_track_destroy(session)); /* @@ -140,7 +144,6 @@ __wt_connection_close(WT_CONNECTION_IMPL *conn) FLD_ISSET(conn->log_flags, WT_CONN_LOG_RECOVER_DONE)) WT_TRET(__wt_txn_checkpoint_log( session, true, WT_TXN_LOG_CKPT_STOP, NULL)); - F_CLR(conn, WT_CONN_LOG_SERVER_RUN); WT_TRET(__wt_logmgr_destroy(session)); /* Free memory for collators, compressors, data sources. */ @@ -159,15 +162,6 @@ __wt_connection_close(WT_CONNECTION_IMPL *conn) /* Discard transaction state. */ __wt_txn_global_destroy(session); - /* Close extensions, first calling any unload entry point. */ - while ((dlh = TAILQ_FIRST(&conn->dlhqh)) != NULL) { - TAILQ_REMOVE(&conn->dlhqh, dlh, q); - - if (dlh->terminate != NULL) - WT_TRET(dlh->terminate(wt_conn)); - WT_TRET(__wt_dlclose(session, dlh)); - } - /* Close the lock file, opening up the database to other connections. */ if (conn->lock_fh != NULL) WT_TRET(__wt_close(session, &conn->lock_fh)); @@ -199,8 +193,22 @@ __wt_connection_close(WT_CONNECTION_IMPL *conn) __wt_free(session, s->hazard); } + /* Destroy the file-system configuration. */ + if (conn->file_system != NULL && conn->file_system->terminate != NULL) + WT_TRET(conn->file_system->terminate( + conn->file_system, (WT_SESSION *)session)); + + /* Close extensions, first calling any unload entry point. */ + while ((dlh = TAILQ_FIRST(&conn->dlhqh)) != NULL) { + TAILQ_REMOVE(&conn->dlhqh, dlh, q); + + if (dlh->terminate != NULL) + WT_TRET(dlh->terminate(wt_conn)); + WT_TRET(__wt_dlclose(session, dlh)); + } + /* Destroy the handle. */ - WT_TRET(__wt_connection_destroy(conn)); + __wt_connection_destroy(conn); return (ret); } diff --git a/src/third_party/wiredtiger/src/conn/conn_stat.c b/src/third_party/wiredtiger/src/conn/conn_stat.c index 3bcdfd7ecb1..d89392b66c6 100644 --- a/src/third_party/wiredtiger/src/conn/conn_stat.c +++ b/src/third_party/wiredtiger/src/conn/conn_stat.c @@ -409,7 +409,6 @@ __statlog_log_one(WT_SESSION_IMPL *session, WT_ITEM *path, WT_ITEM *tmp) struct timespec ts; struct tm *tm, _tm; WT_CONNECTION_IMPL *conn; - WT_DECL_RET; WT_FSTREAM *log_stream; conn = S2C(session); @@ -446,12 +445,9 @@ __statlog_log_one(WT_SESSION_IMPL *session, WT_ITEM *path, WT_ITEM *tmp) * Lock the schema and walk the list of open handles, dumping * any that match the list of object sources. */ - if (conn->stat_sources != NULL) { - WT_WITH_HANDLE_LIST_LOCK(session, - ret = __wt_conn_btree_apply( + if (conn->stat_sources != NULL) + WT_RET(__wt_conn_btree_apply( session, NULL, __statlog_apply, NULL, NULL)); - WT_RET(ret); - } /* * Walk the list of open LSM trees, dumping any that match the @@ -485,8 +481,7 @@ __statlog_on_close(WT_SESSION_IMPL *session) if (!FLD_ISSET(conn->stat_flags, WT_STAT_ON_CLOSE)) return (0); - if (F_ISSET(conn, WT_CONN_SERVER_RUN) && - F_ISSET(conn, WT_CONN_SERVER_STATISTICS)) + if (F_ISSET(conn, WT_CONN_SERVER_STATISTICS)) WT_RET_MSG(session, EINVAL, "Attempt to log statistics while a server is running"); @@ -498,6 +493,16 @@ err: __wt_scr_free(session, &tmp); } /* + * __statlog_server_run_chk -- + * Check to decide if the statistics log server should continue running. + */ +static bool +__statlog_server_run_chk(WT_SESSION_IMPL *session) +{ + return (F_ISSET(S2C(session), WT_CONN_SERVER_STATISTICS)); +} + +/* * __statlog_server -- * The statistics server thread. */ @@ -525,10 +530,14 @@ __statlog_server(void *arg) WT_ERR(__wt_buf_init(session, &path, strlen(conn->stat_path) + 128)); WT_ERR(__wt_buf_init(session, &tmp, strlen(conn->stat_path) + 128)); - while (F_ISSET(conn, WT_CONN_SERVER_RUN) && - F_ISSET(conn, WT_CONN_SERVER_STATISTICS)) { + for (;;) { /* Wait until the next event. */ - __wt_cond_wait(session, conn->stat_cond, conn->stat_usecs); + __wt_cond_wait(session, conn->stat_cond, + conn->stat_usecs, __statlog_server_run_chk); + + /* Check if we're quitting or being reconfigured. */ + if (!__statlog_server_run_chk(session)) + break; if (WT_STAT_ENABLED(session)) WT_ERR(__statlog_log_one(session, &path, &tmp)); @@ -563,7 +572,7 @@ __statlog_start(WT_CONNECTION_IMPL *conn) session = conn->stat_session; WT_RET(__wt_cond_alloc( - session, "statistics log server", false, &conn->stat_cond)); + session, "statistics log server", &conn->stat_cond)); /* * Start the thread. diff --git a/src/third_party/wiredtiger/src/conn/conn_sweep.c b/src/third_party/wiredtiger/src/conn/conn_sweep.c index d1254d8afcc..8c186c63939 100644 --- a/src/third_party/wiredtiger/src/conn/conn_sweep.c +++ b/src/third_party/wiredtiger/src/conn/conn_sweep.c @@ -81,7 +81,7 @@ __sweep_expire_one(WT_SESSION_IMPL *session) * handle list lock so that connection-level handle searches * never need to retry. */ - WT_RET(__wt_try_writelock(session, dhandle->rwlock)); + WT_RET(__wt_try_writelock(session, &dhandle->rwlock)); /* Only sweep clean trees where all updates are visible. */ if (btree->modified || @@ -95,7 +95,7 @@ __sweep_expire_one(WT_SESSION_IMPL *session) */ ret = __wt_conn_btree_sync_and_close(session, false, true); -err: __wt_writeunlock(session, dhandle->rwlock); +err: __wt_writeunlock(session, &dhandle->rwlock); return (ret); } @@ -188,7 +188,7 @@ __sweep_remove_one(WT_SESSION_IMPL *session, WT_DATA_HANDLE *dhandle) WT_DECL_RET; /* Try to get exclusive access. */ - WT_RET(__wt_try_writelock(session, dhandle->rwlock)); + WT_RET(__wt_try_writelock(session, &dhandle->rwlock)); /* * If there are no longer any references to the handle in any @@ -205,7 +205,7 @@ __sweep_remove_one(WT_SESSION_IMPL *session, WT_DATA_HANDLE *dhandle) * don't retry the discard until it times out again. */ if (ret != 0) { -err: __wt_writeunlock(session, dhandle->rwlock); +err: __wt_writeunlock(session, &dhandle->rwlock); } return (ret); @@ -233,7 +233,7 @@ __sweep_remove_handles(WT_SESSION_IMPL *session) if (!WT_DHANDLE_CAN_DISCARD(dhandle)) continue; - WT_WITH_HANDLE_LIST_LOCK(session, + WT_WITH_HANDLE_LIST_WRITE_LOCK(session, ret = __sweep_remove_one(session, dhandle)); if (ret == 0) WT_STAT_CONN_INCR(session, dh_sweep_remove); @@ -246,6 +246,16 @@ __sweep_remove_handles(WT_SESSION_IMPL *session) } /* + * __sweep_server_run_chk -- + * Check to decide if the checkpoint server should continue running. + */ +static bool +__sweep_server_run_chk(WT_SESSION_IMPL *session) +{ + return (F_ISSET(S2C(session), WT_CONN_SERVER_SWEEP)); +} + +/* * __sweep_server -- * The handle sweep server thread. */ @@ -266,11 +276,15 @@ __sweep_server(void *arg) /* * Sweep for dead and excess handles. */ - while (F_ISSET(conn, WT_CONN_SERVER_RUN) && - F_ISSET(conn, WT_CONN_SERVER_SWEEP)) { + for (;;) { /* Wait until the next event. */ - __wt_cond_wait(session, - conn->sweep_cond, conn->sweep_interval * WT_MILLION); + __wt_cond_wait(session, conn->sweep_cond, + conn->sweep_interval * WT_MILLION, __sweep_server_run_chk); + + /* Check if we're quitting or being reconfigured. */ + if (!__sweep_server_run_chk(session)) + break; + __wt_seconds(session, &now); WT_STAT_CONN_INCR(session, dh_sweeps); @@ -390,7 +404,7 @@ __wt_sweep_create(WT_SESSION_IMPL *session) session = conn->sweep_session; WT_RET(__wt_cond_alloc( - session, "handle sweep server", false, &conn->sweep_cond)); + session, "handle sweep server", &conn->sweep_cond)); WT_RET(__wt_thread_create( session, &conn->sweep_tid, __sweep_server, session)); diff --git a/src/third_party/wiredtiger/src/cursor/cur_backup.c b/src/third_party/wiredtiger/src/cursor/cur_backup.c index 456aa2e0f02..61ced8d11e7 100644 --- a/src/third_party/wiredtiger/src/cursor/cur_backup.c +++ b/src/third_party/wiredtiger/src/cursor/cur_backup.c @@ -230,10 +230,10 @@ __backup_start( * We are holding the checkpoint and schema locks so schema operations * will not see the backup file list until it is complete and valid. */ - __wt_writelock(session, conn->hot_backup_lock); + __wt_writelock(session, &conn->hot_backup_lock); conn->hot_backup = true; conn->hot_backup_list = NULL; - __wt_writeunlock(session, conn->hot_backup_lock); + __wt_writeunlock(session, &conn->hot_backup_lock); /* We're the lock holder, we own cleanup. */ F_SET(cb, WT_CURBACKUP_LOCKER); @@ -297,9 +297,9 @@ err: /* Close the hot backup file. */ if (ret == 0) { WT_ASSERT(session, dest != NULL); WT_TRET(__wt_fs_rename(session, WT_BACKUP_TMP, dest, false)); - __wt_writelock(session, conn->hot_backup_lock); + __wt_writelock(session, &conn->hot_backup_lock); conn->hot_backup_list = cb->list; - __wt_writeunlock(session, conn->hot_backup_lock); + __wt_writeunlock(session, &conn->hot_backup_lock); } return (ret); @@ -319,9 +319,9 @@ __backup_stop(WT_SESSION_IMPL *session, WT_CURSOR_BACKUP *cb) conn = S2C(session); /* Release all btree names held by the backup. */ - __wt_writelock(session, conn->hot_backup_lock); + __wt_writelock(session, &conn->hot_backup_lock); conn->hot_backup_list = NULL; - __wt_writeunlock(session, conn->hot_backup_lock); + __wt_writeunlock(session, &conn->hot_backup_lock); if (cb->list != NULL) { for (i = 0; cb->list[i] != NULL; ++i) __wt_free(session, cb->list[i]); @@ -332,9 +332,9 @@ __backup_stop(WT_SESSION_IMPL *session, WT_CURSOR_BACKUP *cb) WT_TRET(__wt_backup_file_remove(session)); /* Checkpoint deletion can proceed, as can the next hot backup. */ - __wt_writelock(session, conn->hot_backup_lock); + __wt_writelock(session, &conn->hot_backup_lock); conn->hot_backup = false; - __wt_writeunlock(session, conn->hot_backup_lock); + __wt_writeunlock(session, &conn->hot_backup_lock); return (ret); } @@ -346,13 +346,9 @@ __backup_stop(WT_SESSION_IMPL *session, WT_CURSOR_BACKUP *cb) static int __backup_all(WT_SESSION_IMPL *session) { - WT_DECL_RET; - /* Build a list of the file objects that need to be copied. */ - WT_WITH_HANDLE_LIST_LOCK(session, ret = - __wt_meta_apply_all(session, NULL, __backup_list_uri_append, NULL)); - - return (ret); + return (__wt_meta_apply_all( + session, NULL, __backup_list_uri_append, NULL)); } /* diff --git a/src/third_party/wiredtiger/src/cursor/cur_index.c b/src/third_party/wiredtiger/src/cursor/cur_index.c index 0ab992bc88c..6fc01c0421f 100644 --- a/src/third_party/wiredtiger/src/cursor/cur_index.c +++ b/src/third_party/wiredtiger/src/cursor/cur_index.c @@ -240,7 +240,17 @@ __curindex_search(WT_CURSOR *cursor) found_key = child->key; if (found_key.size < cursor->key.size) WT_ERR(WT_NOTFOUND); - found_key.size = cursor->key.size; + + /* + * Custom collators expect to see complete keys, pass an item containing + * all the visible fields so it unpacks correctly. + */ + if (cindex->index->collator != NULL && + !F_ISSET(cursor, WT_CURSTD_RAW_SEARCH)) + WT_ERR(__wt_struct_repack(session, child->key_format, + cindex->iface.key_format, &child->key, &found_key)); + else + found_key.size = cursor->key.size; WT_ERR(__wt_compare( session, cindex->index->collator, &cursor->key, &found_key, &cmp)); @@ -307,8 +317,18 @@ __curindex_search_near(WT_CURSOR *cursor, int *exact) * so we flip the sign of the result to match what callers expect. */ found_key = child->key; - if (found_key.size > cursor->key.size) - found_key.size = cursor->key.size; + if (found_key.size > cursor->key.size) { + /* + * Custom collators expect to see complete keys, pass an item + * containing all the visible fields so it unpacks correctly. + */ + if (cindex->index->collator != NULL) + WT_ERR(__wt_struct_repack(session, + cindex->child->key_format, cindex->iface.key_format, + &child->key, &found_key)); + else + found_key.size = cursor->key.size; + } WT_ERR(__wt_compare( session, cindex->index->collator, &cursor->key, &found_key, exact)); @@ -520,8 +540,8 @@ __wt_curindex_open(WT_SESSION_IMPL *session, WT_ERR(__curindex_open_colgroups(session, cindex, cfg)); if (F_ISSET(cursor, WT_CURSTD_DUMP_JSON)) - __wt_json_column_init( - cursor, table->key_format, &idx->colconf, &table->colconf); + __wt_json_column_init(cursor, uri, table->key_format, + &idx->colconf, &table->colconf); if (0) { err: WT_TRET(__curindex_close(cursor)); diff --git a/src/third_party/wiredtiger/src/cursor/cur_json.c b/src/third_party/wiredtiger/src/cursor/cur_json.c index a0a3ffdd974..5870d14273e 100644 --- a/src/third_party/wiredtiger/src/cursor/cur_json.c +++ b/src/third_party/wiredtiger/src/cursor/cur_json.c @@ -369,11 +369,11 @@ __wt_json_unpack_char(u_char ch, u_char *buf, size_t bufsz, bool force_unicode) * of column names. */ void -__wt_json_column_init(WT_CURSOR *cursor, const char *keyformat, +__wt_json_column_init(WT_CURSOR *cursor, const char *uri, const char *keyformat, const WT_CONFIG_ITEM *idxconf, const WT_CONFIG_ITEM *colconf) { WT_CURSOR_JSON *json; - const char *p, *end, *beginkey; + const char *beginkey, *end, *lparen, *p; uint32_t keycnt, nkeys; json = (WT_CURSOR_JSON *)cursor->json_private; @@ -400,8 +400,16 @@ __wt_json_column_init(WT_CURSOR *cursor, const char *keyformat, keycnt++; p++; } - json->value_names.str = p; - json->value_names.len = WT_PTRDIFF(end, p); + if ((lparen = strchr(uri, '(')) != NULL) { + /* This cursor is a projection. */ + json->value_names.str = lparen; + json->value_names.len = strlen(lparen) - 1; + WT_ASSERT((WT_SESSION_IMPL *)cursor->session, + json->value_names.str[json->value_names.len] == ')'); + } else { + json->value_names.str = p; + json->value_names.len = WT_PTRDIFF(end, p); + } if (idxconf == NULL) { if (p > beginkey) p--; diff --git a/src/third_party/wiredtiger/src/cursor/cur_log.c b/src/third_party/wiredtiger/src/cursor/cur_log.c index 3ee6554b3c0..e5b56aa406f 100644 --- a/src/third_party/wiredtiger/src/cursor/cur_log.c +++ b/src/third_party/wiredtiger/src/cursor/cur_log.c @@ -305,7 +305,7 @@ __curlog_close(WT_CURSOR *cursor) WT_ASSERT(session, FLD_ISSET(conn->log_flags, WT_CONN_LOG_ENABLED)); if (F_ISSET(cl, WT_CURLOG_ARCHIVE_LOCK)) - __wt_readunlock(session, conn->log->log_archive_lock); + __wt_readunlock(session, &conn->log->log_archive_lock); __wt_free(session, cl->cur_lsn); __wt_free(session, cl->next_lsn); @@ -383,7 +383,7 @@ __wt_curlog_open(WT_SESSION_IMPL *session, WT_ERR(__wt_log_force_write(session, 1, NULL)); /* Log cursors block archiving. */ - __wt_readlock(session, log->log_archive_lock); + __wt_readlock(session, &log->log_archive_lock); F_SET(cl, WT_CURLOG_ARCHIVE_LOCK); if (0) { diff --git a/src/third_party/wiredtiger/src/cursor/cur_std.c b/src/third_party/wiredtiger/src/cursor/cur_std.c index 6264de89df9..99a9e373354 100644 --- a/src/third_party/wiredtiger/src/cursor/cur_std.c +++ b/src/third_party/wiredtiger/src/cursor/cur_std.c @@ -144,6 +144,7 @@ __wt_cursor_set_notsup(WT_CURSOR *cursor) */ int __wt_cursor_kv_not_set(WT_CURSOR *cursor, bool key) + WT_GCC_FUNC_ATTRIBUTE((cold)) { WT_SESSION_IMPL *session; @@ -632,6 +633,7 @@ __wt_cursor_reconfigure(WT_CURSOR *cursor, const char *config) int __wt_cursor_dup_position(WT_CURSOR *to_dup, WT_CURSOR *cursor) { + WT_DECL_RET; WT_ITEM key; /* @@ -661,9 +663,11 @@ __wt_cursor_dup_position(WT_CURSOR *to_dup, WT_CURSOR *cursor) * cursors cannot reference application memory after cursor operations * and that requirement will save the day. */ - WT_RET(cursor->search(cursor)); + F_SET(cursor, WT_CURSTD_RAW_SEARCH); + ret = cursor->search(cursor); + F_CLR(cursor, WT_CURSTD_RAW_SEARCH); - return (0); + return (ret); } /* diff --git a/src/third_party/wiredtiger/src/cursor/cur_table.c b/src/third_party/wiredtiger/src/cursor/cur_table.c index fae7667e44f..7e8cd153d2d 100644 --- a/src/third_party/wiredtiger/src/cursor/cur_table.c +++ b/src/third_party/wiredtiger/src/cursor/cur_table.c @@ -769,7 +769,7 @@ __curtable_complete(WT_SESSION_IMPL *session, WT_TABLE *table) return (0); /* If the table is incomplete, wait on the table lock and recheck. */ - WT_WITH_TABLE_LOCK(session, complete = table->cg_complete); + WT_WITH_TABLE_READ_LOCK(session, complete = table->cg_complete); if (!complete) WT_RET_MSG(session, EINVAL, "'%s' not available until all column groups are created", @@ -951,7 +951,7 @@ __wt_curtable_open(WT_SESSION_IMPL *session, if (F_ISSET(cursor, WT_CURSTD_DUMP_JSON)) __wt_json_column_init( - cursor, table->key_format, NULL, &table->colconf); + cursor, uri, table->key_format, NULL, &table->colconf); /* * Open the colgroup cursors immediately: we're going to need them for diff --git a/src/third_party/wiredtiger/src/docs/Doxyfile b/src/third_party/wiredtiger/src/docs/Doxyfile index 69e9716b425..3d8c46962f1 100644 --- a/src/third_party/wiredtiger/src/docs/Doxyfile +++ b/src/third_party/wiredtiger/src/docs/Doxyfile @@ -216,11 +216,19 @@ ALIASES = "notyet{1}=Note: <b>"\1"</b> not yet supported in Wired "hrow{3}=<tr><th>\1</th><th>\2</th><th>\3</th></tr>" \ "hrow{4}=<tr><th>\1</th><th>\2</th><th>\3</th><th>\4</th></tr>" \ "hrow{5}=<tr><th>\1</th><th>\2</th><th>\3</th><th>\4</th><th>\5</th></tr>" \ + "hrow{6}=<tr><th>\1</th><th>\2</th><th>\3</th><th>\4</th><th>\5</th><th>\6</th></tr>" \ + "hrow{7}=<tr><th>\1</th><th>\2</th><th>\3</th><th>\4</th><th>\5</th><th>\6</th><th>\7</th></tr>" \ + "hrow{8}=<tr><th>\1</th><th>\2</th><th>\3</th><th>\4</th><th>\5</th><th>\6</th><th>\7</th><th>\8</th></tr>" \ + "hrow{9}=<tr><th>\1</th><th>\2</th><th>\3</th><th>\4</th><th>\5</th><th>\6</th><th>\7</th><th>\8</th><th>\9</th></tr>" \ "row{1}=<tr><td>\1</td></tr>" \ "row{2}=<tr><td>\1</td><td>\2</td></tr>" \ "row{3}=<tr><td>\1</td><td>\2</td><td>\3</td></tr>" \ "row{4}=<tr><td>\1</td><td>\2</td><td>\3</td><td>\4</td></tr>" \ "row{5}=<tr><td>\1</td><td>\2</td><td>\3</td><td>\4</td><td>\5</td></tr>" \ + "row{6}=<tr><td>\1</td><td>\2</td><td>\3</td><td>\4</td><td>\5</td><td>\6</td></tr>" \ + "row{7}=<tr><td>\1</td><td>\2</td><td>\3</td><td>\4</td><td>\5</td><td>\6</td><td>\7</td></tr>" \ + "row{8}=<tr><td>\1</td><td>\2</td><td>\3</td><td>\4</td><td>\5</td><td>\6</td><td>\7</td><td>\8</td></tr>" \ + "row{9}=<tr><td>\1</td><td>\2</td><td>\3</td><td>\4</td><td>\5</td><td>\6</td><td>\7</td><td>\8</td><td>\9</td></tr>" \ "configstart{2}=@param config\n Configuration string, see @ref config_strings. Permitted values:\n <table>@hrow{Name,Effect,Values}" \ "config{3}= @row{<tt>\1</tt>,\2,\3}" \ "configend= </table>" \ diff --git a/src/third_party/wiredtiger/src/docs/build-pydoc.sh b/src/third_party/wiredtiger/src/docs/build-pydoc.sh index aef88fd4c97..5e6e3635be5 100755 --- a/src/third_party/wiredtiger/src/docs/build-pydoc.sh +++ b/src/third_party/wiredtiger/src/docs/build-pydoc.sh @@ -3,4 +3,4 @@ TOP=$DOCS/.. . $TOP/config.sh cd python -PYTHONPATH=../../lang/python/src:$THRIFT_HOME/lib/python2.7/site-packages pydoc -w wiredtiger +PYTHONPATH=../../lang/python/src:$THRIFT_HOME/lib/python2.6/site-packages pydoc -w wiredtiger diff --git a/src/third_party/wiredtiger/src/docs/command-line.dox b/src/third_party/wiredtiger/src/docs/command-line.dox index 5726a1d19a1..df52324f8f8 100644 --- a/src/third_party/wiredtiger/src/docs/command-line.dox +++ b/src/third_party/wiredtiger/src/docs/command-line.dox @@ -370,6 +370,19 @@ Include only "fast" statistics in the output (equivalent to passing <code>statistics=(fast)</code>) to WT_SESSION::open_cursor. <hr> +@section util_truncate wt truncate +Truncate a table, removing all data. + +The \c truncate command truncates the specified \c uri. It is equivalent to a +call to WT_SESSION::truncate with no start or stop specified. + +@subsection util_truncate_synopsis Synopsis +<code>wt [-RVv] [-C config] [-E secretkey ] [-h directory] truncate uri</code> + +@subsection util_truncate_options Options +The \c truncate command has no command-specific options. + +<hr> @section util_upgrade wt upgrade Upgrade a table. diff --git a/src/third_party/wiredtiger/src/docs/cursor-random.dox b/src/third_party/wiredtiger/src/docs/cursor-random.dox index a0a3212be6d..b6434e3d161 100644 --- a/src/third_party/wiredtiger/src/docs/cursor-random.dox +++ b/src/third_party/wiredtiger/src/docs/cursor-random.dox @@ -20,9 +20,4 @@ cursor configured using \c next_random_sample_size divides the object into \c next_random_sample_size pieces, and each subsequent retrieval returns a record from the next one of those pieces. -For example, setting \c next_random_sample_percent to \c 10 would cause -the cursor to sequentially return records from each tenth part of the -object. Setting \c next_random_sample_percent to \c 1000 would cause the -cursor to sequentially return records from each .1% of the object. - */ diff --git a/src/third_party/wiredtiger/src/docs/file-formats.dox b/src/third_party/wiredtiger/src/docs/file-formats.dox index d8990aca7a6..21dc4580bc2 100644 --- a/src/third_party/wiredtiger/src/docs/file-formats.dox +++ b/src/third_party/wiredtiger/src/docs/file-formats.dox @@ -110,7 +110,7 @@ considered. (See @subpage_single huffman for details.) compressing blocks of the backing object's file. The cost is additional CPU and memory use when reading and writing pages to disk. Note the additional CPU cost of block compression can be high, and should be -considered. (See @x_ref compression_formats for details.) +considered. (See @x_ref compression_considerations for details.) Block compression is disabled by default. @@ -146,7 +146,7 @@ Huffman encoding can be high, and should be considered. compressing blocks of the backing object's file. The cost is additional CPU and memory use when reading and writing pages to disk. Note the additional CPU cost of block compression can be high, and should be -considered. (See @x_ref compression_formats for details.) +considered. (See @x_ref compression_considerations for details.) Block compression is disabled by default. @@ -157,7 +157,7 @@ compression: block compression. compressing blocks of the backing object's file. The cost is additional CPU and memory use when reading and writing pages to disk. Note the additional CPU cost of block compression can be high, and should be -considered. (See @x_ref compression_formats for details.) +considered. (See @x_ref compression_considerations for details.) Block compression is disabled by default. diff --git a/src/third_party/wiredtiger/src/docs/programming.dox b/src/third_party/wiredtiger/src/docs/programming.dox index 81e612e8ee8..aa76bef4614 100644 --- a/src/third_party/wiredtiger/src/docs/programming.dox +++ b/src/third_party/wiredtiger/src/docs/programming.dox @@ -66,14 +66,13 @@ each of which is ordered by one or more columns. - @subpage_single wtstats <p> - @subpage_single tune_memory_allocator -- @subpage_single tune_page_sizes +- @subpage_single tune_page_size_and_comp - @subpage_single tune_cache - @subpage_single tune_bulk_load - @subpage_single tune_cursor_persist - @subpage_single tune_read_only - @subpage_single tune_durability - @subpage_single tune_checksum -- @subpage_single tune_compression - @subpage_single tune_file_alloc - @subpage_single tune_system_buffer_cache - @subpage_single tune_transparent_huge_pages diff --git a/src/third_party/wiredtiger/src/docs/spell.ok b/src/third_party/wiredtiger/src/docs/spell.ok index 2413cbc93fb..bc2e16b1122 100644 --- a/src/third_party/wiredtiger/src/docs/spell.ok +++ b/src/third_party/wiredtiger/src/docs/spell.ok @@ -50,6 +50,7 @@ LDFLAGS LIBS LLVM LOGREC +LRU LRVv LSB LSM @@ -167,6 +168,7 @@ dNLen dNOff dT dataN +database's dataitem dataset datasets diff --git a/src/third_party/wiredtiger/src/docs/testing.dox b/src/third_party/wiredtiger/src/docs/testing.dox index cf280e8f3ff..7d454d54212 100644 --- a/src/third_party/wiredtiger/src/docs/testing.dox +++ b/src/third_party/wiredtiger/src/docs/testing.dox @@ -27,7 +27,7 @@ The WiredTiger unit test suite includes tests that cover: The WiredTiger Python test suite is built using the WiredTiger Python API and the Python unittest functionality (the test suite requires at -least Python version 2.7). +least Python version 2.6). The WiredTiger test suite automatically runs as part of every commit into the WiredTiger GitHub source tree. diff --git a/src/third_party/wiredtiger/src/docs/top/main.dox b/src/third_party/wiredtiger/src/docs/top/main.dox index 01acc849d50..84487c13174 100644 --- a/src/third_party/wiredtiger/src/docs/top/main.dox +++ b/src/third_party/wiredtiger/src/docs/top/main.dox @@ -6,12 +6,12 @@ WiredTiger is an high performance, scalable, production quality, NoSQL, @section releases Releases <table> -@row{<b>WiredTiger 2.9.0</b> (current), +@row{<b>WiredTiger 2.9.1</b> (current), + <a href="releases/wiredtiger-2.9.1.tar.bz2"><b>[Release package]</b></a>, + <a href="2.9.1/index.html"><b>[Documentation]</b></a>} +@row{<b>WiredTiger 2.9.0</b> (previous), <a href="releases/wiredtiger-2.9.0.tar.bz2"><b>[Release package]</b></a>, <a href="2.9.0/index.html"><b>[Documentation]</b></a>} -@row{<b>WiredTiger 2.8.0</b> (previous), - <a href="releases/wiredtiger-2.8.0.tar.bz2"><b>[Release package]</b></a>, - <a href="2.8.0/index.html"><b>[Documentation]</b></a>} @row{<b>Development branch</b>, <a href="https://github.com/wiredtiger/wiredtiger"><b>[Source code]</b></a>, <a href="develop/index.html"><b>[Documentation]</b></a>} diff --git a/src/third_party/wiredtiger/src/docs/transactions.dox b/src/third_party/wiredtiger/src/docs/transactions.dox index bbbd2d52296..3b438eda366 100644 --- a/src/third_party/wiredtiger/src/docs/transactions.dox +++ b/src/third_party/wiredtiger/src/docs/transactions.dox @@ -141,7 +141,7 @@ as if the transaction started at the time of the WT_SESSION::snapshot call that created the snapshot. Named snapshots keep data pinned in cache as if a real transaction were -running for the time that the named transaction is active. The resources +running for the time that the named snapshot is active. The resources associated with named snapshots should be released by calling WT_SESSION::snapshot with a configuration that includes <code>"drop="</code>. See WT_SESSION::snapshot documentation for details of diff --git a/src/third_party/wiredtiger/src/docs/tune-compression.dox b/src/third_party/wiredtiger/src/docs/tune-compression.dox deleted file mode 100644 index 8db2151aa76..00000000000 --- a/src/third_party/wiredtiger/src/docs/tune-compression.dox +++ /dev/null @@ -1,62 +0,0 @@ -/*! @page tune_compression Compression - -WiredTiger includes a number of optional compression techniques. Configuring -compression generally decreases on-disk and in-memory resource requirements -and the amount of I/O, and increases CPU cost when data are read and written. - -Configuring compression may change application throughput. For example, -in applications using solid-state drives (where I/O is less expensive), -turning off compression may increase application performance by reducing -CPU costs; in applications where I/O costs are more expensive, turning on -compression may increase application performance by reducing the overall -number of I/O operations. - -An example of turning on row-store key prefix compression: - -@snippet ex_all.c Configure key prefix compression on - -An example of turning on row-store or column-store dictionary compression: - -@snippet ex_all.c Configure dictionary compression on - -@section compression_formats Block Compression Formats -WiredTiger provides two methods of compressing your data when using block -compression: the raw and noraw methods. These methods change how WiredTiger -works to fit data into the blocks that are stored on disk. - -@subsection noraw_compression Noraw Compression -Noraw compression is the traditional compression model where a fixed -amount of data is given to the compression system, then turned into a -compressed block of data. The amount of data chosen to compress is the -data needed to fill the uncompressed block. Thus when compressed, the block will -be smaller than the normal data size and the sizes written to disk will often -vary depending on how compressible the data being stored is. Algorithms -using noraw compression include zlib-noraw, lz4-noraw and snappy. - -@subsection raw_compression Raw Compression -WiredTiger's raw compression takes advantage of compressors that provide a -streaming compression API. Using the streaming API WiredTiger will try to fit -as much data as possible into one block. This means that blocks created -with raw compression should be of similar size. Using a streaming compression -method should also make for less overhead in compression, as the setup and -initial work for compressing is done fewer times compared to the amount of -data stored. Algorithms using raw compression include zlib, lz4. - -@subsection to_raw_or_noraw Choosing between Raw and Noraw Compression -When looking at which compression method to use the biggest consideration is -that raw compression will normally provide higher compression levels while -using more CPU for compression. - -An additional consideration is that raw compression may provide a performance -advantage in workloads where data is accessed sequentially. That is because -more data is generally packed into each block on disk. Conversely, noraw -compression may perform better for workloads with random access patterns -because each block will tend to be smaller and require less work to read and -decompress. - -See @ref file_formats_compression for more information on available -compression techniques. - -See @ref compression for information on how to configure and enable compression. - - */ diff --git a/src/third_party/wiredtiger/src/docs/tune-page-size-and-comp.dox b/src/third_party/wiredtiger/src/docs/tune-page-size-and-comp.dox new file mode 100644 index 00000000000..96b0fda2333 --- /dev/null +++ b/src/third_party/wiredtiger/src/docs/tune-page-size-and-comp.dox @@ -0,0 +1,426 @@ +/*! @page tune_page_size_and_comp Tuning page size and compression + +This document aims to explain the role played by different page sizes in +WiredTiger. It also details motivation behind an application wanting to modify +these page sizes from their default values and the procedure to do so. +Applications commonly configure page sizes based on their workload's typical key +and value size. Once a page size has been chosen, appropriate defaults for the +other configuration values are derived by WiredTiger from the page sizes, and +relatively few applications will need to modify the other page and key/value +size configuration options. WiredTiger also offers several compression options +that have an impact on the size of the data both in-memory and on-disk. Hence +while selecting page sizes, an application must also look at its desired +compression needs. Since the data and workload for a table differs from one +table to another in the database, an application can choose to set page sizes +and compression options on a per-table basis. + +@section data_life_cycle Data life cycle +Before detailing each page size, here is a review of how data gets stored inside +WiredTiger: + - WiredTiger uses the physical disks to store data durably, creating on-disk +files for the tables in the database directory. It also caches the portion of +the table being currently accessed by the application for reading or writing in +main memory. + - WiredTiger maintains a table's data in memory using a data structure called a +<a href="https://en.wikipedia.org/wiki/B-tree">B-Tree</a> ( +<a href="https://en.wikipedia.org/wiki/B%2B_tree">B+ Tree</a> to be specific), +referring to the nodes of a B-Tree as pages. Internal pages carry only keys. The +leaf pages store both keys and values. + - The format of the in-memory pages is not the same as the format of the +on-disk pages. Therefore, the in-memory pages regularly go through a process +called reconciliation to create data structures appropriate for storage on the +disk. These data structures are referred to as on-disk pages. An application can +set a maximum size separately for the internal and leaf on-disk pages otherwise +WiredTiger uses a default value. If reconciliation of an in-memory page is +leading to an on-disk page size greater than this maximum, WiredTiger creates +multiple smaller on-disk pages. + - A component of WiredTiger called the Block Manager divides the on-disk pages +into smaller chunks called blocks, which then get written to the disk. The size +of these blocks is defined by a parameter called allocation_size, which is the +underlying unit of allocation for the file the data gets stored in. An +application might choose to have data compressed before it gets stored to disk +by enabling block compression. + - A database's tables are usually much larger than the main memory available. +Not all of the data can be kept in memory at any given time. A process called +eviction takes care of making space for new data by freeing the memory of data +infrequently accessed. An eviction server regularly finds in-memory pages that +have not been accessed in a while (following an LRU algorithm). Several +background eviction threads continuously process these pages, reconcile them to +disk and remove them from the main memory. + - When an application does an insert or an update of a key/value pair, the +associated key is used to refer to an in-memory page. In the case of this page +not being in memory, appropriate on-disk page(s) are read and an in-memory page +constructed (the opposite of reconciliation). A data structure is maintained on +every in-memory page to store any insertions or modifications to the data done +on that page. As more and more data gets written to this page, the page's memory +footprint keeps growing. + - An application can choose to set the maximum size a page is allowed to grow +in-memory. A default size is set by WiredTiger if the application doesn't +specify one. To keep page management efficient, as a page grows larger in-memory +and approaches this maximum size, if possible, it is split into smaller +in-memory pages. + - When doing an insert or an update, if a page grows larger than the maximum, +the application thread is used to forcefully evict this page. This is done to +split the growing page into smaller in-memory pages and reconcile them into +on-disk pages. Once written to the disk they are removed from the main memory, +making space for more data to be written. When an application gets involved in +forced eviction, it might take longer than usual to do these inserts and +updates. It is not always possible to (force) evict a page from memory and this +page can temporarily grow larger in size than the configured maximum. This page +then remains marked to be evicted and reattempts are made as the application +puts more data in it. + +@section configurable_page_struct Configurable page structures in WiredTiger +There are three page sizes that the user can configure: + 1. The maximum page size of any type of in-memory page in the WiredTiger cache, +memory_page_max. + 2. The maximum size of the on-disk page for an internal page, internal_page_max. + 3. The maximum size of the on-disk leaf page, leaf_page_max. + +There are additional configuration settings that tune more esoteric and +specialized data. Those are included for completeness but are rarely changed. + +@subsection memory_page_max memory_page_max +The maximum size a table's page is allowed to grow to in memory before being +reconciled to disk. + - An integer, with acceptable values between 512B and 10TB + - Default size: 5 MB + - Additionally constrained by the condition: + leaf_page_max <= memory_page_max <= cache_size/10 + - Motivation to tune the value: +\n memory_page_max is significant for applications wanting to tune for +consistency in write intensive workloads. + - This is the parameter to start with for tuning and trying different values +to find the correct balance between overall throughput and individual operation +latency for each table. + - Splitting a growing in-memory page into smaller pages and reconciliation +both require exclusive access to the page which makes an application's write +operations wait. Having a large memory_page_max means that the pages will need +to be split and reconciled less often. But when that happens, the duration that +an exclusive access to the page is required is longer, increasing the latency of +an application's insert or update operations. Conversely, having a smaller +memory_page_max reduces the time taken for splitting and reconciling the pages, +but causes it to happen more frequently, forcing more frequent but shorter +exclusive accesses to the pages. + - Applications should choose the memory_page_max value considering the +trade-off between frequency of exclusive access to the pages (for reconciliation +or splitting pages into smaller pages) versus the duration that the exclusive +access is required. + - Configuration: +\n Specified as memory_page_max configuration option to WT_SESSION::create(). An +example of such a configuration string is as follows: + +<pre> + "key_format=S,value_format=S,memory_page_max=10MB" +</pre> + +@subsection internal_page_max internal_page_max +The maximum page size for the reconciled on-disk internal pages of the B-Tree, +in bytes. When an internal page grows past this size, it splits into multiple +pages. + - An integer, with acceptable values between 512B and 512MB + - Default size: 4 KB (*appropriate for applications with relatively small keys) + - Additionally constrained by the condition: the size must be a multiple of the +allocation size + - Motivation to tune the value: +\n internal_page_max is significant for applications wanting to avoid excessive +L2 cache misses while searching the tree. + - Recall that only keys are stored on internal pages, so the type and size of +the key values for a table help drive the setting for this parameter. + - Should be sized to fit into on-chip caches. + - Applications doing full-table scans with out-of-memory workloads might +increase internal_page_max to transfer more data per I/O. + - Influences the shape of the B-Tree, i.e. depth and the number of children +each page in B-Tree has. To iterate to the desired key/value pair in the B-Tree, +WiredTiger has to binary search the key-range in a page to determine the child +page to proceed to and continue down the depth until it reaches the correct leaf +page. Having an unusually deep B-Tree, or having too many children per page can +negatively impact time taken to iterate the B-Tree, slowing down the application. +The number of children per page and, hence, the tree depth depends upon the +number of keys that can be stored in an internal page, which is +internal_page_max divided by key size. Applications should choose an appropriate +internal_page_max size that avoids the B-Tree from getting too deep. + - Configuration: +\n Specified as internal_page_max configuration option to WT_SESSION::create(). +An example of such a configuration string is as follows: + +<pre> + "key_format=S,value_format=S,internal_page_max=16KB,leaf_page_max=1MB" +</pre> + +@subsection leaf_page_max leaf_page_max +The maximum page size for the reconciled on-disk leaf pages of the B-Tree, in +bytes. When a leaf page grows past this size, it splits into multiple pages. + - An integer, with acceptable values between 512B and 512MB + - Default size: 32 KB (*appropriate for applications with relatively small keys +and values) + - Additionally constrained by the condition: must be a multiple of the +allocation size + - Motivation to tune the value: +\n leaf_page_max is significant for applications wanting to maximize sequential +data transfer from a storage device. + - Should be sized to maximize I/O performance (when reading from disk, it is +usually desirable to read a large amount of data, assuming some locality of +reference in the application's access pattern). + - Applications doing full-table scans through out-of-cache workloads might +increase leaf_page_max to transfer more data per I/O. + - Applications focused on read/write amplification might decrease the page +size to better match the underlying storage block size. + - Configuration: +\n Specified as leaf_page_max configuration option to WT_SESSION::create(). An +example of such a configuration string is as follows: + +<pre> + "key_format=S,value_format=S,internal_page_max=16KB,leaf_page_max=1MB" +</pre> + +The following configuration items following are rarely used. They are described +for completeness: + +@subsection allocation_size allocation_size +This is the underlying unit of allocation for the file. As the unit of file +allocation, it sets the minimum page size and how much space is wasted when +storing small amounts of data and overflow items. + - an integer between 512B and 128 MB + - must a power-of-two + - default : 4 KB + - Motivation to tune the value: +\n Most applications should not need to tune the allocation size. + - To be compatible with virtual memory page sizes and direct I/O requirements +on the platform (4KB for most common server platforms) + - Smaller values decrease the file space required by overflow items. + - For example, if the allocation size is set to 4KB, an overflow item of +18,000 bytes requires 5 allocation units and wastes about 2KB of space. If the +allocation size is 16KB, the same overflow item would waste more than 10KB. + - Configuration: +\n Specified as allocation_size configuration option to WT_SESSION::create(). An +example of such a configuration string is as follows: + +<pre> + "key_format=S,value_format=S,allocation_size=4KB" +</pre> + +@subsection key_val_max internal/leaf key/value max + - Overflow items +\n Overflow items are keys and values too large to easily store on a page. Overflow +items are stored separately in the file from the page where the item logically +appears, and so reading or writing an overflow item is more expensive than an +on-page item, normally requiring additional I/O. Additionally, overflow values +are not cached in memory. This means overflow items won't affect the caching +behavior of the application. It also means that each time an overflow value is +read, it is re-read from disk. + - internal_key_max +\n The largest key stored in an internal page, in bytes. If set, keys larger than +the specified size are stored as overflow items. + - The default and the maximum allowed value are both one-tenth the size of a +newly split internal page. + - leaf_key_max +\n The largest key stored in a leaf page, in bytes. If set, keys larger than the +specified size are stored as overflow items. + - The default value is one-tenth the size of a newly split leaf page. + - leaf_value_max +\n The largest value stored in a leaf page, in bytes. If set, values larger than +the specified size are stored as overflow items + - The default is one-half the size of a newly split leaf page. + - If the size is larger than the maximum leaf page size, the page size is +temporarily ignored when large values are written. + - Motivation to tune the values: +\n Most applications should not need to tune the maximum key and value sizes. +Applications requiring a small page size, but also having latency concerns such +that the additional work to retrieve an overflow item may find modifying these +values useful. +\n Since overflow items are separately stored in the on-disk file, aren't cached +and require additional I/O to access (read or write), applications should avoid +creating overflow items. + - Since page sizes also determine the default size of overflow items, i.e., +keys and values too large to easily store on a page, they can be configured to +avoid performance penalties working with overflow items: + - Applications with large keys and values, and concerned with latency, +might increase the page size to avoid creating overflow items, in order to avoid +the additional cost of retrieving them. + - Applications with large keys and values, doing random searches, might +decrease the page size to avoid wasting cache space on overflow items that +aren't likely to be needed. + - Applications with large keys and values, doing table scans, might +increase the page size to avoid creating overflow items, as the overflow items +must be read into memory in all cases, anyway. + - internal_key_max, leaf_key_max and leaf_value_max configuration values +allow applications to change the size at which a key or value will be treated +as an overflow item. + - Most applications should not need to tune the maximum key and value +sizes. + - The value of internal_key_max is relative to the maximum internal page +size. Because the number of keys on an internal page determines the depth of the +tree, the internal_key_max value can only be adjusted within a certain range, +and the configured value will be automatically adjusted by WiredTiger, if +necessary, to ensure a reasonable number of keys fit on an internal page. + - The values of leaf_key_max and leaf_value_max are not relative to the +maximum leaf page size. If either is larger than the maximum page size, the page +size will be ignored when the larger keys and values are being written, and a +larger page will be created as necessary. + - Configuration: +\n Specified as internal_key_max, leaf_key_max and leaf_value_max configuration +options to WT_SESSION::create(). An example of configuration string for a large +leaf overflow value: + +<pre> + "key_format=S,value_format=S,leaf_page_max=16KB,leaf_value_max=256KB" +</pre> + +@subsection split_pct split_pct (split percentage) +The size (specified as percentage of internal/leaf page_max) at which the +reconciled page must be split into multiple smaller pages before being sent for +compression and then be written to the disk. If the reconciled page can fit into +a single on-disk page without the page growing beyond it's set max size, +split_pct is ignored and the page isn't split. + - an integer between 25 and 100 + - default : 75 + - Motivation to tune the value: +\n Most applications should not need to tune the split percentage size. + - This value should be selected to avoid creating a large number of tiny +pages or repeatedly splitting whenever new entries are inserted. +\n For example, if the maximum page size is 1MB, a split_pct value of 10% +would potentially result in creating a large number of 100KB pages, which may +not be optimal for future I/O. Or, if the maximum page size is 1MB, a split_pct +value of 90% would potentially result in repeatedly splitting pages as the split +pages grow to 1MB over and over. The default value for split_pct is 75%, +intended to keep large pages relatively large, while still giving split pages +room to grow. + - Configuration: +\n Specified as split_pct configuration option to WT_SESSION::create(). An +example of such a configuration string is as follows: + +<pre> + "key_format=S,value_format=S,split_pct=60" +</pre> + +@section compression_considerations Compression considerations +WiredTiger compresses data at several stages to preserve memory and disk space. +Applications can configure these different compression algorithms to tailor +their requirements between memory, disk and CPU consumption. Compression +algorithms other than block compression work by modifying how the keys and +values are represented, and hence reduce data size in-memory and on-disk. Block +compression on the other hand compress the data in its binary representation +while saving it on the disk. + +Configuring compression may change application throughput. For example, in +applications using solid-state drives (where I/O is less expensive), turning +off compression may increase application performance by reducing CPU costs; in +applications where I/O costs are more expensive, turning on compression may +increase application performance by reducing the overall number of I/O +operations. + +WiredTiger uses some internal algorithms to compress the amount of data stored +that are not configurable, but always on. For example, run-length reduces the +size requirement by storing sequential, duplicate values in the store only a +single time (with an associated count). + +Different compression options available with WiredTiger: + - Key-prefix + - Reduces the size requirement by storing any identical key prefix only once +per page. The cost is additional CPU and memory when operating on the in-memory +tree. Specifically, reverse sequential cursor movement (but not forward) through +a prefix-compressed page or the random lookup of a key/value pair will allocate +sufficient memory to hold some number of uncompressed keys. So, for example, if +key prefix compression only saves a small number of bytes per key, the +additional memory cost of instantiating the uncompressed key may mean prefix +compression is not worthwhile. Further, in cases where the on-disk cost is the +primary concern, block compression may mean prefix compression is less useful. + - Configuration: +\n Specified as prefix_compression configuration option to +WT_SESSION::create(). Applications may limit the use of prefix compression by +configuring the minimum number of bytes that must be gained before prefix +compression is used with prefix_compression_min configuration option. An example +of such a configuration string is as follows: + +<pre> + "key_format=S,value_format=S,prefix_compression=true,prefix_compression_min=7" +</pre> + + - Dictionary + - Reduces the size requirement by storing any identical value only once per +page. + - Configuration: +\n Specified as dictionary configuration configuration option to +WT_SESSION::create(), which specifies the maximum number of unique values +remembered in the B-Tree row-store leaf page value dictionary. An example of +such a configuration string is as follows: + +<pre> + "key_format=S,value_format=S,dictionary=1000" +</pre> + + - Huffman + - Reduces the size requirement by compressing individual key/value items, and +can be separately configured either or both keys and values. The additional CPU +cost of Huffman encoding can be high, and should be considered. (See Huffman +Encoding for details.) + - Configuration: +\n Specified as huffman_key and/or huffman_value configuration option to +WT_SESSION::create(). These options can take values of "english" (to use a +built-in English language frequency table), "utf8<file>" or "utf16<file>" (to +use a custom utf8 or utf16 symbol frequency table file). An example of such a +configuration string is as follows: + +<pre> + "key_format=S,value_format=S,huffman_key=english,huffman_value=english" +</pre> + + - Block Compression + - Reduces the size requirement of on-disk objects by compressing blocks of +the backing object's file. The additional CPU cost of block compression can be +high, and should be considered. When block compression has been configured, +configured page sizes will not match the actual size of the page on disk. + - WiredTiger provides two methods of compressing your data when using block +compression: the raw and noraw methods. These methods change how WiredTiger +works to fit data into the blocks that are stored on disk. Applications needing +to write specific sized blocks may want to consider implementing a +WT_COMPRESSOR::compress_raw function. + - Noraw compression: +\n A fixed amount of data is given to the compression system, then turned into +a compressed block of data. The amount of data chosen to compress is the data +needed to fill the uncompressed block. Thus when compressed, the block will be +smaller than the normal data size and the sizes written to disk will often vary +depending on how compressible the data being stored is. Algorithms using noraw +compression include zlib-noraw, lz4-noraw and snappy. +Noraw compression is better suited for workloads with random access patterns +because each block will tend to be smaller and require less work to read and +decompress. + - Raw compression: +\n WiredTiger's raw compression takes advantage of compressors that provide a +streaming compression API. Using the streaming API WiredTiger will try to fit as +much data as possible into one block. This means that blocks created with raw +compression should be of similar size. Using a streaming compression method +should also make for less overhead in compression, as the setup and initial work +for compressing is done fewer times compared to the amount of data stored. +Algorithms using raw compression include zlib, lz4. +Compared to noraw, raw compression provides more compression while using more +CPU. Raw compression may provide a performance advantage in workloads where data +is accessed sequentially. That is because more data is generally packed into +each block on disk. + - Configuration: +\n Specified as the block_compressor configuration option to +WT_SESSION::create(). If WiredTiger has builtin support for "lz4", "snappy", +"zlib" or "zstd" compression, these names are available as the value to the +option. An example of such a configuration string is as follows: + +<pre> + "key_format=S,value_format=S,block_compressor=snappy" +</pre> + +See @ref compression for further information on how to configure and enable +different compression options. + +@subsection table_compress Table summarizing compression in WiredTiger + +<table> +@hrow{Compression Type, Supported by row-store, Supported by variable col-store, + Supported by fixed col-store, Default config, Reduces in-mem size, + Reduces on-disk size, CPU and Memory cost} +@row{Key-prefix, yes, no, no, disabled, yes, yes, minor} +@row{Dictionary, yes, yes, no, disabled, yes, yes, minor} +@row{Huffman, yes, yes, no, disabled, yes, yes, can be high} +@row{Block, yes, yes, yes, disabled, no, yes, can be high} +</table> + +*/ diff --git a/src/third_party/wiredtiger/src/docs/tune-page-sizes.dox b/src/third_party/wiredtiger/src/docs/tune-page-sizes.dox deleted file mode 100644 index 130e047a02d..00000000000 --- a/src/third_party/wiredtiger/src/docs/tune-page-sizes.dox +++ /dev/null @@ -1,142 +0,0 @@ -/*! @page tune_page_sizes Page and overflow key/value sizes - -There are seven page and key/value size configuration strings: - -- allocation size (\c allocation_size), -- page sizes (\c internal_page_max and \c leaf_page_max), -- key and value sizes (\c internal_key_max, \c leaf_key_max and \c leaf_value_max), and the -- page-split percentage (\c split_pct). - -All seven are specified to the WT_SESSION::create method, in other -words, they are configurable on a per-file basis. - -Applications commonly configure page sizes, based on their workload's -typical key and value size. Once the correct page size has been chosen, -appropriate defaults for the other configuration values are derived from -the page sizes, and relatively few applications will need to modify the -other page and key/value size configuration options. - -An example of configuring page and key/value sizes: - -@snippet ex_all.c Create a table and configure the page size - -@section tune_page_sizes_sizes Page, key and value sizes - -The \c internal_page_max and \c leaf_page_max configuration values -specify a maximum size for Btree internal and leaf pages. That is, when -an internal or leaf page grows past that size, it splits into multiple -pages. Generally, internal pages should be sized to fit into on-chip -caches in order to minimize cache misses when searching the tree, while -leaf pages should be sized to maximize I/O performance (if reading from -disk is necessary, it is usually desirable to read a large amount of -data, assuming some locality of reference in the application's access -pattern). - -The default page size configurations (2KB for \c internal_page_max, 32KB -for \c leaf_page_max), are appropriate for applications with relatively -small keys and values. - -- Applications doing full-table scans through out-of-memory workloads -might increase both internal and leaf page sizes to transfer more data -per I/O. -- Applications focused on read/write amplification might decrease the page -size to better match the underlying storage block size. - -When block compression has been configured, configured page sizes will -not match the actual size of the page on disk. Block compression in -WiredTiger happens within the I/O subsystem, and so a page might split -even if subsequent compression would result in a resulting page size -small enough to leave as a single page. In other words, page sizes are -based on in-memory sizes, not on-disk sizes. Applications needing to -write specific sized blocks may want to consider implementing a -WT_COMPRESSOR::compress_raw function. - -The page sizes also determine the default size of overflow items, that -is, keys and values too large to easily store on a page. Overflow items -are stored separately in the file from the page where the item logically -appears, and so reading or writing an overflow item is more expensive -than an on-page item, normally requiring additional I/O. Additionally, -overflow values are not cached in memory. This means overflow items -won't affect the caching behavior of the application, but it also means -that each time an overflow value is read, it is re-read from disk. - -For both of these reasons, applications should avoid creating large -numbers of commonly referenced overflow items. This is especially -important for keys, as keys on internal pages are referenced during -random searches, not just during data retrieval. Generally, -applications should make every attempt to avoid creating overflow keys. - -- Applications with large keys and values, and concerned with latency, -might increase the page size to avoid creating overflow items, in order -to avoid the additional cost of retrieving them. - -- Applications with large keys and values, doing random searches, might -decrease the page size to avoid wasting cache space on overflow items -that aren't likely to be needed. - -- Applications with large keys and values, doing table scans, might -increase the page size to avoid creating overflow items, as the overflow -items must be read into memory in all cases, anyway. - -The \c internal_key_max, \c leaf_key_max and \c leaf_value_max -configuration values allow applications to change the size at which a -key or value will be treated as an overflow item. - -The value of \c internal_key_max is relative to the maximum internal -page size. Because the number of keys on an internal page determines -the depth of the tree, the \c internal_key_max value can only be -adjusted within a certain range, and the configured value will be -automatically adjusted by WiredTiger, if necessary to ensure a -reasonable number of keys fit on an internal page. - -The values of \c leaf_key_max and \c leaf_value_max are not relative to -the maximum leaf page size. If either is larger than the maximum page -size, the page size will be ignored when the larger keys and values are -being written, and a larger page will be created as necessary. - -Most applications should not need to tune the maximum key and value -sizes. Applications requiring a small page size, but also having -latency concerns such that the additional work to retrieve an overflow -item is an issue, may find them useful. - -An example of configuring a large leaf overflow value: - -@snippet ex_all.c Create a table and configure a large leaf value max - -@section tune_page_sizes_split_percentage Split percentage - -The \c split_pct configuration string configures the size of a split -page. When a page grows sufficiently large that it must be written as -multiple disk blocks, the newly written block size is \c split_pct -percent of the maximum page size. This value should be selected to -avoid creating a large number of tiny pages or repeatedly splitting -whenever new entries are inserted. For example, if the maximum page -size is 1MB, a \c split_pct value of 10% would potentially result in -creating a large number of 100KB pages, which may not be optimal for -future I/O. Or, if the maximum page size is 1MB, a \c split_pct value -of 90% would potentially result in repeatedly splitting pages as the -split pages grow to 1MB over and over. The default value for \c -split_pct is 75%, intended to keep large pages relatively large, while -still giving split pages room to grow. - -Most applications should not need to tune the split percentage size. - -@section tune_page_sizes_allocation_size Allocation size - -The \c allocation_size configuration value is the underlying unit of -allocation for the file. As the unit of file allocation, it sets the -minimum page size and how much space is wasted when storing small -amounts of data and overflow items. For example, if the allocation size -is set to 4KB, an overflow item of 18,000 bytes requires 5 allocation -units and wastes about 2KB of space. If the allocation size is 16KB, -the same overflow item would waste more than 10KB. - -The default allocation size is 4KB, chosen for compatibility with -virtual memory page sizes and direct I/O requirements on common server -platforms. - -Most applications should not need to tune the allocation size; it is -primarily intended for applications coping with the specific -requirements some file systems make to support features like direct I/O. - -*/ diff --git a/src/third_party/wiredtiger/src/docs/upgrading.dox b/src/third_party/wiredtiger/src/docs/upgrading.dox index 1e0e2eaf99a..f463e6bc615 100644 --- a/src/third_party/wiredtiger/src/docs/upgrading.dox +++ b/src/third_party/wiredtiger/src/docs/upgrading.dox @@ -1,19 +1,39 @@ /*! @page upgrading Upgrading WiredTiger applications -@section version_291 Upgrading to Version 2.9.1 + +@section version_292 Upgrading to Version 2.9.2 <dl> -<dt>WiredTiger now requires Python 2.7 at minimum</dt> +<dt>WiredTiger Utility now supports truncate</dt> +<dd> +The WiredTiger Utility can now \c truncate an object. Removing all contents +from the specified object. +</dd> +<dt>Handle list lock statistics</dt> <dd> -The minimum version of Python supported by WiredTiger is now 2.7 up from the -previous version of 2.6. This is due to extra unit tests added in this release -that depend on 2.7. This is not due to a change in the Python API. +In the 2.9.1 release we added statistics tracking handle list lock timing, we +have switched that lock from a spin lock to a read-write lock, and consequently +changed the statistics tracking lock related wait time. </dd> +</dl> + +@section version_291 Upgrading to Version 2.9.1 +<dl> <dt>Changes to hazard pointer configuration</dt> <dd> The \c hazard_max parameter to ::wiredtiger_open is now ignored. Memory is allocated for hazard pointers as required by each session. </dd> -</dl><hr> + +<dt>Change to the default fadvise behavior for data files</dt> +<dd> +The old default behavior was to advise the file system that access would be +random for data files, and there was no way to alter that. We no longer +call advise the file system of expected access patterns by default, and +have added a new \c access_pattern_hint configuration option available for +WT_SESSION::create that can be used to restore the old default by setting +the value to "random". +</dd> +</dl> @section version_290 Upgrading to Version 2.9.0 <dl> @@ -314,7 +334,7 @@ be updated. The WT_SESSION::create \c internal_item_max and \c leaf_item_max configuration strings are now deprecated in favor of the \c internal_key_max, \c leaf_key_max, and \c leaf_value_max -configuration strings. See @ref tune_page_sizes for more information. +configuration strings. See @ref tune_page_size_and_comp for more information. </dd> </dl><hr> diff --git a/src/third_party/wiredtiger/src/docs/wtperf.dox b/src/third_party/wiredtiger/src/docs/wtperf.dox index 83aadf8a776..2eac0fef3f4 100644 --- a/src/third_party/wiredtiger/src/docs/wtperf.dox +++ b/src/third_party/wiredtiger/src/docs/wtperf.dox @@ -195,14 +195,14 @@ use pareto distribution for random numbers. Zero to disable, otherwise a percen number of operations to group into each transaction in the populate phase, zero for auto-commit @par populate_threads (unsigned int, default=1) number of populate threads, 1 for bulk load +@par pre_load_data (boolean, default=false) +Scan all data prior to starting the workload phase to warm the cache @par random_range (unsigned int, default=0) if non zero choose a value from within this range as the key for insert operations @par random_value (boolean, default=false) generate random content for the value @par range_partition (boolean, default=false) partition data by range (vs hash) -@par read_range (unsigned int, default=0) -scan a range of keys after each search @par readonly (boolean, default=false) reopen the connection between populate and workload phases in readonly mode. Requires reopen_connection turned on (default). Requires that read be the only workload specified @par reopen_connection (boolean, default=true) @@ -228,7 +228,7 @@ number of tables to run operations over. Keys are divided evenly over the table @par table_count_idle (unsigned int, default=0) number of tables to create, that won't be populated. Default 0. @par threads (string, default="") -workload configuration: each 'count' entry is the total number of threads, and the 'insert', 'read' and 'update' entries are the ratios of insert, read and update operations done by each worker thread; If a throttle value is provided each thread will do a maximum of that number of operations per second; multiple workload configurations may be specified per threads configuration; for example, a more complex threads configuration might be 'threads=((count=2,reads=1)(count=8,reads=1,inserts=2,updates=1))' which would create 2 threads doing nothing but reads and 8 threads each doing 50% inserts and 25% reads and updates. Allowed configuration values are 'count', 'throttle', 'update_delta', 'reads', 'inserts', 'updates', 'truncate', 'truncate_pct' and 'truncate_count'. There are also behavior modifiers, supported modifiers are 'ops_per_txn' +workload configuration: each 'count' entry is the total number of threads, and the 'insert', 'read' and 'update' entries are the ratios of insert, read and update operations done by each worker thread; If a throttle value is provided each thread will do a maximum of that number of operations per second; multiple workload configurations may be specified per threads configuration; for example, a more complex threads configuration might be 'threads=((count=2,reads=1)(count=8,reads=1,inserts=2,updates=1))' which would create 2 threads doing nothing but reads and 8 threads each doing 50% inserts and 25% reads and updates. Allowed configuration values are 'count', 'throttle', 'update_delta', 'reads', 'read_range', 'inserts', 'updates', 'truncate', 'truncate_pct' and 'truncate_count'. There are also behavior modifiers, supported modifiers are 'ops_per_txn' @par transaction_config (string, default="") WT_SESSION.begin_transaction configuration string, applied during the populate phase when populate_ops_per_txn is nonzero @par table_name (string, default="test") diff --git a/src/third_party/wiredtiger/src/evict/evict_lru.c b/src/third_party/wiredtiger/src/evict/evict_lru.c index 6fa728916de..f1949a7c320 100644 --- a/src/third_party/wiredtiger/src/evict/evict_lru.c +++ b/src/third_party/wiredtiger/src/evict/evict_lru.c @@ -15,6 +15,7 @@ static int __evict_lru_walk(WT_SESSION_IMPL *); static int __evict_page(WT_SESSION_IMPL *, bool); static int __evict_pass(WT_SESSION_IMPL *); static int __evict_server(WT_SESSION_IMPL *, bool *); +static int __evict_tune_workers(WT_SESSION_IMPL *session); static int __evict_walk(WT_SESSION_IMPL *, WT_EVICT_QUEUE *); static int __evict_walk_file( WT_SESSION_IMPL *, WT_EVICT_QUEUE *, u_int, u_int *); @@ -23,6 +24,60 @@ static int __evict_walk_file( (S2C(s)->evict_threads.current_threads > 1) /* + * __evict_lock_handle_list -- + * Try to get the handle list lock, with yield and sleep back off. + * Keep timing statistics overall. + */ +static int +__evict_lock_handle_list(WT_SESSION_IMPL *session) +{ + struct timespec enter, leave; + WT_CACHE *cache; + WT_CONNECTION_IMPL *conn; + WT_DECL_RET; + WT_RWLOCK *dh_lock; + u_int spins; + bool dh_stats; + + conn = S2C(session); + cache = conn->cache; + dh_lock = &conn->dhandle_lock; + + /* + * Setup tracking of handle lock acquisition wait time if statistics + * are enabled. + */ + dh_stats = WT_STAT_ENABLED(session); + + if (dh_stats) + __wt_epoch(session, &enter); + + /* + * Use a custom lock acquisition back off loop so the eviction server + * notices any interrupt quickly. + */ + for (spins = 0; + (ret = __wt_try_readlock(session, dh_lock)) == EBUSY && + cache->pass_intr == 0; spins++) { + if (spins < WT_THOUSAND) + __wt_yield(); + else + __wt_sleep(0, WT_THOUSAND); + } + /* + * Only record statistics on success. + */ + WT_RET(ret); + if (dh_stats) { + __wt_epoch(session, &leave); + WT_STAT_CONN_INCRV( + session, lock_handle_list_wait_eviction, + (int64_t)WT_TIMEDIFF_US(leave, enter)); + } + return (0); +} + +/* * __evict_entry_priority -- * Get the adjusted read generation for an eviction entry. */ @@ -143,8 +198,7 @@ __wt_evict_list_clear_page(WT_SESSION_IMPL *session, WT_REF *ref) } __wt_spin_unlock(session, &cache->evict_queues[q].evict_lock); } - WT_ASSERT(session, - !F_ISSET_ATOMIC(ref->page, WT_PAGE_EVICT_LRU)); + WT_ASSERT(session, !F_ISSET_ATOMIC(ref->page, WT_PAGE_EVICT_LRU)); __wt_spin_unlock(session, &cache->evict_queue_lock); } @@ -213,7 +267,7 @@ __wt_evict_server_wake(WT_SESSION_IMPL *session) } #endif - __wt_cond_auto_signal(session, cache->evict_cond); + __wt_cond_signal(session, cache->evict_cond); } /* @@ -226,12 +280,12 @@ __wt_evict_thread_run(WT_SESSION_IMPL *session, WT_THREAD *thread) WT_CACHE *cache; WT_CONNECTION_IMPL *conn; WT_DECL_RET; - bool did_work; + bool did_work, was_intr; conn = S2C(session); cache = conn->cache; -#ifdef HAVE_DIAGNOSTIC +#if defined(HAVE_DIAGNOSTIC) || defined(HAVE_VERBOSE) /* * Ensure the cache stuck timer is initialized when starting eviction. */ @@ -254,12 +308,28 @@ __wt_evict_thread_run(WT_SESSION_IMPL *session, WT_THREAD *thread) ret = __evict_server(session, &did_work); F_CLR(cache->walk_session, WT_SESSION_LOCKED_PASS); F_CLR(session, WT_SESSION_LOCKED_PASS); + was_intr = cache->pass_intr != 0; __wt_spin_unlock(session, &cache->evict_pass_lock); WT_ERR(ret); + + /* + * If the eviction server was interrupted, wait until + * requests have been processed: the system may + * otherwise be busy so don't go to sleep. + */ + if (was_intr) { + while (cache->pass_intr != 0 && + F_ISSET(conn, WT_CONN_EVICTION_RUN) && + F_ISSET(thread, WT_THREAD_RUN)) + __wt_yield(); + continue; + } + __wt_verbose(session, WT_VERB_EVICTSERVER, "sleeping"); + /* Don't rely on signals: check periodically. */ __wt_cond_auto_wait( - session, cache->evict_cond, did_work); + session, cache->evict_cond, did_work, NULL); __wt_verbose(session, WT_VERB_EVICTSERVER, "waking"); } else WT_ERR(__evict_lru_pages(session, false)); @@ -299,14 +369,13 @@ err: WT_PANIC_MSG(session, ret, "cache eviction thread error"); static int __evict_server(WT_SESSION_IMPL *session, bool *did_work) { +#if defined(HAVE_DIAGNOSTIC) || defined(HAVE_VERBOSE) + struct timespec now; +#endif WT_CACHE *cache; WT_CONNECTION_IMPL *conn; WT_DECL_RET; -#ifdef HAVE_DIAGNOSTIC - struct timespec now; -#endif uint64_t orig_pages_evicted; - u_int spins; conn = S2C(session); cache = conn->cache; @@ -317,7 +386,8 @@ __evict_server(WT_SESSION_IMPL *session, bool *did_work) /* Evict pages from the cache as needed. */ WT_RET(__evict_pass(session)); - if (!F_ISSET(conn, WT_CONN_EVICTION_RUN)) + if (!F_ISSET(conn, WT_CONN_EVICTION_RUN) || + cache->pass_intr != 0) return (0); /* @@ -325,35 +395,31 @@ __evict_server(WT_SESSION_IMPL *session, bool *did_work) * otherwise we can block applications evicting large pages. */ if (!__wt_cache_stuck(session)) { - for (spins = 0; (ret = __wt_spin_trylock( - session, &conn->dhandle_lock)) == EBUSY && - cache->pass_intr == 0; spins++) { - if (spins < WT_THOUSAND) - __wt_yield(); - else - __wt_sleep(0, WT_THOUSAND); - } /* - * If we gave up acquiring the lock, that indicates a - * session is waiting for us to clear walks. Do that - * as part of a normal pass (without the handle list + * Try to get the handle list lock: if we give up, that + * indicates a session is waiting for us to clear walks. Do + * that as part of a normal pass (without the handle list * lock) to avoid deadlock. */ - if (ret == EBUSY) + if ((ret = __evict_lock_handle_list(session)) == EBUSY) return (0); WT_RET(ret); ret = __evict_clear_all_walks(session); - __wt_spin_unlock(session, &conn->dhandle_lock); + __wt_readunlock(session, &conn->dhandle_lock); WT_RET(ret); cache->pages_evicted = 0; } else if (cache->pages_evicted != cache->pages_evict) { cache->pages_evicted = cache->pages_evict; -#ifdef HAVE_DIAGNOSTIC +#if defined(HAVE_DIAGNOSTIC) || defined(HAVE_VERBOSE) __wt_epoch(session, &cache->stuck_ts); } else if (!F_ISSET(conn, WT_CONN_IN_MEMORY)) { /* - * After being stuck for 5 minutes, give up. + * If we're stuck for 5 minutes in diagnostic mode, or the + * verbose evict_stuck flag is configured, log the cache + * and transaction state. + * + * If we're stuck for 5 minutes in diagnostic mode, give up. * * We don't do this check for in-memory workloads because * application threads are not blocked by the cache being full. @@ -362,11 +428,22 @@ __evict_server(WT_SESSION_IMPL *session, bool *did_work) */ __wt_epoch(session, &now); if (WT_TIMEDIFF_SEC(now, cache->stuck_ts) > 300) { - ret = ETIMEDOUT; - __wt_err(session, ret, +#if defined(HAVE_DIAGNOSTIC) + __wt_err(session, ETIMEDOUT, "Cache stuck for too long, giving up"); - WT_TRET(__wt_cache_dump(session, NULL)); + ret = ETIMEDOUT; + WT_TRET(__wt_verbose_dump_txn(session)); + WT_TRET(__wt_verbose_dump_cache(session)); return (ret); +#elif defined(HAVE_VERBOSE) + if (WT_VERBOSE_ISSET(session, WT_VERB_EVICT_STUCK)) { + WT_RET(__wt_verbose_dump_txn(session)); + WT_RET(__wt_verbose_dump_cache(session)); + + /* Reset the timer. */ + __wt_epoch(session, &cache->stuck_ts); + } +#endif } #endif } @@ -389,11 +466,13 @@ __wt_evict_create(WT_SESSION_IMPL *session) /* Set first, the thread might run before we finish up. */ F_SET(conn, WT_CONN_EVICTION_RUN); - /* Create the eviction thread group */ + /* + * Create the eviction thread group. + * Set the group size to the maximum allowed sessions. + */ WT_RET(__wt_thread_group_create(session, &conn->evict_threads, - "eviction-server", conn->evict_threads_min, - conn->evict_threads_max, WT_THREAD_CAN_WAIT | WT_THREAD_PANIC_FAIL, - __wt_evict_thread_run)); + "eviction-server", conn->evict_threads_min, conn->evict_threads_max, + WT_THREAD_CAN_WAIT | WT_THREAD_PANIC_FAIL, __wt_evict_thread_run)); /* * Allow queues to be populated now that the eviction threads @@ -420,7 +499,7 @@ __wt_evict_destroy(WT_SESSION_IMPL *session) return (0); /* Wait for any eviction thread group changes to stabilize. */ - __wt_writelock(session, conn->evict_threads.lock); + __wt_writelock(session, &conn->evict_threads.lock); /* * Signal the threads to finish and stop populating the queue. @@ -548,6 +627,8 @@ __evict_pass(WT_SESSION_IMPL *session) if (loop == 0) prev = now; + if (conn->evict_threads.threads[0]->session == session) + WT_RET(__evict_tune_workers(session)); /* * Increment the shared read generation. Do this occasionally * even if eviction is not currently required, so that pages @@ -573,14 +654,6 @@ __evict_pass(WT_SESSION_IMPL *session) if (!__evict_update_work(session)) break; - /* - * Try to start a new thread if we have capacity and haven't - * reached the eviction targets. - */ - if (F_ISSET(cache, WT_CACHE_EVICT_ALL)) - WT_RET(__wt_thread_group_start_one( - session, &conn->evict_threads, false)); - __wt_verbose(session, WT_VERB_EVICTSERVER, "Eviction pass with: Max: %" PRIu64 " In use: %" PRIu64 " Dirty: %" PRIu64, @@ -655,8 +728,8 @@ __evict_pass(WT_SESSION_IMPL *session) */ WT_STAT_CONN_INCR(session, cache_eviction_server_slept); - __wt_cond_wait( - session, cache->evict_cond, WT_THOUSAND); + __wt_cond_wait(session, + cache->evict_cond, WT_THOUSAND, NULL); continue; } @@ -683,7 +756,7 @@ __evict_pass(WT_SESSION_IMPL *session) * Clear a single walk point. */ static int -__evict_clear_walk(WT_SESSION_IMPL *session, bool count_stat) +__evict_clear_walk(WT_SESSION_IMPL *session) { WT_BTREE *btree; WT_CACHE *cache; @@ -700,14 +773,14 @@ __evict_clear_walk(WT_SESSION_IMPL *session, bool count_stat) if ((ref = btree->evict_ref) == NULL) return (0); - if (count_stat) - WT_STAT_CONN_INCR(session, cache_eviction_walks_abandoned); + WT_STAT_CONN_INCR(session, cache_eviction_walks_abandoned); /* - * Clear evict_ref first, in case releasing it forces eviction (we - * assert we never try to evict the current eviction walk point). + * Clear evict_ref before releasing it in case that forces eviction (we + * assert that we never try to evict the current eviction walk point). */ btree->evict_ref = NULL; + WT_WITH_DHANDLE(cache->walk_session, session->dhandle, (ret = __wt_page_release(cache->walk_session, ref, WT_READ_NO_EVICT))); @@ -730,7 +803,7 @@ __evict_clear_all_walks(WT_SESSION_IMPL *session) TAILQ_FOREACH(dhandle, &conn->dhqh, q) if (WT_PREFIX_MATCH(dhandle->name, "file:")) WT_WITH_DHANDLE(session, dhandle, - WT_TRET(__evict_clear_walk(session, true))); + WT_TRET(__evict_clear_walk(session))); return (ret); } @@ -775,7 +848,7 @@ __wt_evict_file_exclusive_on(WT_SESSION_IMPL *session) /* Clear any existing LRU eviction walk for the file. */ WT_WITH_PASS_LOCK(session, - ret = __evict_clear_walk(session, true)); + ret = __evict_clear_walk(session)); (void)__wt_atomic_subv32(&cache->pass_intr, 1); WT_ERR(ret); @@ -844,6 +917,184 @@ __wt_evict_file_exclusive_off(WT_SESSION_IMPL *session) __wt_spin_unlock(session, &cache->evict_walk_lock); } +#define EVICT_TUNE_BATCH 1 /* Max workers to add each period */ +#define EVICT_TUNE_DATAPT_MIN 3 /* Data points needed before deciding + if we should keep adding workers or + settle on an earlier value. */ +#define EVICT_TUNE_PERIOD 2 /* Tune period in seconds */ + +/* + * __evict_tune_workers -- + * Find the right number of eviction workers. Gradually ramp up the number of + * workers increasing the number in batches indicated by the setting above. + * Store the number of workers that gave us the best throughput so far and + * the number of data points we have tried. + * + * Every once in a while when we have the minimum number of data points + * we check whether the eviction throughput achieved with the current number + * of workers is the best we have seen so far. If so, we will keep increasing + * the number of workers. If not, we are past the infliction point on the + * eviction throughput curve. In that case, we will set the number of workers + * to the best observed so far and settle into a stable state. + */ +static int +__evict_tune_workers(WT_SESSION_IMPL *session) +{ + struct timespec current_time; + WT_CACHE *cache; + WT_CONNECTION_IMPL *conn; + WT_DECL_RET; + uint64_t cur_threads, delta_msec, delta_pages, i, target_threads; + uint64_t pgs_evicted_cur, pgs_evicted_persec_cur; + uint32_t thread_surplus; + + conn = S2C(session); + cache = conn->cache; + + WT_ASSERT(session, conn->evict_threads.threads[0]->session == session); + pgs_evicted_persec_cur = 0; + + if (conn->evict_tune_stable) + return (0); + + __wt_epoch(session, ¤t_time); + + /* + * Every EVICT_TUNE_PERIOD seconds record the number of + * pages evicted per second observed in the previous period. + */ + if (WT_TIMEDIFF_SEC( + current_time, conn->evict_tune_last_time) < EVICT_TUNE_PERIOD) + return (0); + + pgs_evicted_cur = cache->pages_evict; + + /* + * If we have recorded the number of pages evicted at the end of + * the previous measurement interval, we can compute the eviction + * rate in evicted pages per second achieved during the current + * measurement interval. + * Otherwise, we just record the number of evicted pages and return. + */ + if (conn->evict_tune_pgs_last == 0) + goto err; + + delta_msec = WT_TIMEDIFF_MS(current_time, conn->evict_tune_last_time); + delta_pages = pgs_evicted_cur - conn->evict_tune_pgs_last; + pgs_evicted_persec_cur = (delta_pages * WT_THOUSAND) / delta_msec; + conn->evict_tune_num_points++; + + /* Keep track of the maximum eviction throughput seen and the number + * of workers corresponding to that throughput. + */ + if (pgs_evicted_persec_cur > conn->evict_tune_pg_sec_max) { + conn->evict_tune_pg_sec_max = pgs_evicted_persec_cur; + conn->evict_tune_workers_best = + conn->evict_threads.current_threads; + } + + /* + * Compare the current number of data points with the number + * needed variable. If they are equal, we will check whether + * we are still going up on the performance curve, in which + * case we will continue increasing the number of workers, or + * we are past the inflection point on the curve, in which case + * we will go back to the best observed number of workers and + * settle into a stable state. + */ + if (conn->evict_tune_num_points >= conn->evict_tune_datapts_needed) { + if ((conn->evict_tune_workers_best == + conn->evict_threads.current_threads) && + (conn->evict_threads.current_threads < + conn->evict_threads_max)) { + /* + * Keep adding workers. We will check again + * at the next check point. + */ + conn->evict_tune_datapts_needed += + WT_MIN(EVICT_TUNE_DATAPT_MIN, + (conn->evict_threads_max + - conn->evict_threads.current_threads)/ + EVICT_TUNE_BATCH); + } else { + /* + * We are past the inflection point. Choose the + * best number of eviction workers observed and + * settle into a stable state. + */ + thread_surplus = + conn->evict_threads.current_threads - + conn->evict_tune_workers_best; + + for (i = 0; i < thread_surplus; i++) { + /* + * If we get an error, it should be because we + * were unable to acquire the thread group lock. + * Break out of trying. + */ + WT_ERR(__wt_thread_group_stop_one( + session, &conn->evict_threads, false)); + WT_STAT_CONN_INCR(session, + cache_eviction_worker_removed); + } + WT_STAT_CONN_SET(session, + cache_eviction_stable_state_workers, + conn->evict_tune_workers_best); + conn->evict_tune_stable = true; + WT_STAT_CONN_SET(session, cache_eviction_active_workers, + conn->evict_threads.current_threads); + return (0); + } + } + + /* + * If we have not added any worker threads in the past, we set the + * number needed equal to the number of data points that we must + * accumulate before deciding if we should keep adding workers or settle + * on a previously tried value of workers. + */ + if (conn->evict_tune_last_action_time.tv_sec == 0) + conn->evict_tune_datapts_needed = WT_MIN(EVICT_TUNE_DATAPT_MIN, + (conn->evict_threads_max - + conn->evict_threads.current_threads) / EVICT_TUNE_BATCH); + + if (F_ISSET(cache, WT_CACHE_EVICT_ALL)) { + cur_threads = conn->evict_threads.current_threads; + target_threads = WT_MIN(cur_threads + EVICT_TUNE_BATCH, + conn->evict_threads_max); + /* + * Start the new threads. + */ + for (i = 0; i < (target_threads - cur_threads); ++i) { + /* + * If we get an error, it should be because we were + * unable to acquire the thread group lock. Break out + * of trying. + */ + WT_ERR(__wt_thread_group_start_one(session, + &conn->evict_threads, false)); + WT_STAT_CONN_INCR(session, + cache_eviction_worker_created); + __wt_verbose(session, WT_VERB_EVICTSERVER, + "added worker thread"); + } + conn->evict_tune_last_action_time = current_time; + } + + WT_STAT_CONN_SET(session, cache_eviction_active_workers, + conn->evict_threads.current_threads); + +err: conn->evict_tune_last_time = current_time; + conn->evict_tune_pgs_last = pgs_evicted_cur; + /* + * If we got an EBUSY trying to acquire the lock just return. + * We can try to tune the workers next time. + */ + if (ret == EBUSY) + ret = 0; + return (ret); +} + /* * __evict_lru_pages -- * Get pages from the LRU queue to evict. @@ -867,7 +1118,8 @@ __evict_lru_pages(WT_SESSION_IMPL *session, bool is_server) /* If a worker thread found the queue empty, pause. */ if (ret == WT_NOTFOUND && !is_server && F_ISSET(S2C(session), WT_CONN_EVICTION_RUN)) - __wt_cond_wait(session, conn->evict_threads.wait_cond, 10000); + __wt_cond_wait( + session, conn->evict_threads.wait_cond, 10000, NULL); return (ret == WT_NOTFOUND ? 0 : ret); } @@ -1046,7 +1298,7 @@ __evict_walk(WT_SESSION_IMPL *session, WT_EVICT_QUEUE *queue) WT_CONNECTION_IMPL *conn; WT_DATA_HANDLE *dhandle; WT_DECL_RET; - u_int max_entries, retries, slot, spins, start_slot, total_candidates; + u_int max_entries, retries, slot, start_slot, total_candidates; bool dhandle_locked, incr; conn = S2C(session); @@ -1084,16 +1336,7 @@ retry: while (slot < max_entries) { * reference count to keep it alive while we sweep. */ if (!dhandle_locked) { - for (spins = 0; (ret = __wt_spin_trylock( - session, &conn->dhandle_lock)) == EBUSY && - cache->pass_intr == 0; - spins++) { - if (spins < WT_THOUSAND) - __wt_yield(); - else - __wt_sleep(0, WT_THOUSAND); - } - WT_ERR(ret); + WT_ERR(__evict_lock_handle_list(session)); dhandle_locked = true; } @@ -1172,7 +1415,7 @@ retry: while (slot < max_entries) { (void)__wt_atomic_addi32(&dhandle->session_inuse, 1); incr = true; - __wt_spin_unlock(session, &conn->dhandle_lock); + __wt_readunlock(session, &conn->dhandle_lock); dhandle_locked = false; /* @@ -1219,7 +1462,7 @@ retry: while (slot < max_entries) { } err: if (dhandle_locked) { - __wt_spin_unlock(session, &conn->dhandle_lock); + __wt_readunlock(session, &conn->dhandle_lock); dhandle_locked = false; } @@ -1282,8 +1525,8 @@ __evict_push_candidate(WT_SESSION_IMPL *session, * Get a few page eviction candidates from a single underlying file. */ static int -__evict_walk_file(WT_SESSION_IMPL *session, WT_EVICT_QUEUE *queue, - u_int max_entries, u_int *slotp) +__evict_walk_file(WT_SESSION_IMPL *session, + WT_EVICT_QUEUE *queue, u_int max_entries, u_int *slotp) { WT_BTREE *btree; WT_CACHE *cache; @@ -1315,6 +1558,19 @@ __evict_walk_file(WT_SESSION_IMPL *session, WT_EVICT_QUEUE *queue, start = queue->evict_queue + *slotp; remaining_slots = max_entries - *slotp; total_slots = max_entries - queue->evict_entries; + btree_inuse = cache_inuse = 0; + target_pages_clean = target_pages_dirty = 0; + + /* + * The number of times we should fill the queue by the end of + * considering all trees. + */ +#define QUEUE_FILLS_PER_PASS 10 + + /* + * The minimum number of pages we should consider per tree. + */ +#define MIN_PAGES_PER_TREE 10 /* * The target number of pages for this tree is proportional to the @@ -1323,13 +1579,12 @@ __evict_walk_file(WT_SESSION_IMPL *session, WT_EVICT_QUEUE *queue, * cache (and only have to walk it once). */ if (F_ISSET(cache, WT_CACHE_EVICT_CLEAN)) { - btree_inuse = __wt_btree_bytes_inuse(session); + btree_inuse = __wt_btree_bytes_evictable(session); cache_inuse = __wt_cache_bytes_inuse(cache); bytes_per_slot = 1 + cache_inuse / total_slots; target_pages_clean = (uint32_t)( (btree_inuse + bytes_per_slot / 2) / bytes_per_slot); - } else - target_pages_clean = 0; + } if (F_ISSET(cache, WT_CACHE_EVICT_DIRTY)) { btree_inuse = __wt_btree_dirty_leaf_inuse(session); @@ -1337,35 +1592,58 @@ __evict_walk_file(WT_SESSION_IMPL *session, WT_EVICT_QUEUE *queue, bytes_per_slot = 1 + cache_inuse / total_slots; target_pages_dirty = (uint32_t)( (btree_inuse + bytes_per_slot / 2) / bytes_per_slot); - } else - target_pages_dirty = 0; + } - target_pages = WT_MAX(target_pages_clean, target_pages_dirty); + /* + * Weight the number of target pages by the number of times we want to + * fill the cache per pass through all the trees. Note that we don't + * build this into the calculation above because we don't want to favor + * small trees, so round to a whole number of slots (zero for small + * trees) before multiplying. + */ + target_pages = WT_MAX(target_pages_clean, target_pages_dirty) * + QUEUE_FILLS_PER_PASS; + /* + * Randomly walk trees with a small fraction of the cache in case there + * are so many trees that none of them use enough of the cache to be + * allocated slots. + * + * The chance of walking a tree is equal to the chance that a random + * byte in cache belongs to the tree, weighted by how many times we + * want to fill queues during a pass through all the trees in cache. + */ if (target_pages == 0) { - /* - * Randomly walk trees with a tiny fraction of the cache in - * case there are so many trees that none of them use enough of - * the cache to be allocated slots. Walk small trees 1% of the - * time. - */ - if (__wt_random(&session->rnd) > UINT32_MAX / 100) + if (F_ISSET(cache, WT_CACHE_EVICT_CLEAN)) { + btree_inuse = __wt_btree_bytes_evictable(session); + cache_inuse = __wt_cache_bytes_inuse(cache); + } else { + btree_inuse = __wt_btree_dirty_leaf_inuse(session); + cache_inuse = __wt_cache_dirty_leaf_inuse(cache); + } + if (btree_inuse == 0 || cache_inuse == 0) + return (0); + if (__wt_random64(&session->rnd) % cache_inuse > + btree_inuse * QUEUE_FILLS_PER_PASS) return (0); - target_pages = 10; } + /* + * There is some cost associated with walking a tree. If we're going + * to visit this tree, always look for a minimum number of pages. + */ + if (target_pages < MIN_PAGES_PER_TREE) + target_pages = MIN_PAGES_PER_TREE; + + /* + * If the tree is dead or we're near the end of the queue, fill the + * remaining slots. + */ if (F_ISSET(session->dhandle, WT_DHANDLE_DEAD) || target_pages > remaining_slots) target_pages = remaining_slots; end = start + target_pages; - walk_flags = - WT_READ_CACHE | WT_READ_NO_EVICT | WT_READ_NO_GEN | WT_READ_NO_WAIT; - - /* Randomize the walk direction. */ - if (btree->evict_walk_reverse) - FLD_SET(walk_flags, WT_READ_PREV); - /* * Examine at least a reasonable number of pages before deciding * whether to give up. When we are only looking for dirty pages, @@ -1376,9 +1654,44 @@ __evict_walk_file(WT_SESSION_IMPL *session, WT_EVICT_QUEUE *queue, !F_ISSET(cache, WT_CACHE_EVICT_CLEAN)) min_pages *= 10; + walk_flags = + WT_READ_CACHE | WT_READ_NO_EVICT | WT_READ_NO_GEN | WT_READ_NO_WAIT; + + /* + * Choose a random point in the tree if looking for candidates in a + * tree with no starting point set. This is mostly aimed at ensuring + * eviction fairly visits all pages in trees with a lot of in-cache + * content. + */ + switch (btree->evict_walk_state) { + case WT_EVICT_WALK_NEXT: + break; + case WT_EVICT_WALK_PREV: + FLD_SET(walk_flags, WT_READ_PREV); + break; + case WT_EVICT_WALK_RAND_PREV: + FLD_SET(walk_flags, WT_READ_PREV); + /* FALLTHROUGH */ + case WT_EVICT_WALK_RAND_NEXT: + if (btree->evict_ref == NULL) { + /* Ensure internal pages indexes remain valid */ + WT_WITH_PAGE_INDEX(session, ret = __wt_random_descent( + session, &btree->evict_ref, true)); + WT_RET_NOTFOUND_OK(ret); + } + break; + } + + /* + * Get some more eviction candidate pages, starting at the last saved + * point. Clear the saved point immediately, we assert when discarding + * pages we're not discarding an eviction point, so this clear must be + * complete before the page is released. + */ + ref = btree->evict_ref; + btree->evict_ref = NULL; + /* - * Get some more eviction candidate pages. - * * !!! Take care terminating this loop. * * Don't make an extra call to __wt_tree_walk after we hit the end of a @@ -1391,7 +1704,7 @@ __evict_walk_file(WT_SESSION_IMPL *session, WT_EVICT_QUEUE *queue, for (evict = start, pages_queued = pages_seen = refs_walked = 0; evict < end && (ret == 0 || ret == WT_NOTFOUND); ret = __wt_tree_walk_count( - session, &btree->evict_ref, &refs_walked, walk_flags)) { + session, &ref, &refs_walked, walk_flags)) { /* * Check whether we're finding a good ratio of candidates vs * pages seen. Some workloads create "deserts" in trees where @@ -1402,10 +1715,18 @@ __evict_walk_file(WT_SESSION_IMPL *session, WT_EVICT_QUEUE *queue, pages_seen > min_pages && (pages_queued == 0 || (pages_seen / pages_queued) > (min_pages / target_pages)); - if (give_up) + if (give_up) { + /* + * Try a different walk start point next time if a + * walk gave up. + */ + btree->evict_walk_state = + (btree->evict_walk_state + 1) % + WT_EVICT_WALK_MAX_LEGAL_VALUE; break; + } - if ((ref = btree->evict_ref) == NULL) { + if (ref == NULL) { if (++restarts == 2) break; WT_STAT_CONN_INCR( @@ -1439,7 +1760,7 @@ __evict_walk_file(WT_SESSION_IMPL *session, WT_EVICT_QUEUE *queue, if (page->read_gen == WT_READGEN_NOTSET) __wt_cache_read_gen_new(session, page); - /* Pages we no longer need (clean or dirty), are found money. */ + /* Pages being forcibly evicted go on the urgent queue. */ if (page->read_gen == WT_READGEN_OLDEST || page->memory_footprint >= btree->splitmempage) { WT_STAT_CONN_INCR( @@ -1449,7 +1770,7 @@ __evict_walk_file(WT_SESSION_IMPL *session, WT_EVICT_QUEUE *queue, continue; } - /* Pages that are empty or from dead trees are also good. */ + /* Pages that are empty or from dead trees are fast-tracked. */ if (__wt_page_is_empty(page) || F_ISSET(session->dhandle, WT_DHANDLE_DEAD)) goto fast; @@ -1495,7 +1816,7 @@ fast: /* If the page can't be evicted, give up. */ ++pages_queued; if (WT_PAGE_IS_INTERNAL(page)) - ++internal_pages; + ++internal_pages; __wt_verbose(session, WT_VERB_EVICTSERVER, "select: %p, size %" WT_SIZET_FMT, @@ -1508,12 +1829,10 @@ fast: /* If the page can't be evicted, give up. */ session, cache_eviction_pages_queued, (u_int)(evict - start)); /* - * If we didn't find any candidates in the file, reverse the direction - * of the walk and skip it next time. + * If we couldn't find the number of pages we were looking for, skip + * the tree next time. */ - if (give_up) - btree->evict_walk_reverse = !btree->evict_walk_reverse; - if (pages_queued == 0 && !urgent_queued) + if (pages_queued < target_pages / 2 && !urgent_queued) btree->evict_walk_period = WT_MIN( WT_MAX(1, 2 * btree->evict_walk_period), 100); else if (pages_queued == target_pages) @@ -1522,6 +1841,8 @@ fast: /* If the page can't be evicted, give up. */ btree->evict_walk_period /= 2; /* + * Give up the walk occasionally. + * * If we happen to end up on the root page or a page requiring urgent * eviction, clear it. We have to track hazard pointers, and the root * page complicates that calculation. @@ -1533,16 +1854,20 @@ fast: /* If the page can't be evicted, give up. */ * If we land on a page requiring forced eviction, move on to the next * page: we want this page evicted as quickly as possible. */ - if ((ref = btree->evict_ref) != NULL) { - /* Give up the walk occasionally. */ + if (ref != NULL) { if (__wt_ref_is_root(ref) || evict == start || give_up || ref->page->read_gen == WT_READGEN_OLDEST || - ref->page->memory_footprint >= btree->splitmempage) - WT_RET(__evict_clear_walk(session, restarts == 0)); - else if (ref->page->read_gen == WT_READGEN_OLDEST) + ref->page->memory_footprint >= btree->splitmempage) { + if (restarts == 0) + WT_STAT_CONN_INCR( + session, cache_eviction_walks_abandoned); + WT_RET(__wt_page_release(cache->walk_session, + ref, WT_READ_NO_EVICT)); + ref = NULL; + } else if (ref->page->read_gen == WT_READGEN_OLDEST) WT_RET_NOTFOUND_OK(__wt_tree_walk_count( - session, &btree->evict_ref, - &refs_walked, walk_flags)); + session, &ref, &refs_walked, walk_flags)); + btree->evict_ref = ref; } WT_STAT_CONN_INCRV(session, cache_eviction_walk, refs_walked); @@ -1876,8 +2201,8 @@ __wt_cache_eviction_worker(WT_SESSION_IMPL *session, bool busy, u_int pct_full) break; case WT_NOTFOUND: /* Allow the queue to re-populate before retrying. */ - __wt_cond_wait( - session, conn->evict_threads.wait_cond, 10000); + __wt_cond_wait(session, + conn->evict_threads.wait_cond, 10000, NULL); cache->app_waits++; break; default: @@ -1973,125 +2298,140 @@ __wt_evict_priority_clear(WT_SESSION_IMPL *session) S2BT(session)->evict_priority = 0; } -#ifdef HAVE_DIAGNOSTIC +#if defined(HAVE_DIAGNOSTIC) || defined(HAVE_VERBOSE) /* - * __wt_cache_dump -- - * Dump debugging information to a file (default stderr) about the size of - * the files in the cache. + * __verbose_dump_cache_single -- + * Output diagnostic information about a single file in the cache. */ -int -__wt_cache_dump(WT_SESSION_IMPL *session, const char *ofile) +static int +__verbose_dump_cache_single(WT_SESSION_IMPL *session, + uint64_t *total_bytesp, uint64_t *total_dirty_bytesp) { - FILE *fp; - WT_CONNECTION_IMPL *conn; - WT_DATA_HANDLE *dhandle, *saved_dhandle; + WT_DATA_HANDLE *dhandle; WT_PAGE *page; WT_REF *next_walk; + size_t size; uint64_t intl_bytes, intl_bytes_max, intl_dirty_bytes; uint64_t intl_dirty_bytes_max, intl_dirty_pages, intl_pages; uint64_t leaf_bytes, leaf_bytes_max, leaf_dirty_bytes; uint64_t leaf_dirty_bytes_max, leaf_dirty_pages, leaf_pages; + + intl_bytes = intl_bytes_max = intl_dirty_bytes = 0; + intl_dirty_bytes_max = intl_dirty_pages = intl_pages = 0; + leaf_bytes = leaf_bytes_max = leaf_dirty_bytes = 0; + leaf_dirty_bytes_max = leaf_dirty_pages = leaf_pages = 0; + + next_walk = NULL; + while (__wt_tree_walk(session, &next_walk, + WT_READ_CACHE | WT_READ_NO_EVICT | WT_READ_NO_WAIT) == 0 && + next_walk != NULL) { + page = next_walk->page; + size = page->memory_footprint; + + if (WT_PAGE_IS_INTERNAL(page)) { + ++intl_pages; + intl_bytes += size; + intl_bytes_max = WT_MAX(intl_bytes_max, size); + if (__wt_page_is_modified(page)) { + ++intl_dirty_pages; + intl_dirty_bytes += size; + intl_dirty_bytes_max = + WT_MAX(intl_dirty_bytes_max, size); + } + } else { + ++leaf_pages; + leaf_bytes += size; + leaf_bytes_max = WT_MAX(leaf_bytes_max, size); + if (__wt_page_is_modified(page)) { + ++leaf_dirty_pages; + leaf_dirty_bytes += size; + leaf_dirty_bytes_max = + WT_MAX(leaf_dirty_bytes_max, size); + } + } + } + + dhandle = session->dhandle; + if (dhandle->checkpoint == NULL) + WT_RET(__wt_msg(session, "%s(<live>):", dhandle->name)); + else + WT_RET(__wt_msg(session, "%s(checkpoint=%s):", + dhandle->name, dhandle->checkpoint)); + if (intl_pages != 0) + WT_RET(__wt_msg(session, + "internal: " + "%" PRIu64 " pages, " + "%" PRIu64 "MB, " + "%" PRIu64 "/%" PRIu64 " clean/dirty pages, " + "%" PRIu64 "/%" PRIu64 " clean/dirty MB, " + "%" PRIu64 "MB max page, " + "%" PRIu64 "MB max dirty page", + intl_pages, + intl_bytes / WT_MEGABYTE, + intl_pages - intl_dirty_pages, + intl_dirty_pages, + (intl_bytes - intl_dirty_bytes) / WT_MEGABYTE, + intl_dirty_bytes / WT_MEGABYTE, + intl_bytes_max / WT_MEGABYTE, + intl_dirty_bytes_max / WT_MEGABYTE)); + if (leaf_pages != 0) + WT_RET(__wt_msg(session, + "leaf: " + "%" PRIu64 " pages, " + "%" PRIu64 "MB, " + "%" PRIu64 "/%" PRIu64 " clean/dirty pages, " + "%" PRIu64 "/%" PRIu64 " clean/dirty MB, " + "%" PRIu64 "MB max page, " + "%" PRIu64 "MB max dirty page", + leaf_pages, + leaf_bytes / WT_MEGABYTE, + leaf_pages - leaf_dirty_pages, + leaf_dirty_pages, + (leaf_bytes - leaf_dirty_bytes) / WT_MEGABYTE, + leaf_dirty_bytes / WT_MEGABYTE, + leaf_bytes_max / WT_MEGABYTE, + leaf_dirty_bytes_max / WT_MEGABYTE)); + + *total_bytesp += intl_bytes + leaf_bytes; + *total_dirty_bytesp += intl_dirty_bytes + leaf_dirty_bytes; + + return (0); +} + +/* + * __wt_verbose_dump_cache -- + * Output diagnostic information about the cache. + */ +int +__wt_verbose_dump_cache(WT_SESSION_IMPL *session) +{ + WT_CONNECTION_IMPL *conn; + WT_DATA_HANDLE *dhandle; + WT_DECL_RET; uint64_t total_bytes, total_dirty_bytes; - size_t size; conn = S2C(session); total_bytes = total_dirty_bytes = 0; - if (ofile == NULL) - fp = stderr; - else if ((fp = fopen(ofile, "w")) == NULL) - return (EIO); + WT_RET(__wt_msg(session, "%s", WT_DIVIDER)); + WT_RET(__wt_msg(session, "cache dump")); - /* Note: odd string concatenation avoids spelling errors. */ - (void)fprintf(fp, "==========\n" "cache dump\n"); - - saved_dhandle = session->dhandle; - TAILQ_FOREACH(dhandle, &conn->dhqh, q) { + for (dhandle = NULL;;) { + WT_WITH_HANDLE_LIST_READ_LOCK(session, + WT_DHANDLE_NEXT(session, dhandle, &conn->dhqh, q)); + if (dhandle == NULL) + break; if (!WT_PREFIX_MATCH(dhandle->name, "file:") || !F_ISSET(dhandle, WT_DHANDLE_OPEN)) continue; - intl_bytes = intl_bytes_max = intl_dirty_bytes = 0; - intl_dirty_bytes_max = intl_dirty_pages = intl_pages = 0; - leaf_bytes = leaf_bytes_max = leaf_dirty_bytes = 0; - leaf_dirty_bytes_max = leaf_dirty_pages = leaf_pages = 0; - - next_walk = NULL; - session->dhandle = dhandle; - while (__wt_tree_walk(session, &next_walk, - WT_READ_CACHE | WT_READ_NO_EVICT | WT_READ_NO_WAIT) == 0 && - next_walk != NULL) { - page = next_walk->page; - size = page->memory_footprint; - - if (WT_PAGE_IS_INTERNAL(page)) { - ++intl_pages; - intl_bytes += size; - intl_bytes_max = WT_MAX(intl_bytes_max, size); - if (__wt_page_is_modified(page)) { - ++intl_dirty_pages; - intl_dirty_bytes += size; - intl_dirty_bytes_max = - WT_MAX(intl_dirty_bytes_max, size); - } - } else { - ++leaf_pages; - leaf_bytes += size; - leaf_bytes_max = WT_MAX(leaf_bytes_max, size); - if (__wt_page_is_modified(page)) { - ++leaf_dirty_pages; - leaf_dirty_bytes += size; - leaf_dirty_bytes_max = - WT_MAX(leaf_dirty_bytes_max, size); - } - } - } - session->dhandle = NULL; - - if (dhandle->checkpoint == NULL) - (void)fprintf(fp, "%s(<live>): \n", dhandle->name); - else - (void)fprintf(fp, "%s(checkpoint=%s): \n", - dhandle->name, dhandle->checkpoint); - if (intl_pages != 0) - (void)fprintf(fp, - "\t" "internal: " - "%" PRIu64 " pages, " - "%" PRIu64 "MB, " - "%" PRIu64 "/%" PRIu64 " clean/dirty pages, " - "%" PRIu64 "/%" PRIu64 " clean/dirty MB, " - "%" PRIu64 "MB max page, " - "%" PRIu64 "MB max dirty page\n", - intl_pages, - intl_bytes >> 20, - intl_pages - intl_dirty_pages, - intl_dirty_pages, - (intl_bytes - intl_dirty_bytes) >> 20, - intl_dirty_bytes >> 20, - intl_bytes_max >> 20, - intl_dirty_bytes_max >> 20); - if (leaf_pages != 0) - (void)fprintf(fp, - "\t" "leaf: " - "%" PRIu64 " pages, " - "%" PRIu64 "MB, " - "%" PRIu64 "/%" PRIu64 " clean/dirty pages, " - "%" PRIu64 "/%" PRIu64 " clean/dirty MB, " - "%" PRIu64 "MB max page, " - "%" PRIu64 "MB max dirty page\n", - leaf_pages, - leaf_bytes >> 20, - leaf_pages - leaf_dirty_pages, - leaf_dirty_pages, - (leaf_bytes - leaf_dirty_bytes) >> 20, - leaf_dirty_bytes >> 20, - leaf_bytes_max >> 20, - leaf_dirty_bytes_max >> 20); - - total_bytes += intl_bytes + leaf_bytes; - total_dirty_bytes += intl_dirty_bytes + leaf_dirty_bytes; + WT_WITH_DHANDLE(session, dhandle, + ret = __verbose_dump_cache_single( + session, &total_bytes, &total_dirty_bytes)); + if (ret != 0) + break; } - session->dhandle = saved_dhandle; + WT_RET(ret); /* * Apply the overhead percentage so our total bytes are comparable with @@ -2099,16 +2439,16 @@ __wt_cache_dump(WT_SESSION_IMPL *session, const char *ofile) */ total_bytes = __wt_cache_bytes_plus_overhead(conn->cache, total_bytes); - (void)fprintf(fp, + WT_RET(__wt_msg(session, "cache dump: " - "total found = %" PRIu64 "MB vs tracked inuse %" PRIu64 "MB\n" - "total dirty bytes = %" PRIu64 "MB\n", - total_bytes >> 20, __wt_cache_bytes_inuse(conn->cache) >> 20, - total_dirty_bytes >> 20); - (void)fprintf(fp, "==========\n"); - - if (ofile != NULL && fclose(fp) != 0) - return (EIO); + "total found: %" PRIu64 "MB vs tracked inuse %" PRIu64 "MB", + total_bytes / WT_MEGABYTE, + __wt_cache_bytes_inuse(conn->cache) / WT_MEGABYTE)); + WT_RET(__wt_msg(session, + "total dirty bytes: %" PRIu64 "MB", + total_dirty_bytes / WT_MEGABYTE)); + WT_RET(__wt_msg(session, "%s", WT_DIVIDER)); + return (0); } #endif diff --git a/src/third_party/wiredtiger/src/evict/evict_stat.c b/src/third_party/wiredtiger/src/evict/evict_stat.c index 2dd3b1e83a0..7c2d5722a63 100644 --- a/src/third_party/wiredtiger/src/evict/evict_stat.c +++ b/src/third_party/wiredtiger/src/evict/evict_stat.c @@ -134,5 +134,5 @@ __wt_curstat_cache_walk(WT_SESSION_IMPL *session) WT_STAT_DATA_SET(session, cache_state_root_size, btree->root.page->memory_footprint); - WT_WITH_HANDLE_LIST_LOCK(session, __evict_stat_walk(session)); + __evict_stat_walk(session); } diff --git a/src/third_party/wiredtiger/src/include/btmem.h b/src/third_party/wiredtiger/src/include/btmem.h index 9bd835f5d09..39ca223aebf 100644 --- a/src/third_party/wiredtiger/src/include/btmem.h +++ b/src/third_party/wiredtiger/src/include/btmem.h @@ -435,6 +435,19 @@ struct __wt_page_modify { }; /* + * WT_COL_RLE -- + * Variable-length column-store pages have an array of page entries with RLE + * counts greater than 1 when reading the page, so it's not necessary to walk + * the page counting records to find a specific entry. We can do a binary search + * in this array, then an offset calculation to find the cell. + */ +WT_PACKED_STRUCT_BEGIN(__wt_col_rle) + uint64_t recno; /* Record number of first repeat. */ + uint64_t rle; /* Repeat count. */ + uint32_t indx; /* Slot of entry in col_var. */ +WT_PACKED_STRUCT_END + +/* * WT_PAGE -- * The WT_PAGE structure describes the in-memory page information. */ @@ -470,6 +483,7 @@ struct __wt_page { */ struct { WT_REF *parent_ref; /* Parent reference */ + uint64_t split_gen; /* Generation of last split */ struct __wt_page_index { uint32_t entries; @@ -479,6 +493,8 @@ struct __wt_page { } intl; #undef pg_intl_parent_ref #define pg_intl_parent_ref u.intl.parent_ref +#undef pg_intl_split_gen +#define pg_intl_split_gen u.intl.split_gen /* * Macros to copy/set the index because the name is obscured to ensure @@ -515,53 +531,54 @@ struct __wt_page { } while (0) /* Row-store leaf page. */ - struct { - WT_ROW *d; /* Key/value pairs */ - uint32_t entries; /* Entries */ - } row; -#undef pg_row_d -#define pg_row_d u.row.d -#undef pg_row_entries -#define pg_row_entries u.row.entries + WT_ROW *row; /* Key/value pairs */ +#undef pg_row +#define pg_row u.row /* Fixed-length column-store leaf page. */ - struct { - uint8_t *bitf; /* Values */ - uint32_t entries; /* Entries */ - } col_fix; + uint8_t *fix_bitf; /* Values */ #undef pg_fix_bitf -#define pg_fix_bitf u.col_fix.bitf -#undef pg_fix_entries -#define pg_fix_entries u.col_fix.entries +#define pg_fix_bitf u.fix_bitf /* Variable-length column-store leaf page. */ struct { - WT_COL *d; /* Values */ + WT_COL *col_var; /* Values */ /* - * Variable-length column-store files maintain a list of - * RLE entries on the page so it's unnecessary to walk - * the page counting records to find a specific entry. + * Variable-length column-store pages have an array + * of page entries with RLE counts greater than 1 when + * reading the page, so it's not necessary to walk the + * page counting records to find a specific entry. We + * can do a binary search in this array, then an offset + * calculation to find the cell. + * + * It's a separate structure to keep the page structure + * as small as possible. */ - WT_COL_RLE *repeats; /* RLE array for lookups */ - uint32_t nrepeats; /* Number of repeat slots */ - - uint32_t entries; /* Entries */ + struct __wt_col_var_repeat { + uint32_t nrepeats; /* repeat slots */ + WT_COL_RLE repeats[0]; /* lookup RLE array */ + } *repeats; +#define WT_COL_VAR_REPEAT_SET(page) \ + ((page)->u.col_var.repeats != NULL) } col_var; -#undef pg_var_d -#define pg_var_d u.col_var.d +#undef pg_var +#define pg_var u.col_var.col_var #undef pg_var_repeats -#define pg_var_repeats u.col_var.repeats +#define pg_var_repeats u.col_var.repeats->repeats #undef pg_var_nrepeats -#define pg_var_nrepeats u.col_var.nrepeats -#undef pg_var_entries -#define pg_var_entries u.col_var.entries +#define pg_var_nrepeats u.col_var.repeats->nrepeats } u; /* - * The page's type and flags are positioned at the end of the WT_PAGE - * union, it reduces cache misses in the row-store search function. + * Page entries, type and flags are positioned at the end of the WT_PAGE + * union to reduce cache misses in the row-store search function. + * + * The entries field only applies to leaf pages, internal pages use the + * page-index entries instead. */ + uint32_t entries; /* Leaf page entries */ + #define WT_PAGE_IS_INTERNAL(page) \ ((page)->type == WT_PAGE_COL_INT || (page)->type == WT_PAGE_ROW_INT) #define WT_PAGE_INVALID 0 /* Invalid page */ @@ -579,9 +596,8 @@ struct __wt_page { #define WT_PAGE_DISK_MAPPED 0x04 /* Disk image in mapped memory */ #define WT_PAGE_EVICT_LRU 0x08 /* Page is on the LRU queue */ #define WT_PAGE_OVERFLOW_KEYS 0x10 /* Page has overflow keys */ -#define WT_PAGE_SPLIT_BLOCK 0x20 /* Split blocking eviction and splits */ -#define WT_PAGE_SPLIT_INSERT 0x40 /* A leaf page was split for append */ -#define WT_PAGE_UPDATE_IGNORE 0x80 /* Ignore updates on page discard */ +#define WT_PAGE_SPLIT_INSERT 0x20 /* A leaf page was split for append */ +#define WT_PAGE_UPDATE_IGNORE 0x40 /* Ignore updates on page discard */ uint8_t flags_atomic; /* Atomic flags, use F_*_ATOMIC */ uint8_t unused[2]; /* Unused padding */ @@ -618,8 +634,8 @@ struct __wt_page { #define WT_READGEN_START_VALUE 100 #define WT_READGEN_STEP 100 uint64_t read_gen; - /* The evict pass generation for the page */ - uint64_t evict_pass_gen; + + uint64_t evict_pass_gen; /* Eviction pass generation */ size_t memory_footprint; /* Memory attached to the page */ @@ -792,11 +808,11 @@ struct __wt_row { /* On-page key, on-page cell, or off-page WT_IKEY */ * Walk the entries of an in-memory row-store leaf page. */ #define WT_ROW_FOREACH(page, rip, i) \ - for ((i) = (page)->pg_row_entries, \ - (rip) = (page)->pg_row_d; (i) > 0; ++(rip), --(i)) + for ((i) = (page)->entries, \ + (rip) = (page)->pg_row; (i) > 0; ++(rip), --(i)) #define WT_ROW_FOREACH_REVERSE(page, rip, i) \ - for ((i) = (page)->pg_row_entries, \ - (rip) = (page)->pg_row_d + ((page)->pg_row_entries - 1); \ + for ((i) = (page)->entries, \ + (rip) = (page)->pg_row + ((page)->entries - 1); \ (i) > 0; --(rip), --(i)) /* @@ -804,7 +820,7 @@ struct __wt_row { /* On-page key, on-page cell, or off-page WT_IKEY */ * Return the 0-based array offset based on a WT_ROW reference. */ #define WT_ROW_SLOT(page, rip) \ - ((uint32_t)(((WT_ROW *)(rip)) - (page)->pg_row_d)) + ((uint32_t)(((WT_ROW *)(rip)) - (page)->pg_row)) /* * WT_COL -- @@ -829,18 +845,6 @@ struct __wt_col { }; /* - * WT_COL_RLE -- - * In variable-length column store leaf pages, we build an array of entries - * with RLE counts greater than 1 when reading the page. We can do a binary - * search in this array, then an offset calculation to find the cell. - */ -WT_PACKED_STRUCT_BEGIN(__wt_col_rle) - uint64_t recno; /* Record number of first repeat. */ - uint64_t rle; /* Repeat count. */ - uint32_t indx; /* Slot of entry in col_var.d */ -WT_PACKED_STRUCT_END - -/* * WT_COL_PTR, WT_COL_PTR_SET -- * Return/Set a pointer corresponding to the data offset. (If the item does * not exist on the page, return a NULL.) @@ -856,15 +860,15 @@ WT_PACKED_STRUCT_END * Walk the entries of variable-length column-store leaf page. */ #define WT_COL_FOREACH(page, cip, i) \ - for ((i) = (page)->pg_var_entries, \ - (cip) = (page)->pg_var_d; (i) > 0; ++(cip), --(i)) + for ((i) = (page)->entries, \ + (cip) = (page)->pg_var; (i) > 0; ++(cip), --(i)) /* * WT_COL_SLOT -- * Return the 0-based array offset based on a WT_COL reference. */ #define WT_COL_SLOT(page, cip) \ - ((uint32_t)(((WT_COL *)cip) - (page)->pg_var_d)) + ((uint32_t)(((WT_COL *)cip) - (page)->pg_var)) /* * WT_IKEY -- @@ -1041,7 +1045,7 @@ struct __wt_insert_head { #define WT_ROW_INSERT_SMALLEST(page) \ ((page)->modify == NULL || \ (page)->modify->mod_row_insert == NULL ? \ - NULL : (page)->modify->mod_row_insert[(page)->pg_row_entries]) + NULL : (page)->modify->mod_row_insert[(page)->entries]) /* * The column-store leaf page update lists are arrays of pointers to structures, diff --git a/src/third_party/wiredtiger/src/include/btree.h b/src/third_party/wiredtiger/src/include/btree.h index 595afc453c8..976c1d2110c 100644 --- a/src/third_party/wiredtiger/src/include/btree.h +++ b/src/third_party/wiredtiger/src/include/btree.h @@ -114,7 +114,7 @@ struct __wt_btree { int split_pct; /* Split page percent */ WT_COMPRESSOR *compressor; /* Page compressor */ WT_KEYED_ENCRYPTOR *kencryptor; /* Page encryptor */ - WT_RWLOCK *ovfl_lock; /* Overflow lock */ + WT_RWLOCK ovfl_lock; /* Overflow lock */ uint64_t last_recno; /* Column-store last record number */ @@ -131,6 +131,7 @@ struct __wt_btree { uint64_t write_gen; /* Write generation */ uint64_t bytes_inmem; /* Cache bytes in memory. */ + uint64_t bytes_dirty_intl; /* Bytes in dirty internal pages. */ uint64_t bytes_dirty_leaf; /* Bytes in dirty leaf pages. */ WT_REF *evict_ref; /* Eviction thread's location */ @@ -140,7 +141,11 @@ struct __wt_btree { u_int evict_walk_skips; /* Number of walks skipped */ u_int evict_disabled; /* Eviction disabled count */ volatile uint32_t evict_busy; /* Count of threads in eviction */ - bool evict_walk_reverse; /* Walk direction */ + enum { + WT_EVICT_WALK_NEXT, WT_EVICT_WALK_PREV, + WT_EVICT_WALK_RAND_NEXT, WT_EVICT_WALK_RAND_PREV + } evict_walk_state; /* Eviction walk state */ +#define WT_EVICT_WALK_MAX_LEGAL_VALUE WT_EVICT_WALK_RAND_PREV + 1 enum { WT_CKPT_OFF, WT_CKPT_PREPARE, WT_CKPT_RUNNING diff --git a/src/third_party/wiredtiger/src/include/btree.i b/src/third_party/wiredtiger/src/include/btree.i index 4f69c258621..315efa86fa6 100644 --- a/src/third_party/wiredtiger/src/include/btree.i +++ b/src/third_party/wiredtiger/src/include/btree.i @@ -71,6 +71,47 @@ __wt_btree_bytes_inuse(WT_SESSION_IMPL *session) } /* + * __wt_btree_bytes_evictable -- + * Return the number of bytes that can be evicted (i.e. bytes apart from + * the pinned root page). + */ +static inline uint64_t +__wt_btree_bytes_evictable(WT_SESSION_IMPL *session) +{ + WT_BTREE *btree; + WT_CACHE *cache; + WT_PAGE *root_page; + uint64_t bytes_inmem, bytes_root; + + btree = S2BT(session); + cache = S2C(session)->cache; + root_page = btree->root.page; + + bytes_inmem = btree->bytes_inmem; + bytes_root = root_page == NULL ? 0 : root_page->memory_footprint; + + return (bytes_inmem <= bytes_root ? 0 : + __wt_cache_bytes_plus_overhead(cache, bytes_inmem - bytes_root)); +} + +/* + * __wt_btree_dirty_inuse -- + * Return the number of dirty bytes in use. + */ +static inline uint64_t +__wt_btree_dirty_inuse(WT_SESSION_IMPL *session) +{ + WT_BTREE *btree; + WT_CACHE *cache; + + btree = S2BT(session); + cache = S2C(session)->cache; + + return (__wt_cache_bytes_plus_overhead(cache, + btree->bytes_dirty_intl + btree->bytes_dirty_leaf)); +} + +/* * __wt_btree_dirty_leaf_inuse -- * Return the number of bytes in use by dirty leaf pages. */ @@ -105,11 +146,12 @@ __wt_cache_page_inmem_incr(WT_SESSION_IMPL *session, WT_PAGE *page, size_t size) (void)__wt_atomic_addsize(&page->memory_footprint, size); if (__wt_page_is_modified(page)) { (void)__wt_atomic_addsize(&page->modify->bytes_dirty, size); - if (WT_PAGE_IS_INTERNAL(page)) + if (WT_PAGE_IS_INTERNAL(page)) { + (void)__wt_atomic_add64(&btree->bytes_dirty_intl, size); (void)__wt_atomic_add64(&cache->bytes_dirty_intl, size); - else if (!F_ISSET(btree, WT_BTREE_LSM_PRIMARY)) { - (void)__wt_atomic_add64(&cache->bytes_dirty_leaf, size); + } else if (!F_ISSET(btree, WT_BTREE_LSM_PRIMARY)) { (void)__wt_atomic_add64(&btree->bytes_dirty_leaf, size); + (void)__wt_atomic_add64(&cache->bytes_dirty_leaf, size); } } /* Track internal size in cache. */ @@ -238,10 +280,12 @@ __wt_cache_page_byte_dirty_decr( if (i == 5) return; - if (WT_PAGE_IS_INTERNAL(page)) + if (WT_PAGE_IS_INTERNAL(page)) { + __wt_cache_decr_check_uint64(session, &btree->bytes_dirty_intl, + decr, "WT_BTREE.bytes_dirty_intl"); __wt_cache_decr_check_uint64(session, &cache->bytes_dirty_intl, decr, "WT_CACHE.bytes_dirty_intl"); - else if (!F_ISSET(btree, WT_BTREE_LSM_PRIMARY)) { + } else if (!F_ISSET(btree, WT_BTREE_LSM_PRIMARY)) { __wt_cache_decr_check_uint64(session, &btree->bytes_dirty_leaf, decr, "WT_BTREE.bytes_dirty_leaf"); __wt_cache_decr_check_uint64(session, &cache->bytes_dirty_leaf, @@ -297,6 +341,7 @@ __wt_cache_dirty_incr(WT_SESSION_IMPL *session, WT_PAGE *page) */ size = page->memory_footprint; if (WT_PAGE_IS_INTERNAL(page)) { + (void)__wt_atomic_add64(&btree->bytes_dirty_intl, size); (void)__wt_atomic_add64(&cache->bytes_dirty_intl, size); (void)__wt_atomic_add64(&cache->pages_dirty_intl, 1); } else { @@ -392,17 +437,20 @@ __wt_cache_page_evict(WT_SESSION_IMPL *session, WT_PAGE *page) /* Update the cache's dirty-byte count. */ if (modify != NULL && modify->bytes_dirty != 0) { - if (WT_PAGE_IS_INTERNAL(page)) + if (WT_PAGE_IS_INTERNAL(page)) { + __wt_cache_decr_zero_uint64(session, + &btree->bytes_dirty_intl, + modify->bytes_dirty, "WT_BTREE.bytes_dirty_intl"); __wt_cache_decr_zero_uint64(session, &cache->bytes_dirty_intl, modify->bytes_dirty, "WT_CACHE.bytes_dirty_intl"); - else if (!F_ISSET(btree, WT_BTREE_LSM_PRIMARY)) { - __wt_cache_decr_zero_uint64(session, - &cache->bytes_dirty_leaf, - modify->bytes_dirty, "WT_CACHE.bytes_dirty_leaf"); + } else if (!F_ISSET(btree, WT_BTREE_LSM_PRIMARY)) { __wt_cache_decr_zero_uint64(session, &btree->bytes_dirty_leaf, modify->bytes_dirty, "WT_BTREE.bytes_dirty_leaf"); + __wt_cache_decr_zero_uint64(session, + &cache->bytes_dirty_leaf, + modify->bytes_dirty, "WT_CACHE.bytes_dirty_leaf"); } } @@ -984,7 +1032,7 @@ __wt_cursor_row_leaf_key(WT_CURSOR_BTREE *cbt, WT_ITEM *key) if (cbt->ins == NULL) { session = (WT_SESSION_IMPL *)cbt->iface.session; page = cbt->ref->page; - rip = &page->u.row.d[cbt->slot]; + rip = &page->pg_row[cbt->slot]; WT_RET(__wt_row_leaf_key(session, page, rip, key, false)); } else { key->data = WT_INSERT_KEY(cbt->ins); @@ -1183,9 +1231,9 @@ __wt_leaf_page_can_split(WT_SESSION_IMPL *session, WT_PAGE *page) */ ins_head = page->type == WT_PAGE_ROW_LEAF ? - (page->pg_row_entries == 0 ? + (page->entries == 0 ? WT_ROW_INSERT_SMALLEST(page) : - WT_ROW_INSERT_SLOT(page, page->pg_row_entries - 1)) : + WT_ROW_INSERT_SLOT(page, page->entries - 1)) : WT_COL_APPEND(page); if (ins_head == NULL) return (false); @@ -1300,8 +1348,8 @@ __wt_page_can_evict( * discards its WT_REF array, and a thread traversing the original * parent page index might see a freed WT_REF. */ - if (WT_PAGE_IS_INTERNAL(page) && - F_ISSET_ATOMIC(page, WT_PAGE_SPLIT_BLOCK)) + if (WT_PAGE_IS_INTERNAL(page) && !__wt_split_obsolete( + session, page->pg_intl_split_gen)) return (false); /* diff --git a/src/third_party/wiredtiger/src/include/cache.h b/src/third_party/wiredtiger/src/include/cache.h index 70f6169200d..abd5a1901f7 100644 --- a/src/third_party/wiredtiger/src/include/cache.h +++ b/src/third_party/wiredtiger/src/include/cache.h @@ -83,7 +83,7 @@ struct __wt_cache { uint64_t worker_evicts; /* Pages evicted by worker threads */ uint64_t evict_max_page_size; /* Largest page seen at eviction */ -#ifdef HAVE_DIAGNOSTIC +#if defined(HAVE_DIAGNOSTIC) || defined(HAVE_VERBOSE) struct timespec stuck_ts; /* Stuck timestamp */ #endif diff --git a/src/third_party/wiredtiger/src/include/cache.i b/src/third_party/wiredtiger/src/include/cache.i index 17ab39e97d2..d71978ccf35 100644 --- a/src/third_party/wiredtiger/src/include/cache.i +++ b/src/third_party/wiredtiger/src/include/cache.i @@ -364,7 +364,7 @@ __wt_cache_eviction_check(WT_SESSION_IMPL *session, bool busy, bool *didworkp) * block eviction), we don't want to highjack the thread for eviction. */ if (F_ISSET(session, WT_SESSION_NO_EVICTION | - WT_SESSION_LOCKED_HANDLE_LIST | WT_SESSION_LOCKED_SCHEMA)) + WT_SESSION_LOCKED_HANDLE_LIST_WRITE | WT_SESSION_LOCKED_SCHEMA)) return (0); /* In memory configurations don't block when the cache is full. */ diff --git a/src/third_party/wiredtiger/src/include/column.i b/src/third_party/wiredtiger/src/include/column.i index d15f874b281..c1b45a1f4e0 100644 --- a/src/third_party/wiredtiger/src/include/column.i +++ b/src/third_party/wiredtiger/src/include/column.i @@ -221,13 +221,13 @@ __col_var_last_recno(WT_REF *ref) * This function ignores those records, our callers must handle that * explicitly, if they care. */ - if (page->pg_var_nrepeats == 0) - return (page->pg_var_entries == 0 ? 0 : - ref->ref_recno + (page->pg_var_entries - 1)); + if (!WT_COL_VAR_REPEAT_SET(page)) + return (page->entries == 0 ? 0 : + ref->ref_recno + (page->entries - 1)); repeat = &page->pg_var_repeats[page->pg_var_nrepeats - 1]; return ((repeat->recno + repeat->rle) - 1 + - (page->pg_var_entries - (repeat->indx + 1))); + (page->entries - (repeat->indx + 1))); } /* @@ -246,8 +246,7 @@ __col_fix_last_recno(WT_REF *ref) * This function ignores those records, our callers must handle that * explicitly, if they care. */ - return (page->pg_fix_entries == 0 ? - 0 : ref->ref_recno + (page->pg_fix_entries - 1)); + return (page->entries == 0 ? 0 : ref->ref_recno + (page->entries - 1)); } /* @@ -273,7 +272,9 @@ __col_var_search(WT_REF *ref, uint64_t recno, uint64_t *start_recnop) * slot for this record number, because we know any intervening records * have repeat counts of 1. */ - for (base = 0, limit = page->pg_var_nrepeats; limit != 0; limit >>= 1) { + for (base = 0, + limit = WT_COL_VAR_REPEAT_SET(page) ? page->pg_var_nrepeats : 0; + limit != 0; limit >>= 1) { indx = base + (limit >> 1); repeat = page->pg_var_repeats + indx; @@ -281,7 +282,7 @@ __col_var_search(WT_REF *ref, uint64_t recno, uint64_t *start_recnop) recno < repeat->recno + repeat->rle) { if (start_recnop != NULL) *start_recnop = repeat->recno; - return (page->pg_var_d + repeat->indx); + return (page->pg_var + repeat->indx); } if (recno < repeat->recno) continue; @@ -306,14 +307,14 @@ __col_var_search(WT_REF *ref, uint64_t recno, uint64_t *start_recnop) * !!! * The test could be written more simply as: * - * (recno >= start_recno + (page->pg_var_entries - start_indx)) + * (recno >= start_recno + (page->entries - start_indx)) * * It's split into two parts because the simpler test will overflow if * searching for large record numbers. */ if (recno >= start_recno && - recno - start_recno >= page->pg_var_entries - start_indx) + recno - start_recno >= page->entries - start_indx) return (NULL); - return (page->pg_var_d + start_indx + (uint32_t)(recno - start_recno)); + return (page->pg_var + start_indx + (uint32_t)(recno - start_recno)); } diff --git a/src/third_party/wiredtiger/src/include/connection.h b/src/third_party/wiredtiger/src/include/connection.h index 60ce5f55234..ce483d3291a 100644 --- a/src/third_party/wiredtiger/src/include/connection.h +++ b/src/third_party/wiredtiger/src/include/connection.h @@ -107,7 +107,7 @@ struct __wt_named_extractor { * Allocate some additional slots for internal sessions so the user cannot * configure too few sessions for us to run. */ -#define WT_EXTRA_INTERNAL_SESSIONS 10 +#define WT_EXTRA_INTERNAL_SESSIONS 20 /* * WT_CONN_CHECK_PANIC -- @@ -123,12 +123,16 @@ struct __wt_named_extractor { * main queue and the hashed queue. */ #define WT_CONN_DHANDLE_INSERT(conn, dhandle, bucket) do { \ + WT_ASSERT(session, \ + F_ISSET(session, WT_SESSION_LOCKED_HANDLE_LIST_WRITE)); \ TAILQ_INSERT_HEAD(&(conn)->dhqh, dhandle, q); \ TAILQ_INSERT_HEAD(&(conn)->dhhash[bucket], dhandle, hashq); \ ++conn->dhandle_count; \ } while (0) #define WT_CONN_DHANDLE_REMOVE(conn, dhandle, bucket) do { \ + WT_ASSERT(session, \ + F_ISSET(session, WT_SESSION_LOCKED_HANDLE_LIST_WRITE)); \ TAILQ_REMOVE(&(conn)->dhqh, dhandle, q); \ TAILQ_REMOVE(&(conn)->dhhash[bucket], dhandle, hashq); \ --conn->dhandle_count; \ @@ -163,13 +167,13 @@ struct __wt_connection_impl { WT_SPINLOCK api_lock; /* Connection API spinlock */ WT_SPINLOCK checkpoint_lock; /* Checkpoint spinlock */ - WT_SPINLOCK dhandle_lock; /* Data handle list spinlock */ WT_SPINLOCK fh_lock; /* File handle queue spinlock */ WT_SPINLOCK metadata_lock; /* Metadata update spinlock */ WT_SPINLOCK reconfig_lock; /* Single thread reconfigure */ WT_SPINLOCK schema_lock; /* Schema operation spinlock */ - WT_SPINLOCK table_lock; /* Table creation spinlock */ + WT_RWLOCK table_lock; /* Table list lock */ WT_SPINLOCK turtle_lock; /* Turtle file spinlock */ + WT_RWLOCK dhandle_lock; /* Data handle list lock */ /* * We distribute the btree page locks across a set of spin locks. Don't @@ -262,7 +266,7 @@ struct __wt_connection_impl { WT_TXN_GLOBAL txn_global; /* Global transaction state */ - WT_RWLOCK *hot_backup_lock; /* Hot backup serialization */ + WT_RWLOCK hot_backup_lock; /* Hot backup serialization */ bool hot_backup; /* Hot backup in progress */ char **hot_backup_list; /* Hot backup file list */ @@ -301,6 +305,15 @@ struct __wt_connection_impl { uint32_t evict_threads_max;/* Max eviction threads */ uint32_t evict_threads_min;/* Min eviction threads */ + uint32_t evict_tune_datapts_needed;/* Data needed to tune */ + struct timespec evict_tune_last_action_time;/* Time of last action */ + struct timespec evict_tune_last_time; /* Time of last check */ + uint32_t evict_tune_num_points; /* Number of values tried */ + uint64_t evict_tune_pgs_last; /* Number of pages evicted */ + uint64_t evict_tune_pg_sec_max; /* Max throughput encountered */ + bool evict_tune_stable; /* Are we stable? */ + uint32_t evict_tune_workers_best;/* Best performing value */ + #define WT_STATLOG_FILENAME "WiredTigerStat.%d.%H" WT_SESSION_IMPL *stat_session; /* Statistics log session */ wt_thread_t stat_tid; /* Statistics log thread */ @@ -326,11 +339,11 @@ struct __wt_connection_impl { bool log_tid_set; /* Log server thread set */ WT_CONDVAR *log_file_cond; /* Log file thread wait mutex */ WT_SESSION_IMPL *log_file_session;/* Log file thread session */ - wt_thread_t log_file_tid; /* Log file thread thread */ + wt_thread_t log_file_tid; /* Log file thread */ bool log_file_tid_set;/* Log file thread set */ WT_CONDVAR *log_wrlsn_cond;/* Log write lsn thread wait mutex */ WT_SESSION_IMPL *log_wrlsn_session;/* Log write lsn thread session */ - wt_thread_t log_wrlsn_tid; /* Log write lsn thread thread */ + wt_thread_t log_wrlsn_tid; /* Log write lsn thread */ bool log_wrlsn_tid_set;/* Log write lsn thread set */ WT_LOG *log; /* Logging structure */ WT_COMPRESSOR *log_compressor;/* Logging compressor */ diff --git a/src/third_party/wiredtiger/src/include/cursor.h b/src/third_party/wiredtiger/src/include/cursor.h index d522abc2a56..31c8963a486 100644 --- a/src/third_party/wiredtiger/src/include/cursor.h +++ b/src/third_party/wiredtiger/src/include/cursor.h @@ -52,8 +52,8 @@ { 0 }, /* recno raw buffer */ \ NULL, /* json_private */ \ NULL, /* lang_private */ \ - { NULL, 0, 0, NULL, 0 }, /* WT_ITEM key */ \ - { NULL, 0, 0, NULL, 0 }, /* WT_ITEM value */ \ + { NULL, 0, NULL, 0, 0 }, /* WT_ITEM key */ \ + { NULL, 0, NULL, 0, 0 }, /* WT_ITEM value */ \ 0, /* int saved_err */ \ NULL, /* internal_uri */ \ 0 /* uint32_t flags */ \ diff --git a/src/third_party/wiredtiger/src/include/dhandle.h b/src/third_party/wiredtiger/src/include/dhandle.h index d7802bb319b..4f318e7bccf 100644 --- a/src/third_party/wiredtiger/src/include/dhandle.h +++ b/src/third_party/wiredtiger/src/include/dhandle.h @@ -37,12 +37,30 @@ #define WT_SESSION_META_DHANDLE(s) \ (((WT_CURSOR_BTREE *)((s)->meta_cursor))->btree->dhandle) +#define WT_DHANDLE_ACQUIRE(dhandle) \ + (void)__wt_atomic_add32(&dhandle->session_ref, 1) + +#define WT_DHANDLE_RELEASE(dhandle) \ + (void)__wt_atomic_sub32(&dhandle->session_ref, 1) + +#define WT_DHANDLE_NEXT(session, dhandle, head, field) do { \ + WT_ASSERT(session, F_ISSET(session, WT_SESSION_LOCKED_HANDLE_LIST));\ + if (dhandle == NULL) \ + dhandle = TAILQ_FIRST(head); \ + else { \ + WT_DHANDLE_RELEASE(dhandle); \ + dhandle = TAILQ_NEXT(dhandle, field); \ + } \ + if (dhandle != NULL) \ + WT_DHANDLE_ACQUIRE(dhandle); \ +} while (0) + /* * WT_DATA_HANDLE -- * A handle for a generic named data source. */ struct __wt_data_handle { - WT_RWLOCK *rwlock; /* Lock for shared/exclusive ops */ + WT_RWLOCK rwlock; /* Lock for shared/exclusive ops */ TAILQ_ENTRY(__wt_data_handle) q; TAILQ_ENTRY(__wt_data_handle) hashq; diff --git a/src/third_party/wiredtiger/src/include/extern.h b/src/third_party/wiredtiger/src/include/extern.h index be042bcd6cb..19ad9a880df 100644 --- a/src/third_party/wiredtiger/src/include/extern.h +++ b/src/third_party/wiredtiger/src/include/extern.h @@ -98,6 +98,7 @@ extern void __wt_cursor_key_order_reset(WT_CURSOR_BTREE *cbt) WT_GCC_FUNC_DECL_A extern void __wt_btcur_iterate_setup(WT_CURSOR_BTREE *cbt) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); extern int __wt_btcur_next(WT_CURSOR_BTREE *cbt, bool truncating) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); extern int __wt_btcur_prev(WT_CURSOR_BTREE *cbt, bool truncating) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); +extern bool __wt_cursor_valid(WT_CURSOR_BTREE *cbt, WT_UPDATE **updp) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); extern int __wt_btcur_reset(WT_CURSOR_BTREE *cbt) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); extern int __wt_btcur_search(WT_CURSOR_BTREE *cbt) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); extern int __wt_btcur_search_near(WT_CURSOR_BTREE *cbt, int *exactp) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); @@ -105,7 +106,6 @@ extern int __wt_btcur_insert(WT_CURSOR_BTREE *cbt) WT_GCC_FUNC_DECL_ATTRIBUTE((w extern int __wt_btcur_update_check(WT_CURSOR_BTREE *cbt) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); extern int __wt_btcur_remove(WT_CURSOR_BTREE *cbt) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); extern int __wt_btcur_update(WT_CURSOR_BTREE *cbt) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); -extern int __wt_btcur_next_random(WT_CURSOR_BTREE *cbt) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); extern int __wt_btcur_compare(WT_CURSOR_BTREE *a_arg, WT_CURSOR_BTREE *b_arg, int *cmpp) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); extern int __wt_btcur_equals(WT_CURSOR_BTREE *a_arg, WT_CURSOR_BTREE *b_arg, int *equalp) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); extern int __wt_btcur_range_truncate(WT_CURSOR_BTREE *start, WT_CURSOR_BTREE *stop) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); @@ -150,6 +150,9 @@ extern int __wt_ovfl_cache(WT_SESSION_IMPL *session, WT_PAGE *page, void *cookie extern int __wt_ovfl_discard(WT_SESSION_IMPL *session, WT_CELL *cell) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); extern int __wt_page_alloc(WT_SESSION_IMPL *session, uint8_t type, uint32_t alloc_entries, bool alloc_refs, WT_PAGE **pagep) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); extern int __wt_page_inmem(WT_SESSION_IMPL *session, WT_REF *ref, const void *image, size_t memsize, uint32_t flags, WT_PAGE **pagep) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); +extern int __wt_row_random_leaf(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); +extern int __wt_random_descent(WT_SESSION_IMPL *session, WT_REF **refp, bool eviction) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); +extern int __wt_btcur_next_random(WT_CURSOR_BTREE *cbt) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); extern int __wt_las_remove_block(WT_SESSION_IMPL *session, WT_CURSOR *cursor, uint32_t btree_id, const uint8_t *addr, size_t addr_size) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); extern int __wt_page_in_func(WT_SESSION_IMPL *session, WT_REF *ref, uint32_t flags @@ -160,6 +163,7 @@ __wt_page_in_func(WT_SESSION_IMPL *session, WT_REF *ref, uint32_t flags extern int __wt_bt_rebalance(WT_SESSION_IMPL *session, const char *cfg[]) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); extern int __wt_kv_return(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, WT_UPDATE *upd) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); extern int __wt_bt_salvage(WT_SESSION_IMPL *session, WT_CKPT *ckptbase, const char *cfg[]) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); +extern bool __wt_split_obsolete(WT_SESSION_IMPL *session, uint64_t split_gen) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); extern void __wt_split_stash_discard(WT_SESSION_IMPL *session) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); extern void __wt_split_stash_discard_all( WT_SESSION_IMPL *session_safe, WT_SESSION_IMPL *session) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); extern int __wt_multi_to_ref(WT_SESSION_IMPL *session, WT_PAGE *page, WT_MULTI *multi, WT_REF **refp, size_t *incrp, bool closing) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); @@ -192,8 +196,6 @@ extern WT_UPDATE *__wt_update_obsolete_check( WT_SESSION_IMPL *session, WT_PAGE extern void __wt_update_obsolete_free( WT_SESSION_IMPL *session, WT_PAGE *page, WT_UPDATE *upd) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); extern int __wt_search_insert(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, WT_INSERT_HEAD *ins_head, WT_ITEM *srch_key) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); extern int __wt_row_search(WT_SESSION_IMPL *session, WT_ITEM *srch_key, WT_REF *leaf, WT_CURSOR_BTREE *cbt, bool insert) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); -extern int __wt_row_random_leaf(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); -extern int __wt_row_random_descent(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); extern void __wt_las_stats_update(WT_SESSION_IMPL *session) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); extern int __wt_las_create(WT_SESSION_IMPL *session) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); extern int __wt_las_destroy(WT_SESSION_IMPL *session) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); @@ -254,6 +256,7 @@ extern WT_THREAD_RET __wt_cache_pool_server(void *arg) WT_GCC_FUNC_DECL_ATTRIBUT extern int __wt_checkpoint_server_create(WT_SESSION_IMPL *session, const char *cfg[]) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); extern int __wt_checkpoint_server_destroy(WT_SESSION_IMPL *session) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); extern void __wt_checkpoint_signal(WT_SESSION_IMPL *session, wt_off_t logsize) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); +extern int __wt_conn_dhandle_alloc( WT_SESSION_IMPL *session, const char *uri, const char *checkpoint) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); extern int __wt_conn_dhandle_find( WT_SESSION_IMPL *session, const char *uri, const char *checkpoint) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); extern int __wt_conn_btree_sync_and_close(WT_SESSION_IMPL *session, bool final, bool force) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); extern int __wt_conn_btree_open( WT_SESSION_IMPL *session, const char *cfg[], uint32_t flags) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); @@ -262,7 +265,7 @@ extern int __wt_conn_dhandle_close_all( WT_SESSION_IMPL *session, const char *ur extern int __wt_conn_dhandle_discard_single( WT_SESSION_IMPL *session, bool final, bool force) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); extern int __wt_conn_dhandle_discard(WT_SESSION_IMPL *session) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); extern int __wt_connection_init(WT_CONNECTION_IMPL *conn) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); -extern int __wt_connection_destroy(WT_CONNECTION_IMPL *conn) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); +extern void __wt_connection_destroy(WT_CONNECTION_IMPL *conn) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); extern int __wt_logmgr_reconfig(WT_SESSION_IMPL *session, const char **cfg) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); extern int __wt_log_truncate_files( WT_SESSION_IMPL *session, WT_CURSOR *cursor, const char *cfg[]) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); extern void __wt_log_wrlsn(WT_SESSION_IMPL *session, int *yield) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); @@ -294,7 +297,7 @@ extern int __wt_curjoin_join(WT_SESSION_IMPL *session, WT_CURSOR_JOIN *cjoin, WT extern int __wt_json_alloc_unpack(WT_SESSION_IMPL *session, const void *buffer, size_t size, const char *fmt, WT_CURSOR_JSON *json, bool iskey, va_list ap) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); extern void __wt_json_close(WT_SESSION_IMPL *session, WT_CURSOR *cursor) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); extern size_t __wt_json_unpack_char(u_char ch, u_char *buf, size_t bufsz, bool force_unicode) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("default"))); -extern void __wt_json_column_init(WT_CURSOR *cursor, const char *keyformat, const WT_CONFIG_ITEM *idxconf, const WT_CONFIG_ITEM *colconf) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); +extern void __wt_json_column_init(WT_CURSOR *cursor, const char *uri, const char *keyformat, const WT_CONFIG_ITEM *idxconf, const WT_CONFIG_ITEM *colconf) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); extern int __wt_json_token(WT_SESSION *wt_session, const char *src, int *toktype, const char **tokstart, size_t *toklen) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("default"))) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern const char *__wt_json_tokname(int toktype) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("default"))); extern int __wt_json_to_item(WT_SESSION_IMPL *session, const char *jstr, const char *format, WT_CURSOR_JSON *json, bool iskey, WT_ITEM *item) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); @@ -315,7 +318,7 @@ extern int __wt_cursor_equals_notsup(WT_CURSOR *cursor, WT_CURSOR *other, int *e extern int __wt_cursor_search_near_notsup(WT_CURSOR *cursor, int *exact) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); extern int __wt_cursor_reconfigure_notsup(WT_CURSOR *cursor, const char *config) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); extern void __wt_cursor_set_notsup(WT_CURSOR *cursor) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); -extern int __wt_cursor_kv_not_set(WT_CURSOR *cursor, bool key) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); +extern int __wt_cursor_kv_not_set(WT_CURSOR *cursor, bool key) WT_GCC_FUNC_DECL_ATTRIBUTE((cold)) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); extern int __wt_cursor_get_key(WT_CURSOR *cursor, ...) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); extern void __wt_cursor_set_key(WT_CURSOR *cursor, ...) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); extern int __wt_cursor_get_raw_key(WT_CURSOR *cursor, WT_ITEM *key) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); @@ -352,7 +355,7 @@ extern int __wt_cache_eviction_worker(WT_SESSION_IMPL *session, bool busy, u_int extern bool __wt_page_evict_urgent(WT_SESSION_IMPL *session, WT_REF *ref) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); extern void __wt_evict_priority_set(WT_SESSION_IMPL *session, uint64_t v) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); extern void __wt_evict_priority_clear(WT_SESSION_IMPL *session) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); -extern int __wt_cache_dump(WT_SESSION_IMPL *session, const char *ofile) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); +extern int __wt_verbose_dump_cache(WT_SESSION_IMPL *session) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); extern int __wt_page_release_evict(WT_SESSION_IMPL *session, WT_REF *ref) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); extern int __wt_evict(WT_SESSION_IMPL *session, WT_REF *ref, bool closing) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); extern void __wt_curstat_cache_walk(WT_SESSION_IMPL *session) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); @@ -403,7 +406,7 @@ extern int __wt_log_slot_switch( WT_SESSION_IMPL *session, WT_MYSLOT *myslot, bo extern int __wt_log_slot_new(WT_SESSION_IMPL *session) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); extern int __wt_log_slot_init(WT_SESSION_IMPL *session) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); extern int __wt_log_slot_destroy(WT_SESSION_IMPL *session) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); -extern void __wt_log_slot_join(WT_SESSION_IMPL *session, uint64_t mysize, uint32_t flags, WT_MYSLOT *myslot) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); +extern int __wt_log_slot_join(WT_SESSION_IMPL *session, uint64_t mysize, uint32_t flags, WT_MYSLOT *myslot) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); extern int64_t __wt_log_slot_release(WT_SESSION_IMPL *session, WT_MYSLOT *myslot, int64_t size) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); extern void __wt_log_slot_free(WT_SESSION_IMPL *session, WT_LOGSLOT *slot) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); extern int __wt_clsm_request_switch(WT_CURSOR_LSM *clsm) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); @@ -613,11 +616,9 @@ extern void __wt_session_close_cache(WT_SESSION_IMPL *session) WT_GCC_FUNC_DECL_ extern int __wt_session_get_btree(WT_SESSION_IMPL *session, const char *uri, const char *checkpoint, const char *cfg[], uint32_t flags) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); extern int __wt_session_lock_checkpoint(WT_SESSION_IMPL *session, const char *checkpoint) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); extern int __wt_salvage(WT_SESSION_IMPL *session, const char *cfg[]) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); -extern int __wt_cond_auto_alloc( WT_SESSION_IMPL *session, const char *name, bool is_signalled, uint64_t min, uint64_t max, WT_CONDVAR **condp) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); -extern void __wt_cond_auto_signal(WT_SESSION_IMPL *session, WT_CONDVAR *cond) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); -extern void __wt_cond_auto_wait_signal( WT_SESSION_IMPL *session, WT_CONDVAR *cond, bool progress, bool *signalled) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); -extern void __wt_cond_auto_wait( WT_SESSION_IMPL *session, WT_CONDVAR *cond, bool progress) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); -extern int __wt_cond_auto_destroy(WT_SESSION_IMPL *session, WT_CONDVAR **condp) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); +extern int __wt_cond_auto_alloc(WT_SESSION_IMPL *session, const char *name, uint64_t min, uint64_t max, WT_CONDVAR **condp) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); +extern void __wt_cond_auto_wait_signal(WT_SESSION_IMPL *session, WT_CONDVAR *cond, bool progress, bool (*run_func)(WT_SESSION_IMPL *), bool *signalled) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); +extern void __wt_cond_auto_wait(WT_SESSION_IMPL *session, WT_CONDVAR *cond, bool progress, bool (*run_func)(WT_SESSION_IMPL *)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); extern int __wt_decrypt(WT_SESSION_IMPL *session, WT_ENCRYPTOR *encryptor, size_t skip, WT_ITEM *in, WT_ITEM *out) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); extern int __wt_encrypt(WT_SESSION_IMPL *session, WT_KEYED_ENCRYPTOR *kencryptor, size_t skip, WT_ITEM *in, WT_ITEM *out) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); extern void __wt_encrypt_size(WT_SESSION_IMPL *session, WT_KEYED_ENCRYPTOR *kencryptor, size_t incoming_size, size_t *sizep) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); @@ -671,16 +672,16 @@ extern void __wt_huffman_close(WT_SESSION_IMPL *session, void *huffman_arg) WT_G extern void __wt_print_huffman_code(void *huffman_arg, uint16_t symbol) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); extern int __wt_huffman_encode(WT_SESSION_IMPL *session, void *huffman_arg, const uint8_t *from_arg, size_t from_len, WT_ITEM *to_buf) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); extern int __wt_huffman_decode(WT_SESSION_IMPL *session, void *huffman_arg, const uint8_t *from_arg, size_t from_len, WT_ITEM *to_buf) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); -extern int __wt_rwlock_alloc( WT_SESSION_IMPL *session, WT_RWLOCK **rwlockp, const char *name) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); -extern int __wt_try_readlock(WT_SESSION_IMPL *session, WT_RWLOCK *rwlock) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); -extern void __wt_readlock_spin(WT_SESSION_IMPL *session, WT_RWLOCK *rwlock) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); -extern void __wt_readlock(WT_SESSION_IMPL *session, WT_RWLOCK *rwlock) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); -extern void __wt_readunlock(WT_SESSION_IMPL *session, WT_RWLOCK *rwlock) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); -extern int __wt_try_writelock(WT_SESSION_IMPL *session, WT_RWLOCK *rwlock) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); -extern void __wt_writelock(WT_SESSION_IMPL *session, WT_RWLOCK *rwlock) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); -extern void __wt_writeunlock(WT_SESSION_IMPL *session, WT_RWLOCK *rwlock) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); -extern void __wt_rwlock_destroy(WT_SESSION_IMPL *session, WT_RWLOCK **rwlockp) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); -extern bool __wt_rwlock_islocked(WT_SESSION_IMPL *session, WT_RWLOCK *rwlock) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); +extern void __wt_rwlock_init(WT_SESSION_IMPL *session, WT_RWLOCK *l) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); +extern void __wt_rwlock_destroy(WT_SESSION_IMPL *session, WT_RWLOCK *l) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); +extern int __wt_try_readlock(WT_SESSION_IMPL *session, WT_RWLOCK *l) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); +extern void __wt_readlock_spin(WT_SESSION_IMPL *session, WT_RWLOCK *l) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); +extern void __wt_readlock(WT_SESSION_IMPL *session, WT_RWLOCK *l) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); +extern void __wt_readunlock(WT_SESSION_IMPL *session, WT_RWLOCK *l) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); +extern int __wt_try_writelock(WT_SESSION_IMPL *session, WT_RWLOCK *l) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); +extern void __wt_writelock(WT_SESSION_IMPL *session, WT_RWLOCK *l) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); +extern void __wt_writeunlock(WT_SESSION_IMPL *session, WT_RWLOCK *l) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); +extern bool __wt_rwlock_islocked(WT_SESSION_IMPL *session, WT_RWLOCK *l) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); extern uint32_t __wt_nlpo2_round(uint32_t v) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); extern uint32_t __wt_nlpo2(uint32_t v) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); extern uint32_t __wt_log2_int(uint32_t n) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); @@ -689,6 +690,7 @@ extern uint32_t __wt_rduppo2(uint32_t n, uint32_t po2) WT_GCC_FUNC_DECL_ATTRIBUT extern void __wt_random_init(WT_RAND_STATE volatile *rnd_state) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("default"))); extern void __wt_random_init_seed( WT_SESSION_IMPL *session, WT_RAND_STATE volatile *rnd_state) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("default"))); extern uint32_t __wt_random(WT_RAND_STATE volatile *rnd_state) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("default"))); +extern uint64_t __wt_random64(WT_RAND_STATE volatile *rnd_state) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("default"))); extern int __wt_buf_grow_worker(WT_SESSION_IMPL *session, WT_ITEM *buf, size_t size) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); extern int __wt_buf_fmt(WT_SESSION_IMPL *session, WT_ITEM *buf, const char *fmt, ...) WT_GCC_FUNC_DECL_ATTRIBUTE((format (printf, 3, 4))) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); extern int __wt_buf_catfmt(WT_SESSION_IMPL *session, WT_ITEM *buf, const char *fmt, ...) WT_GCC_FUNC_DECL_ATTRIBUTE((format (printf, 3, 4))) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); @@ -728,6 +730,7 @@ extern int __wt_thread_group_resize( WT_SESSION_IMPL *session, WT_THREAD_GROUP * extern int __wt_thread_group_create( WT_SESSION_IMPL *session, WT_THREAD_GROUP *group, const char *name, uint32_t min, uint32_t max, uint32_t flags, int (*run_func)(WT_SESSION_IMPL *session, WT_THREAD *context)) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); extern int __wt_thread_group_destroy(WT_SESSION_IMPL *session, WT_THREAD_GROUP *group) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); extern int __wt_thread_group_start_one( WT_SESSION_IMPL *session, WT_THREAD_GROUP *group, bool wait) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); +extern int __wt_thread_group_stop_one( WT_SESSION_IMPL *session, WT_THREAD_GROUP *group, bool wait) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); extern void __wt_txn_release_snapshot(WT_SESSION_IMPL *session) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); extern void __wt_txn_get_snapshot(WT_SESSION_IMPL *session) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); extern int __wt_txn_update_oldest(WT_SESSION_IMPL *session, uint32_t flags) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); @@ -740,6 +743,7 @@ extern void __wt_txn_stats_update(WT_SESSION_IMPL *session) WT_GCC_FUNC_DECL_ATT extern void __wt_txn_destroy(WT_SESSION_IMPL *session) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); extern int __wt_txn_global_init(WT_SESSION_IMPL *session, const char *cfg[]) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); extern void __wt_txn_global_destroy(WT_SESSION_IMPL *session) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); +extern int __wt_verbose_dump_txn(WT_SESSION_IMPL *session) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); extern int __wt_checkpoint_get_handles(WT_SESSION_IMPL *session, const char *cfg[]) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); extern int __wt_txn_checkpoint(WT_SESSION_IMPL *session, const char *cfg[], bool waiting) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); extern int __wt_checkpoint(WT_SESSION_IMPL *session, const char *cfg[]) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); diff --git a/src/third_party/wiredtiger/src/include/extern_posix.h b/src/third_party/wiredtiger/src/include/extern_posix.h index 5acb7b0ed27..fed7835ada1 100644 --- a/src/third_party/wiredtiger/src/include/extern_posix.h +++ b/src/third_party/wiredtiger/src/include/extern_posix.h @@ -12,8 +12,8 @@ extern int __wt_posix_map(WT_FILE_HANDLE *fh, WT_SESSION *wt_session, void *mapp extern int __wt_posix_map_preload(WT_FILE_HANDLE *fh, WT_SESSION *wt_session, const void *map, size_t length, void *mapped_cookie) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); extern int __wt_posix_map_discard(WT_FILE_HANDLE *fh, WT_SESSION *wt_session, void *map, size_t length, void *mapped_cookie) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); extern int __wt_posix_unmap(WT_FILE_HANDLE *fh, WT_SESSION *wt_session, void *mapped_region, size_t len, void *mapped_cookie) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); -extern int __wt_cond_alloc(WT_SESSION_IMPL *session, const char *name, bool is_signalled, WT_CONDVAR **condp) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); -extern void __wt_cond_wait_signal( WT_SESSION_IMPL *session, WT_CONDVAR *cond, uint64_t usecs, bool *signalled) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); +extern int __wt_cond_alloc(WT_SESSION_IMPL *session, const char *name, WT_CONDVAR **condp) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); +extern void __wt_cond_wait_signal(WT_SESSION_IMPL *session, WT_CONDVAR *cond, uint64_t usecs, bool (*run_func)(WT_SESSION_IMPL *), bool *signalled) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); extern void __wt_cond_signal(WT_SESSION_IMPL *session, WT_CONDVAR *cond) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); extern int __wt_cond_destroy(WT_SESSION_IMPL *session, WT_CONDVAR **condp) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); extern int __wt_once(void (*init_routine)(void)) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); diff --git a/src/third_party/wiredtiger/src/include/extern_win.h b/src/third_party/wiredtiger/src/include/extern_win.h index 11b45f11304..0bfc821c7a6 100644 --- a/src/third_party/wiredtiger/src/include/extern_win.h +++ b/src/third_party/wiredtiger/src/include/extern_win.h @@ -10,8 +10,8 @@ extern int __wt_os_win(WT_SESSION_IMPL *session) WT_GCC_FUNC_DECL_ATTRIBUTE((war extern int __wt_getenv(WT_SESSION_IMPL *session, const char *variable, const char **envp) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); extern int __wt_win_map(WT_FILE_HANDLE *file_handle, WT_SESSION *wt_session, void *mapped_regionp, size_t *lenp, void *mapped_cookiep) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); extern int __wt_win_unmap(WT_FILE_HANDLE *file_handle, WT_SESSION *wt_session, void *mapped_region, size_t length, void *mapped_cookie) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); -extern int __wt_cond_alloc(WT_SESSION_IMPL *session, const char *name, bool is_signalled, WT_CONDVAR **condp) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); -extern void __wt_cond_wait_signal( WT_SESSION_IMPL *session, WT_CONDVAR *cond, uint64_t usecs, bool *signalled) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); +extern int __wt_cond_alloc(WT_SESSION_IMPL *session, const char *name, WT_CONDVAR **condp) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); +extern void __wt_cond_wait_signal(WT_SESSION_IMPL *session, WT_CONDVAR *cond, uint64_t usecs, bool (*run_func)(WT_SESSION_IMPL *), bool *signalled) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); extern void __wt_cond_signal(WT_SESSION_IMPL *session, WT_CONDVAR *cond) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); extern int __wt_cond_destroy(WT_SESSION_IMPL *session, WT_CONDVAR **condp) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); extern int __wt_once(void (*init_routine)(void)) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); diff --git a/src/third_party/wiredtiger/src/include/flags.h b/src/third_party/wiredtiger/src/include/flags.h index e7a5ba066df..c1fff920e3b 100644 --- a/src/third_party/wiredtiger/src/include/flags.h +++ b/src/third_party/wiredtiger/src/include/flags.h @@ -53,22 +53,24 @@ #define WT_SESSION_CAN_WAIT 0x00000001 #define WT_SESSION_INTERNAL 0x00000002 #define WT_SESSION_LOCKED_CHECKPOINT 0x00000004 -#define WT_SESSION_LOCKED_HANDLE_LIST 0x00000008 -#define WT_SESSION_LOCKED_METADATA 0x00000010 -#define WT_SESSION_LOCKED_PASS 0x00000020 -#define WT_SESSION_LOCKED_SCHEMA 0x00000040 -#define WT_SESSION_LOCKED_SLOT 0x00000080 -#define WT_SESSION_LOCKED_TABLE 0x00000100 -#define WT_SESSION_LOCKED_TURTLE 0x00000200 -#define WT_SESSION_LOGGING_INMEM 0x00000400 -#define WT_SESSION_LOOKASIDE_CURSOR 0x00000800 -#define WT_SESSION_NO_CACHE 0x00001000 -#define WT_SESSION_NO_DATA_HANDLES 0x00002000 -#define WT_SESSION_NO_EVICTION 0x00004000 -#define WT_SESSION_NO_LOGGING 0x00008000 -#define WT_SESSION_NO_SCHEMA_LOCK 0x00010000 -#define WT_SESSION_QUIET_CORRUPT_FILE 0x00020000 -#define WT_SESSION_SERVER_ASYNC 0x00040000 +#define WT_SESSION_LOCKED_HANDLE_LIST_READ 0x00000008 +#define WT_SESSION_LOCKED_HANDLE_LIST_WRITE 0x00000010 +#define WT_SESSION_LOCKED_METADATA 0x00000020 +#define WT_SESSION_LOCKED_PASS 0x00000040 +#define WT_SESSION_LOCKED_SCHEMA 0x00000080 +#define WT_SESSION_LOCKED_SLOT 0x00000100 +#define WT_SESSION_LOCKED_TABLE_READ 0x00000200 +#define WT_SESSION_LOCKED_TABLE_WRITE 0x00000400 +#define WT_SESSION_LOCKED_TURTLE 0x00000800 +#define WT_SESSION_LOGGING_INMEM 0x00001000 +#define WT_SESSION_LOOKASIDE_CURSOR 0x00002000 +#define WT_SESSION_NO_CACHE 0x00004000 +#define WT_SESSION_NO_DATA_HANDLES 0x00008000 +#define WT_SESSION_NO_EVICTION 0x00010000 +#define WT_SESSION_NO_LOGGING 0x00020000 +#define WT_SESSION_NO_SCHEMA_LOCK 0x00040000 +#define WT_SESSION_QUIET_CORRUPT_FILE 0x00080000 +#define WT_SESSION_SERVER_ASYNC 0x00100000 #define WT_STAT_CLEAR 0x00000001 #define WT_STAT_JSON 0x00000002 #define WT_STAT_ON_CLOSE 0x00000004 @@ -90,27 +92,29 @@ #define WT_VERB_COMPACT 0x00000008 #define WT_VERB_EVICT 0x00000010 #define WT_VERB_EVICTSERVER 0x00000020 -#define WT_VERB_FILEOPS 0x00000040 -#define WT_VERB_HANDLEOPS 0x00000080 -#define WT_VERB_LOG 0x00000100 -#define WT_VERB_LSM 0x00000200 -#define WT_VERB_LSM_MANAGER 0x00000400 -#define WT_VERB_METADATA 0x00000800 -#define WT_VERB_MUTEX 0x00001000 -#define WT_VERB_OVERFLOW 0x00002000 -#define WT_VERB_READ 0x00004000 -#define WT_VERB_REBALANCE 0x00008000 -#define WT_VERB_RECONCILE 0x00010000 -#define WT_VERB_RECOVERY 0x00020000 -#define WT_VERB_SALVAGE 0x00040000 -#define WT_VERB_SHARED_CACHE 0x00080000 -#define WT_VERB_SPLIT 0x00100000 -#define WT_VERB_TEMPORARY 0x00200000 -#define WT_VERB_THREAD_GROUP 0x00400000 -#define WT_VERB_TRANSACTION 0x00800000 -#define WT_VERB_VERIFY 0x01000000 -#define WT_VERB_VERSION 0x02000000 -#define WT_VERB_WRITE 0x04000000 +#define WT_VERB_EVICT_STUCK 0x00000040 +#define WT_VERB_FILEOPS 0x00000080 +#define WT_VERB_HANDLEOPS 0x00000100 +#define WT_VERB_LOG 0x00000200 +#define WT_VERB_LSM 0x00000400 +#define WT_VERB_LSM_MANAGER 0x00000800 +#define WT_VERB_METADATA 0x00001000 +#define WT_VERB_MUTEX 0x00002000 +#define WT_VERB_OVERFLOW 0x00004000 +#define WT_VERB_READ 0x00008000 +#define WT_VERB_REBALANCE 0x00010000 +#define WT_VERB_RECONCILE 0x00020000 +#define WT_VERB_RECOVERY 0x00040000 +#define WT_VERB_RECOVERY_PROGRESS 0x00080000 +#define WT_VERB_SALVAGE 0x00100000 +#define WT_VERB_SHARED_CACHE 0x00200000 +#define WT_VERB_SPLIT 0x00400000 +#define WT_VERB_TEMPORARY 0x00800000 +#define WT_VERB_THREAD_GROUP 0x01000000 +#define WT_VERB_TRANSACTION 0x02000000 +#define WT_VERB_VERIFY 0x04000000 +#define WT_VERB_VERSION 0x08000000 +#define WT_VERB_WRITE 0x10000000 #define WT_VISIBILITY_ERR 0x00000080 /* * flags section: END diff --git a/src/third_party/wiredtiger/src/include/log.h b/src/third_party/wiredtiger/src/include/log.h index 3f2cb2ba8e6..a6be3582b4d 100644 --- a/src/third_party/wiredtiger/src/include/log.h +++ b/src/third_party/wiredtiger/src/include/log.h @@ -163,7 +163,7 @@ struct __wt_logslot { WT_CACHE_LINE_PAD_BEGIN volatile int64_t slot_state; /* Slot state */ int64_t slot_unbuffered; /* Unbuffered data in this slot */ - int32_t slot_error; /* Error value */ + int slot_error; /* Error value */ wt_off_t slot_start_offset; /* Starting file offset */ wt_off_t slot_last_offset; /* Last record offset */ WT_LSN slot_release_lsn; /* Slot release LSN */ @@ -235,7 +235,7 @@ struct __wt_log { WT_SPINLOCK log_sync_lock; /* Locked: Single-thread fsync */ WT_SPINLOCK log_writelsn_lock; /* Locked: write LSN */ - WT_RWLOCK *log_archive_lock; /* Archive and log cursors */ + WT_RWLOCK log_archive_lock;/* Archive and log cursors */ /* Notify any waiting threads when sync_lsn is updated. */ WT_CONDVAR *log_sync_cond; @@ -254,6 +254,7 @@ struct __wt_log { #define WT_SLOT_POOL 128 WT_LOGSLOT *active_slot; /* Active slot */ WT_LOGSLOT slot_pool[WT_SLOT_POOL]; /* Pool of all slots */ + int32_t pool_index; /* Index into slot pool */ size_t slot_buf_size; /* Buffer size for slots */ #ifdef HAVE_DIAGNOSTIC uint64_t write_calls; /* Calls to log_write */ diff --git a/src/third_party/wiredtiger/src/include/lsm.h b/src/third_party/wiredtiger/src/include/lsm.h index fefed9daa81..2bbb813bad2 100644 --- a/src/third_party/wiredtiger/src/include/lsm.h +++ b/src/third_party/wiredtiger/src/include/lsm.h @@ -189,7 +189,7 @@ struct __wt_lsm_tree { #define LSM_TREE_MAX_QUEUE 100 uint32_t queue_ref; - WT_RWLOCK *rwlock; + WT_RWLOCK rwlock; TAILQ_ENTRY(__wt_lsm_tree) q; uint64_t dsk_gen; diff --git a/src/third_party/wiredtiger/src/include/misc.i b/src/third_party/wiredtiger/src/include/misc.i index f36be32d6a2..d5692a3f9cf 100644 --- a/src/third_party/wiredtiger/src/include/misc.i +++ b/src/third_party/wiredtiger/src/include/misc.i @@ -11,11 +11,12 @@ * Wait on a mutex, optionally timing out. */ static inline void -__wt_cond_wait(WT_SESSION_IMPL *session, WT_CONDVAR *cond, uint64_t usecs) +__wt_cond_wait(WT_SESSION_IMPL *session, + WT_CONDVAR *cond, uint64_t usecs, bool (*run_func)(WT_SESSION_IMPL *)) { bool notused; - __wt_cond_wait_signal(session, cond, usecs, ¬used); + __wt_cond_wait_signal(session, cond, usecs, run_func, ¬used); } /* diff --git a/src/third_party/wiredtiger/src/include/mutex.h b/src/third_party/wiredtiger/src/include/mutex.h index 6b81b1a6265..06b8c4a3304 100644 --- a/src/third_party/wiredtiger/src/include/mutex.h +++ b/src/third_party/wiredtiger/src/include/mutex.h @@ -21,8 +21,8 @@ struct __wt_condvar { int waiters; /* Numbers of waiters, or -1 if signalled with no waiters. */ /* - * The following fields are only used for automatically adjusting - * condition variables. They could be in a separate structure. + * The following fields are used for automatically adjusting condition + * variable wait times. */ uint64_t min_wait; /* Minimum wait duration */ uint64_t max_wait; /* Maximum wait duration */ @@ -30,11 +30,14 @@ struct __wt_condvar { }; /* + * Read/write locks: + * + * WiredTiger uses read/write locks for shared/exclusive access to resources. * !!! * Don't modify this structure without understanding the read/write locking * functions. */ -typedef union { /* Read/write lock */ +union __wt_rwlock { /* Read/write lock */ uint64_t u; struct { uint32_t wr; /* Writers and readers */ @@ -45,19 +48,6 @@ typedef union { /* Read/write lock */ uint16_t next; /* Next available ticket number */ uint16_t writers_active;/* Count of active writers */ } s; -} wt_rwlock_t; - -/* - * Read/write locks: - * - * WiredTiger uses read/write locks for shared/exclusive access to resources. - */ -struct __wt_rwlock { - WT_CACHE_LINE_PAD_BEGIN - const char *name; /* Lock name for debugging */ - - wt_rwlock_t rwlock; /* Read/write lock */ - WT_CACHE_LINE_PAD_END }; /* diff --git a/src/third_party/wiredtiger/src/include/mutex.i b/src/third_party/wiredtiger/src/include/mutex.i index a6309e0976b..6b83cb280d3 100644 --- a/src/third_party/wiredtiger/src/include/mutex.i +++ b/src/third_party/wiredtiger/src/include/mutex.i @@ -300,3 +300,22 @@ __wt_spin_lock_track(WT_SESSION_IMPL *session, WT_SPINLOCK *t) } else __wt_spin_lock(session, t); } + +/* + * __wt_spin_trylock_track -- + * Try to lock a spinlock or fail immediately if it is busy. + * Track if successful. + */ +static inline int +__wt_spin_trylock_track(WT_SESSION_IMPL *session, WT_SPINLOCK *t) +{ + int64_t **stats; + + if (t->stat_count_off != -1 && WT_STAT_ENABLED(session)) { + WT_RET(__wt_spin_trylock(session, t)); + stats = (int64_t **)S2C(session)->stats; + stats[session->stat_bucket][t->stat_count_off]++; + return (0); + } else + return (__wt_spin_trylock(session, t)); +} diff --git a/src/third_party/wiredtiger/src/include/packing.i b/src/third_party/wiredtiger/src/include/packing.i index 17ca261bcfc..8ba3dd536ac 100644 --- a/src/third_party/wiredtiger/src/include/packing.i +++ b/src/third_party/wiredtiger/src/include/packing.i @@ -168,10 +168,15 @@ next: if (pack->cur == pack->end) (int)(pack->end - pack->orig), pack->orig); return (0); case 'u': - case 'U': /* Special case for items with a size prefix. */ pv->type = (!pv->havesize && *pack->cur != '\0') ? 'U' : 'u'; return (0); + case 'U': + /* + * Don't change the type. 'U' is used internally, so this type + * was already changed to explicitly include the size. + */ + return (0); case 'b': case 'h': case 'i': diff --git a/src/third_party/wiredtiger/src/include/schema.h b/src/third_party/wiredtiger/src/include/schema.h index a17affb7660..9a6e1e54e80 100644 --- a/src/third_party/wiredtiger/src/include/schema.h +++ b/src/third_party/wiredtiger/src/include/schema.h @@ -78,6 +78,14 @@ struct __wt_table { */ #define WT_COLGROUPS(t) WT_MAX((t)->ncolgroups, 1) +/* Helpers for the locked state of the handle list and table locks. */ +#define WT_SESSION_LOCKED_HANDLE_LIST \ + (WT_SESSION_LOCKED_HANDLE_LIST_READ | \ + WT_SESSION_LOCKED_HANDLE_LIST_WRITE) +#define WT_SESSION_LOCKED_TABLE \ + (WT_SESSION_LOCKED_TABLE_READ | \ + WT_SESSION_LOCKED_TABLE_WRITE) + /* * WT_WITH_LOCK_WAIT -- * Wait for a lock, perform an operation, drop the lock. @@ -85,7 +93,7 @@ struct __wt_table { #define WT_WITH_LOCK_WAIT(session, lock, flag, op) do { \ if (F_ISSET(session, (flag))) { \ op; \ - } else { \ + } else { \ __wt_spin_lock_track(session, lock); \ F_SET(session, (flag)); \ op; \ @@ -102,7 +110,7 @@ struct __wt_table { ret = 0; \ if (F_ISSET(session, (flag))) { \ op; \ - } else if ((ret = __wt_spin_trylock(session, lock)) == 0) { \ + } else if ((ret = __wt_spin_trylock_track(session, lock)) == 0) {\ F_SET(session, (flag)); \ op; \ F_CLR(session, (flag)); \ @@ -122,16 +130,46 @@ struct __wt_table { &S2C(session)->checkpoint_lock, WT_SESSION_LOCKED_CHECKPOINT, op) /* - * WT_WITH_HANDLE_LIST_LOCK -- - * Acquire the data handle list lock, perform an operation, drop the lock. + * WT_WITH_HANDLE_LIST_READ_LOCK -- + * Acquire the data handle list lock in shared mode, perform an operation, + * drop the lock. The handle list lock is a read-write lock so the + * implementation is different to the other lock macros. * * Note: always waits because some operations need the handle list lock to * discard handles, and we only expect it to be held across short * operations. */ -#define WT_WITH_HANDLE_LIST_LOCK(session, op) \ - WT_WITH_LOCK_WAIT(session, \ - &S2C(session)->dhandle_lock, WT_SESSION_LOCKED_HANDLE_LIST, op) +#define WT_WITH_HANDLE_LIST_READ_LOCK(session, op) do { \ + if (F_ISSET(session, WT_SESSION_LOCKED_HANDLE_LIST)) { \ + op; \ + } else { \ + __wt_readlock(session, &S2C(session)->dhandle_lock); \ + F_SET(session, WT_SESSION_LOCKED_HANDLE_LIST_READ); \ + op; \ + F_CLR(session, WT_SESSION_LOCKED_HANDLE_LIST_READ); \ + __wt_readunlock(session, &S2C(session)->dhandle_lock); \ + } \ +} while (0) + +/* + * WT_WITH_HANDLE_LIST_WRITE_LOCK -- + * Acquire the data handle list lock in exclusive mode, perform an + * operation, drop the lock. The handle list lock is a read-write lock so + * the implementation is different to the other lock macros. + */ +#define WT_WITH_HANDLE_LIST_WRITE_LOCK(session, op) do { \ + if (F_ISSET(session, WT_SESSION_LOCKED_HANDLE_LIST_WRITE)) { \ + op; \ + } else { \ + WT_ASSERT(session, \ + !F_ISSET(session, WT_SESSION_LOCKED_HANDLE_LIST_READ));\ + __wt_writelock(session, &S2C(session)->dhandle_lock); \ + F_SET(session, WT_SESSION_LOCKED_HANDLE_LIST_WRITE); \ + op; \ + F_CLR(session, WT_SESSION_LOCKED_HANDLE_LIST_WRITE); \ + __wt_writeunlock(session, &S2C(session)->dhandle_lock); \ + } \ +} while (0) /* * WT_WITH_METADATA_LOCK -- @@ -165,22 +203,58 @@ struct __wt_table { } while (0) /* - * WT_WITH_TABLE_LOCK, WT_WITH_TABLE_LOCK_NOWAIT -- + * WT_WITH_TABLE_READ_LOCK, WT_WITH_TABLE_WRITE_LOCK, + * WT_WITH_TABLE_WRITE_LOCK_NOWAIT -- * Acquire the table lock, perform an operation, drop the lock. + * The table lock is a read-write lock so the implementation is different + * to most other lock macros. + * + * Note: readlock always waits because some operations need the table lock + * to discard handles, and we only expect it to be held across short + * operations. */ -#define WT_WITH_TABLE_LOCK(session, op) do { \ - WT_ASSERT(session, \ - F_ISSET(session, WT_SESSION_LOCKED_TABLE) || \ - !F_ISSET(session, WT_SESSION_LOCKED_HANDLE_LIST)); \ - WT_WITH_LOCK_WAIT(session, \ - &S2C(session)->table_lock, WT_SESSION_LOCKED_TABLE, op); \ +#define WT_WITH_TABLE_READ_LOCK(session, op) do { \ + if (F_ISSET(session, WT_SESSION_LOCKED_TABLE)) { \ + op; \ + } else { \ + WT_ASSERT(session, \ + !F_ISSET(session, WT_SESSION_LOCKED_HANDLE_LIST)); \ + __wt_readlock(session, &S2C(session)->table_lock); \ + F_SET(session, WT_SESSION_LOCKED_TABLE_READ); \ + op; \ + F_CLR(session, WT_SESSION_LOCKED_TABLE_READ); \ + __wt_readunlock(session, &S2C(session)->table_lock); \ + } \ +} while (0) + +#define WT_WITH_TABLE_WRITE_LOCK(session, op) do { \ + if (F_ISSET(session, WT_SESSION_LOCKED_TABLE_WRITE)) { \ + op; \ + } else { \ + WT_ASSERT(session, \ + !F_ISSET(session, WT_SESSION_LOCKED_TABLE_READ | \ + WT_SESSION_LOCKED_HANDLE_LIST)); \ + __wt_writelock(session, &S2C(session)->table_lock); \ + F_SET(session, WT_SESSION_LOCKED_TABLE_WRITE); \ + op; \ + F_CLR(session, WT_SESSION_LOCKED_TABLE_WRITE); \ + __wt_writeunlock(session, &S2C(session)->table_lock); \ + } \ } while (0) -#define WT_WITH_TABLE_LOCK_NOWAIT(session, ret, op) do { \ +#define WT_WITH_TABLE_WRITE_LOCK_NOWAIT(session, ret, op) do { \ WT_ASSERT(session, \ - F_ISSET(session, WT_SESSION_LOCKED_TABLE) || \ - !F_ISSET(session, WT_SESSION_LOCKED_HANDLE_LIST)); \ - WT_WITH_LOCK_NOWAIT(session, ret, \ - &S2C(session)->table_lock, WT_SESSION_LOCKED_TABLE, op); \ + F_ISSET(session, WT_SESSION_LOCKED_TABLE_WRITE) || \ + !F_ISSET(session, WT_SESSION_LOCKED_TABLE_READ | \ + WT_SESSION_LOCKED_HANDLE_LIST)); \ + if (F_ISSET(session, WT_SESSION_LOCKED_TABLE_WRITE)) { \ + op; \ + } else if ((ret = __wt_try_writelock(session, \ + &S2C(session)->table_lock)) == 0) { \ + F_SET(session, WT_SESSION_LOCKED_TABLE_WRITE); \ + op; \ + F_CLR(session, WT_SESSION_LOCKED_TABLE_WRITE); \ + __wt_writeunlock(session, &S2C(session)->table_lock); \ + } \ } while (0) /* @@ -192,19 +266,31 @@ struct __wt_table { WT_CONNECTION_IMPL *__conn = S2C(session); \ bool __checkpoint_locked = \ F_ISSET(session, WT_SESSION_LOCKED_CHECKPOINT); \ - bool __handle_locked = \ - F_ISSET(session, WT_SESSION_LOCKED_HANDLE_LIST); \ - bool __table_locked = \ - F_ISSET(session, WT_SESSION_LOCKED_TABLE); \ + bool __handle_read_locked = \ + F_ISSET(session, WT_SESSION_LOCKED_HANDLE_LIST_READ); \ + bool __handle_write_locked = \ + F_ISSET(session, WT_SESSION_LOCKED_HANDLE_LIST_WRITE); \ + bool __table_read_locked = \ + F_ISSET(session, WT_SESSION_LOCKED_TABLE_READ); \ + bool __table_write_locked = \ + F_ISSET(session, WT_SESSION_LOCKED_TABLE_WRITE); \ bool __schema_locked = \ F_ISSET(session, WT_SESSION_LOCKED_SCHEMA); \ - if (__handle_locked) { \ - F_CLR(session, WT_SESSION_LOCKED_HANDLE_LIST); \ - __wt_spin_unlock(session, &__conn->dhandle_lock); \ + if (__handle_read_locked) { \ + F_CLR(session, WT_SESSION_LOCKED_HANDLE_LIST_READ); \ + __wt_readunlock(session, &__conn->dhandle_lock); \ } \ - if (__table_locked) { \ - F_CLR(session, WT_SESSION_LOCKED_TABLE); \ - __wt_spin_unlock(session, &__conn->table_lock); \ + if (__handle_write_locked) { \ + F_CLR(session, WT_SESSION_LOCKED_HANDLE_LIST_WRITE); \ + __wt_writeunlock(session, &__conn->dhandle_lock); \ + } \ + if (__table_read_locked) { \ + F_CLR(session, WT_SESSION_LOCKED_TABLE_READ); \ + __wt_readunlock(session, &__conn->table_lock); \ + } \ + if (__table_write_locked) { \ + F_CLR(session, WT_SESSION_LOCKED_TABLE_WRITE); \ + __wt_writeunlock(session, &__conn->table_lock); \ } \ if (__schema_locked) { \ F_CLR(session, WT_SESSION_LOCKED_SCHEMA); \ @@ -223,12 +309,20 @@ struct __wt_table { __wt_spin_lock(session, &__conn->schema_lock); \ F_SET(session, WT_SESSION_LOCKED_SCHEMA); \ } \ - if (__table_locked) { \ - __wt_spin_lock(session, &__conn->table_lock); \ - F_SET(session, WT_SESSION_LOCKED_TABLE); \ + if (__table_read_locked) { \ + __wt_readlock(session, &__conn->table_lock); \ + F_SET(session, WT_SESSION_LOCKED_TABLE_READ); \ + } \ + if (__table_write_locked) { \ + __wt_writelock(session, &__conn->table_lock); \ + F_SET(session, WT_SESSION_LOCKED_TABLE_WRITE); \ + } \ + if (__handle_read_locked) { \ + __wt_readlock(session, &__conn->dhandle_lock); \ + F_SET(session, WT_SESSION_LOCKED_HANDLE_LIST_READ); \ } \ - if (__handle_locked) { \ - __wt_spin_lock(session, &__conn->dhandle_lock); \ - F_SET(session, WT_SESSION_LOCKED_HANDLE_LIST); \ + if (__handle_write_locked) { \ + __wt_writelock(session, &__conn->dhandle_lock); \ + F_SET(session, WT_SESSION_LOCKED_HANDLE_LIST_WRITE); \ } \ } while (0) diff --git a/src/third_party/wiredtiger/src/include/session.h b/src/third_party/wiredtiger/src/include/session.h index 7dd523aea26..085f871a34f 100644 --- a/src/third_party/wiredtiger/src/include/session.h +++ b/src/third_party/wiredtiger/src/include/session.h @@ -52,8 +52,6 @@ struct __wt_session_impl { const char *lastop; /* Last operation */ uint32_t id; /* UID, offset in session array */ - WT_CONDVAR *cond; /* Condition variable */ - WT_EVENT_HANDLER *event_handler;/* Application's event handlers */ WT_DATA_HANDLE *dhandle; /* Current data handle */ diff --git a/src/third_party/wiredtiger/src/include/stat.h b/src/third_party/wiredtiger/src/include/stat.h index 0daab83e166..8b2e78a4ed5 100644 --- a/src/third_party/wiredtiger/src/include/stat.h +++ b/src/third_party/wiredtiger/src/include/stat.h @@ -310,7 +310,11 @@ struct __wt_connection_stats { int64_t cache_eviction_slow; int64_t cache_eviction_state; int64_t cache_eviction_walks_abandoned; + int64_t cache_eviction_active_workers; + int64_t cache_eviction_worker_created; int64_t cache_eviction_worker_evicting; + int64_t cache_eviction_worker_removed; + int64_t cache_eviction_stable_state_workers; int64_t cache_eviction_force_fail; int64_t cache_eviction_walks_active; int64_t cache_eviction_walks_started; @@ -388,9 +392,7 @@ struct __wt_connection_stats { int64_t lock_checkpoint_count; int64_t lock_checkpoint_wait_application; int64_t lock_checkpoint_wait_internal; - int64_t lock_handle_list_count; - int64_t lock_handle_list_wait_application; - int64_t lock_handle_list_wait_internal; + int64_t lock_handle_list_wait_eviction; int64_t lock_metadata_count; int64_t lock_metadata_wait_application; int64_t lock_metadata_wait_internal; @@ -564,6 +566,7 @@ struct __wt_dsrc_stats { int64_t cache_pages_requested; int64_t cache_write; int64_t cache_write_restore; + int64_t cache_bytes_dirty; int64_t cache_eviction_clean; int64_t cache_state_gen_avg_gap; int64_t cache_state_avg_written_size; diff --git a/src/third_party/wiredtiger/src/include/thread_group.h b/src/third_party/wiredtiger/src/include/thread_group.h index 76758a090c4..77cff00dc8d 100644 --- a/src/third_party/wiredtiger/src/include/thread_group.h +++ b/src/third_party/wiredtiger/src/include/thread_group.h @@ -40,7 +40,7 @@ struct __wt_thread_group { const char *name; /* Name */ - WT_RWLOCK *lock; /* Protects group changes */ + WT_RWLOCK lock; /* Protects group changes */ /* * Condition signalled when wanting to wake up threads that are diff --git a/src/third_party/wiredtiger/src/include/txn.h b/src/third_party/wiredtiger/src/include/txn.h index 12fc2a0a5b7..7e802c188ab 100644 --- a/src/third_party/wiredtiger/src/include/txn.h +++ b/src/third_party/wiredtiger/src/include/txn.h @@ -92,7 +92,7 @@ struct __wt_txn_global { * Prevents the oldest ID moving forwards while threads are scanning * the global transaction state. */ - WT_RWLOCK *scan_rwlock; + WT_RWLOCK scan_rwlock; /* * Track information about the running checkpoint. The transaction @@ -114,7 +114,7 @@ struct __wt_txn_global { volatile uint64_t metadata_pinned; /* Oldest ID for metadata */ /* Named snapshot state. */ - WT_RWLOCK *nsnap_rwlock; + WT_RWLOCK nsnap_rwlock; volatile uint64_t nsnap_oldest_id; TAILQ_HEAD(__wt_nsnap_qh, __wt_named_snapshot) nsnaph; diff --git a/src/third_party/wiredtiger/src/include/verify_build.h b/src/third_party/wiredtiger/src/include/verify_build.h index 8abc192892e..640f5e4cf5f 100644 --- a/src/third_party/wiredtiger/src/include/verify_build.h +++ b/src/third_party/wiredtiger/src/include/verify_build.h @@ -59,7 +59,6 @@ __wt_verify_build(void) sizeof(s) > WT_CACHE_LINE_ALIGNMENT || \ sizeof(s) % WT_CACHE_LINE_ALIGNMENT == 0) WT_PADDING_CHECK(WT_LOGSLOT); - WT_PADDING_CHECK(WT_RWLOCK); WT_PADDING_CHECK(WT_SPINLOCK); WT_PADDING_CHECK(WT_TXN_STATE); diff --git a/src/third_party/wiredtiger/src/include/wiredtiger.in b/src/third_party/wiredtiger/src/include/wiredtiger.in index a6deed7e14e..c148e759299 100644 --- a/src/third_party/wiredtiger/src/include/wiredtiger.in +++ b/src/third_party/wiredtiger/src/include/wiredtiger.in @@ -114,16 +114,16 @@ struct __wt_item { size_t size; #ifndef DOXYGEN -#define WT_ITEM_ALIGNED 0x00000001 -#define WT_ITEM_INUSE 0x00000002 - /* This appears in the middle of the struct to avoid padding. */ - /*! Object flags (internal use). */ - uint32_t flags; - /*! Managed memory chunk (internal use). */ void *mem; + /*! Managed memory size (internal use). */ size_t memsize; + +#define WT_ITEM_ALIGNED 0x00000001 +#define WT_ITEM_INUSE 0x00000002 + /*! Object flags (internal use). */ + uint32_t flags; #endif }; @@ -576,8 +576,9 @@ struct __wt_cursor { #define WT_CURSTD_OPEN 0x00200 #define WT_CURSTD_OVERWRITE 0x00400 #define WT_CURSTD_RAW 0x00800 -#define WT_CURSTD_VALUE_EXT 0x01000 /* Value points out of the tree. */ -#define WT_CURSTD_VALUE_INT 0x02000 /* Value points into the tree. */ +#define WT_CURSTD_RAW_SEARCH 0x01000 +#define WT_CURSTD_VALUE_EXT 0x02000 /* Value points out of the tree. */ +#define WT_CURSTD_VALUE_INT 0x04000 /* Value points into the tree. */ #define WT_CURSTD_VALUE_SET (WT_CURSTD_VALUE_EXT | WT_CURSTD_VALUE_INT) uint32_t flags; #endif @@ -1855,7 +1856,7 @@ struct __wt_connection { * threads WiredTiger will start to help evict pages from cache. The * number of threads started will vary depending on the current eviction * load. Each eviction worker thread uses a session from the configured - * session_max., an integer between 1 and 20; default \c 1.} + * session_max., an integer between 1 and 20; default \c 8.} * @config{ threads_min, minimum number of * threads WiredTiger will start to help evict pages from cache. The * number of threads currently running will vary depending on the @@ -1982,12 +1983,13 @@ struct __wt_connection { * as a list\, such as <code>"verbose=[evictserver\,read]"</code>., a * list\, with values chosen from the following options: \c "api"\, \c * "block"\, \c "checkpoint"\, \c "compact"\, \c "evict"\, \c - * "evictserver"\, \c "fileops"\, \c "handleops"\, \c "log"\, \c "lsm"\, - * \c "lsm_manager"\, \c "metadata"\, \c "mutex"\, \c "overflow"\, \c - * "read"\, \c "rebalance"\, \c "reconcile"\, \c "recovery"\, \c - * "salvage"\, \c "shared_cache"\, \c "split"\, \c "temporary"\, \c - * "thread_group"\, \c "transaction"\, \c "verify"\, \c "version"\, \c - * "write"; default empty.} + * "evict_stuck"\, \c "evictserver"\, \c "fileops"\, \c "handleops"\, \c + * "log"\, \c "lsm"\, \c "lsm_manager"\, \c "metadata"\, \c "mutex"\, \c + * "overflow"\, \c "read"\, \c "rebalance"\, \c "reconcile"\, \c + * "recovery"\, \c "recovery_progress"\, \c "salvage"\, \c + * "shared_cache"\, \c "split"\, \c "temporary"\, \c "thread_group"\, \c + * "transaction"\, \c "verify"\, \c "version"\, \c "write"; default + * empty.} * @configend * @errors */ @@ -2331,7 +2333,7 @@ struct __wt_connection { * WiredTiger will start to help evict pages from cache. The number of threads * started will vary depending on the current eviction load. Each eviction * worker thread uses a session from the configured session_max., an integer - * between 1 and 20; default \c 1.} + * between 1 and 20; default \c 8.} * @config{ threads_min, * minimum number of threads WiredTiger will start to help evict pages from * cache. The number of threads currently running will vary depending on the @@ -2361,7 +2363,7 @@ struct __wt_connection { * @config{exclusive, fail if the database already exists\, generally used with * the \c create option., a boolean flag; default \c false.} * @config{extensions, list of shared library extensions to load (using dlopen). - * Any values specified to an library extension are passed to + * Any values specified to a library extension are passed to * WT_CONNECTION::load_extension as the \c config parameter (for example\, * <code>extensions=(/path/ext.so={entry=my_entry})</code>)., a list of strings; * default empty.} @@ -2513,12 +2515,13 @@ struct __wt_connection { * WiredTiger is configured with --enable-verbose. Options are given as a * list\, such as <code>"verbose=[evictserver\,read]"</code>., a list\, with * values chosen from the following options: \c "api"\, \c "block"\, \c - * "checkpoint"\, \c "compact"\, \c "evict"\, \c "evictserver"\, \c "fileops"\, - * \c "handleops"\, \c "log"\, \c "lsm"\, \c "lsm_manager"\, \c "metadata"\, \c - * "mutex"\, \c "overflow"\, \c "read"\, \c "rebalance"\, \c "reconcile"\, \c - * "recovery"\, \c "salvage"\, \c "shared_cache"\, \c "split"\, \c "temporary"\, - * \c "thread_group"\, \c "transaction"\, \c "verify"\, \c "version"\, \c - * "write"; default empty.} + * "checkpoint"\, \c "compact"\, \c "evict"\, \c "evict_stuck"\, \c + * "evictserver"\, \c "fileops"\, \c "handleops"\, \c "log"\, \c "lsm"\, \c + * "lsm_manager"\, \c "metadata"\, \c "mutex"\, \c "overflow"\, \c "read"\, \c + * "rebalance"\, \c "reconcile"\, \c "recovery"\, \c "recovery_progress"\, \c + * "salvage"\, \c "shared_cache"\, \c "split"\, \c "temporary"\, \c + * "thread_group"\, \c "transaction"\, \c "verify"\, \c "version"\, \c "write"; + * default empty.} * @config{write_through, Use \c FILE_FLAG_WRITE_THROUGH on Windows to write to * files. Ignored on non-Windows systems. Options are given as a list\, such * as <code>"write_through=[data]"</code>. Configuring \c write_through requires @@ -4429,396 +4432,400 @@ extern int wiredtiger_extension_terminate(WT_CONNECTION *connection); #define WT_STAT_CONN_CACHE_EVICTION_STATE 1051 /*! cache: eviction walks abandoned */ #define WT_STAT_CONN_CACHE_EVICTION_WALKS_ABANDONED 1052 +/*! cache: eviction worker thread active */ +#define WT_STAT_CONN_CACHE_EVICTION_ACTIVE_WORKERS 1053 +/*! cache: eviction worker thread created */ +#define WT_STAT_CONN_CACHE_EVICTION_WORKER_CREATED 1054 /*! cache: eviction worker thread evicting pages */ -#define WT_STAT_CONN_CACHE_EVICTION_WORKER_EVICTING 1053 +#define WT_STAT_CONN_CACHE_EVICTION_WORKER_EVICTING 1055 +/*! cache: eviction worker thread removed */ +#define WT_STAT_CONN_CACHE_EVICTION_WORKER_REMOVED 1056 +/*! cache: eviction worker thread stable number */ +#define WT_STAT_CONN_CACHE_EVICTION_STABLE_STATE_WORKERS 1057 /*! cache: failed eviction of pages that exceeded the in-memory maximum */ -#define WT_STAT_CONN_CACHE_EVICTION_FORCE_FAIL 1054 +#define WT_STAT_CONN_CACHE_EVICTION_FORCE_FAIL 1058 /*! cache: files with active eviction walks */ -#define WT_STAT_CONN_CACHE_EVICTION_WALKS_ACTIVE 1055 +#define WT_STAT_CONN_CACHE_EVICTION_WALKS_ACTIVE 1059 /*! cache: files with new eviction walks started */ -#define WT_STAT_CONN_CACHE_EVICTION_WALKS_STARTED 1056 +#define WT_STAT_CONN_CACHE_EVICTION_WALKS_STARTED 1060 /*! cache: hazard pointer blocked page eviction */ -#define WT_STAT_CONN_CACHE_EVICTION_HAZARD 1057 +#define WT_STAT_CONN_CACHE_EVICTION_HAZARD 1061 /*! cache: hazard pointer check calls */ -#define WT_STAT_CONN_CACHE_HAZARD_CHECKS 1058 +#define WT_STAT_CONN_CACHE_HAZARD_CHECKS 1062 /*! cache: hazard pointer check entries walked */ -#define WT_STAT_CONN_CACHE_HAZARD_WALKS 1059 +#define WT_STAT_CONN_CACHE_HAZARD_WALKS 1063 /*! cache: hazard pointer maximum array length */ -#define WT_STAT_CONN_CACHE_HAZARD_MAX 1060 +#define WT_STAT_CONN_CACHE_HAZARD_MAX 1064 /*! cache: in-memory page passed criteria to be split */ -#define WT_STAT_CONN_CACHE_INMEM_SPLITTABLE 1061 +#define WT_STAT_CONN_CACHE_INMEM_SPLITTABLE 1065 /*! cache: in-memory page splits */ -#define WT_STAT_CONN_CACHE_INMEM_SPLIT 1062 +#define WT_STAT_CONN_CACHE_INMEM_SPLIT 1066 /*! cache: internal pages evicted */ -#define WT_STAT_CONN_CACHE_EVICTION_INTERNAL 1063 +#define WT_STAT_CONN_CACHE_EVICTION_INTERNAL 1067 /*! cache: internal pages split during eviction */ -#define WT_STAT_CONN_CACHE_EVICTION_SPLIT_INTERNAL 1064 +#define WT_STAT_CONN_CACHE_EVICTION_SPLIT_INTERNAL 1068 /*! cache: leaf pages split during eviction */ -#define WT_STAT_CONN_CACHE_EVICTION_SPLIT_LEAF 1065 +#define WT_STAT_CONN_CACHE_EVICTION_SPLIT_LEAF 1069 /*! cache: lookaside table insert calls */ -#define WT_STAT_CONN_CACHE_LOOKASIDE_INSERT 1066 +#define WT_STAT_CONN_CACHE_LOOKASIDE_INSERT 1070 /*! cache: lookaside table remove calls */ -#define WT_STAT_CONN_CACHE_LOOKASIDE_REMOVE 1067 +#define WT_STAT_CONN_CACHE_LOOKASIDE_REMOVE 1071 /*! cache: maximum bytes configured */ -#define WT_STAT_CONN_CACHE_BYTES_MAX 1068 +#define WT_STAT_CONN_CACHE_BYTES_MAX 1072 /*! cache: maximum page size at eviction */ -#define WT_STAT_CONN_CACHE_EVICTION_MAXIMUM_PAGE_SIZE 1069 +#define WT_STAT_CONN_CACHE_EVICTION_MAXIMUM_PAGE_SIZE 1073 /*! cache: modified pages evicted */ -#define WT_STAT_CONN_CACHE_EVICTION_DIRTY 1070 +#define WT_STAT_CONN_CACHE_EVICTION_DIRTY 1074 /*! cache: modified pages evicted by application threads */ -#define WT_STAT_CONN_CACHE_EVICTION_APP_DIRTY 1071 +#define WT_STAT_CONN_CACHE_EVICTION_APP_DIRTY 1075 /*! cache: overflow pages read into cache */ -#define WT_STAT_CONN_CACHE_READ_OVERFLOW 1072 +#define WT_STAT_CONN_CACHE_READ_OVERFLOW 1076 /*! cache: overflow values cached in memory */ -#define WT_STAT_CONN_CACHE_OVERFLOW_VALUE 1073 +#define WT_STAT_CONN_CACHE_OVERFLOW_VALUE 1077 /*! cache: page split during eviction deepened the tree */ -#define WT_STAT_CONN_CACHE_EVICTION_DEEPEN 1074 +#define WT_STAT_CONN_CACHE_EVICTION_DEEPEN 1078 /*! cache: page written requiring lookaside records */ -#define WT_STAT_CONN_CACHE_WRITE_LOOKASIDE 1075 +#define WT_STAT_CONN_CACHE_WRITE_LOOKASIDE 1079 /*! cache: pages currently held in the cache */ -#define WT_STAT_CONN_CACHE_PAGES_INUSE 1076 +#define WT_STAT_CONN_CACHE_PAGES_INUSE 1080 /*! cache: pages evicted because they exceeded the in-memory maximum */ -#define WT_STAT_CONN_CACHE_EVICTION_FORCE 1077 +#define WT_STAT_CONN_CACHE_EVICTION_FORCE 1081 /*! cache: pages evicted because they had chains of deleted items */ -#define WT_STAT_CONN_CACHE_EVICTION_FORCE_DELETE 1078 +#define WT_STAT_CONN_CACHE_EVICTION_FORCE_DELETE 1082 /*! cache: pages evicted by application threads */ -#define WT_STAT_CONN_CACHE_EVICTION_APP 1079 +#define WT_STAT_CONN_CACHE_EVICTION_APP 1083 /*! cache: pages queued for eviction */ -#define WT_STAT_CONN_CACHE_EVICTION_PAGES_QUEUED 1080 +#define WT_STAT_CONN_CACHE_EVICTION_PAGES_QUEUED 1084 /*! cache: pages queued for urgent eviction */ -#define WT_STAT_CONN_CACHE_EVICTION_PAGES_QUEUED_URGENT 1081 +#define WT_STAT_CONN_CACHE_EVICTION_PAGES_QUEUED_URGENT 1085 /*! cache: pages queued for urgent eviction during walk */ -#define WT_STAT_CONN_CACHE_EVICTION_PAGES_QUEUED_OLDEST 1082 +#define WT_STAT_CONN_CACHE_EVICTION_PAGES_QUEUED_OLDEST 1086 /*! cache: pages read into cache */ -#define WT_STAT_CONN_CACHE_READ 1083 +#define WT_STAT_CONN_CACHE_READ 1087 /*! cache: pages read into cache requiring lookaside entries */ -#define WT_STAT_CONN_CACHE_READ_LOOKASIDE 1084 +#define WT_STAT_CONN_CACHE_READ_LOOKASIDE 1088 /*! cache: pages requested from the cache */ -#define WT_STAT_CONN_CACHE_PAGES_REQUESTED 1085 +#define WT_STAT_CONN_CACHE_PAGES_REQUESTED 1089 /*! cache: pages seen by eviction walk */ -#define WT_STAT_CONN_CACHE_EVICTION_PAGES_SEEN 1086 +#define WT_STAT_CONN_CACHE_EVICTION_PAGES_SEEN 1090 /*! cache: pages selected for eviction unable to be evicted */ -#define WT_STAT_CONN_CACHE_EVICTION_FAIL 1087 +#define WT_STAT_CONN_CACHE_EVICTION_FAIL 1091 /*! cache: pages walked for eviction */ -#define WT_STAT_CONN_CACHE_EVICTION_WALK 1088 +#define WT_STAT_CONN_CACHE_EVICTION_WALK 1092 /*! cache: pages written from cache */ -#define WT_STAT_CONN_CACHE_WRITE 1089 +#define WT_STAT_CONN_CACHE_WRITE 1093 /*! cache: pages written requiring in-memory restoration */ -#define WT_STAT_CONN_CACHE_WRITE_RESTORE 1090 +#define WT_STAT_CONN_CACHE_WRITE_RESTORE 1094 /*! cache: percentage overhead */ -#define WT_STAT_CONN_CACHE_OVERHEAD 1091 +#define WT_STAT_CONN_CACHE_OVERHEAD 1095 /*! cache: tracked bytes belonging to internal pages in the cache */ -#define WT_STAT_CONN_CACHE_BYTES_INTERNAL 1092 +#define WT_STAT_CONN_CACHE_BYTES_INTERNAL 1096 /*! cache: tracked bytes belonging to leaf pages in the cache */ -#define WT_STAT_CONN_CACHE_BYTES_LEAF 1093 +#define WT_STAT_CONN_CACHE_BYTES_LEAF 1097 /*! cache: tracked dirty bytes in the cache */ -#define WT_STAT_CONN_CACHE_BYTES_DIRTY 1094 +#define WT_STAT_CONN_CACHE_BYTES_DIRTY 1098 /*! cache: tracked dirty pages in the cache */ -#define WT_STAT_CONN_CACHE_PAGES_DIRTY 1095 +#define WT_STAT_CONN_CACHE_PAGES_DIRTY 1099 /*! cache: unmodified pages evicted */ -#define WT_STAT_CONN_CACHE_EVICTION_CLEAN 1096 +#define WT_STAT_CONN_CACHE_EVICTION_CLEAN 1100 /*! connection: auto adjusting condition resets */ -#define WT_STAT_CONN_COND_AUTO_WAIT_RESET 1097 +#define WT_STAT_CONN_COND_AUTO_WAIT_RESET 1101 /*! connection: auto adjusting condition wait calls */ -#define WT_STAT_CONN_COND_AUTO_WAIT 1098 +#define WT_STAT_CONN_COND_AUTO_WAIT 1102 /*! connection: files currently open */ -#define WT_STAT_CONN_FILE_OPEN 1099 +#define WT_STAT_CONN_FILE_OPEN 1103 /*! connection: memory allocations */ -#define WT_STAT_CONN_MEMORY_ALLOCATION 1100 +#define WT_STAT_CONN_MEMORY_ALLOCATION 1104 /*! connection: memory frees */ -#define WT_STAT_CONN_MEMORY_FREE 1101 +#define WT_STAT_CONN_MEMORY_FREE 1105 /*! connection: memory re-allocations */ -#define WT_STAT_CONN_MEMORY_GROW 1102 +#define WT_STAT_CONN_MEMORY_GROW 1106 /*! connection: pthread mutex condition wait calls */ -#define WT_STAT_CONN_COND_WAIT 1103 +#define WT_STAT_CONN_COND_WAIT 1107 /*! connection: pthread mutex shared lock read-lock calls */ -#define WT_STAT_CONN_RWLOCK_READ 1104 +#define WT_STAT_CONN_RWLOCK_READ 1108 /*! connection: pthread mutex shared lock write-lock calls */ -#define WT_STAT_CONN_RWLOCK_WRITE 1105 +#define WT_STAT_CONN_RWLOCK_WRITE 1109 /*! connection: total fsync I/Os */ -#define WT_STAT_CONN_FSYNC_IO 1106 +#define WT_STAT_CONN_FSYNC_IO 1110 /*! connection: total read I/Os */ -#define WT_STAT_CONN_READ_IO 1107 +#define WT_STAT_CONN_READ_IO 1111 /*! connection: total write I/Os */ -#define WT_STAT_CONN_WRITE_IO 1108 +#define WT_STAT_CONN_WRITE_IO 1112 /*! cursor: cursor create calls */ -#define WT_STAT_CONN_CURSOR_CREATE 1109 +#define WT_STAT_CONN_CURSOR_CREATE 1113 /*! cursor: cursor insert calls */ -#define WT_STAT_CONN_CURSOR_INSERT 1110 +#define WT_STAT_CONN_CURSOR_INSERT 1114 /*! cursor: cursor next calls */ -#define WT_STAT_CONN_CURSOR_NEXT 1111 +#define WT_STAT_CONN_CURSOR_NEXT 1115 /*! cursor: cursor prev calls */ -#define WT_STAT_CONN_CURSOR_PREV 1112 +#define WT_STAT_CONN_CURSOR_PREV 1116 /*! cursor: cursor remove calls */ -#define WT_STAT_CONN_CURSOR_REMOVE 1113 +#define WT_STAT_CONN_CURSOR_REMOVE 1117 /*! cursor: cursor reset calls */ -#define WT_STAT_CONN_CURSOR_RESET 1114 +#define WT_STAT_CONN_CURSOR_RESET 1118 /*! cursor: cursor restarted searches */ -#define WT_STAT_CONN_CURSOR_RESTART 1115 +#define WT_STAT_CONN_CURSOR_RESTART 1119 /*! cursor: cursor search calls */ -#define WT_STAT_CONN_CURSOR_SEARCH 1116 +#define WT_STAT_CONN_CURSOR_SEARCH 1120 /*! cursor: cursor search near calls */ -#define WT_STAT_CONN_CURSOR_SEARCH_NEAR 1117 +#define WT_STAT_CONN_CURSOR_SEARCH_NEAR 1121 /*! cursor: cursor update calls */ -#define WT_STAT_CONN_CURSOR_UPDATE 1118 +#define WT_STAT_CONN_CURSOR_UPDATE 1122 /*! cursor: truncate calls */ -#define WT_STAT_CONN_CURSOR_TRUNCATE 1119 +#define WT_STAT_CONN_CURSOR_TRUNCATE 1123 /*! data-handle: connection data handles currently active */ -#define WT_STAT_CONN_DH_CONN_HANDLE_COUNT 1120 +#define WT_STAT_CONN_DH_CONN_HANDLE_COUNT 1124 /*! data-handle: connection sweep candidate became referenced */ -#define WT_STAT_CONN_DH_SWEEP_REF 1121 +#define WT_STAT_CONN_DH_SWEEP_REF 1125 /*! data-handle: connection sweep dhandles closed */ -#define WT_STAT_CONN_DH_SWEEP_CLOSE 1122 +#define WT_STAT_CONN_DH_SWEEP_CLOSE 1126 /*! data-handle: connection sweep dhandles removed from hash list */ -#define WT_STAT_CONN_DH_SWEEP_REMOVE 1123 +#define WT_STAT_CONN_DH_SWEEP_REMOVE 1127 /*! data-handle: connection sweep time-of-death sets */ -#define WT_STAT_CONN_DH_SWEEP_TOD 1124 +#define WT_STAT_CONN_DH_SWEEP_TOD 1128 /*! data-handle: connection sweeps */ -#define WT_STAT_CONN_DH_SWEEPS 1125 +#define WT_STAT_CONN_DH_SWEEPS 1129 /*! data-handle: session dhandles swept */ -#define WT_STAT_CONN_DH_SESSION_HANDLES 1126 +#define WT_STAT_CONN_DH_SESSION_HANDLES 1130 /*! data-handle: session sweep attempts */ -#define WT_STAT_CONN_DH_SESSION_SWEEPS 1127 +#define WT_STAT_CONN_DH_SESSION_SWEEPS 1131 /*! lock: checkpoint lock acquisitions */ -#define WT_STAT_CONN_LOCK_CHECKPOINT_COUNT 1128 +#define WT_STAT_CONN_LOCK_CHECKPOINT_COUNT 1132 /*! lock: checkpoint lock application thread wait time (usecs) */ -#define WT_STAT_CONN_LOCK_CHECKPOINT_WAIT_APPLICATION 1129 +#define WT_STAT_CONN_LOCK_CHECKPOINT_WAIT_APPLICATION 1133 /*! lock: checkpoint lock internal thread wait time (usecs) */ -#define WT_STAT_CONN_LOCK_CHECKPOINT_WAIT_INTERNAL 1130 -/*! lock: handle-list lock acquisitions */ -#define WT_STAT_CONN_LOCK_HANDLE_LIST_COUNT 1131 -/*! lock: handle-list lock application thread wait time (usecs) */ -#define WT_STAT_CONN_LOCK_HANDLE_LIST_WAIT_APPLICATION 1132 -/*! lock: handle-list lock internal thread wait time (usecs) */ -#define WT_STAT_CONN_LOCK_HANDLE_LIST_WAIT_INTERNAL 1133 +#define WT_STAT_CONN_LOCK_CHECKPOINT_WAIT_INTERNAL 1134 +/*! lock: handle-list lock eviction thread wait time (usecs) */ +#define WT_STAT_CONN_LOCK_HANDLE_LIST_WAIT_EVICTION 1135 /*! lock: metadata lock acquisitions */ -#define WT_STAT_CONN_LOCK_METADATA_COUNT 1134 +#define WT_STAT_CONN_LOCK_METADATA_COUNT 1136 /*! lock: metadata lock application thread wait time (usecs) */ -#define WT_STAT_CONN_LOCK_METADATA_WAIT_APPLICATION 1135 +#define WT_STAT_CONN_LOCK_METADATA_WAIT_APPLICATION 1137 /*! lock: metadata lock internal thread wait time (usecs) */ -#define WT_STAT_CONN_LOCK_METADATA_WAIT_INTERNAL 1136 +#define WT_STAT_CONN_LOCK_METADATA_WAIT_INTERNAL 1138 /*! lock: schema lock acquisitions */ -#define WT_STAT_CONN_LOCK_SCHEMA_COUNT 1137 +#define WT_STAT_CONN_LOCK_SCHEMA_COUNT 1139 /*! lock: schema lock application thread wait time (usecs) */ -#define WT_STAT_CONN_LOCK_SCHEMA_WAIT_APPLICATION 1138 +#define WT_STAT_CONN_LOCK_SCHEMA_WAIT_APPLICATION 1140 /*! lock: schema lock internal thread wait time (usecs) */ -#define WT_STAT_CONN_LOCK_SCHEMA_WAIT_INTERNAL 1139 +#define WT_STAT_CONN_LOCK_SCHEMA_WAIT_INTERNAL 1141 /*! lock: table lock acquisitions */ -#define WT_STAT_CONN_LOCK_TABLE_COUNT 1140 +#define WT_STAT_CONN_LOCK_TABLE_COUNT 1142 /*! * lock: table lock application thread time waiting for the table lock * (usecs) */ -#define WT_STAT_CONN_LOCK_TABLE_WAIT_APPLICATION 1141 +#define WT_STAT_CONN_LOCK_TABLE_WAIT_APPLICATION 1143 /*! * lock: table lock internal thread time waiting for the table lock * (usecs) */ -#define WT_STAT_CONN_LOCK_TABLE_WAIT_INTERNAL 1142 +#define WT_STAT_CONN_LOCK_TABLE_WAIT_INTERNAL 1144 /*! log: busy returns attempting to switch slots */ -#define WT_STAT_CONN_LOG_SLOT_SWITCH_BUSY 1143 +#define WT_STAT_CONN_LOG_SLOT_SWITCH_BUSY 1145 /*! log: consolidated slot closures */ -#define WT_STAT_CONN_LOG_SLOT_CLOSES 1144 +#define WT_STAT_CONN_LOG_SLOT_CLOSES 1146 /*! log: consolidated slot join races */ -#define WT_STAT_CONN_LOG_SLOT_RACES 1145 +#define WT_STAT_CONN_LOG_SLOT_RACES 1147 /*! log: consolidated slot join transitions */ -#define WT_STAT_CONN_LOG_SLOT_TRANSITIONS 1146 +#define WT_STAT_CONN_LOG_SLOT_TRANSITIONS 1148 /*! log: consolidated slot joins */ -#define WT_STAT_CONN_LOG_SLOT_JOINS 1147 +#define WT_STAT_CONN_LOG_SLOT_JOINS 1149 /*! log: consolidated slot unbuffered writes */ -#define WT_STAT_CONN_LOG_SLOT_UNBUFFERED 1148 +#define WT_STAT_CONN_LOG_SLOT_UNBUFFERED 1150 /*! log: log bytes of payload data */ -#define WT_STAT_CONN_LOG_BYTES_PAYLOAD 1149 +#define WT_STAT_CONN_LOG_BYTES_PAYLOAD 1151 /*! log: log bytes written */ -#define WT_STAT_CONN_LOG_BYTES_WRITTEN 1150 +#define WT_STAT_CONN_LOG_BYTES_WRITTEN 1152 /*! log: log files manually zero-filled */ -#define WT_STAT_CONN_LOG_ZERO_FILLS 1151 +#define WT_STAT_CONN_LOG_ZERO_FILLS 1153 /*! log: log flush operations */ -#define WT_STAT_CONN_LOG_FLUSH 1152 +#define WT_STAT_CONN_LOG_FLUSH 1154 /*! log: log force write operations */ -#define WT_STAT_CONN_LOG_FORCE_WRITE 1153 +#define WT_STAT_CONN_LOG_FORCE_WRITE 1155 /*! log: log force write operations skipped */ -#define WT_STAT_CONN_LOG_FORCE_WRITE_SKIP 1154 +#define WT_STAT_CONN_LOG_FORCE_WRITE_SKIP 1156 /*! log: log records compressed */ -#define WT_STAT_CONN_LOG_COMPRESS_WRITES 1155 +#define WT_STAT_CONN_LOG_COMPRESS_WRITES 1157 /*! log: log records not compressed */ -#define WT_STAT_CONN_LOG_COMPRESS_WRITE_FAILS 1156 +#define WT_STAT_CONN_LOG_COMPRESS_WRITE_FAILS 1158 /*! log: log records too small to compress */ -#define WT_STAT_CONN_LOG_COMPRESS_SMALL 1157 +#define WT_STAT_CONN_LOG_COMPRESS_SMALL 1159 /*! log: log release advances write LSN */ -#define WT_STAT_CONN_LOG_RELEASE_WRITE_LSN 1158 +#define WT_STAT_CONN_LOG_RELEASE_WRITE_LSN 1160 /*! log: log scan operations */ -#define WT_STAT_CONN_LOG_SCANS 1159 +#define WT_STAT_CONN_LOG_SCANS 1161 /*! log: log scan records requiring two reads */ -#define WT_STAT_CONN_LOG_SCAN_REREADS 1160 +#define WT_STAT_CONN_LOG_SCAN_REREADS 1162 /*! log: log server thread advances write LSN */ -#define WT_STAT_CONN_LOG_WRITE_LSN 1161 +#define WT_STAT_CONN_LOG_WRITE_LSN 1163 /*! log: log server thread write LSN walk skipped */ -#define WT_STAT_CONN_LOG_WRITE_LSN_SKIP 1162 +#define WT_STAT_CONN_LOG_WRITE_LSN_SKIP 1164 /*! log: log sync operations */ -#define WT_STAT_CONN_LOG_SYNC 1163 +#define WT_STAT_CONN_LOG_SYNC 1165 /*! log: log sync time duration (usecs) */ -#define WT_STAT_CONN_LOG_SYNC_DURATION 1164 +#define WT_STAT_CONN_LOG_SYNC_DURATION 1166 /*! log: log sync_dir operations */ -#define WT_STAT_CONN_LOG_SYNC_DIR 1165 +#define WT_STAT_CONN_LOG_SYNC_DIR 1167 /*! log: log sync_dir time duration (usecs) */ -#define WT_STAT_CONN_LOG_SYNC_DIR_DURATION 1166 +#define WT_STAT_CONN_LOG_SYNC_DIR_DURATION 1168 /*! log: log write operations */ -#define WT_STAT_CONN_LOG_WRITES 1167 +#define WT_STAT_CONN_LOG_WRITES 1169 /*! log: logging bytes consolidated */ -#define WT_STAT_CONN_LOG_SLOT_CONSOLIDATED 1168 +#define WT_STAT_CONN_LOG_SLOT_CONSOLIDATED 1170 /*! log: maximum log file size */ -#define WT_STAT_CONN_LOG_MAX_FILESIZE 1169 +#define WT_STAT_CONN_LOG_MAX_FILESIZE 1171 /*! log: number of pre-allocated log files to create */ -#define WT_STAT_CONN_LOG_PREALLOC_MAX 1170 +#define WT_STAT_CONN_LOG_PREALLOC_MAX 1172 /*! log: pre-allocated log files not ready and missed */ -#define WT_STAT_CONN_LOG_PREALLOC_MISSED 1171 +#define WT_STAT_CONN_LOG_PREALLOC_MISSED 1173 /*! log: pre-allocated log files prepared */ -#define WT_STAT_CONN_LOG_PREALLOC_FILES 1172 +#define WT_STAT_CONN_LOG_PREALLOC_FILES 1174 /*! log: pre-allocated log files used */ -#define WT_STAT_CONN_LOG_PREALLOC_USED 1173 +#define WT_STAT_CONN_LOG_PREALLOC_USED 1175 /*! log: records processed by log scan */ -#define WT_STAT_CONN_LOG_SCAN_RECORDS 1174 +#define WT_STAT_CONN_LOG_SCAN_RECORDS 1176 /*! log: total in-memory size of compressed records */ -#define WT_STAT_CONN_LOG_COMPRESS_MEM 1175 +#define WT_STAT_CONN_LOG_COMPRESS_MEM 1177 /*! log: total log buffer size */ -#define WT_STAT_CONN_LOG_BUFFER_SIZE 1176 +#define WT_STAT_CONN_LOG_BUFFER_SIZE 1178 /*! log: total size of compressed records */ -#define WT_STAT_CONN_LOG_COMPRESS_LEN 1177 +#define WT_STAT_CONN_LOG_COMPRESS_LEN 1179 /*! log: written slots coalesced */ -#define WT_STAT_CONN_LOG_SLOT_COALESCED 1178 +#define WT_STAT_CONN_LOG_SLOT_COALESCED 1180 /*! log: yields waiting for previous log file close */ -#define WT_STAT_CONN_LOG_CLOSE_YIELDS 1179 +#define WT_STAT_CONN_LOG_CLOSE_YIELDS 1181 /*! reconciliation: fast-path pages deleted */ -#define WT_STAT_CONN_REC_PAGE_DELETE_FAST 1180 +#define WT_STAT_CONN_REC_PAGE_DELETE_FAST 1182 /*! reconciliation: page reconciliation calls */ -#define WT_STAT_CONN_REC_PAGES 1181 +#define WT_STAT_CONN_REC_PAGES 1183 /*! reconciliation: page reconciliation calls for eviction */ -#define WT_STAT_CONN_REC_PAGES_EVICTION 1182 +#define WT_STAT_CONN_REC_PAGES_EVICTION 1184 /*! reconciliation: pages deleted */ -#define WT_STAT_CONN_REC_PAGE_DELETE 1183 +#define WT_STAT_CONN_REC_PAGE_DELETE 1185 /*! reconciliation: split bytes currently awaiting free */ -#define WT_STAT_CONN_REC_SPLIT_STASHED_BYTES 1184 +#define WT_STAT_CONN_REC_SPLIT_STASHED_BYTES 1186 /*! reconciliation: split objects currently awaiting free */ -#define WT_STAT_CONN_REC_SPLIT_STASHED_OBJECTS 1185 +#define WT_STAT_CONN_REC_SPLIT_STASHED_OBJECTS 1187 /*! session: open cursor count */ -#define WT_STAT_CONN_SESSION_CURSOR_OPEN 1186 +#define WT_STAT_CONN_SESSION_CURSOR_OPEN 1188 /*! session: open session count */ -#define WT_STAT_CONN_SESSION_OPEN 1187 +#define WT_STAT_CONN_SESSION_OPEN 1189 /*! session: table alter failed calls */ -#define WT_STAT_CONN_SESSION_TABLE_ALTER_FAIL 1188 +#define WT_STAT_CONN_SESSION_TABLE_ALTER_FAIL 1190 /*! session: table alter successful calls */ -#define WT_STAT_CONN_SESSION_TABLE_ALTER_SUCCESS 1189 +#define WT_STAT_CONN_SESSION_TABLE_ALTER_SUCCESS 1191 /*! session: table alter unchanged and skipped */ -#define WT_STAT_CONN_SESSION_TABLE_ALTER_SKIP 1190 +#define WT_STAT_CONN_SESSION_TABLE_ALTER_SKIP 1192 /*! session: table compact failed calls */ -#define WT_STAT_CONN_SESSION_TABLE_COMPACT_FAIL 1191 +#define WT_STAT_CONN_SESSION_TABLE_COMPACT_FAIL 1193 /*! session: table compact successful calls */ -#define WT_STAT_CONN_SESSION_TABLE_COMPACT_SUCCESS 1192 +#define WT_STAT_CONN_SESSION_TABLE_COMPACT_SUCCESS 1194 /*! session: table create failed calls */ -#define WT_STAT_CONN_SESSION_TABLE_CREATE_FAIL 1193 +#define WT_STAT_CONN_SESSION_TABLE_CREATE_FAIL 1195 /*! session: table create successful calls */ -#define WT_STAT_CONN_SESSION_TABLE_CREATE_SUCCESS 1194 +#define WT_STAT_CONN_SESSION_TABLE_CREATE_SUCCESS 1196 /*! session: table drop failed calls */ -#define WT_STAT_CONN_SESSION_TABLE_DROP_FAIL 1195 +#define WT_STAT_CONN_SESSION_TABLE_DROP_FAIL 1197 /*! session: table drop successful calls */ -#define WT_STAT_CONN_SESSION_TABLE_DROP_SUCCESS 1196 +#define WT_STAT_CONN_SESSION_TABLE_DROP_SUCCESS 1198 /*! session: table rebalance failed calls */ -#define WT_STAT_CONN_SESSION_TABLE_REBALANCE_FAIL 1197 +#define WT_STAT_CONN_SESSION_TABLE_REBALANCE_FAIL 1199 /*! session: table rebalance successful calls */ -#define WT_STAT_CONN_SESSION_TABLE_REBALANCE_SUCCESS 1198 +#define WT_STAT_CONN_SESSION_TABLE_REBALANCE_SUCCESS 1200 /*! session: table rename failed calls */ -#define WT_STAT_CONN_SESSION_TABLE_RENAME_FAIL 1199 +#define WT_STAT_CONN_SESSION_TABLE_RENAME_FAIL 1201 /*! session: table rename successful calls */ -#define WT_STAT_CONN_SESSION_TABLE_RENAME_SUCCESS 1200 +#define WT_STAT_CONN_SESSION_TABLE_RENAME_SUCCESS 1202 /*! session: table salvage failed calls */ -#define WT_STAT_CONN_SESSION_TABLE_SALVAGE_FAIL 1201 +#define WT_STAT_CONN_SESSION_TABLE_SALVAGE_FAIL 1203 /*! session: table salvage successful calls */ -#define WT_STAT_CONN_SESSION_TABLE_SALVAGE_SUCCESS 1202 +#define WT_STAT_CONN_SESSION_TABLE_SALVAGE_SUCCESS 1204 /*! session: table truncate failed calls */ -#define WT_STAT_CONN_SESSION_TABLE_TRUNCATE_FAIL 1203 +#define WT_STAT_CONN_SESSION_TABLE_TRUNCATE_FAIL 1205 /*! session: table truncate successful calls */ -#define WT_STAT_CONN_SESSION_TABLE_TRUNCATE_SUCCESS 1204 +#define WT_STAT_CONN_SESSION_TABLE_TRUNCATE_SUCCESS 1206 /*! session: table verify failed calls */ -#define WT_STAT_CONN_SESSION_TABLE_VERIFY_FAIL 1205 +#define WT_STAT_CONN_SESSION_TABLE_VERIFY_FAIL 1207 /*! session: table verify successful calls */ -#define WT_STAT_CONN_SESSION_TABLE_VERIFY_SUCCESS 1206 +#define WT_STAT_CONN_SESSION_TABLE_VERIFY_SUCCESS 1208 /*! thread-state: active filesystem fsync calls */ -#define WT_STAT_CONN_THREAD_FSYNC_ACTIVE 1207 +#define WT_STAT_CONN_THREAD_FSYNC_ACTIVE 1209 /*! thread-state: active filesystem read calls */ -#define WT_STAT_CONN_THREAD_READ_ACTIVE 1208 +#define WT_STAT_CONN_THREAD_READ_ACTIVE 1210 /*! thread-state: active filesystem write calls */ -#define WT_STAT_CONN_THREAD_WRITE_ACTIVE 1209 +#define WT_STAT_CONN_THREAD_WRITE_ACTIVE 1211 /*! thread-yield: application thread time evicting (usecs) */ -#define WT_STAT_CONN_APPLICATION_EVICT_TIME 1210 +#define WT_STAT_CONN_APPLICATION_EVICT_TIME 1212 /*! thread-yield: application thread time waiting for cache (usecs) */ -#define WT_STAT_CONN_APPLICATION_CACHE_TIME 1211 +#define WT_STAT_CONN_APPLICATION_CACHE_TIME 1213 /*! thread-yield: page acquire busy blocked */ -#define WT_STAT_CONN_PAGE_BUSY_BLOCKED 1212 +#define WT_STAT_CONN_PAGE_BUSY_BLOCKED 1214 /*! thread-yield: page acquire eviction blocked */ -#define WT_STAT_CONN_PAGE_FORCIBLE_EVICT_BLOCKED 1213 +#define WT_STAT_CONN_PAGE_FORCIBLE_EVICT_BLOCKED 1215 /*! thread-yield: page acquire locked blocked */ -#define WT_STAT_CONN_PAGE_LOCKED_BLOCKED 1214 +#define WT_STAT_CONN_PAGE_LOCKED_BLOCKED 1216 /*! thread-yield: page acquire read blocked */ -#define WT_STAT_CONN_PAGE_READ_BLOCKED 1215 +#define WT_STAT_CONN_PAGE_READ_BLOCKED 1217 /*! thread-yield: page acquire time sleeping (usecs) */ -#define WT_STAT_CONN_PAGE_SLEEP 1216 +#define WT_STAT_CONN_PAGE_SLEEP 1218 /*! transaction: number of named snapshots created */ -#define WT_STAT_CONN_TXN_SNAPSHOTS_CREATED 1217 +#define WT_STAT_CONN_TXN_SNAPSHOTS_CREATED 1219 /*! transaction: number of named snapshots dropped */ -#define WT_STAT_CONN_TXN_SNAPSHOTS_DROPPED 1218 +#define WT_STAT_CONN_TXN_SNAPSHOTS_DROPPED 1220 /*! transaction: transaction begins */ -#define WT_STAT_CONN_TXN_BEGIN 1219 +#define WT_STAT_CONN_TXN_BEGIN 1221 /*! transaction: transaction checkpoint currently running */ -#define WT_STAT_CONN_TXN_CHECKPOINT_RUNNING 1220 +#define WT_STAT_CONN_TXN_CHECKPOINT_RUNNING 1222 /*! transaction: transaction checkpoint generation */ -#define WT_STAT_CONN_TXN_CHECKPOINT_GENERATION 1221 +#define WT_STAT_CONN_TXN_CHECKPOINT_GENERATION 1223 /*! transaction: transaction checkpoint max time (msecs) */ -#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_MAX 1222 +#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_MAX 1224 /*! transaction: transaction checkpoint min time (msecs) */ -#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_MIN 1223 +#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_MIN 1225 /*! transaction: transaction checkpoint most recent time (msecs) */ -#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_RECENT 1224 +#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_RECENT 1226 /*! transaction: transaction checkpoint scrub dirty target */ -#define WT_STAT_CONN_TXN_CHECKPOINT_SCRUB_TARGET 1225 +#define WT_STAT_CONN_TXN_CHECKPOINT_SCRUB_TARGET 1227 /*! transaction: transaction checkpoint scrub time (msecs) */ -#define WT_STAT_CONN_TXN_CHECKPOINT_SCRUB_TIME 1226 +#define WT_STAT_CONN_TXN_CHECKPOINT_SCRUB_TIME 1228 /*! transaction: transaction checkpoint total time (msecs) */ -#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_TOTAL 1227 +#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_TOTAL 1229 /*! transaction: transaction checkpoints */ -#define WT_STAT_CONN_TXN_CHECKPOINT 1228 +#define WT_STAT_CONN_TXN_CHECKPOINT 1230 /*! * transaction: transaction checkpoints skipped because database was * clean */ -#define WT_STAT_CONN_TXN_CHECKPOINT_SKIPPED 1229 +#define WT_STAT_CONN_TXN_CHECKPOINT_SKIPPED 1231 /*! transaction: transaction failures due to cache overflow */ -#define WT_STAT_CONN_TXN_FAIL_CACHE 1230 +#define WT_STAT_CONN_TXN_FAIL_CACHE 1232 /*! * transaction: transaction fsync calls for checkpoint after allocating * the transaction ID */ -#define WT_STAT_CONN_TXN_CHECKPOINT_FSYNC_POST 1231 +#define WT_STAT_CONN_TXN_CHECKPOINT_FSYNC_POST 1233 /*! * transaction: transaction fsync duration for checkpoint after * allocating the transaction ID (usecs) */ -#define WT_STAT_CONN_TXN_CHECKPOINT_FSYNC_POST_DURATION 1232 +#define WT_STAT_CONN_TXN_CHECKPOINT_FSYNC_POST_DURATION 1234 /*! transaction: transaction range of IDs currently pinned */ -#define WT_STAT_CONN_TXN_PINNED_RANGE 1233 +#define WT_STAT_CONN_TXN_PINNED_RANGE 1235 /*! transaction: transaction range of IDs currently pinned by a checkpoint */ -#define WT_STAT_CONN_TXN_PINNED_CHECKPOINT_RANGE 1234 +#define WT_STAT_CONN_TXN_PINNED_CHECKPOINT_RANGE 1236 /*! * transaction: transaction range of IDs currently pinned by named * snapshots */ -#define WT_STAT_CONN_TXN_PINNED_SNAPSHOT_RANGE 1235 +#define WT_STAT_CONN_TXN_PINNED_SNAPSHOT_RANGE 1237 /*! transaction: transaction sync calls */ -#define WT_STAT_CONN_TXN_SYNC 1236 +#define WT_STAT_CONN_TXN_SYNC 1238 /*! transaction: transactions committed */ -#define WT_STAT_CONN_TXN_COMMIT 1237 +#define WT_STAT_CONN_TXN_COMMIT 1239 /*! transaction: transactions rolled back */ -#define WT_STAT_CONN_TXN_ROLLBACK 1238 +#define WT_STAT_CONN_TXN_ROLLBACK 1240 /*! * @} @@ -4978,181 +4985,183 @@ extern int wiredtiger_extension_terminate(WT_CONNECTION *connection); #define WT_STAT_DSRC_CACHE_WRITE 2059 /*! cache: pages written requiring in-memory restoration */ #define WT_STAT_DSRC_CACHE_WRITE_RESTORE 2060 +/*! cache: tracked dirty bytes in the cache */ +#define WT_STAT_DSRC_CACHE_BYTES_DIRTY 2061 /*! cache: unmodified pages evicted */ -#define WT_STAT_DSRC_CACHE_EVICTION_CLEAN 2061 +#define WT_STAT_DSRC_CACHE_EVICTION_CLEAN 2062 /*! * cache_walk: Average difference between current eviction generation * when the page was last considered, only reported if cache_walk or all * statistics are enabled */ -#define WT_STAT_DSRC_CACHE_STATE_GEN_AVG_GAP 2062 +#define WT_STAT_DSRC_CACHE_STATE_GEN_AVG_GAP 2063 /*! * cache_walk: Average on-disk page image size seen, only reported if * cache_walk or all statistics are enabled */ -#define WT_STAT_DSRC_CACHE_STATE_AVG_WRITTEN_SIZE 2063 +#define WT_STAT_DSRC_CACHE_STATE_AVG_WRITTEN_SIZE 2064 /*! * cache_walk: Clean pages currently in cache, only reported if * cache_walk or all statistics are enabled */ -#define WT_STAT_DSRC_CACHE_STATE_PAGES_CLEAN 2064 +#define WT_STAT_DSRC_CACHE_STATE_PAGES_CLEAN 2065 /*! * cache_walk: Current eviction generation, only reported if cache_walk * or all statistics are enabled */ -#define WT_STAT_DSRC_CACHE_STATE_GEN_CURRENT 2065 +#define WT_STAT_DSRC_CACHE_STATE_GEN_CURRENT 2066 /*! * cache_walk: Dirty pages currently in cache, only reported if * cache_walk or all statistics are enabled */ -#define WT_STAT_DSRC_CACHE_STATE_PAGES_DIRTY 2066 +#define WT_STAT_DSRC_CACHE_STATE_PAGES_DIRTY 2067 /*! * cache_walk: Entries in the root page, only reported if cache_walk or * all statistics are enabled */ -#define WT_STAT_DSRC_CACHE_STATE_ROOT_ENTRIES 2067 +#define WT_STAT_DSRC_CACHE_STATE_ROOT_ENTRIES 2068 /*! * cache_walk: Internal pages currently in cache, only reported if * cache_walk or all statistics are enabled */ -#define WT_STAT_DSRC_CACHE_STATE_PAGES_INTERNAL 2068 +#define WT_STAT_DSRC_CACHE_STATE_PAGES_INTERNAL 2069 /*! * cache_walk: Leaf pages currently in cache, only reported if cache_walk * or all statistics are enabled */ -#define WT_STAT_DSRC_CACHE_STATE_PAGES_LEAF 2069 +#define WT_STAT_DSRC_CACHE_STATE_PAGES_LEAF 2070 /*! * cache_walk: Maximum difference between current eviction generation * when the page was last considered, only reported if cache_walk or all * statistics are enabled */ -#define WT_STAT_DSRC_CACHE_STATE_GEN_MAX_GAP 2070 +#define WT_STAT_DSRC_CACHE_STATE_GEN_MAX_GAP 2071 /*! * cache_walk: Maximum page size seen, only reported if cache_walk or all * statistics are enabled */ -#define WT_STAT_DSRC_CACHE_STATE_MAX_PAGESIZE 2071 +#define WT_STAT_DSRC_CACHE_STATE_MAX_PAGESIZE 2072 /*! * cache_walk: Minimum on-disk page image size seen, only reported if * cache_walk or all statistics are enabled */ -#define WT_STAT_DSRC_CACHE_STATE_MIN_WRITTEN_SIZE 2072 +#define WT_STAT_DSRC_CACHE_STATE_MIN_WRITTEN_SIZE 2073 /*! * cache_walk: On-disk page image sizes smaller than a single allocation * unit, only reported if cache_walk or all statistics are enabled */ -#define WT_STAT_DSRC_CACHE_STATE_SMALLER_ALLOC_SIZE 2073 +#define WT_STAT_DSRC_CACHE_STATE_SMALLER_ALLOC_SIZE 2074 /*! * cache_walk: Pages created in memory and never written, only reported * if cache_walk or all statistics are enabled */ -#define WT_STAT_DSRC_CACHE_STATE_MEMORY 2074 +#define WT_STAT_DSRC_CACHE_STATE_MEMORY 2075 /*! * cache_walk: Pages currently queued for eviction, only reported if * cache_walk or all statistics are enabled */ -#define WT_STAT_DSRC_CACHE_STATE_QUEUED 2075 +#define WT_STAT_DSRC_CACHE_STATE_QUEUED 2076 /*! * cache_walk: Pages that could not be queued for eviction, only reported * if cache_walk or all statistics are enabled */ -#define WT_STAT_DSRC_CACHE_STATE_NOT_QUEUEABLE 2076 +#define WT_STAT_DSRC_CACHE_STATE_NOT_QUEUEABLE 2077 /*! * cache_walk: Refs skipped during cache traversal, only reported if * cache_walk or all statistics are enabled */ -#define WT_STAT_DSRC_CACHE_STATE_REFS_SKIPPED 2077 +#define WT_STAT_DSRC_CACHE_STATE_REFS_SKIPPED 2078 /*! * cache_walk: Size of the root page, only reported if cache_walk or all * statistics are enabled */ -#define WT_STAT_DSRC_CACHE_STATE_ROOT_SIZE 2078 +#define WT_STAT_DSRC_CACHE_STATE_ROOT_SIZE 2079 /*! * cache_walk: Total number of pages currently in cache, only reported if * cache_walk or all statistics are enabled */ -#define WT_STAT_DSRC_CACHE_STATE_PAGES 2079 +#define WT_STAT_DSRC_CACHE_STATE_PAGES 2080 /*! compression: compressed pages read */ -#define WT_STAT_DSRC_COMPRESS_READ 2080 +#define WT_STAT_DSRC_COMPRESS_READ 2081 /*! compression: compressed pages written */ -#define WT_STAT_DSRC_COMPRESS_WRITE 2081 +#define WT_STAT_DSRC_COMPRESS_WRITE 2082 /*! compression: page written failed to compress */ -#define WT_STAT_DSRC_COMPRESS_WRITE_FAIL 2082 +#define WT_STAT_DSRC_COMPRESS_WRITE_FAIL 2083 /*! compression: page written was too small to compress */ -#define WT_STAT_DSRC_COMPRESS_WRITE_TOO_SMALL 2083 +#define WT_STAT_DSRC_COMPRESS_WRITE_TOO_SMALL 2084 /*! compression: raw compression call failed, additional data available */ -#define WT_STAT_DSRC_COMPRESS_RAW_FAIL_TEMPORARY 2084 +#define WT_STAT_DSRC_COMPRESS_RAW_FAIL_TEMPORARY 2085 /*! compression: raw compression call failed, no additional data available */ -#define WT_STAT_DSRC_COMPRESS_RAW_FAIL 2085 +#define WT_STAT_DSRC_COMPRESS_RAW_FAIL 2086 /*! compression: raw compression call succeeded */ -#define WT_STAT_DSRC_COMPRESS_RAW_OK 2086 +#define WT_STAT_DSRC_COMPRESS_RAW_OK 2087 /*! cursor: bulk-loaded cursor-insert calls */ -#define WT_STAT_DSRC_CURSOR_INSERT_BULK 2087 +#define WT_STAT_DSRC_CURSOR_INSERT_BULK 2088 /*! cursor: create calls */ -#define WT_STAT_DSRC_CURSOR_CREATE 2088 +#define WT_STAT_DSRC_CURSOR_CREATE 2089 /*! cursor: cursor-insert key and value bytes inserted */ -#define WT_STAT_DSRC_CURSOR_INSERT_BYTES 2089 +#define WT_STAT_DSRC_CURSOR_INSERT_BYTES 2090 /*! cursor: cursor-remove key bytes removed */ -#define WT_STAT_DSRC_CURSOR_REMOVE_BYTES 2090 +#define WT_STAT_DSRC_CURSOR_REMOVE_BYTES 2091 /*! cursor: cursor-update value bytes updated */ -#define WT_STAT_DSRC_CURSOR_UPDATE_BYTES 2091 +#define WT_STAT_DSRC_CURSOR_UPDATE_BYTES 2092 /*! cursor: insert calls */ -#define WT_STAT_DSRC_CURSOR_INSERT 2092 +#define WT_STAT_DSRC_CURSOR_INSERT 2093 /*! cursor: next calls */ -#define WT_STAT_DSRC_CURSOR_NEXT 2093 +#define WT_STAT_DSRC_CURSOR_NEXT 2094 /*! cursor: prev calls */ -#define WT_STAT_DSRC_CURSOR_PREV 2094 +#define WT_STAT_DSRC_CURSOR_PREV 2095 /*! cursor: remove calls */ -#define WT_STAT_DSRC_CURSOR_REMOVE 2095 +#define WT_STAT_DSRC_CURSOR_REMOVE 2096 /*! cursor: reset calls */ -#define WT_STAT_DSRC_CURSOR_RESET 2096 +#define WT_STAT_DSRC_CURSOR_RESET 2097 /*! cursor: restarted searches */ -#define WT_STAT_DSRC_CURSOR_RESTART 2097 +#define WT_STAT_DSRC_CURSOR_RESTART 2098 /*! cursor: search calls */ -#define WT_STAT_DSRC_CURSOR_SEARCH 2098 +#define WT_STAT_DSRC_CURSOR_SEARCH 2099 /*! cursor: search near calls */ -#define WT_STAT_DSRC_CURSOR_SEARCH_NEAR 2099 +#define WT_STAT_DSRC_CURSOR_SEARCH_NEAR 2100 /*! cursor: truncate calls */ -#define WT_STAT_DSRC_CURSOR_TRUNCATE 2100 +#define WT_STAT_DSRC_CURSOR_TRUNCATE 2101 /*! cursor: update calls */ -#define WT_STAT_DSRC_CURSOR_UPDATE 2101 +#define WT_STAT_DSRC_CURSOR_UPDATE 2102 /*! reconciliation: dictionary matches */ -#define WT_STAT_DSRC_REC_DICTIONARY 2102 +#define WT_STAT_DSRC_REC_DICTIONARY 2103 /*! reconciliation: fast-path pages deleted */ -#define WT_STAT_DSRC_REC_PAGE_DELETE_FAST 2103 +#define WT_STAT_DSRC_REC_PAGE_DELETE_FAST 2104 /*! * reconciliation: internal page key bytes discarded using suffix * compression */ -#define WT_STAT_DSRC_REC_SUFFIX_COMPRESSION 2104 +#define WT_STAT_DSRC_REC_SUFFIX_COMPRESSION 2105 /*! reconciliation: internal page multi-block writes */ -#define WT_STAT_DSRC_REC_MULTIBLOCK_INTERNAL 2105 +#define WT_STAT_DSRC_REC_MULTIBLOCK_INTERNAL 2106 /*! reconciliation: internal-page overflow keys */ -#define WT_STAT_DSRC_REC_OVERFLOW_KEY_INTERNAL 2106 +#define WT_STAT_DSRC_REC_OVERFLOW_KEY_INTERNAL 2107 /*! reconciliation: leaf page key bytes discarded using prefix compression */ -#define WT_STAT_DSRC_REC_PREFIX_COMPRESSION 2107 +#define WT_STAT_DSRC_REC_PREFIX_COMPRESSION 2108 /*! reconciliation: leaf page multi-block writes */ -#define WT_STAT_DSRC_REC_MULTIBLOCK_LEAF 2108 +#define WT_STAT_DSRC_REC_MULTIBLOCK_LEAF 2109 /*! reconciliation: leaf-page overflow keys */ -#define WT_STAT_DSRC_REC_OVERFLOW_KEY_LEAF 2109 +#define WT_STAT_DSRC_REC_OVERFLOW_KEY_LEAF 2110 /*! reconciliation: maximum blocks required for a page */ -#define WT_STAT_DSRC_REC_MULTIBLOCK_MAX 2110 +#define WT_STAT_DSRC_REC_MULTIBLOCK_MAX 2111 /*! reconciliation: overflow values written */ -#define WT_STAT_DSRC_REC_OVERFLOW_VALUE 2111 +#define WT_STAT_DSRC_REC_OVERFLOW_VALUE 2112 /*! reconciliation: page checksum matches */ -#define WT_STAT_DSRC_REC_PAGE_MATCH 2112 +#define WT_STAT_DSRC_REC_PAGE_MATCH 2113 /*! reconciliation: page reconciliation calls */ -#define WT_STAT_DSRC_REC_PAGES 2113 +#define WT_STAT_DSRC_REC_PAGES 2114 /*! reconciliation: page reconciliation calls for eviction */ -#define WT_STAT_DSRC_REC_PAGES_EVICTION 2114 +#define WT_STAT_DSRC_REC_PAGES_EVICTION 2115 /*! reconciliation: pages deleted */ -#define WT_STAT_DSRC_REC_PAGE_DELETE 2115 +#define WT_STAT_DSRC_REC_PAGE_DELETE 2116 /*! session: object compaction */ -#define WT_STAT_DSRC_SESSION_COMPACT 2116 +#define WT_STAT_DSRC_SESSION_COMPACT 2117 /*! session: open cursor count */ -#define WT_STAT_DSRC_SESSION_CURSOR_OPEN 2117 +#define WT_STAT_DSRC_SESSION_CURSOR_OPEN 2118 /*! transaction: update conflicts */ -#define WT_STAT_DSRC_TXN_UPDATE_CONFLICT 2118 +#define WT_STAT_DSRC_TXN_UPDATE_CONFLICT 2119 /*! * @} diff --git a/src/third_party/wiredtiger/src/include/wt_internal.h b/src/third_party/wiredtiger/src/include/wt_internal.h index e18563dd2d2..da318ad8a86 100644 --- a/src/third_party/wiredtiger/src/include/wt_internal.h +++ b/src/third_party/wiredtiger/src/include/wt_internal.h @@ -106,6 +106,8 @@ struct __wt_col; typedef struct __wt_col WT_COL; struct __wt_col_rle; typedef struct __wt_col_rle WT_COL_RLE; +struct __wt_col_var_repeat; + typedef struct __wt_col_var_repeat WT_COL_VAR_REPEAT; struct __wt_colgroup; typedef struct __wt_colgroup WT_COLGROUP; struct __wt_compact_state; @@ -266,8 +268,6 @@ struct __wt_ref; typedef struct __wt_ref WT_REF; struct __wt_row; typedef struct __wt_row WT_ROW; -struct __wt_rwlock; - typedef struct __wt_rwlock WT_RWLOCK; struct __wt_salvage_cookie; typedef struct __wt_salvage_cookie WT_SALVAGE_COOKIE; struct __wt_save_upd; @@ -302,6 +302,8 @@ union __wt_lsn; typedef union __wt_lsn WT_LSN; union __wt_rand_state; typedef union __wt_rand_state WT_RAND_STATE; +union __wt_rwlock; + typedef union __wt_rwlock WT_RWLOCK; /* * Forward type declarations for internal types: END * DO NOT EDIT: automatically built by dist/s_typedef. diff --git a/src/third_party/wiredtiger/src/log/log.c b/src/third_party/wiredtiger/src/log/log.c index 413df312a15..3477ca52502 100644 --- a/src/third_party/wiredtiger/src/log/log.c +++ b/src/third_party/wiredtiger/src/log/log.c @@ -24,7 +24,7 @@ static int __log_write_internal( * __log_wait_for_earlier_slot -- * Wait for write_lsn to catch up to this slot. */ -static void +static int __log_wait_for_earlier_slot(WT_SESSION_IMPL *session, WT_LOGSLOT *slot) { WT_CONNECTION_IMPL *conn; @@ -41,16 +41,18 @@ __log_wait_for_earlier_slot(WT_SESSION_IMPL *session, WT_LOGSLOT *slot) * unlock in case an earlier thread is trying to switch its * slot and complete its operation. */ + WT_RET(WT_SESSION_CHECK_PANIC(session)); if (F_ISSET(session, WT_SESSION_LOCKED_SLOT)) __wt_spin_unlock(session, &log->log_slot_lock); - __wt_cond_auto_signal(session, conn->log_wrlsn_cond); + __wt_cond_signal(session, conn->log_wrlsn_cond); if (++yield_count < WT_THOUSAND) __wt_yield(); else - __wt_cond_wait(session, log->log_write_cond, 200); + __wt_cond_wait(session, log->log_write_cond, 200, NULL); if (F_ISSET(session, WT_SESSION_LOCKED_SLOT)) __wt_spin_lock(session, &log->log_slot_lock); } + return (0); } /* @@ -62,16 +64,21 @@ static int __log_fs_write(WT_SESSION_IMPL *session, WT_LOGSLOT *slot, wt_off_t offset, size_t len, const void *buf) { + WT_DECL_RET; + /* * If we're writing into a new log file, we have to wait for all * writes to the previous log file to complete otherwise there could * be a hole at the end of the previous log file that we cannot detect. */ if (slot->slot_release_lsn.l.file < slot->slot_start_lsn.l.file) { - __log_wait_for_earlier_slot(session, slot); + WT_RET(__log_wait_for_earlier_slot(session, slot)); WT_RET(__wt_log_force_sync(session, &slot->slot_release_lsn)); } - return (__wt_write(session, slot->slot_fh, offset, len, buf)); + if ((ret = __wt_write(session, slot->slot_fh, offset, len, buf)) != 0) + WT_PANIC_MSG(session, ret, + "%s: fatal log failure", slot->slot_fh->name); + return (ret); } /* @@ -89,7 +96,7 @@ __wt_log_ckpt(WT_SESSION_IMPL *session, WT_LSN *ckp_lsn) log = conn->log; log->ckpt_lsn = *ckp_lsn; if (conn->log_cond != NULL) - __wt_cond_auto_signal(session, conn->log_cond); + __wt_cond_signal(session, conn->log_cond); } /* @@ -105,6 +112,7 @@ __wt_log_flush_lsn(WT_SESSION_IMPL *session, WT_LSN *lsn, bool start) conn = S2C(session); log = conn->log; + WT_RET(WT_SESSION_CHECK_PANIC(session)); WT_RET(__wt_log_force_write(session, 1, NULL)); __wt_log_wrlsn(session, NULL); if (start) @@ -169,8 +177,9 @@ __wt_log_force_sync(WT_SESSION_IMPL *session, WT_LSN *min_lsn) * log file ready to close. */ while (log->sync_lsn.l.file < min_lsn->l.file) { + WT_RET(WT_SESSION_CHECK_PANIC(session)); __wt_cond_signal(session, S2C(session)->log_file_cond); - __wt_cond_wait(session, log->log_sync_cond, 10000); + __wt_cond_wait(session, log->log_sync_cond, 10000, NULL); } __wt_spin_lock(session, &log->log_sync_lock); WT_ASSERT(session, log->log_dir_fh != NULL); @@ -895,12 +904,12 @@ __log_newfile(WT_SESSION_IMPL *session, bool conn_open, bool *created) */ create_log = true; if (conn->log_prealloc > 0 && !conn->hot_backup) { - __wt_readlock(session, conn->hot_backup_lock); + __wt_readlock(session, &conn->hot_backup_lock); if (conn->hot_backup) - __wt_readunlock(session, conn->hot_backup_lock); + __wt_readunlock(session, &conn->hot_backup_lock); else { ret = __log_alloc_prealloc(session, log->fileid); - __wt_readunlock(session, conn->hot_backup_lock); + __wt_readunlock(session, &conn->hot_backup_lock); /* * If ret is 0 it means we found a pre-allocated file. @@ -915,7 +924,7 @@ __log_newfile(WT_SESSION_IMPL *session, bool conn_open, bool *created) else { WT_STAT_CONN_INCR(session, log_prealloc_missed); if (conn->log_cond != NULL) - __wt_cond_auto_signal( + __wt_cond_signal( session, conn->log_cond); } } @@ -1029,12 +1038,12 @@ __log_truncate_file(WT_SESSION_IMPL *session, WT_FH *log_fh, wt_off_t offset) log = conn->log; if (!F_ISSET(log, WT_LOG_TRUNCATE_NOTSUP) && !conn->hot_backup) { - __wt_readlock(session, conn->hot_backup_lock); + __wt_readlock(session, &conn->hot_backup_lock); if (conn->hot_backup) - __wt_readunlock(session, conn->hot_backup_lock); + __wt_readunlock(session, &conn->hot_backup_lock); else { ret = __wt_ftruncate(session, log_fh, offset); - __wt_readunlock(session, conn->hot_backup_lock); + __wt_readunlock(session, &conn->hot_backup_lock); if (ret != ENOTSUP) return (ret); F_SET(log, WT_LOG_TRUNCATE_NOTSUP); @@ -1462,7 +1471,7 @@ __wt_log_release(WT_SESSION_IMPL *session, WT_LOGSLOT *slot, bool *freep) * be holes in the log file. */ WT_STAT_CONN_INCR(session, log_release_write_lsn); - __log_wait_for_earlier_slot(session, slot); + WT_ERR(__log_wait_for_earlier_slot(session, slot)); log->write_start_lsn = slot->slot_start_lsn; log->write_lsn = slot->slot_end_lsn; @@ -1483,6 +1492,7 @@ __wt_log_release(WT_SESSION_IMPL *session, WT_LOGSLOT *slot, bool *freep) * current fsync completes and advance log->sync_lsn. */ while (F_ISSET(slot, WT_SLOT_SYNC | WT_SLOT_SYNC_DIR)) { + WT_ERR(WT_SESSION_CHECK_PANIC(session)); /* * We have to wait until earlier log files have finished their * sync operations. The most recent one will set the LSN to the @@ -1490,7 +1500,8 @@ __wt_log_release(WT_SESSION_IMPL *session, WT_LOGSLOT *slot, bool *freep) */ if (log->sync_lsn.l.file < slot->slot_end_lsn.l.file || __wt_spin_trylock(session, &log->log_sync_lock) != 0) { - __wt_cond_wait(session, log->log_sync_cond, 10000); + __wt_cond_wait( + session, log->log_sync_cond, 10000, NULL); continue; } locked = true; @@ -1655,10 +1666,7 @@ __wt_log_scan(WT_SESSION_IMPL *session, WT_LSN *lsnp, uint32_t flags, WT_RET(__log_get_files(session, WT_LOG_FILENAME, &logfiles, &logcount)); if (logcount == 0) - /* - * Return it is not supported if none don't exist. - */ - return (ENOTSUP); + WT_RET_MSG(session, ENOTSUP, "no log files found"); for (i = 0; i < logcount; i++) { WT_ERR(__wt_log_extract_lognum(session, logfiles[i], &lognum)); @@ -1674,6 +1682,10 @@ __wt_log_scan(WT_SESSION_IMPL *session, WT_LSN *lsnp, uint32_t flags, &log_fh, WT_LOG_FILENAME, start_lsn.l.file, WT_LOG_OPEN_VERIFY)); WT_ERR(__wt_filesize(session, log_fh, &log_size)); rd_lsn = start_lsn; + if (LF_ISSET(WT_LOGSCAN_RECOVER)) + __wt_verbose(session, WT_VERB_RECOVERY_PROGRESS, + "Recovering log %" PRIu32 " through %" PRIu32, + rd_lsn.l.file, end_lsn.l.file); WT_ERR(__wt_scr_alloc(session, WT_LOG_ALIGN, &buf)); WT_ERR(__wt_scr_alloc(session, 0, &decryptitem)); @@ -1722,6 +1734,11 @@ advance: WT_ERR(__log_openfile(session, &log_fh, WT_LOG_FILENAME, rd_lsn.l.file, WT_LOG_OPEN_VERIFY)); + if (LF_ISSET(WT_LOGSCAN_RECOVER)) + __wt_verbose(session, WT_VERB_RECOVERY_PROGRESS, + "Recovering log %" PRIu32 + " through %" PRIu32, + rd_lsn.l.file, end_lsn.l.file); WT_ERR(__wt_filesize(session, log_fh, &log_size)); eol = false; continue; @@ -2120,7 +2137,11 @@ __log_write_internal(WT_SESSION_IMPL *session, WT_ITEM *record, WT_LSN *lsnp, WT_STAT_CONN_INCR(session, log_writes); - __wt_log_slot_join(session, rdup_len, flags, &myslot); + /* + * The only time joining a slot should ever return an error is if it + * detects a panic. + */ + WT_ERR(__wt_log_slot_join(session, rdup_len, flags, &myslot)); /* * If the addition of this record crosses the buffer boundary, * switch in a new slot. @@ -2154,7 +2175,7 @@ __log_write_internal(WT_SESSION_IMPL *session, WT_ITEM *record, WT_LSN *lsnp, * XXX I've seen times when conditions are NULL. */ if (conn->log_cond != NULL) { - __wt_cond_auto_signal(session, conn->log_cond); + __wt_cond_signal(session, conn->log_cond); __wt_yield(); } else WT_ERR(__wt_log_force_write(session, 1, NULL)); @@ -2162,13 +2183,19 @@ __log_write_internal(WT_SESSION_IMPL *session, WT_ITEM *record, WT_LSN *lsnp, if (LF_ISSET(WT_LOG_FLUSH)) { /* Wait for our writes to reach the OS */ while (__wt_log_cmp(&log->write_lsn, &lsn) <= 0 && - myslot.slot->slot_error == 0) - __wt_cond_wait(session, log->log_write_cond, 10000); + myslot.slot->slot_error == 0) { + WT_ERR(WT_SESSION_CHECK_PANIC(session)); + __wt_cond_wait( + session, log->log_write_cond, 10000, NULL); + } } else if (LF_ISSET(WT_LOG_FSYNC)) { /* Wait for our writes to reach disk */ while (__wt_log_cmp(&log->sync_lsn, &lsn) <= 0 && - myslot.slot->slot_error == 0) - __wt_cond_wait(session, log->log_sync_cond, 10000); + myslot.slot->slot_error == 0) { + WT_ERR(WT_SESSION_CHECK_PANIC(session)); + __wt_cond_wait( + session, log->log_sync_cond, 10000, NULL); + } } /* @@ -2193,12 +2220,12 @@ err: /* * If one of the sync flags is set, assert the proper LSN has moved to - * match. + * match on success. */ - WT_ASSERT(session, !LF_ISSET(WT_LOG_FLUSH) || + WT_ASSERT(session, ret != 0 || !LF_ISSET(WT_LOG_FLUSH) || __wt_log_cmp(&log->write_lsn, &lsn) >= 0); - WT_ASSERT(session, - !LF_ISSET(WT_LOG_FSYNC) || __wt_log_cmp(&log->sync_lsn, &lsn) >= 0); + WT_ASSERT(session, ret != 0 || !LF_ISSET(WT_LOG_FSYNC) || + __wt_log_cmp(&log->sync_lsn, &lsn) >= 0); return (ret); } diff --git a/src/third_party/wiredtiger/src/log/log_slot.c b/src/third_party/wiredtiger/src/log/log_slot.c index a29a34e5652..b4655ff6c1a 100644 --- a/src/third_party/wiredtiger/src/log/log_slot.c +++ b/src/third_party/wiredtiger/src/log/log_slot.c @@ -8,6 +8,49 @@ #include "wt_internal.h" +#ifdef HAVE_DIAGNOSTIC +/* + * __log_slot_dump -- + * Dump the entire slot state. + */ +static void +__log_slot_dump(WT_SESSION_IMPL *session) +{ + WT_CONNECTION_IMPL *conn; + WT_LOG *log; + WT_LOGSLOT *slot; + int earliest, i; + + conn = S2C(session); + log = conn->log; + earliest = 0; + for (i = 0; i < WT_SLOT_POOL; i++) { + slot = &log->slot_pool[i]; + if (__wt_log_cmp(&slot->slot_release_lsn, + &log->slot_pool[earliest].slot_release_lsn) < 0) + earliest = i; + __wt_errx(session, "Slot %d:", i); + __wt_errx(session, " State: %" PRIx64 " Flags: %" PRIx32, + slot->slot_state, slot->flags); + __wt_errx(session, " Start LSN: %" PRIu32 "/%" PRIu32, + slot->slot_start_lsn.l.file, slot->slot_start_lsn.l.offset); + __wt_errx(session, " End LSN: %" PRIu32 "/%" PRIu32, + slot->slot_end_lsn.l.file, slot->slot_end_lsn.l.offset); + __wt_errx(session, " Release LSN: %" PRIu32 "/%" PRIu32, + slot->slot_release_lsn.l.file, + slot->slot_release_lsn.l.offset); + __wt_errx(session, " Offset: start: %" PRIuMAX + " last:%" PRIuMAX, (uintmax_t)slot->slot_start_offset, + (uintmax_t)slot->slot_last_offset); + __wt_errx(session, " Unbuffered: %" PRId64 + " error: %" PRId32, slot->slot_unbuffered, + slot->slot_error); + } + __wt_errx(session, "Earliest slot: %d", earliest); + +} +#endif + /* * __wt_log_slot_activate -- * Initialize a slot to become active. @@ -21,7 +64,6 @@ __wt_log_slot_activate(WT_SESSION_IMPL *session, WT_LOGSLOT *slot) conn = S2C(session); log = conn->log; - slot->slot_state = 0; /* * !!! slot_release_lsn must be set outside this function because * this function may be called after a log file switch and the @@ -30,12 +72,19 @@ __wt_log_slot_activate(WT_SESSION_IMPL *session, WT_LOGSLOT *slot) * set for closing the file handle on a log file switch. The flags * are reset when the slot is freed. See log_slot_free. */ + slot->slot_unbuffered = 0; slot->slot_start_lsn = slot->slot_end_lsn = log->alloc_lsn; slot->slot_start_offset = log->alloc_lsn.l.offset; slot->slot_last_offset = log->alloc_lsn.l.offset; slot->slot_fh = log->log_fh; slot->slot_error = 0; - slot->slot_unbuffered = 0; + WT_DIAGNOSTIC_YIELD; + /* + * Set the slot state last. Other threads may have a stale pointer + * to this slot and could try to alter the state and other fields once + * they see the state cleared. + */ + WT_PUBLISH(slot->slot_state, 0); } /* @@ -50,6 +99,10 @@ __log_slot_close( WT_CONNECTION_IMPL *conn; WT_LOG *log; int64_t end_offset, new_state, old_state; +#ifdef HAVE_DIAGNOSTIC + struct timespec begin, now; + int count; +#endif WT_ASSERT(session, F_ISSET(session, WT_SESSION_LOCKED_SLOT)); WT_ASSERT(session, releasep != NULL); @@ -101,9 +154,33 @@ retry: * that value. If the state is unbuffered, wait for the unbuffered * size to be set. */ - while (WT_LOG_SLOT_UNBUFFERED_ISSET(old_state) && - slot->slot_unbuffered == 0) - __wt_yield(); +#ifdef HAVE_DIAGNOSTIC + count = 0; + __wt_epoch(session, &begin); +#endif + if (WT_LOG_SLOT_UNBUFFERED_ISSET(old_state)) { + while (slot->slot_unbuffered == 0) { + WT_RET(WT_SESSION_CHECK_PANIC(session)); + __wt_yield(); +#ifdef HAVE_DIAGNOSTIC + ++count; + if (count > WT_MILLION) { + __wt_epoch(session, &now); + if (WT_TIMEDIFF_SEC(now, begin) > 10) { + __wt_errx(session, "SLOT_CLOSE: Slot %" + PRIu32 " Timeout unbuffered, state 0x%" + PRIx64 " unbuffered %" PRIu64, + (uint32_t)(slot - &log->slot_pool[0]), + slot->slot_state, + slot->slot_unbuffered); + __log_slot_dump(session); + __wt_abort(session); + } + count = 0; + } +#endif + } + } end_offset = WT_LOG_SLOT_JOINED_BUFFERED(old_state) + slot->slot_unbuffered; @@ -143,6 +220,7 @@ __log_slot_switch_internal( if (slot != log->active_slot) return (0); + WT_RET(WT_SESSION_CHECK_PANIC(session)); /* * We may come through here multiple times if we were able to close * a slot but could not set up a new one. If we closed it already, @@ -218,7 +296,11 @@ __wt_log_slot_new(WT_SESSION_IMPL *session) WT_CONNECTION_IMPL *conn; WT_LOG *log; WT_LOGSLOT *slot; - int32_t i; + int32_t i, pool_i; +#ifdef HAVE_DIAGNOSTIC + struct timespec begin, now; + int count; +#endif WT_ASSERT(session, F_ISSET(session, WT_SESSION_LOCKED_SLOT)); conn = S2C(session); @@ -232,16 +314,22 @@ __wt_log_slot_new(WT_SESSION_IMPL *session) WT_LOG_SLOT_OPEN(slot->slot_state)) return (0); +#ifdef HAVE_DIAGNOSTIC + count = 0; + __wt_epoch(session, &begin); +#endif /* * Keep trying until we can find a free slot. */ for (;;) { /* - * For now just restart at 0. We could use log->pool_index - * if that is inefficient. + * Rotate among the slots to lessen collisions. */ - for (i = 0; i < WT_SLOT_POOL; i++) { - slot = &log->slot_pool[i]; + for (i = 0, pool_i = log->pool_index; i < WT_SLOT_POOL; + i++, pool_i++) { + if (pool_i >= WT_SLOT_POOL) + pool_i = 0; + slot = &log->slot_pool[pool_i]; if (slot->slot_state == WT_LOG_SLOT_FREE) { /* * Acquire our starting position in the @@ -256,14 +344,28 @@ __wt_log_slot_new(WT_SESSION_IMPL *session) WT_STAT_CONN_INCR(session, log_slot_transitions); log->active_slot = slot; + log->pool_index = pool_i; return (0); } } /* * If we didn't find any free slots signal the worker thread. */ - __wt_cond_auto_signal(session, conn->log_wrlsn_cond); + __wt_cond_signal(session, conn->log_wrlsn_cond); __wt_yield(); +#ifdef HAVE_DIAGNOSTIC + ++count; + if (count > WT_MILLION) { + __wt_epoch(session, &now); + if (WT_TIMEDIFF_SEC(now, begin) > 10) { + __wt_errx(session, + "SLOT_NEW: Timeout free slot"); + __log_slot_dump(session); + __wt_abort(session); + } + count = 0; + } +#endif } /* NOTREACHED */ } @@ -311,10 +413,13 @@ __wt_log_slot_init(WT_SESSION_IMPL *session) /* * We cannot initialize the release LSN in the activate function * because that function can be called after a log file switch. + * The release LSN is usually the same as the slot_start_lsn except + * around a log file switch. */ slot->slot_release_lsn = log->alloc_lsn; __wt_log_slot_activate(session, slot); log->active_slot = slot; + log->pool_index = 0; if (0) { err: while (--i >= 0) @@ -361,7 +466,7 @@ __wt_log_slot_destroy(WT_SESSION_IMPL *session) * __wt_log_slot_join -- * Join a consolidated logging slot. */ -void +int __wt_log_slot_join(WT_SESSION_IMPL *session, uint64_t mysize, uint32_t flags, WT_MYSLOT *myslot) { @@ -370,53 +475,63 @@ __wt_log_slot_join(WT_SESSION_IMPL *session, uint64_t mysize, WT_LOGSLOT *slot; int64_t flag_state, new_state, old_state, released; int32_t join_offset, new_join; -#ifdef HAVE_DIAGNOSTIC - bool unbuf_force; -#endif + bool unbuffered, yld; conn = S2C(session); log = conn->log; WT_ASSERT(session, !F_ISSET(session, WT_SESSION_LOCKED_SLOT)); + WT_ASSERT(session, mysize != 0); /* * There should almost always be a slot open. */ + unbuffered = false; #ifdef HAVE_DIAGNOSTIC - unbuf_force = (++log->write_calls % WT_THOUSAND) == 0; + yld = (++log->write_calls % 7) == 0; + if ((log->write_calls % WT_THOUSAND) == 0 || + mysize > WT_LOG_SLOT_BUF_MAX) { +#else + yld = false; + if (mysize > WT_LOG_SLOT_BUF_MAX) { #endif + unbuffered = true; + F_SET(myslot, WT_MYSLOT_UNBUFFERED); + } for (;;) { WT_BARRIER(); + WT_RET(WT_SESSION_CHECK_PANIC(session)); slot = log->active_slot; old_state = slot->slot_state; - /* - * Try to join our size into the existing size and - * atomically write it back into the state. - */ - flag_state = WT_LOG_SLOT_FLAGS(old_state); - released = WT_LOG_SLOT_RELEASED(old_state); - join_offset = WT_LOG_SLOT_JOINED(old_state); -#ifdef HAVE_DIAGNOSTIC - if (unbuf_force || mysize > WT_LOG_SLOT_BUF_MAX) { -#else - if (mysize > WT_LOG_SLOT_BUF_MAX) { -#endif - new_join = join_offset + WT_LOG_SLOT_UNBUFFERED; - F_SET(myslot, WT_MYSLOT_UNBUFFERED); - myslot->slot = slot; - } else - new_join = join_offset + (int32_t)mysize; - new_state = (int64_t)WT_LOG_SLOT_JOIN_REL( - (int64_t)new_join, (int64_t)released, (int64_t)flag_state); - - /* - * Check if the slot is open for joining and we are able to - * swap in our size into the state. - */ - if (WT_LOG_SLOT_OPEN(old_state) && - __wt_atomic_casiv64( - &slot->slot_state, old_state, new_state)) - break; + if (WT_LOG_SLOT_OPEN(old_state)) { + /* + * Try to join our size into the existing size and + * atomically write it back into the state. + */ + flag_state = WT_LOG_SLOT_FLAGS(old_state); + released = WT_LOG_SLOT_RELEASED(old_state); + join_offset = WT_LOG_SLOT_JOINED(old_state); + if (unbuffered) + new_join = join_offset + WT_LOG_SLOT_UNBUFFERED; + else + new_join = join_offset + (int32_t)mysize; + new_state = (int64_t)WT_LOG_SLOT_JOIN_REL( + (int64_t)new_join, (int64_t)released, + (int64_t)flag_state); + + /* + * Braces used due to potential empty body warning. + */ + if (yld) { + WT_DIAGNOSTIC_YIELD; + } + /* + * Attempt to swap our size into the state. + */ + if (__wt_atomic_casiv64( + &slot->slot_state, old_state, new_state)) + break; + } /* * The slot is no longer open or we lost the race to * update it. Yield and try again. @@ -428,8 +543,7 @@ __wt_log_slot_join(WT_SESSION_IMPL *session, uint64_t mysize, * We joined this slot. Fill in our information to return to * the caller. */ - if (mysize != 0) - WT_STAT_CONN_INCR(session, log_slot_joins); + WT_STAT_CONN_INCR(session, log_slot_joins); if (LF_ISSET(WT_LOG_DSYNC | WT_LOG_FSYNC)) F_SET(slot, WT_SLOT_SYNC_DIR); if (LF_ISSET(WT_LOG_FLUSH)) @@ -444,6 +558,7 @@ __wt_log_slot_join(WT_SESSION_IMPL *session, uint64_t mysize, myslot->slot = slot; myslot->offset = join_offset; myslot->end_offset = (wt_off_t)((uint64_t)join_offset + mysize); + return (0); } /* @@ -468,6 +583,7 @@ __wt_log_slot_release(WT_SESSION_IMPL *session, WT_MYSLOT *myslot, int64_t size) * was written rather than the beginning record of the slot. */ while ((cur_offset = slot->slot_last_offset) < my_start) { + WT_RET(WT_SESSION_CHECK_PANIC(session)); /* * Set our offset if we are larger. */ diff --git a/src/third_party/wiredtiger/src/lsm/lsm_cursor.c b/src/third_party/wiredtiger/src/lsm/lsm_cursor.c index 839648b97d7..60afbc99ade 100644 --- a/src/third_party/wiredtiger/src/lsm/lsm_cursor.c +++ b/src/third_party/wiredtiger/src/lsm/lsm_cursor.c @@ -304,7 +304,7 @@ __clsm_leave(WT_CURSOR_LSM *clsm) * byte, if the application uses two leading DC4 byte for some reason, we'll do * a wasted data copy each time a new value is inserted into the object. */ -static const WT_ITEM __tombstone = { "\x14\x14", 2, 0, NULL, 0 }; +static const WT_ITEM __tombstone = { "\x14\x14", 2, NULL, 0, 0 }; /* * __clsm_deleted -- @@ -1692,8 +1692,8 @@ __wt_clsm_open(WT_SESSION_IMPL *session, bulk = cval.val != 0; /* Get the LSM tree. */ - WT_WITH_HANDLE_LIST_LOCK(session, - ret = __wt_lsm_tree_get(session, uri, bulk, &lsm_tree)); + ret = __wt_lsm_tree_get(session, uri, bulk, &lsm_tree); + /* * Check whether the exclusive open for a bulk load succeeded, and * if it did ensure that it's safe to bulk load into the tree. diff --git a/src/third_party/wiredtiger/src/lsm/lsm_manager.c b/src/third_party/wiredtiger/src/lsm/lsm_manager.c index cbd83a5cd30..6dc06146179 100644 --- a/src/third_party/wiredtiger/src/lsm/lsm_manager.c +++ b/src/third_party/wiredtiger/src/lsm/lsm_manager.c @@ -387,8 +387,8 @@ __lsm_manager_run_server(WT_SESSION_IMPL *session) __wt_sleep(0, 10000); if (TAILQ_EMPTY(&conn->lsmqh)) continue; - __wt_spin_lock(session, &conn->dhandle_lock); - F_SET(session, WT_SESSION_LOCKED_HANDLE_LIST); + __wt_readlock(session, &conn->dhandle_lock); + F_SET(session, WT_SESSION_LOCKED_HANDLE_LIST_READ); dhandle_locked = true; TAILQ_FOREACH(lsm_tree, &S2C(session)->lsmqh, q) { if (!lsm_tree->active) @@ -448,14 +448,14 @@ __lsm_manager_run_server(WT_SESSION_IMPL *session) session, WT_LSM_WORK_MERGE, 0, lsm_tree)); } } - __wt_spin_unlock(session, &conn->dhandle_lock); - F_CLR(session, WT_SESSION_LOCKED_HANDLE_LIST); + __wt_readunlock(session, &conn->dhandle_lock); + F_CLR(session, WT_SESSION_LOCKED_HANDLE_LIST_READ); dhandle_locked = false; } err: if (dhandle_locked) { - __wt_spin_unlock(session, &conn->dhandle_lock); - F_CLR(session, WT_SESSION_LOCKED_HANDLE_LIST); + __wt_readunlock(session, &conn->dhandle_lock); + F_CLR(session, WT_SESSION_LOCKED_HANDLE_LIST_READ); } return (ret); } diff --git a/src/third_party/wiredtiger/src/lsm/lsm_stat.c b/src/third_party/wiredtiger/src/lsm/lsm_stat.c index 150de968722..21e8991be94 100644 --- a/src/third_party/wiredtiger/src/lsm/lsm_stat.c +++ b/src/third_party/wiredtiger/src/lsm/lsm_stat.c @@ -33,9 +33,7 @@ __curstat_lsm_init( "checkpoint=" WT_CHECKPOINT, NULL, NULL }; locked = false; - WT_WITH_HANDLE_LIST_LOCK(session, - ret = __wt_lsm_tree_get(session, uri, false, &lsm_tree)); - WT_RET(ret); + WT_RET(__wt_lsm_tree_get(session, uri, false, &lsm_tree)); WT_ERR(__wt_scr_alloc(session, 0, &uribuf)); /* Propagate all, fast and/or clear to the cursors we open. */ diff --git a/src/third_party/wiredtiger/src/lsm/lsm_tree.c b/src/third_party/wiredtiger/src/lsm/lsm_tree.c index 38d87dd852b..a9275976023 100644 --- a/src/third_party/wiredtiger/src/lsm/lsm_tree.c +++ b/src/third_party/wiredtiger/src/lsm/lsm_tree.c @@ -38,7 +38,7 @@ __lsm_tree_discard(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree, bool final) /* We may be destroying an lsm_tree before it was added. */ if (F_ISSET(lsm_tree, WT_LSM_TREE_OPEN)) { WT_ASSERT(session, final || - F_ISSET(session, WT_SESSION_LOCKED_HANDLE_LIST)); + F_ISSET(session, WT_SESSION_LOCKED_HANDLE_LIST_WRITE)); TAILQ_REMOVE(&S2C(session)->lsmqh, lsm_tree, q); } @@ -321,9 +321,7 @@ __wt_lsm_tree_create(WT_SESSION_IMPL *session, metadata = NULL; /* If the tree can be opened, it already exists. */ - WT_WITH_HANDLE_LIST_LOCK(session, - ret = __wt_lsm_tree_get(session, uri, false, &lsm_tree)); - if (ret == 0) { + if ((ret = __wt_lsm_tree_get(session, uri, false, &lsm_tree)) == 0) { __wt_lsm_tree_release(session, lsm_tree); return (exclusive ? EEXIST : 0); } @@ -339,7 +337,7 @@ __wt_lsm_tree_create(WT_SESSION_IMPL *session, * error: the returned handle is NULL on error, and the metadata * tracking macros handle cleaning up on failure. */ - WT_WITH_HANDLE_LIST_LOCK(session, + WT_WITH_HANDLE_LIST_WRITE_LOCK(session, ret = __lsm_tree_open(session, uri, true, &lsm_tree)); if (ret == 0) __wt_lsm_tree_release(session, lsm_tree); @@ -404,6 +402,9 @@ __lsm_tree_find(WT_SESSION_IMPL *session, } *treep = lsm_tree; + + WT_ASSERT(session, lsm_tree->excl_session == + (exclusive ? session : NULL)); return (0); } @@ -456,7 +457,8 @@ __lsm_tree_open(WT_SESSION_IMPL *session, conn = S2C(session); lsm_tree = NULL; - WT_ASSERT(session, F_ISSET(session, WT_SESSION_LOCKED_HANDLE_LIST)); + WT_ASSERT(session, + F_ISSET(session, WT_SESSION_LOCKED_HANDLE_LIST_WRITE)); /* Start the LSM manager thread if it isn't running. */ if (__wt_atomic_cas32(&conn->lsm_manager.lsm_workers, 0, 1)) @@ -469,7 +471,7 @@ __lsm_tree_open(WT_SESSION_IMPL *session, /* Try to open the tree. */ WT_RET(__wt_calloc_one(session, &lsm_tree)); - WT_ERR(__wt_rwlock_alloc(session, &lsm_tree->rwlock, "lsm tree")); + __wt_rwlock_init(session, &lsm_tree->rwlock); WT_ERR(__lsm_tree_set_name(session, lsm_tree, uri)); @@ -520,14 +522,21 @@ __wt_lsm_tree_get(WT_SESSION_IMPL *session, { WT_DECL_RET; - WT_ASSERT(session, F_ISSET(session, WT_SESSION_LOCKED_HANDLE_LIST)); - - ret = __lsm_tree_find(session, uri, exclusive, treep); + /* + * Dropping and re-acquiring the lock is safe here, since the tree open + * call checks to see if another thread beat it to opening the tree + * before proceeding. + */ + if (exclusive) + WT_WITH_HANDLE_LIST_WRITE_LOCK(session, + ret = __lsm_tree_find(session, uri, exclusive, treep)); + else + WT_WITH_HANDLE_LIST_READ_LOCK(session, + ret = __lsm_tree_find(session, uri, exclusive, treep)); if (ret == WT_NOTFOUND) - ret = __lsm_tree_open(session, uri, exclusive, treep); + WT_WITH_HANDLE_LIST_WRITE_LOCK(session, + ret = __lsm_tree_open(session, uri, exclusive, treep)); - WT_ASSERT(session, ret != 0 || - (*treep)->excl_session == (exclusive ? session : NULL)); return (ret); } @@ -857,9 +866,7 @@ __wt_lsm_tree_alter( locked = false; /* Get the LSM tree. */ - WT_WITH_HANDLE_LIST_LOCK(session, - ret = __wt_lsm_tree_get(session, uri, false, &lsm_tree)); - WT_RET(ret); + WT_RET(__wt_lsm_tree_get(session, uri, false, &lsm_tree)); /* Prevent any new opens. */ __wt_lsm_tree_writelock(session, lsm_tree); @@ -899,9 +906,7 @@ __wt_lsm_tree_drop( locked = false; /* Get the LSM tree. */ - WT_WITH_HANDLE_LIST_LOCK(session, - ret = __wt_lsm_tree_get(session, name, true, &lsm_tree)); - WT_RET(ret); + WT_RET(__wt_lsm_tree_get(session, name, true, &lsm_tree)); WT_ASSERT(session, !lsm_tree->active); /* Prevent any new opens. */ @@ -934,7 +939,7 @@ __wt_lsm_tree_drop( WT_ASSERT(session, !lsm_tree->active); err: if (locked) __wt_lsm_tree_writeunlock(session, lsm_tree); - WT_WITH_HANDLE_LIST_LOCK(session, + WT_WITH_HANDLE_LIST_WRITE_LOCK(session, tret = __lsm_tree_discard(session, lsm_tree, false)); WT_TRET(tret); return (ret); @@ -960,9 +965,7 @@ __wt_lsm_tree_rename(WT_SESSION_IMPL *session, locked = false; /* Get the LSM tree. */ - WT_WITH_HANDLE_LIST_LOCK(session, - ret = __wt_lsm_tree_get(session, olduri, true, &lsm_tree)); - WT_RET(ret); + WT_RET(__wt_lsm_tree_get(session, olduri, true, &lsm_tree)); /* Prevent any new opens. */ __wt_lsm_tree_writelock(session, lsm_tree); @@ -1007,7 +1010,7 @@ err: if (locked) * Discard this LSM tree structure. The first operation on the renamed * tree will create a new one. */ - WT_WITH_HANDLE_LIST_LOCK(session, + WT_WITH_HANDLE_LIST_WRITE_LOCK(session, tret = __lsm_tree_discard(session, lsm_tree, false)); WT_TRET(tret); return (ret); @@ -1032,9 +1035,7 @@ __wt_lsm_tree_truncate( locked = false; /* Get the LSM tree. */ - WT_WITH_HANDLE_LIST_LOCK(session, - ret = __wt_lsm_tree_get(session, name, true, &lsm_tree)); - WT_RET(ret); + WT_RET(__wt_lsm_tree_get(session, name, true, &lsm_tree)); /* Prevent any new opens. */ __wt_lsm_tree_writelock(session, lsm_tree); @@ -1068,7 +1069,7 @@ err: if (locked) * the last good version of the metadata will be used, resulting * in a valid (not truncated) tree. */ - WT_WITH_HANDLE_LIST_LOCK(session, + WT_WITH_HANDLE_LIST_WRITE_LOCK(session, tret = __lsm_tree_discard(session, lsm_tree, false)); WT_TRET(tret); } @@ -1082,7 +1083,7 @@ err: if (locked) void __wt_lsm_tree_readlock(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree) { - __wt_readlock(session, lsm_tree->rwlock); + __wt_readlock(session, &lsm_tree->rwlock); /* * Diagnostic: avoid deadlocks with the schema lock: if we need it for @@ -1100,7 +1101,7 @@ __wt_lsm_tree_readunlock(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree) { F_CLR(session, WT_SESSION_NO_EVICTION | WT_SESSION_NO_SCHEMA_LOCK); - __wt_readunlock(session, lsm_tree->rwlock); + __wt_readunlock(session, &lsm_tree->rwlock); } /* @@ -1110,7 +1111,7 @@ __wt_lsm_tree_readunlock(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree) void __wt_lsm_tree_writelock(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree) { - __wt_writelock(session, lsm_tree->rwlock); + __wt_writelock(session, &lsm_tree->rwlock); /* * Diagnostic: avoid deadlocks with the schema lock: if we need it for @@ -1128,7 +1129,7 @@ __wt_lsm_tree_writeunlock(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree) { F_CLR(session, WT_SESSION_NO_EVICTION | WT_SESSION_NO_SCHEMA_LOCK); - __wt_writeunlock(session, lsm_tree->rwlock); + __wt_writeunlock(session, &lsm_tree->rwlock); } /* @@ -1157,9 +1158,7 @@ __wt_lsm_compact(WT_SESSION_IMPL *session, const char *name, bool *skipp) /* Tell __wt_schema_worker not to look inside the LSM tree. */ *skipp = true; - WT_WITH_HANDLE_LIST_LOCK(session, - ret = __wt_lsm_tree_get(session, name, false, &lsm_tree)); - WT_RET(ret); + WT_RET(__wt_lsm_tree_get(session, name, false, &lsm_tree)); if (!F_ISSET(S2C(session), WT_CONN_LSM_MERGE)) WT_ERR_MSG(session, EINVAL, @@ -1356,9 +1355,7 @@ __wt_lsm_tree_worker(WT_SESSION_IMPL *session, locked = false; exclusive = FLD_ISSET(open_flags, WT_DHANDLE_EXCLUSIVE); - WT_WITH_HANDLE_LIST_LOCK(session, - ret = __wt_lsm_tree_get(session, uri, exclusive, &lsm_tree)); - WT_RET(ret); + WT_RET(__wt_lsm_tree_get(session, uri, exclusive, &lsm_tree)); /* * We mark that we're busy using the tree to coordinate diff --git a/src/third_party/wiredtiger/src/lsm/lsm_work_unit.c b/src/third_party/wiredtiger/src/lsm/lsm_work_unit.c index d9c185a3f58..4349acf7b55 100644 --- a/src/third_party/wiredtiger/src/lsm/lsm_work_unit.c +++ b/src/third_party/wiredtiger/src/lsm/lsm_work_unit.c @@ -276,7 +276,7 @@ __wt_lsm_checkpoint_chunk(WT_SESSION_IMPL *session, if (F_ISSET(chunk, WT_LSM_CHUNK_ONDISK) && !F_ISSET(chunk, WT_LSM_CHUNK_STABLE) && !chunk->evicted) { - WT_WITH_HANDLE_LIST_LOCK(session, + WT_WITH_HANDLE_LIST_WRITE_LOCK(session, ret = __lsm_discard_handle(session, chunk->uri, NULL)); if (ret == 0) chunk->evicted = 1; @@ -517,7 +517,7 @@ __lsm_drop_file(WT_SESSION_IMPL *session, const char *uri) * * This will fail with EBUSY if the file is still in use. */ - WT_WITH_HANDLE_LIST_LOCK(session, + WT_WITH_HANDLE_LIST_WRITE_LOCK(session, ret = __lsm_discard_handle(session, uri, WT_CHECKPOINT)); WT_RET(ret); diff --git a/src/third_party/wiredtiger/src/lsm/lsm_worker.c b/src/third_party/wiredtiger/src/lsm/lsm_worker.c index b0d0758775d..ffa00c0a5e7 100644 --- a/src/third_party/wiredtiger/src/lsm/lsm_worker.c +++ b/src/third_party/wiredtiger/src/lsm/lsm_worker.c @@ -154,7 +154,7 @@ __lsm_worker(void *arg) /* Don't busy wait if there was any work to do. */ if (!progress) { - __wt_cond_wait(session, cookie->work_cond, 10000); + __wt_cond_wait(session, cookie->work_cond, 10000, NULL); continue; } } diff --git a/src/third_party/wiredtiger/src/os_posix/os_mtx_cond.c b/src/third_party/wiredtiger/src/os_posix/os_mtx_cond.c index be8b1abda31..a5ee78f9e3e 100644 --- a/src/third_party/wiredtiger/src/os_posix/os_mtx_cond.c +++ b/src/third_party/wiredtiger/src/os_posix/os_mtx_cond.c @@ -13,8 +13,7 @@ * Allocate and initialize a condition variable. */ int -__wt_cond_alloc(WT_SESSION_IMPL *session, - const char *name, bool is_signalled, WT_CONDVAR **condp) +__wt_cond_alloc(WT_SESSION_IMPL *session, const char *name, WT_CONDVAR **condp) { WT_CONDVAR *cond; WT_DECL_RET; @@ -27,7 +26,7 @@ __wt_cond_alloc(WT_SESSION_IMPL *session, WT_ERR(pthread_cond_init(&cond->cond, NULL)); cond->name = name; - cond->waiters = is_signalled ? -1 : 0; + cond->waiters = 0; *condp = cond; return (0); @@ -42,8 +41,8 @@ err: __wt_free(session, cond); * out period expires, let the caller know. */ void -__wt_cond_wait_signal( - WT_SESSION_IMPL *session, WT_CONDVAR *cond, uint64_t usecs, bool *signalled) +__wt_cond_wait_signal(WT_SESSION_IMPL *session, WT_CONDVAR *cond, + uint64_t usecs, bool (*run_func)(WT_SESSION_IMPL *), bool *signalled) { struct timespec ts; WT_DECL_RET; @@ -62,6 +61,23 @@ __wt_cond_wait_signal( WT_ERR(pthread_mutex_lock(&cond->mtx)); locked = true; + /* + * It's possible to race with threads waking us up. That's not a problem + * if there are multiple wakeups because the next wakeup will get us, or + * if we're only pausing for a short period. It's a problem if there's + * only a single wakeup, our waker is likely waiting for us to exit. + * After acquiring the mutex (so we're guaranteed to be awakened by any + * future wakeup call), optionally check if we're OK to keep running. + * This won't ensure our caller won't just loop and call us again, but + * at least it's not our fault. + * + * Assert we're not waiting longer than a second if not checking the + * run status. + */ + WT_ASSERT(session, run_func != NULL || usecs <= WT_MILLION); + if (run_func != NULL && !run_func(session)) + goto skipping; + if (usecs > 0) { __wt_epoch(session, &ts); ts.tv_sec += (time_t) @@ -81,7 +97,7 @@ __wt_cond_wait_signal( ret == ETIME || #endif ret == ETIMEDOUT) { - *signalled = false; +skipping: *signalled = false; ret = 0; } diff --git a/src/third_party/wiredtiger/src/os_posix/os_yield.c b/src/third_party/wiredtiger/src/os_posix/os_yield.c index 37d05bc1854..f7c43aae746 100644 --- a/src/third_party/wiredtiger/src/os_posix/os_yield.c +++ b/src/third_party/wiredtiger/src/os_posix/os_yield.c @@ -16,5 +16,13 @@ void __wt_yield(void) WT_GCC_FUNC_ATTRIBUTE((visibility("default"))) { + /* + * Yielding the processor isn't documented as a memory barrier, and it's + * a reasonable expectation to have. There's no reason not to explicitly + * include a barrier since we're giving up the CPU, and ensures callers + * aren't ever surprised. + */ + WT_FULL_BARRIER(); + sched_yield(); } diff --git a/src/third_party/wiredtiger/src/os_win/os_mtx_cond.c b/src/third_party/wiredtiger/src/os_win/os_mtx_cond.c index 79c62ccd7f2..0001c6c2322 100644 --- a/src/third_party/wiredtiger/src/os_win/os_mtx_cond.c +++ b/src/third_party/wiredtiger/src/os_win/os_mtx_cond.c @@ -13,8 +13,7 @@ * Allocate and initialize a condition variable. */ int -__wt_cond_alloc(WT_SESSION_IMPL *session, - const char *name, bool is_signalled, WT_CONDVAR **condp) +__wt_cond_alloc(WT_SESSION_IMPL *session, const char *name, WT_CONDVAR **condp) { WT_CONDVAR *cond; @@ -26,7 +25,7 @@ __wt_cond_alloc(WT_SESSION_IMPL *session, InitializeConditionVariable(&cond->cond); cond->name = name; - cond->waiters = is_signalled ? -1 : 0; + cond->waiters = 0; *condp = cond; return (0); @@ -38,8 +37,8 @@ __wt_cond_alloc(WT_SESSION_IMPL *session, * out period expires, let the caller know. */ void -__wt_cond_wait_signal( - WT_SESSION_IMPL *session, WT_CONDVAR *cond, uint64_t usecs, bool *signalled) +__wt_cond_wait_signal(WT_SESSION_IMPL *session, WT_CONDVAR *cond, + uint64_t usecs, bool (*run_func)(WT_SESSION_IMPL *), bool *signalled) { BOOL sleepret; DWORD milliseconds, windows_error; @@ -59,8 +58,26 @@ __wt_cond_wait_signal( EnterCriticalSection(&cond->mtx); locked = true; + /* + * It's possible to race with threads waking us up. That's not a problem + * if there are multiple wakeups because the next wakeup will get us, or + * if we're only pausing for a short period. It's a problem if there's + * only a single wakeup, our waker is likely waiting for us to exit. + * After acquiring the mutex (so we're guaranteed to be awakened by any + * future wakeup call), optionally check if we're OK to keep running. + * This won't ensure our caller won't just loop and call us again, but + * at least it's not our fault. + * + * Assert we're not waiting longer than a second if not checking the + * run status. + */ + WT_ASSERT(session, run_func != NULL || usecs <= WT_MILLION); + + if (run_func != NULL && !run_func(session)) + goto skipping; + if (usecs > 0) { - milliseconds64 = usecs / 1000; + milliseconds64 = usecs / WT_THOUSAND; /* * Check for 32-bit unsigned integer overflow @@ -90,7 +107,7 @@ __wt_cond_wait_signal( if (sleepret == 0) { windows_error = __wt_getlasterror(); if (windows_error == ERROR_TIMEOUT) { - *signalled = false; +skipping: *signalled = false; sleepret = 1; } } @@ -117,17 +134,17 @@ void __wt_cond_signal(WT_SESSION_IMPL *session, WT_CONDVAR *cond) { WT_DECL_RET; - bool locked; - - locked = false; __wt_verbose(session, WT_VERB_MUTEX, "signal %s", cond->name); /* - * Our callers are often setting flags to cause a thread to exit. Add - * a barrier to ensure the flags are seen by the threads. + * Our callers often set flags to cause a thread to exit. Add a barrier + * to ensure exit flags are seen by the sleeping threads, otherwise we + * can wake up a thread, it immediately goes back to sleep, and we'll + * hang. Use a full barrier (we may not write before waiting on thread + * join). */ - WT_WRITE_BARRIER(); + WT_FULL_BARRIER(); /* * Fast path if we are in (or can enter), a state where the next waiter diff --git a/src/third_party/wiredtiger/src/os_win/os_yield.c b/src/third_party/wiredtiger/src/os_win/os_yield.c index aab1559e072..038f2efe162 100644 --- a/src/third_party/wiredtiger/src/os_win/os_yield.c +++ b/src/third_party/wiredtiger/src/os_win/os_yield.c @@ -15,5 +15,13 @@ void __wt_yield(void) { + /* + * Yielding the processor isn't documented as a memory barrier, and it's + * a reasonable expectation to have. There's no reason not to explicitly + * include a barrier since we're giving up the CPU, and ensures callers + * aren't ever surprised. + */ + WT_FULL_BARRIER(); + SwitchToThread(); } diff --git a/src/third_party/wiredtiger/src/reconcile/rec_track.c b/src/third_party/wiredtiger/src/reconcile/rec_track.c index 3795b6e5ae8..5bf425b1b21 100644 --- a/src/third_party/wiredtiger/src/reconcile/rec_track.c +++ b/src/third_party/wiredtiger/src/reconcile/rec_track.c @@ -875,9 +875,9 @@ __wt_ovfl_track_wrapup(WT_SESSION_IMPL *session, WT_PAGE *page) WT_RET(__ovfl_reuse_wrapup(session, page)); if (track->ovfl_txnc[0] != NULL) { - __wt_writelock(session, S2BT(session)->ovfl_lock); + __wt_writelock(session, &S2BT(session)->ovfl_lock); ret = __ovfl_txnc_wrapup(session, page); - __wt_writeunlock(session, S2BT(session)->ovfl_lock); + __wt_writeunlock(session, &S2BT(session)->ovfl_lock); } return (ret); } @@ -903,9 +903,9 @@ __wt_ovfl_track_wrapup_err(WT_SESSION_IMPL *session, WT_PAGE *page) WT_RET(__ovfl_reuse_wrapup_err(session, page)); if (track->ovfl_txnc[0] != NULL) { - __wt_writelock(session, S2BT(session)->ovfl_lock); + __wt_writelock(session, &S2BT(session)->ovfl_lock); ret = __ovfl_txnc_wrapup(session, page); - __wt_writeunlock(session, S2BT(session)->ovfl_lock); + __wt_writeunlock(session, &S2BT(session)->ovfl_lock); } return (ret); } diff --git a/src/third_party/wiredtiger/src/reconcile/rec_write.c b/src/third_party/wiredtiger/src/reconcile/rec_write.c index e82f449a50d..a667a288187 100644 --- a/src/third_party/wiredtiger/src/reconcile/rec_write.c +++ b/src/third_party/wiredtiger/src/reconcile/rec_write.c @@ -4069,7 +4069,7 @@ __rec_col_fix(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_REF *pageref) /* Copy the original, disk-image bytes into place. */ memcpy(r->first_free, page->pg_fix_bitf, - __bitstr_size((size_t)page->pg_fix_entries * btree->bitcnt)); + __bitstr_size((size_t)page->entries * btree->bitcnt)); /* Update any changes to the original on-page data items. */ WT_SKIP_FOREACH(ins, WT_COL_UPDATE_SINGLE(page)) { @@ -4081,9 +4081,8 @@ __rec_col_fix(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_REF *pageref) } /* Calculate the number of entries per page remainder. */ - entry = page->pg_fix_entries; - nrecs = WT_FIX_BYTES_TO_ENTRIES( - btree, r->space_avail) - page->pg_fix_entries; + entry = page->entries; + nrecs = WT_FIX_BYTES_TO_ENTRIES(btree, r->space_avail) - page->entries; r->recno += entry; /* Walk any append list. */ @@ -4206,7 +4205,7 @@ __rec_col_fix_slvg(WT_SESSION_IMPL *session, session, r, page, pageref->ref_recno, btree->maxleafpage)); /* We may not be taking all of the entries on the original page. */ - page_take = salvage->take == 0 ? page->pg_fix_entries : salvage->take; + page_take = salvage->take == 0 ? page->entries : salvage->take; page_start = salvage->skip == 0 ? 0 : salvage->skip; /* Calculate the number of entries per page. */ diff --git a/src/third_party/wiredtiger/src/schema/schema_drop.c b/src/third_party/wiredtiger/src/schema/schema_drop.c index c1a4f257648..49801e4e5f9 100644 --- a/src/third_party/wiredtiger/src/schema/schema_drop.c +++ b/src/third_party/wiredtiger/src/schema/schema_drop.c @@ -30,7 +30,7 @@ __drop_file( WT_RET(__wt_schema_backup_check(session, filename)); /* Close all btree handles associated with this file. */ - WT_WITH_HANDLE_LIST_LOCK(session, + WT_WITH_HANDLE_LIST_WRITE_LOCK(session, ret = __wt_conn_dhandle_close_all(session, uri, force)); WT_RET(ret); diff --git a/src/third_party/wiredtiger/src/schema/schema_list.c b/src/third_party/wiredtiger/src/schema/schema_list.c index ea7374b7554..74ef5135a4a 100644 --- a/src/third_party/wiredtiger/src/schema/schema_list.c +++ b/src/third_party/wiredtiger/src/schema/schema_list.c @@ -25,7 +25,7 @@ __schema_add_table(WT_SESSION_IMPL *session, /* Make sure the metadata is open before getting other locks. */ WT_RET(__wt_metadata_cursor(session, NULL)); - WT_WITH_TABLE_LOCK(session, + WT_WITH_TABLE_READ_LOCK(session, ret = __wt_schema_open_table( session, name, namelen, ok_incomplete, &table)); WT_RET(ret); diff --git a/src/third_party/wiredtiger/src/schema/schema_rename.c b/src/third_party/wiredtiger/src/schema/schema_rename.c index f512482c162..a374f4c2831 100644 --- a/src/third_party/wiredtiger/src/schema/schema_rename.c +++ b/src/third_party/wiredtiger/src/schema/schema_rename.c @@ -33,7 +33,7 @@ __rename_file( WT_RET(__wt_schema_backup_check(session, filename)); WT_RET(__wt_schema_backup_check(session, newfile)); /* Close any btree handles in the file. */ - WT_WITH_HANDLE_LIST_LOCK(session, + WT_WITH_HANDLE_LIST_WRITE_LOCK(session, ret = __wt_conn_dhandle_close_all(session, uri, false)); WT_ERR(ret); diff --git a/src/third_party/wiredtiger/src/schema/schema_util.c b/src/third_party/wiredtiger/src/schema/schema_util.c index 433224a868e..9de4b916a79 100644 --- a/src/third_party/wiredtiger/src/schema/schema_util.c +++ b/src/third_party/wiredtiger/src/schema/schema_util.c @@ -26,7 +26,7 @@ __wt_schema_backup_check(WT_SESSION_IMPL *session, const char *name) conn = S2C(session); if (!conn->hot_backup) return (0); - __wt_readlock(session, conn->hot_backup_lock); + __wt_readlock(session, &conn->hot_backup_lock); /* * There is a window at the end of a backup where the list has been * cleared from the connection but the flag is still set. It is safe @@ -34,7 +34,7 @@ __wt_schema_backup_check(WT_SESSION_IMPL *session, const char *name) */ if (!conn->hot_backup || (backup_list = conn->hot_backup_list) == NULL) { - __wt_readunlock(session, conn->hot_backup_lock); + __wt_readunlock(session, &conn->hot_backup_lock); return (0); } for (i = 0; backup_list[i] != NULL; ++i) { @@ -43,7 +43,7 @@ __wt_schema_backup_check(WT_SESSION_IMPL *session, const char *name) break; } } - __wt_readunlock(session, conn->hot_backup_lock); + __wt_readunlock(session, &conn->hot_backup_lock); return (ret); } diff --git a/src/third_party/wiredtiger/src/schema/schema_worker.c b/src/third_party/wiredtiger/src/schema/schema_worker.c index fb7f8cec074..e5f71b5d56f 100644 --- a/src/third_party/wiredtiger/src/schema/schema_worker.c +++ b/src/third_party/wiredtiger/src/schema/schema_worker.c @@ -49,7 +49,7 @@ __wt_schema_worker(WT_SESSION_IMPL *session, * any open file handles, including checkpoints. */ if (FLD_ISSET(open_flags, WT_DHANDLE_EXCLUSIVE)) { - WT_WITH_HANDLE_LIST_LOCK(session, + WT_WITH_HANDLE_LIST_WRITE_LOCK(session, ret = __wt_conn_dhandle_close_all( session, uri, false)); WT_ERR(ret); diff --git a/src/third_party/wiredtiger/src/session/session_api.c b/src/third_party/wiredtiger/src/session/session_api.c index fe1bf821d3b..d282c5d0c32 100644 --- a/src/third_party/wiredtiger/src/session/session_api.c +++ b/src/third_party/wiredtiger/src/session/session_api.c @@ -162,7 +162,7 @@ __session_alter(WT_SESSION *wt_session, const char *uri, const char *config) cfg[1] = NULL; WT_WITH_CHECKPOINT_LOCK(session, WT_WITH_SCHEMA_LOCK(session, - WT_WITH_TABLE_LOCK(session, + WT_WITH_TABLE_WRITE_LOCK(session, ret = __wt_schema_alter(session, uri, cfg)))); err: if (ret != 0) @@ -234,9 +234,6 @@ __session_close(WT_SESSION *wt_session, const char *config) /* Release common session resources. */ WT_TRET(__wt_session_release_resources(session)); - /* Destroy the thread's mutex. */ - WT_TRET(__wt_cond_destroy(session, &session->cond)); - /* The API lock protects opening and closing of sessions. */ __wt_spin_lock(session, &conn->api_lock); @@ -521,7 +518,7 @@ __wt_session_create( WT_DECL_RET; WT_WITH_SCHEMA_LOCK(session, - WT_WITH_TABLE_LOCK(session, + WT_WITH_TABLE_WRITE_LOCK(session, ret = __wt_schema_create(session, uri, config))); return (ret); } @@ -769,7 +766,7 @@ __session_rename(WT_SESSION *wt_session, WT_WITH_CHECKPOINT_LOCK(session, WT_WITH_SCHEMA_LOCK(session, - WT_WITH_TABLE_LOCK(session, + WT_WITH_TABLE_WRITE_LOCK(session, ret = __wt_schema_rename(session, uri, newuri, cfg)))); err: if (ret != 0) @@ -858,21 +855,22 @@ __session_drop(WT_SESSION *wt_session, const char *uri, const char *config) if (lock_wait) WT_WITH_CHECKPOINT_LOCK(session, WT_WITH_SCHEMA_LOCK(session, - WT_WITH_TABLE_LOCK(session, ret = + WT_WITH_TABLE_WRITE_LOCK(session, ret = __wt_schema_drop(session, uri, cfg)))); else WT_WITH_CHECKPOINT_LOCK_NOWAIT(session, ret, WT_WITH_SCHEMA_LOCK_NOWAIT(session, ret, - WT_WITH_TABLE_LOCK_NOWAIT(session, ret, ret = + WT_WITH_TABLE_WRITE_LOCK_NOWAIT(session, ret, + ret = __wt_schema_drop(session, uri, cfg)))); } else { if (lock_wait) WT_WITH_SCHEMA_LOCK(session, - WT_WITH_TABLE_LOCK(session, + WT_WITH_TABLE_WRITE_LOCK(session, ret = __wt_schema_drop(session, uri, cfg))); else WT_WITH_SCHEMA_LOCK_NOWAIT(session, ret, - WT_WITH_TABLE_LOCK_NOWAIT(session, ret, + WT_WITH_TABLE_WRITE_LOCK_NOWAIT(session, ret, ret = __wt_schema_drop(session, uri, cfg))); } @@ -1489,6 +1487,20 @@ err: API_END_RET(session, ret); } /* + * __transaction_sync_run_chk -- + * Check to decide if the transaction sync call should continue running. + */ +static bool +__transaction_sync_run_chk(WT_SESSION_IMPL *session) +{ + WT_CONNECTION_IMPL *conn; + + conn = S2C(session); + + return (FLD_ISSET(conn->flags, WT_CONN_LOG_SERVER_RUN)); +} + +/* * __session_transaction_sync -- * WT_SESSION->transaction_sync method. */ @@ -1502,7 +1514,7 @@ __session_transaction_sync(WT_SESSION *wt_session, const char *config) WT_SESSION_IMPL *session; WT_TXN *txn; struct timespec now, start; - uint64_t timeout_ms, waited_ms; + uint64_t remaining_usec, timeout_ms, waited_ms; bool forever; session = (WT_SESSION_IMPL *)wt_session; @@ -1555,22 +1567,20 @@ __session_transaction_sync(WT_SESSION *wt_session, const char *config) __wt_epoch(session, &start); /* * Keep checking the LSNs until we find it is stable or we reach - * our timeout. + * our timeout, or there's some other reason to quit. */ while (__wt_log_cmp(&session->bg_sync_lsn, &log->sync_lsn) > 0) { + if (!__transaction_sync_run_chk(session)) + WT_ERR(ETIMEDOUT); + __wt_cond_signal(session, conn->log_file_cond); __wt_epoch(session, &now); waited_ms = WT_TIMEDIFF_MS(now, start); - if (forever || waited_ms < timeout_ms) - /* - * Note, we will wait an increasing amount of time - * each iteration, likely doubling. Also note that - * the function timeout value is in usecs (we are - * computing the wait time in msecs and passing that - * in, unchanged, as the usecs to wait). - */ - __wt_cond_wait(session, log->log_sync_cond, waited_ms); - else + if (forever || waited_ms < timeout_ms) { + remaining_usec = (timeout_ms - waited_ms) * WT_THOUSAND; + __wt_cond_wait(session, log->log_sync_cond, + remaining_usec, __transaction_sync_run_chk); + } else WT_ERR(ETIMEDOUT); } @@ -1686,7 +1696,7 @@ __session_snapshot(WT_SESSION *wt_session, const char *config) WT_ERR(__wt_txn_named_snapshot_config( session, cfg, &has_create, &has_drop)); - __wt_writelock(session, txn_global->nsnap_rwlock); + __wt_writelock(session, &txn_global->nsnap_rwlock); /* Drop any snapshots to be removed first. */ if (has_drop) @@ -1696,7 +1706,7 @@ __session_snapshot(WT_SESSION *wt_session, const char *config) if (has_create) WT_ERR(__wt_txn_named_snapshot_begin(session, cfg)); -err: __wt_writeunlock(session, txn_global->nsnap_rwlock); +err: __wt_writeunlock(session, &txn_global->nsnap_rwlock); API_END_RET_NOTFOUND_MAP(session, ret); } @@ -1825,8 +1835,6 @@ __open_session(WT_CONNECTION_IMPL *conn, session_ret->name = NULL; session_ret->id = i; - WT_ERR(__wt_cond_alloc(session, "session", false, &session_ret->cond)); - if (WT_SESSION_FIRST_USE(session_ret)) __wt_random_init(&session_ret->rnd); diff --git a/src/third_party/wiredtiger/src/session/session_dhandle.c b/src/third_party/wiredtiger/src/session/session_dhandle.c index 732dc797b6d..ee9bddbfc19 100644 --- a/src/third_party/wiredtiger/src/session/session_dhandle.c +++ b/src/third_party/wiredtiger/src/session/session_dhandle.c @@ -44,8 +44,7 @@ __session_discard_dhandle( TAILQ_REMOVE(&session->dhandles, dhandle_cache, q); TAILQ_REMOVE(&session->dhhash[bucket], dhandle_cache, hashq); - (void)__wt_atomic_sub32(&dhandle_cache->dhandle->session_ref, 1); - + WT_DHANDLE_RELEASE(dhandle_cache->dhandle); __wt_overwrite_and_free(session, dhandle_cache); } @@ -181,17 +180,17 @@ __wt_session_lock_dhandle( */ if (F_ISSET(dhandle, WT_DHANDLE_OPEN) && (!want_exclusive || lock_busy)) { - __wt_readlock(session, dhandle->rwlock); + __wt_readlock(session, &dhandle->rwlock); if (F_ISSET(dhandle, WT_DHANDLE_DEAD)) { *is_deadp = 1; - __wt_readunlock(session, dhandle->rwlock); + __wt_readunlock(session, &dhandle->rwlock); return (0); } is_open = F_ISSET(dhandle, WT_DHANDLE_OPEN); if (is_open && !want_exclusive) return (0); - __wt_readunlock(session, dhandle->rwlock); + __wt_readunlock(session, &dhandle->rwlock); } else is_open = false; @@ -201,10 +200,11 @@ __wt_session_lock_dhandle( * with another thread that successfully opens the file, we * don't want to block waiting to get exclusive access. */ - if ((ret = __wt_try_writelock(session, dhandle->rwlock)) == 0) { + if ((ret = + __wt_try_writelock(session, &dhandle->rwlock)) == 0) { if (F_ISSET(dhandle, WT_DHANDLE_DEAD)) { *is_deadp = 1; - __wt_writeunlock(session, dhandle->rwlock); + __wt_writeunlock(session, &dhandle->rwlock); return (0); } @@ -215,7 +215,7 @@ __wt_session_lock_dhandle( if (F_ISSET(dhandle, WT_DHANDLE_OPEN) && !want_exclusive) { lock_busy = false; - __wt_writeunlock(session, dhandle->rwlock); + __wt_writeunlock(session, &dhandle->rwlock); continue; } @@ -286,9 +286,9 @@ __wt_session_release_btree(WT_SESSION_IMPL *session) if (locked) { if (write_locked) { F_CLR(dhandle, WT_DHANDLE_EXCLUSIVE); - __wt_writeunlock(session, dhandle->rwlock); + __wt_writeunlock(session, &dhandle->rwlock); } else - __wt_readunlock(session, dhandle->rwlock); + __wt_readunlock(session, &dhandle->rwlock); } session->dhandle = NULL; @@ -411,17 +411,27 @@ __session_dhandle_sweep(WT_SESSION_IMPL *session) /* * __session_find_shared_dhandle -- * Search for a data handle in the connection and add it to a session's - * cache. Since the data handle isn't locked, this must be called holding - * the handle list lock, and we must increment the handle's reference - * count before releasing it. + * cache. We must increment the handle's reference count while holding + * the handle list lock. */ static int __session_find_shared_dhandle( WT_SESSION_IMPL *session, const char *uri, const char *checkpoint) { - WT_RET(__wt_conn_dhandle_find(session, uri, checkpoint)); - (void)__wt_atomic_add32(&session->dhandle->session_ref, 1); - return (0); + WT_DECL_RET; + + WT_WITH_HANDLE_LIST_READ_LOCK(session, + if ((ret = __wt_conn_dhandle_find(session, uri, checkpoint)) == 0) + WT_DHANDLE_ACQUIRE(session->dhandle)); + + if (ret != WT_NOTFOUND) + return (ret); + + WT_WITH_HANDLE_LIST_WRITE_LOCK(session, + if ((ret = __wt_conn_dhandle_alloc(session, uri, checkpoint)) == 0) + WT_DHANDLE_ACQUIRE(session->dhandle)); + + return (ret); } /* @@ -449,16 +459,16 @@ __session_get_dhandle( * We didn't find a match in the session cache, search the shared * handle list and cache the handle we find. */ - WT_WITH_HANDLE_LIST_LOCK(session, - ret = __session_find_shared_dhandle(session, uri, checkpoint)); - WT_RET(ret); + WT_RET(__session_find_shared_dhandle(session, uri, checkpoint)); /* * Fixup the reference count on failure (we incremented the reference * count while holding the handle-list lock). */ - if ((ret = __session_add_dhandle(session)) != 0) - (void)__wt_atomic_sub32(&session->dhandle->session_ref, 1); + if ((ret = __session_add_dhandle(session)) != 0) { + WT_DHANDLE_RELEASE(session->dhandle); + session->dhandle = NULL; + } return (ret); } @@ -504,17 +514,15 @@ __wt_session_get_btree(WT_SESSION_IMPL *session, * reopen handles in the meantime. A combination of the schema * and handle list locks are used to enforce this. */ - if (!F_ISSET(session, WT_SESSION_LOCKED_SCHEMA) || - !F_ISSET(session, WT_SESSION_LOCKED_HANDLE_LIST)) { + if (!F_ISSET(session, WT_SESSION_LOCKED_SCHEMA)) { dhandle->excl_session = NULL; dhandle->excl_ref = 0; F_CLR(dhandle, WT_DHANDLE_EXCLUSIVE); - __wt_writeunlock(session, dhandle->rwlock); + __wt_writeunlock(session, &dhandle->rwlock); WT_WITH_SCHEMA_LOCK(session, - WT_WITH_HANDLE_LIST_LOCK(session, - ret = __wt_session_get_btree( - session, uri, checkpoint, cfg, flags))); + ret = __wt_session_get_btree( + session, uri, checkpoint, cfg, flags)); return (ret); } @@ -531,7 +539,7 @@ __wt_session_get_btree(WT_SESSION_IMPL *session, dhandle->excl_session = NULL; dhandle->excl_ref = 0; F_CLR(dhandle, WT_DHANDLE_EXCLUSIVE); - __wt_writeunlock(session, dhandle->rwlock); + __wt_writeunlock(session, &dhandle->rwlock); WT_RET(ret); } diff --git a/src/third_party/wiredtiger/src/support/cond_auto.c b/src/third_party/wiredtiger/src/support/cond_auto.c index a3ae67f5baa..600e5eab0ff 100644 --- a/src/third_party/wiredtiger/src/support/cond_auto.c +++ b/src/third_party/wiredtiger/src/support/cond_auto.c @@ -1,29 +1,9 @@ /*- - * Public Domain 2014-2016 MongoDB, Inc. - * Public Domain 2008-2014 WiredTiger, Inc. + * Copyright (c) 2014-2016 MongoDB, Inc. + * Copyright (c) 2008-2014 WiredTiger, Inc. + * All rights reserved. * - * This is free and unencumbered software released into the public domain. - * - * Anyone is free to copy, modify, publish, use, compile, sell, or - * distribute this software, either in source code form or as a compiled - * binary, for any purpose, commercial or non-commercial, and by any - * means. - * - * In jurisdictions that recognize copyright laws, the author or authors - * of this software dedicate any and all copyright interest in the - * software to the public domain. We make this dedication for the benefit - * of the public at large and to the detriment of our heirs and - * successors. We intend this dedication to be an overt act of - * relinquishment in perpetuity of all present and future rights to this - * software under copyright law. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. - * IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR - * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, - * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR - * OTHER DEALINGS IN THE SOFTWARE. + * See the file LICENSE for redistribution information. */ #include "wt_internal.h" @@ -38,13 +18,12 @@ * Allocate and initialize an automatically adjusting condition variable. */ int -__wt_cond_auto_alloc( - WT_SESSION_IMPL *session, const char *name, - bool is_signalled, uint64_t min, uint64_t max, WT_CONDVAR **condp) +__wt_cond_auto_alloc(WT_SESSION_IMPL *session, + const char *name, uint64_t min, uint64_t max, WT_CONDVAR **condp) { WT_CONDVAR *cond; - WT_RET(__wt_cond_alloc(session, name, is_signalled, condp)); + WT_RET(__wt_cond_alloc(session, name, condp)); cond = *condp; cond->min_wait = min; @@ -55,33 +34,19 @@ __wt_cond_auto_alloc( } /* - * __wt_cond_auto_signal -- - * Signal a condition variable. - */ -void -__wt_cond_auto_signal(WT_SESSION_IMPL *session, WT_CONDVAR *cond) -{ - - WT_ASSERT(session, cond->min_wait != 0); - __wt_cond_signal(session, cond); -} - -/* * __wt_cond_auto_wait_signal -- * Wait on a mutex, optionally timing out. If we get it before the time * out period expires, let the caller know. - * TODO: Can this version of the API be removed, now that we have the - * auto adjusting condition variables? */ void -__wt_cond_auto_wait_signal( - WT_SESSION_IMPL *session, WT_CONDVAR *cond, bool progress, bool *signalled) +__wt_cond_auto_wait_signal(WT_SESSION_IMPL *session, WT_CONDVAR *cond, + bool progress, bool (*run_func)(WT_SESSION_IMPL *), bool *signalled) { uint64_t delta; /* * Catch cases where this function is called with a condition variable - * that was initialized non-auto. + * that wasn't initialized to do automatic adjustments. */ WT_ASSERT(session, cond->min_wait != 0); @@ -94,7 +59,8 @@ __wt_cond_auto_wait_signal( cond->max_wait, cond->prev_wait + delta); } - __wt_cond_wait_signal(session, cond, cond->prev_wait, signalled); + __wt_cond_wait_signal( + session, cond, cond->prev_wait, run_func, signalled); if (progress || *signalled) WT_STAT_CONN_INCR(session, cond_auto_wait_reset); @@ -108,24 +74,10 @@ __wt_cond_auto_wait_signal( * out period expires, let the caller know. */ void -__wt_cond_auto_wait( - WT_SESSION_IMPL *session, WT_CONDVAR *cond, bool progress) +__wt_cond_auto_wait(WT_SESSION_IMPL *session, + WT_CONDVAR *cond, bool progress, bool (*run_func)(WT_SESSION_IMPL *)) { - bool signalled; - - /* - * Call the signal version so the wait period is reset if the - * condition is woken explicitly. - */ - __wt_cond_auto_wait_signal(session, cond, progress, &signalled); -} + bool notused; -/* - * __wt_cond_auto_destroy -- - * Destroy a condition variable. - */ -int -__wt_cond_auto_destroy(WT_SESSION_IMPL *session, WT_CONDVAR **condp) -{ - return (__wt_cond_destroy(session, condp)); + __wt_cond_auto_wait_signal(session, cond, progress, run_func, ¬used); } diff --git a/src/third_party/wiredtiger/src/support/mtx_rw.c b/src/third_party/wiredtiger/src/support/mtx_rw.c index ea18f556257..35ad5da23f2 100644 --- a/src/third_party/wiredtiger/src/support/mtx_rw.c +++ b/src/third_party/wiredtiger/src/support/mtx_rw.c @@ -115,23 +115,27 @@ #include "wt_internal.h" /* - * __wt_rwlock_alloc -- - * Allocate and initialize a read/write lock. + * __wt_rwlock_init -- + * Initialize a read/write lock. */ -int -__wt_rwlock_alloc( - WT_SESSION_IMPL *session, WT_RWLOCK **rwlockp, const char *name) +void +__wt_rwlock_init(WT_SESSION_IMPL *session, WT_RWLOCK *l) { - WT_RWLOCK *rwlock; - - __wt_verbose(session, WT_VERB_MUTEX, "rwlock: alloc %s", name); + WT_UNUSED(session); - WT_RET(__wt_calloc_one(session, &rwlock)); + l->u = 0; +} - rwlock->name = name; +/* + * __wt_rwlock_destroy -- + * Destroy a read/write lock. + */ +void +__wt_rwlock_destroy(WT_SESSION_IMPL *session, WT_RWLOCK *l) +{ + WT_UNUSED(session); - *rwlockp = rwlock; - return (0); + l->u = 0; } /* @@ -139,13 +143,12 @@ __wt_rwlock_alloc( * Try to get a shared lock, fail immediately if unavailable. */ int -__wt_try_readlock(WT_SESSION_IMPL *session, WT_RWLOCK *rwlock) +__wt_try_readlock(WT_SESSION_IMPL *session, WT_RWLOCK *l) { - wt_rwlock_t *l, new, old; + WT_RWLOCK new, old; WT_STAT_CONN_INCR(session, rwlock_read); - l = &rwlock->rwlock; new = old = *l; /* @@ -172,19 +175,15 @@ __wt_try_readlock(WT_SESSION_IMPL *session, WT_RWLOCK *rwlock) * exclusive. */ void -__wt_readlock_spin(WT_SESSION_IMPL *session, WT_RWLOCK *rwlock) +__wt_readlock_spin(WT_SESSION_IMPL *session, WT_RWLOCK *l) { - wt_rwlock_t *l; - - l = &rwlock->rwlock; - /* * Try to get the lock in a single operation if it is available to * readers. This avoids the situation where multiple readers arrive * concurrently and have to line up in order to enter the lock. For * read-heavy workloads it can make a significant difference. */ - while (__wt_try_readlock(session, rwlock) != 0) { + while (__wt_try_readlock(session, l) != 0) { if (l->s.writers_active > 0) __wt_yield(); else @@ -197,9 +196,8 @@ __wt_readlock_spin(WT_SESSION_IMPL *session, WT_RWLOCK *rwlock) * Get a shared lock. */ void -__wt_readlock(WT_SESSION_IMPL *session, WT_RWLOCK *rwlock) +__wt_readlock(WT_SESSION_IMPL *session, WT_RWLOCK *l) { - wt_rwlock_t *l; uint16_t ticket; int pause_cnt; @@ -207,8 +205,6 @@ __wt_readlock(WT_SESSION_IMPL *session, WT_RWLOCK *rwlock) WT_DIAGNOSTIC_YIELD; - l = &rwlock->rwlock; - /* * Possibly wrap: if we have more than 64K lockers waiting, the ticket * value will wrap and two lockers will simultaneously be granted the @@ -246,14 +242,10 @@ __wt_readlock(WT_SESSION_IMPL *session, WT_RWLOCK *rwlock) * Release a shared lock. */ void -__wt_readunlock(WT_SESSION_IMPL *session, WT_RWLOCK *rwlock) +__wt_readunlock(WT_SESSION_IMPL *session, WT_RWLOCK *l) { - wt_rwlock_t *l; - WT_UNUSED(session); - l = &rwlock->rwlock; - /* * Increment the writers value (other readers are doing the same, make * sure we don't race). @@ -266,13 +258,12 @@ __wt_readunlock(WT_SESSION_IMPL *session, WT_RWLOCK *rwlock) * Try to get an exclusive lock, fail immediately if unavailable. */ int -__wt_try_writelock(WT_SESSION_IMPL *session, WT_RWLOCK *rwlock) +__wt_try_writelock(WT_SESSION_IMPL *session, WT_RWLOCK *l) { - wt_rwlock_t *l, new, old; + WT_RWLOCK new, old; WT_STAT_CONN_INCR(session, rwlock_write); - l = &rwlock->rwlock; old = new = *l; /* @@ -296,16 +287,13 @@ __wt_try_writelock(WT_SESSION_IMPL *session, WT_RWLOCK *rwlock) * Wait to get an exclusive lock. */ void -__wt_writelock(WT_SESSION_IMPL *session, WT_RWLOCK *rwlock) +__wt_writelock(WT_SESSION_IMPL *session, WT_RWLOCK *l) { - wt_rwlock_t *l; uint16_t ticket; int pause_cnt; WT_STAT_CONN_INCR(session, rwlock_write); - l = &rwlock->rwlock; - /* * Possibly wrap: if we have more than 64K lockers waiting, the ticket * value will wrap and two lockers will simultaneously be granted the @@ -338,13 +326,12 @@ __wt_writelock(WT_SESSION_IMPL *session, WT_RWLOCK *rwlock) * Release an exclusive lock. */ void -__wt_writeunlock(WT_SESSION_IMPL *session, WT_RWLOCK *rwlock) +__wt_writeunlock(WT_SESSION_IMPL *session, WT_RWLOCK *l) { - wt_rwlock_t *l, new; + WT_RWLOCK new; WT_UNUSED(session); - l = &rwlock->rwlock; (void)__wt_atomic_sub16(&l->s.writers_active, 1); /* @@ -368,40 +355,16 @@ __wt_writeunlock(WT_SESSION_IMPL *session, WT_RWLOCK *rwlock) WT_DIAGNOSTIC_YIELD; } -/* - * __wt_rwlock_destroy -- - * Destroy a read/write lock. - */ -void -__wt_rwlock_destroy(WT_SESSION_IMPL *session, WT_RWLOCK **rwlockp) -{ - WT_RWLOCK *rwlock; - - rwlock = *rwlockp; /* Clear our caller's reference. */ - if (rwlock == NULL) - return; - *rwlockp = NULL; - - __wt_verbose( - session, WT_VERB_MUTEX, "rwlock: destroy %s", rwlock->name); - - __wt_free(session, rwlock); -} - #ifdef HAVE_DIAGNOSTIC /* * __wt_rwlock_islocked -- * Return if a read/write lock is currently locked for reading or writing. */ bool -__wt_rwlock_islocked(WT_SESSION_IMPL *session, WT_RWLOCK *rwlock) +__wt_rwlock_islocked(WT_SESSION_IMPL *session, WT_RWLOCK *l) { - wt_rwlock_t *l; - WT_UNUSED(session); - l = &rwlock->rwlock; - return (l->s.writers != l->s.next || l->s.readers != l->s.next); } #endif diff --git a/src/third_party/wiredtiger/src/support/rand.c b/src/third_party/wiredtiger/src/support/rand.c index a5b229b9abc..4fae43edc8e 100644 --- a/src/third_party/wiredtiger/src/support/rand.c +++ b/src/third_party/wiredtiger/src/support/rand.c @@ -120,3 +120,15 @@ __wt_random(WT_RAND_STATE volatile * rnd_state) return ((z << 16) + (w & 65535)); } + +/* + * __wt_random64 -- + * Return a 64-bit pseudo-random number. + */ +uint64_t +__wt_random64(WT_RAND_STATE volatile * rnd_state) + WT_GCC_FUNC_ATTRIBUTE((visibility("default"))) +{ + return (((uint64_t)__wt_random(rnd_state) << 32) + + __wt_random(rnd_state)); +} diff --git a/src/third_party/wiredtiger/src/support/stat.c b/src/third_party/wiredtiger/src/support/stat.c index a9c0b24ef29..fd38e1b79ee 100644 --- a/src/third_party/wiredtiger/src/support/stat.c +++ b/src/third_party/wiredtiger/src/support/stat.c @@ -64,6 +64,7 @@ static const char * const __stats_dsrc_desc[] = { "cache: pages requested from the cache", "cache: pages written from cache", "cache: pages written requiring in-memory restoration", + "cache: tracked dirty bytes in the cache", "cache: unmodified pages evicted", "cache_walk: Average difference between current eviction generation when the page was last considered", "cache_walk: Average on-disk page image size seen", @@ -225,6 +226,7 @@ __wt_stat_dsrc_clear_single(WT_DSRC_STATS *stats) stats->cache_pages_requested = 0; stats->cache_write = 0; stats->cache_write_restore = 0; + /* not clearing cache_bytes_dirty */ stats->cache_eviction_clean = 0; /* not clearing cache_state_gen_avg_gap */ /* not clearing cache_state_avg_written_size */ @@ -372,6 +374,7 @@ __wt_stat_dsrc_aggregate_single( to->cache_pages_requested += from->cache_pages_requested; to->cache_write += from->cache_write; to->cache_write_restore += from->cache_write_restore; + to->cache_bytes_dirty += from->cache_bytes_dirty; to->cache_eviction_clean += from->cache_eviction_clean; to->cache_state_gen_avg_gap += from->cache_state_gen_avg_gap; to->cache_state_avg_written_size += @@ -535,6 +538,7 @@ __wt_stat_dsrc_aggregate( WT_STAT_READ(from, cache_pages_requested); to->cache_write += WT_STAT_READ(from, cache_write); to->cache_write_restore += WT_STAT_READ(from, cache_write_restore); + to->cache_bytes_dirty += WT_STAT_READ(from, cache_bytes_dirty); to->cache_eviction_clean += WT_STAT_READ(from, cache_eviction_clean); to->cache_state_gen_avg_gap += WT_STAT_READ(from, cache_state_gen_avg_gap); @@ -673,7 +677,11 @@ static const char * const __stats_connection_desc[] = { "cache: eviction server unable to reach eviction goal", "cache: eviction state", "cache: eviction walks abandoned", + "cache: eviction worker thread active", + "cache: eviction worker thread created", "cache: eviction worker thread evicting pages", + "cache: eviction worker thread removed", + "cache: eviction worker thread stable number", "cache: failed eviction of pages that exceeded the in-memory maximum", "cache: files with active eviction walks", "cache: files with new eviction walks started", @@ -751,9 +759,7 @@ static const char * const __stats_connection_desc[] = { "lock: checkpoint lock acquisitions", "lock: checkpoint lock application thread wait time (usecs)", "lock: checkpoint lock internal thread wait time (usecs)", - "lock: handle-list lock acquisitions", - "lock: handle-list lock application thread wait time (usecs)", - "lock: handle-list lock internal thread wait time (usecs)", + "lock: handle-list lock eviction thread wait time (usecs)", "lock: metadata lock acquisitions", "lock: metadata lock application thread wait time (usecs)", "lock: metadata lock internal thread wait time (usecs)", @@ -954,7 +960,11 @@ __wt_stat_connection_clear_single(WT_CONNECTION_STATS *stats) stats->cache_eviction_slow = 0; /* not clearing cache_eviction_state */ stats->cache_eviction_walks_abandoned = 0; + /* not clearing cache_eviction_active_workers */ + stats->cache_eviction_worker_created = 0; stats->cache_eviction_worker_evicting = 0; + stats->cache_eviction_worker_removed = 0; + /* not clearing cache_eviction_stable_state_workers */ stats->cache_eviction_force_fail = 0; /* not clearing cache_eviction_walks_active */ stats->cache_eviction_walks_started = 0; @@ -1032,9 +1042,7 @@ __wt_stat_connection_clear_single(WT_CONNECTION_STATS *stats) stats->lock_checkpoint_count = 0; stats->lock_checkpoint_wait_application = 0; stats->lock_checkpoint_wait_internal = 0; - stats->lock_handle_list_count = 0; - stats->lock_handle_list_wait_application = 0; - stats->lock_handle_list_wait_internal = 0; + stats->lock_handle_list_wait_eviction = 0; stats->lock_metadata_count = 0; stats->lock_metadata_wait_application = 0; stats->lock_metadata_wait_internal = 0; @@ -1228,8 +1236,16 @@ __wt_stat_connection_aggregate( to->cache_eviction_state += WT_STAT_READ(from, cache_eviction_state); to->cache_eviction_walks_abandoned += WT_STAT_READ(from, cache_eviction_walks_abandoned); + to->cache_eviction_active_workers += + WT_STAT_READ(from, cache_eviction_active_workers); + to->cache_eviction_worker_created += + WT_STAT_READ(from, cache_eviction_worker_created); to->cache_eviction_worker_evicting += WT_STAT_READ(from, cache_eviction_worker_evicting); + to->cache_eviction_worker_removed += + WT_STAT_READ(from, cache_eviction_worker_removed); + to->cache_eviction_stable_state_workers += + WT_STAT_READ(from, cache_eviction_stable_state_workers); to->cache_eviction_force_fail += WT_STAT_READ(from, cache_eviction_force_fail); to->cache_eviction_walks_active += @@ -1331,12 +1347,8 @@ __wt_stat_connection_aggregate( WT_STAT_READ(from, lock_checkpoint_wait_application); to->lock_checkpoint_wait_internal += WT_STAT_READ(from, lock_checkpoint_wait_internal); - to->lock_handle_list_count += - WT_STAT_READ(from, lock_handle_list_count); - to->lock_handle_list_wait_application += - WT_STAT_READ(from, lock_handle_list_wait_application); - to->lock_handle_list_wait_internal += - WT_STAT_READ(from, lock_handle_list_wait_internal); + to->lock_handle_list_wait_eviction += + WT_STAT_READ(from, lock_handle_list_wait_eviction); to->lock_metadata_count += WT_STAT_READ(from, lock_metadata_count); to->lock_metadata_wait_application += WT_STAT_READ(from, lock_metadata_wait_application); diff --git a/src/third_party/wiredtiger/src/support/thread_group.c b/src/third_party/wiredtiger/src/support/thread_group.c index a866d2d01c5..2b4b7ad4e61 100644 --- a/src/third_party/wiredtiger/src/support/thread_group.c +++ b/src/third_party/wiredtiger/src/support/thread_group.c @@ -50,8 +50,7 @@ __thread_group_grow( { WT_THREAD *thread; - WT_ASSERT(session, - __wt_rwlock_islocked(session, group->lock)); + WT_ASSERT(session, __wt_rwlock_islocked(session, &group->lock)); /* * Any bounds checking is done by the caller so we know that @@ -72,20 +71,19 @@ __thread_group_grow( /* * __thread_group_shrink -- - * Decrease the number of running threads in the group, and free any + * Decrease the number of running threads in the group. Optionally free any * memory associated with slots larger than the new count. */ static int __thread_group_shrink(WT_SESSION_IMPL *session, - WT_THREAD_GROUP *group, uint32_t new_count) + WT_THREAD_GROUP *group, uint32_t new_count, bool free_thread) { WT_DECL_RET; WT_SESSION *wt_session; WT_THREAD *thread; uint32_t current_slot; - WT_ASSERT(session, - __wt_rwlock_islocked(session, group->lock)); + WT_ASSERT(session, __wt_rwlock_islocked(session, &group->lock)); for (current_slot = group->alloc; current_slot > new_count; ) { /* @@ -107,14 +105,15 @@ __thread_group_shrink(WT_SESSION_IMPL *session, WT_TRET(__wt_thread_join(session, thread->tid)); thread->tid = 0; } - - if (thread->session != NULL) { - wt_session = (WT_SESSION *)thread->session; - WT_TRET(wt_session->close(wt_session, NULL)); - thread->session = NULL; + if (free_thread) { + if (thread->session != NULL) { + wt_session = (WT_SESSION *)thread->session; + WT_TRET(wt_session->close(wt_session, NULL)); + thread->session = NULL; + } + __wt_free(session, thread); + group->threads[current_slot] = NULL; } - __wt_free(session, thread); - group->threads[current_slot] = NULL; } /* Update the thread group state to match our changes */ @@ -142,16 +141,19 @@ __thread_group_resize( WT_ASSERT(session, group->current_threads <= group->alloc && - __wt_rwlock_islocked(session, group->lock)); + __wt_rwlock_islocked(session, &group->lock)); if (new_min == group->min && new_max == group->max) return (0); + if (new_min > new_max) + return (EINVAL); + /* - * Coll shrink to reduce the number of thread structures and running + * Call shrink to reduce the number of thread structures and running * threads if required by the change in group size. */ - WT_RET(__thread_group_shrink(session, group, new_max)); + WT_RET(__thread_group_shrink(session, group, new_max, true)); /* * Only reallocate the thread array if it is the largest ever, since @@ -227,9 +229,9 @@ __wt_thread_group_resize( " from max: %" PRIu32 " -> %" PRIu32, (void *)group, group->min, new_min, group->max, new_max); - __wt_writelock(session, group->lock); + __wt_writelock(session, &group->lock); WT_TRET(__thread_group_resize(session, group, new_min, new_max, flags)); - __wt_writeunlock(session, group->lock); + __wt_writeunlock(session, &group->lock); return (ret); } @@ -255,17 +257,17 @@ __wt_thread_group_create( __wt_verbose(session, WT_VERB_THREAD_GROUP, "Creating thread group: %p", (void *)group); - WT_RET(__wt_rwlock_alloc(session, &group->lock, "Thread group")); + __wt_rwlock_init(session, &group->lock); WT_ERR(__wt_cond_alloc( - session, "Thread group cond", false, &group->wait_cond)); + session, "thread group cond", &group->wait_cond)); cond_alloced = true; - __wt_writelock(session, group->lock); + __wt_writelock(session, &group->lock); group->run_func = run_func; group->name = name; WT_TRET(__thread_group_resize(session, group, min, max, flags)); - __wt_writeunlock(session, group->lock); + __wt_writeunlock(session, &group->lock); /* Cleanup on error to avoid leaking resources */ err: if (ret != 0) { @@ -288,10 +290,10 @@ __wt_thread_group_destroy(WT_SESSION_IMPL *session, WT_THREAD_GROUP *group) __wt_verbose(session, WT_VERB_THREAD_GROUP, "Destroying thread group: %p", (void *)group); - WT_ASSERT(session, __wt_rwlock_islocked(session, group->lock)); + WT_ASSERT(session, __wt_rwlock_islocked(session, &group->lock)); /* Shut down all threads and free associated resources. */ - WT_TRET(__thread_group_shrink(session, group, 0)); + WT_TRET(__thread_group_shrink(session, group, 0, true)); __wt_free(session, group->threads); @@ -322,15 +324,42 @@ __wt_thread_group_start_one( return (0); if (wait) - __wt_writelock(session, group->lock); - else if (__wt_try_writelock(session, group->lock) != 0) - return (0); + __wt_writelock(session, &group->lock); + else + WT_RET(__wt_try_writelock(session, &group->lock)); /* Recheck the bounds now that we hold the lock */ if (group->current_threads < group->max) WT_TRET(__thread_group_grow( session, group, group->current_threads + 1)); - __wt_writeunlock(session, group->lock); + __wt_writeunlock(session, &group->lock); + + return (ret); +} + +/* + * __wt_thread_group_stop_one -- + * Stop one thread if possible. + */ +int +__wt_thread_group_stop_one( + WT_SESSION_IMPL *session, WT_THREAD_GROUP *group, bool wait) +{ + WT_DECL_RET; + + if (group->current_threads <= group->min) + return (0); + + if (wait) + __wt_writelock(session, &group->lock); + else + WT_RET(__wt_try_writelock(session, &group->lock)); + + /* Recheck the bounds now that we hold the lock */ + if (group->current_threads > group->min) + WT_TRET(__thread_group_shrink( + session, group, group->current_threads - 1, false)); + __wt_writeunlock(session, &group->lock); return (ret); } diff --git a/src/third_party/wiredtiger/src/txn/txn.c b/src/third_party/wiredtiger/src/txn/txn.c index 26a0ed679e2..e5e59c2b901 100644 --- a/src/third_party/wiredtiger/src/txn/txn.c +++ b/src/third_party/wiredtiger/src/txn/txn.c @@ -126,7 +126,7 @@ __wt_txn_get_snapshot(WT_SESSION_IMPL *session) n = 0; /* We're going to scan the table: wait for the lock. */ - __wt_readlock_spin(session, txn_global->scan_rwlock); + __wt_readlock_spin(session, &txn_global->scan_rwlock); current_id = pinned_id = txn_global->current; prev_oldest_id = txn_global->oldest_id; @@ -180,7 +180,7 @@ __wt_txn_get_snapshot(WT_SESSION_IMPL *session) WT_ASSERT(session, prev_oldest_id == txn_global->oldest_id); txn_state->pinned_id = pinned_id; -done: __wt_readunlock(session, txn_global->scan_rwlock); +done: __wt_readunlock(session, &txn_global->scan_rwlock); __txn_sort_snapshot(session, n, current_id); } @@ -293,13 +293,13 @@ __wt_txn_update_oldest(WT_SESSION_IMPL *session, uint32_t flags) /* First do a read-only scan. */ if (wait) - __wt_readlock_spin(session, txn_global->scan_rwlock); + __wt_readlock_spin(session, &txn_global->scan_rwlock); else if ((ret = - __wt_try_readlock(session, txn_global->scan_rwlock)) != 0) + __wt_try_readlock(session, &txn_global->scan_rwlock)) != 0) return (ret == EBUSY ? 0 : ret); __txn_oldest_scan(session, &oldest_id, &last_running, &metadata_pinned, &oldest_session); - __wt_readunlock(session, txn_global->scan_rwlock); + __wt_readunlock(session, &txn_global->scan_rwlock); /* * If the state hasn't changed (or hasn't moved far enough for @@ -314,9 +314,9 @@ __wt_txn_update_oldest(WT_SESSION_IMPL *session, uint32_t flags) /* It looks like an update is necessary, wait for exclusive access. */ if (wait) - __wt_writelock(session, txn_global->scan_rwlock); + __wt_writelock(session, &txn_global->scan_rwlock); else if ((ret = - __wt_try_writelock(session, txn_global->scan_rwlock)) != 0) + __wt_try_writelock(session, &txn_global->scan_rwlock)) != 0) return (ret == EBUSY ? 0 : ret); /* @@ -375,7 +375,7 @@ __wt_txn_update_oldest(WT_SESSION_IMPL *session, uint32_t flags) #endif } -done: __wt_writeunlock(session, txn_global->scan_rwlock); +done: __wt_writeunlock(session, &txn_global->scan_rwlock); return (ret); } @@ -768,10 +768,8 @@ __wt_txn_global_init(WT_SESSION_IMPL *session, const char *cfg[]) WT_RET(__wt_spin_init(session, &txn_global->id_lock, "transaction id lock")); - WT_RET(__wt_rwlock_alloc(session, - &txn_global->scan_rwlock, "transaction scan lock")); - WT_RET(__wt_rwlock_alloc(session, - &txn_global->nsnap_rwlock, "named snapshot lock")); + __wt_rwlock_init(session, &txn_global->scan_rwlock); + __wt_rwlock_init(session, &txn_global->nsnap_rwlock); txn_global->nsnap_oldest_id = WT_TXN_NONE; TAILQ_INIT(&txn_global->nsnaph); @@ -805,3 +803,98 @@ __wt_txn_global_destroy(WT_SESSION_IMPL *session) __wt_rwlock_destroy(session, &txn_global->nsnap_rwlock); __wt_free(session, txn_global->states); } + +#if defined(HAVE_DIAGNOSTIC) || defined(HAVE_VERBOSE) +/* + * __wt_verbose_dump_txn -- + * Output diagnostic information about the global transaction state. + */ +int +__wt_verbose_dump_txn(WT_SESSION_IMPL *session) +{ + WT_CONNECTION_IMPL *conn; + WT_TXN_GLOBAL *txn_global; + WT_TXN *txn; + WT_TXN_STATE *s; + const char *iso_tag; + uint64_t id; + uint32_t i, session_cnt; + + conn = S2C(session); + txn_global = &conn->txn_global; + + WT_RET(__wt_msg(session, "%s", WT_DIVIDER)); + WT_RET(__wt_msg(session, "transaction state dump")); + + WT_RET(__wt_msg(session, "current ID: %" PRIu64, txn_global->current)); + WT_RET(__wt_msg(session, + "last running ID: %" PRIu64, txn_global->last_running)); + WT_RET(__wt_msg(session, "oldest ID: %" PRIu64, txn_global->oldest_id)); + WT_RET(__wt_msg(session, + "oldest named snapshot ID: %" PRIu64, txn_global->nsnap_oldest_id)); + + WT_RET(__wt_msg(session, "checkpoint running? %s", + txn_global->checkpoint_running ? "yes" : "no")); + WT_RET(__wt_msg(session, + "checkpoint generation: %" PRIu64, txn_global->checkpoint_gen)); + WT_RET(__wt_msg(session, + "checkpoint pinned ID: %" PRIu64, txn_global->checkpoint_pinned)); + WT_RET(__wt_msg(session, + "checkpoint txn ID: %" PRIu64, txn_global->checkpoint_txnid)); + + WT_ORDERED_READ(session_cnt, conn->session_cnt); + WT_RET(__wt_msg(session, "session count: %" PRIu32, session_cnt)); + + WT_RET(__wt_msg(session, "Transaction state of active sessions:")); + + /* + * Walk each session transaction state and dump information. Accessing + * the content of session handles is not thread safe, so some + * information may change while traversing if other threads are active + * at the same time, which is OK since this is diagnostic code. + */ + for (i = 0, s = txn_global->states; i < session_cnt; i++, s++) { + /* Skip sessions with no active transaction */ + if ((id = s->id) == WT_TXN_NONE && s->pinned_id == WT_TXN_NONE) + continue; + + txn = &conn->sessions[i].txn; + iso_tag = "INVALID"; + switch (txn->isolation) { + case WT_ISO_READ_COMMITTED: + iso_tag = "WT_ISO_READ_COMMITTED"; + break; + case WT_ISO_READ_UNCOMMITTED: + iso_tag = "WT_ISO_READ_UNCOMMITTED"; + break; + case WT_ISO_SNAPSHOT: + iso_tag = "WT_ISO_SNAPSHOT"; + break; + } + + WT_RET(__wt_msg(session, + "ID: %6" PRIu64 + ", mod count: %u" + ", pinned ID: %" PRIu64 + ", snap min: %" PRIu64 + ", snap max: %" PRIu64 + ", metadata pinned ID: %" PRIu64 + ", flags: 0x%08" PRIx32 + ", name: %s" + ", isolation: %s", + id, + txn->mod_count, + s->pinned_id, + txn->snap_min, + txn->snap_max, + s->metadata_pinned, + txn->flags, + conn->sessions[i].name == NULL ? + "EMPTY" : conn->sessions[i].name, + iso_tag)); + } + WT_RET(__wt_msg(session, "%s", WT_DIVIDER)); + + return (0); +} +#endif diff --git a/src/third_party/wiredtiger/src/txn/txn_ckpt.c b/src/third_party/wiredtiger/src/txn/txn_ckpt.c index 399d9187d82..3261c8089f4 100644 --- a/src/third_party/wiredtiger/src/txn/txn_ckpt.c +++ b/src/third_party/wiredtiger/src/txn/txn_ckpt.c @@ -525,6 +525,17 @@ __checkpoint_verbose_track(WT_SESSION_IMPL *session, } /* + * __checkpoint_fail_reset -- + * Reset fields when a failure occurs. + */ +static void +__checkpoint_fail_reset(WT_SESSION_IMPL *session) +{ + S2BT(session)->modified = true; + S2BT(session)->ckpt = NULL; +} + +/* * __txn_checkpoint -- * Checkpoint a database or a list of objects in the database. */ @@ -543,7 +554,7 @@ __txn_checkpoint(WT_SESSION_IMPL *session, const char *cfg[]) void *saved_meta_next; u_int i; uint64_t fsync_duration_usecs; - bool full, idle, logging, tracking; + bool failed, full, idle, logging, tracking; const char *txn_cfg[] = { WT_CONFIG_BASE(session, WT_SESSION_begin_transaction), "isolation=snapshot", NULL }; @@ -639,10 +650,9 @@ __txn_checkpoint(WT_SESSION_IMPL *session, const char *cfg[]) */ WT_ASSERT(session, session->ckpt_handle_next == 0); WT_WITH_SCHEMA_LOCK(session, - WT_WITH_TABLE_LOCK(session, - WT_WITH_HANDLE_LIST_LOCK(session, - ret = __checkpoint_apply_all( - session, cfg, __wt_checkpoint_get_handles, NULL)))); + WT_WITH_TABLE_READ_LOCK(session, + ret = __checkpoint_apply_all( + session, cfg, __wt_checkpoint_get_handles, NULL))); WT_ERR(ret); /* @@ -679,7 +689,7 @@ __txn_checkpoint(WT_SESSION_IMPL *session, const char *cfg[]) * This allows ordinary visibility checks to move forward because * checkpoints often take a long time and only write to the metadata. */ - __wt_writelock(session, txn_global->scan_rwlock); + __wt_writelock(session, &txn_global->scan_rwlock); txn_global->checkpoint_txnid = txn->id; txn_global->checkpoint_pinned = WT_MIN(txn->id, txn->snap_min); @@ -700,7 +710,7 @@ __txn_checkpoint(WT_SESSION_IMPL *session, const char *cfg[]) */ txn_state->id = txn_state->pinned_id = txn_state->metadata_pinned = WT_TXN_NONE; - __wt_writeunlock(session, txn_global->scan_rwlock); + __wt_writeunlock(session, &txn_global->scan_rwlock); /* * Unblock updates -- we can figure out that any updates to clean pages @@ -825,12 +835,13 @@ err: /* * overwritten the checkpoint, so what ends up on disk is not * consistent. */ - if (ret != 0 && !conn->modified) + failed = ret != 0; + if (failed) conn->modified = true; session->isolation = txn->isolation = WT_ISO_READ_UNCOMMITTED; if (tracking) - WT_TRET(__wt_meta_track_off(session, false, ret != 0)); + WT_TRET(__wt_meta_track_off(session, false, failed)); cache->eviction_scrub_limit = 0.0; WT_STAT_CONN_SET(session, txn_checkpoint_scrub_target, 0); @@ -863,6 +874,13 @@ err: /* for (i = 0; i < session->ckpt_handle_next; ++i) { if (session->ckpt_handle[i] == NULL) continue; + /* + * If the operation failed, mark all trees dirty so they are + * included if a future checkpoint can succeed. + */ + if (failed) + WT_WITH_DHANDLE(session, session->ckpt_handle[i], + __checkpoint_fail_reset(session)); WT_WITH_DHANDLE(session, session->ckpt_handle[i], WT_TRET(__wt_session_release_btree(session))); } @@ -1159,7 +1177,7 @@ __checkpoint_lock_tree(WT_SESSION_IMPL *session, * Hold the lock until we're done (blocking hot backups from starting), * we don't want to race with a future hot backup. */ - __wt_readlock(session, conn->hot_backup_lock); + __wt_readlock(session, &conn->hot_backup_lock); hot_backup_locked = true; if (conn->hot_backup) WT_CKPT_FOREACH(ckptbase, ckpt) { @@ -1233,7 +1251,7 @@ __checkpoint_lock_tree(WT_SESSION_IMPL *session, WT_ASSERT(session, !is_checkpoint || !F_ISSET(btree, WT_BTREE_SPECIAL_FLAGS)); - __wt_readunlock(session, conn->hot_backup_lock); + __wt_readunlock(session, &conn->hot_backup_lock); WT_ASSERT(session, btree->ckpt == NULL); btree->ckpt = ckptbase; @@ -1241,7 +1259,7 @@ __checkpoint_lock_tree(WT_SESSION_IMPL *session, return (0); err: if (hot_backup_locked) - __wt_readunlock(session, conn->hot_backup_lock); + __wt_readunlock(session, &conn->hot_backup_lock); __wt_meta_ckptlist_free(session, ckptbase); __wt_free(session, name_alloc); @@ -1341,7 +1359,6 @@ __checkpoint_tree( WT_DATA_HANDLE *dhandle; WT_DECL_RET; WT_LSN ckptlsn; - int was_modified; bool fake_ckpt; WT_UNUSED(cfg); @@ -1352,7 +1369,6 @@ __checkpoint_tree( conn = S2C(session); dhandle = session->dhandle; fake_ckpt = false; - was_modified = btree->modified; /* * Set the checkpoint LSN to the maximum LSN so that if logging is @@ -1483,10 +1499,9 @@ err: /* * If the checkpoint didn't complete successfully, make sure the * tree is marked dirty. */ - if (ret != 0 && !btree->modified && was_modified) { + if (ret != 0) { btree->modified = true; - if (!S2C(session)->modified) - S2C(session)->modified = true; + S2C(session)->modified = true; } __wt_meta_ckptlist_free(session, ckptbase); diff --git a/src/third_party/wiredtiger/src/txn/txn_log.c b/src/third_party/wiredtiger/src/txn/txn_log.c index 5f4704b40c4..2931dc1ce82 100644 --- a/src/third_party/wiredtiger/src/txn/txn_log.c +++ b/src/third_party/wiredtiger/src/txn/txn_log.c @@ -269,7 +269,7 @@ __wt_txn_checkpoint_logread(WT_SESSION_IMPL *session, WT_ITEM ckpt_snapshot_unused; uint32_t ckpt_file, ckpt_offset; u_int ckpt_nsnapshot_unused; - const char *fmt = WT_UNCHECKED_STRING(IIIU); + const char *fmt = WT_UNCHECKED_STRING(IIIu); if ((ret = __wt_struct_unpack(session, *pp, WT_PTRDIFF(end, *pp), fmt, &ckpt_file, &ckpt_offset, @@ -297,7 +297,7 @@ __wt_txn_checkpoint_log( uint8_t *end, *p; size_t recsize; uint32_t i, rectype = WT_LOGREC_CHECKPOINT; - const char *fmt = WT_UNCHECKED_STRING(IIIIU); + const char *fmt = WT_UNCHECKED_STRING(IIIIu); txn = &session->txn; ckpt_lsn = &txn->ckpt_lsn; @@ -368,14 +368,16 @@ __wt_txn_checkpoint_log( /* * If this full checkpoint completed successfully and there is - * no hot backup in progress, tell the logging subsystem the - * checkpoint LSN so that it can archive. Do not update the - * logging checkpoint LSN if this is during a clean connection - * close, only during a full checkpoint. A clean close may not - * update any metadata LSN and we do not want to archive in - * that case. + * no hot backup in progress and this is not recovery, tell + * the logging subsystem the checkpoint LSN so that it can + * archive. Do not update the logging checkpoint LSN if this + * is during a clean connection close, only during a full + * checkpoint. A clean close may not update any metadata LSN + * and we do not want to archive in that case. */ - if (!S2C(session)->hot_backup && txn->full_ckpt) + if (!S2C(session)->hot_backup && + !F_ISSET(S2C(session), WT_CONN_RECOVERING) && + txn->full_ckpt) __wt_log_ckpt(session, ckpt_lsn); /* FALLTHROUGH */ diff --git a/src/third_party/wiredtiger/src/txn/txn_nsnap.c b/src/third_party/wiredtiger/src/txn/txn_nsnap.c index 65ec1a6662f..659570dbcd9 100644 --- a/src/third_party/wiredtiger/src/txn/txn_nsnap.c +++ b/src/third_party/wiredtiger/src/txn/txn_nsnap.c @@ -211,9 +211,9 @@ __wt_txn_named_snapshot_begin(WT_SESSION_IMPL *session, const char *cfg[]) if (TAILQ_EMPTY(&txn_global->nsnaph)) { WT_ASSERT(session, txn_global->nsnap_oldest_id == WT_TXN_NONE && !__wt_txn_visible_all(session, nsnap_new->pinned_id)); - __wt_readlock(session, txn_global->scan_rwlock); + __wt_readlock(session, &txn_global->scan_rwlock); txn_global->nsnap_oldest_id = nsnap_new->pinned_id; - __wt_readunlock(session, txn_global->scan_rwlock); + __wt_readunlock(session, &txn_global->scan_rwlock); } TAILQ_INSERT_TAIL(&txn_global->nsnaph, nsnap_new, q); WT_STAT_CONN_INCR(session, txn_snapshots_created); @@ -297,16 +297,16 @@ __wt_txn_named_snapshot_get(WT_SESSION_IMPL *session, WT_CONFIG_ITEM *nameval) if (session->ncursors > 0) WT_RET(__wt_session_copy_values(session)); - __wt_readlock(session, txn_global->nsnap_rwlock); + __wt_readlock(session, &txn_global->nsnap_rwlock); TAILQ_FOREACH(nsnap, &txn_global->nsnaph, q) if (WT_STRING_MATCH(nsnap->name, nameval->str, nameval->len)) { /* * Acquire the scan lock so the oldest ID can't move * forward without seeing our pinned ID. */ - __wt_readlock(session, txn_global->scan_rwlock); + __wt_readlock(session, &txn_global->scan_rwlock); txn_state->pinned_id = nsnap->pinned_id; - __wt_readunlock(session, txn_global->scan_rwlock); + __wt_readunlock(session, &txn_global->scan_rwlock); WT_ASSERT(session, !__wt_txn_visible_all( session, txn_state->pinned_id) && @@ -327,7 +327,7 @@ __wt_txn_named_snapshot_get(WT_SESSION_IMPL *session, WT_CONFIG_ITEM *nameval) F_SET(txn, WT_TXN_HAS_SNAPSHOT); break; } - __wt_readunlock(session, txn_global->nsnap_rwlock); + __wt_readunlock(session, &txn_global->nsnap_rwlock); if (nsnap == NULL) WT_RET_MSG(session, EINVAL, diff --git a/src/third_party/wiredtiger/src/txn/txn_recover.c b/src/third_party/wiredtiger/src/txn/txn_recover.c index a6390dcbd06..2d8a77a69e6 100644 --- a/src/third_party/wiredtiger/src/txn/txn_recover.c +++ b/src/third_party/wiredtiger/src/txn/txn_recover.c @@ -501,7 +501,7 @@ __wt_txn_recover(WT_SESSION_IMPL *session) * Pass WT_LOGSCAN_RECOVER so that old logs get truncated. */ r.metadata_only = false; - __wt_verbose(session, WT_VERB_RECOVERY, + __wt_verbose(session, WT_VERB_RECOVERY | WT_VERB_RECOVERY_PROGRESS, "Main recovery loop: starting at %" PRIu32 "/%" PRIu32, r.ckpt_lsn.l.file, r.ckpt_lsn.l.offset); WT_ERR(__wt_log_needs_recovery(session, &r.ckpt_lsn, &needs_rec)); diff --git a/src/third_party/wiredtiger/src/utilities/util.h b/src/third_party/wiredtiger/src/utilities/util.h index 2658d877b63..93a96d44219 100644 --- a/src/third_party/wiredtiger/src/utilities/util.h +++ b/src/third_party/wiredtiger/src/utilities/util.h @@ -40,7 +40,6 @@ int util_flush(WT_SESSION *, const char *); int util_list(WT_SESSION *, int, char *[]); int util_load(WT_SESSION *, int, char *[]); int util_loadtext(WT_SESSION *, int, char *[]); -char *util_name(WT_SESSION *, const char *, const char *); int util_printlog(WT_SESSION *, int, char *[]); int util_read(WT_SESSION *, int, char *[]); int util_read_line(WT_SESSION *, ULINE *, bool, bool *); @@ -49,6 +48,8 @@ int util_rename(WT_SESSION *, int, char *[]); int util_salvage(WT_SESSION *, int, char *[]); int util_stat(WT_SESSION *, int, char *[]); int util_str2recno(WT_SESSION *, const char *p, uint64_t *recnop); +int util_truncate(WT_SESSION *, int, char *[]); int util_upgrade(WT_SESSION *, int, char *[]); +char *util_uri(WT_SESSION *, const char *, const char *); int util_verify(WT_SESSION *, int, char *[]); int util_write(WT_SESSION *, int, char *[]); diff --git a/src/third_party/wiredtiger/src/utilities/util_alter.c b/src/third_party/wiredtiger/src/utilities/util_alter.c index d228c15cd48..ef01a1ed826 100644 --- a/src/third_party/wiredtiger/src/utilities/util_alter.c +++ b/src/third_party/wiredtiger/src/utilities/util_alter.c @@ -34,9 +34,12 @@ util_alter(WT_SESSION *session, int argc, char *argv[]) for (configp = argv; configp != NULL && *configp != NULL; configp += 2) if ((ret = session->alter( - session, configp[0], configp[1])) != 0) - break; - return (ret); + session, configp[0], configp[1])) != 0) { + (void)util_err(session, ret, + "session.alter: %s, %s", configp[0], configp[1]); + return (1); + } + return (0); } static int diff --git a/src/third_party/wiredtiger/src/utilities/util_compact.c b/src/third_party/wiredtiger/src/utilities/util_compact.c index c114eb207fa..e469b4dce6e 100644 --- a/src/third_party/wiredtiger/src/utilities/util_compact.c +++ b/src/third_party/wiredtiger/src/utilities/util_compact.c @@ -30,21 +30,13 @@ util_compact(WT_SESSION *session, int argc, char *argv[]) /* The remaining argument is the table name. */ if (argc != 1) return (usage()); - if ((uri = util_name(session, *argv, "table")) == NULL) + if ((uri = util_uri(session, *argv, "table")) == NULL) return (1); - if ((ret = session->compact(session, uri, NULL)) != 0) { - fprintf(stderr, "%s: compact(%s): %s\n", - progname, uri, session->strerror(session, ret)); - goto err; - } - - if (0) { -err: ret = 1; - } + if ((ret = session->compact(session, uri, NULL)) != 0) + (void)util_err(session, ret, "session.compact: %s", uri); free(uri); - return (ret); } diff --git a/src/third_party/wiredtiger/src/utilities/util_create.c b/src/third_party/wiredtiger/src/utilities/util_create.c index 4e609736f2d..7c22a67792b 100644 --- a/src/third_party/wiredtiger/src/utilities/util_create.c +++ b/src/third_party/wiredtiger/src/utilities/util_create.c @@ -15,9 +15,9 @@ util_create(WT_SESSION *session, int argc, char *argv[]) { WT_DECL_RET; int ch; - const char *config, *uri; + char *config, *uri; - config = NULL; + config = uri = NULL; while ((ch = __wt_getopt(progname, argc, argv, "c:")) != EOF) switch (ch) { case 'c': /* command-line configuration */ @@ -35,12 +35,14 @@ util_create(WT_SESSION *session, int argc, char *argv[]) if (argc != 1) return (usage()); - if ((uri = util_name(session, *argv, "table")) == NULL) + if ((uri = util_uri(session, *argv, "table")) == NULL) return (1); if ((ret = session->create(session, uri, config)) != 0) - return (util_err(session, ret, "%s: session.create", uri)); - return (0); + (void)util_err(session, ret, "session.create: %s", uri); + + free(uri); + return (ret); } static int diff --git a/src/third_party/wiredtiger/src/utilities/util_drop.c b/src/third_party/wiredtiger/src/utilities/util_drop.c index ba41445dfb6..456005d445d 100644 --- a/src/third_party/wiredtiger/src/utilities/util_drop.c +++ b/src/third_party/wiredtiger/src/utilities/util_drop.c @@ -15,8 +15,9 @@ util_drop(WT_SESSION *session, int argc, char *argv[]) { WT_DECL_RET; int ch; - char *name; + char *uri; + uri = NULL; while ((ch = __wt_getopt(progname, argc, argv, "")) != EOF) switch (ch) { case '?': @@ -30,12 +31,13 @@ util_drop(WT_SESSION *session, int argc, char *argv[]) /* The remaining argument is the uri. */ if (argc != 1) return (usage()); - if ((name = util_name(session, *argv, "table")) == NULL) + if ((uri = util_uri(session, *argv, "table")) == NULL) return (1); - ret = session->drop(session, name, "force"); + if ((ret = session->drop(session, uri, "force")) != 0) + (void)util_err(session, ret, "session.drop: %s", uri); - free(name); + free(uri); return (ret); } diff --git a/src/third_party/wiredtiger/src/utilities/util_dump.c b/src/third_party/wiredtiger/src/utilities/util_dump.c index 7dde13ee837..cded40a8b45 100644 --- a/src/third_party/wiredtiger/src/utilities/util_dump.c +++ b/src/third_party/wiredtiger/src/utilities/util_dump.c @@ -6,10 +6,14 @@ * See the file LICENSE for redistribution information. */ +#include <assert.h> #include "util.h" #include "util_dump.h" -static int dump_config(WT_SESSION *, const char *, bool, bool); +#define STRING_MATCH_CONFIG(s, item) \ + (strncmp(s, (item).str, (item).len) == 0 && (s)[(item).len] == '\0') + +static int dump_config(WT_SESSION *, const char *, WT_CURSOR *, bool, bool); static int dump_json_begin(WT_SESSION *); static int dump_json_end(WT_SESSION *); static int dump_json_separator(WT_SESSION *); @@ -17,7 +21,8 @@ static int dump_json_table_end(WT_SESSION *); static int dump_prefix(WT_SESSION *, bool, bool); static int dump_record(WT_CURSOR *, bool, bool); static int dump_suffix(WT_SESSION *, bool); -static int dump_table_config(WT_SESSION *, WT_CURSOR *, const char *, bool); +static int dump_table_config( + WT_SESSION *, WT_CURSOR *, WT_CURSOR *, const char *, bool); static int dump_table_parts_config( WT_SESSION *, WT_CURSOR *, const char *, const char *, bool); static int dup_json_string(const char *, char **); @@ -32,10 +37,11 @@ util_dump(WT_SESSION *session, int argc, char *argv[]) size_t len; int ch, i; bool hex, json, reverse; - char *checkpoint, *config, *name; + char *checkpoint, *config, *p, *simpleuri, *uri; hex = json = reverse = false; - checkpoint = config = name = NULL; + checkpoint = config = simpleuri = uri = NULL; + cursor = NULL; while ((ch = __wt_getopt(progname, argc, argv, "c:f:jrx")) != EOF) switch (ch) { case 'c': @@ -75,21 +81,19 @@ util_dump(WT_SESSION *session, int argc, char *argv[]) return (usage()); if (json && - ((ret = dump_json_begin(session)) != 0 || - (ret = dump_prefix(session, hex, json)) != 0)) + (dump_json_begin(session) != 0 || + dump_prefix(session, hex, json) != 0)) goto err; for (i = 0; i < argc; i++) { if (json && i > 0) - if ((ret = dump_json_separator(session)) != 0) + if (dump_json_separator(session) != 0) goto err; - free(name); - name = NULL; - - if ((name = util_name(session, argv[i], "table")) == NULL) - goto err; + free(uri); + free(simpleuri); + uri = simpleuri = NULL; - if (dump_config(session, name, hex, json) != 0) + if ((uri = util_uri(session, argv[i], "table")) == NULL) goto err; len = @@ -109,18 +113,34 @@ util_dump(WT_SESSION *session, int argc, char *argv[]) (void)strcat(config, json ? "dump=json" : (hex ? "dump=hex" : "dump=print")); if ((ret = session->open_cursor( - session, name, NULL, config, &cursor)) != 0) { + session, uri, NULL, config, &cursor)) != 0) { fprintf(stderr, "%s: cursor open(%s) failed: %s\n", - progname, name, session->strerror(session, ret)); + progname, uri, session->strerror(session, ret)); + goto err; + } + + if ((simpleuri = strdup(uri)) == NULL) { + (void)util_err(session, errno, NULL); goto err; } + if ((p = strchr(simpleuri, '(')) != NULL) + *p = '\0'; + if (dump_config(session, simpleuri, cursor, hex, json) != 0) + goto err; - if ((ret = dump_record(cursor, reverse, json)) != 0) + if (dump_record(cursor, reverse, json) != 0) goto err; - if (json && (ret = dump_json_table_end(session)) != 0) + if (json && dump_json_table_end(session) != 0) goto err; + + ret = cursor->close(cursor); + cursor = NULL; + if (ret != 0) { + (void)util_err(session, ret, NULL); + goto err; + } } - if (json && ((ret = dump_json_end(session)) != 0)) + if (json && dump_json_end(session) != 0) goto err; if (0) { @@ -128,8 +148,12 @@ err: ret = 1; } free(config); - free(name); - + free(uri); + free(simpleuri); + if (cursor != NULL && (ret = cursor->close(cursor)) != 0) { + (void)util_err(session, ret, NULL); + ret = 1; + } return (ret); } @@ -138,15 +162,16 @@ err: ret = 1; * Dump the config for the uri. */ static int -dump_config(WT_SESSION *session, const char *uri, bool hex, bool json) +dump_config(WT_SESSION *session, const char *uri, WT_CURSOR *cursor, bool hex, + bool json) { - WT_CURSOR *cursor; + WT_CURSOR *mcursor; WT_DECL_RET; int tret; /* Open a metadata cursor. */ if ((ret = session->open_cursor( - session, "metadata:create", NULL, NULL, &cursor)) != 0) { + session, "metadata:create", NULL, NULL, &mcursor)) != 0) { fprintf(stderr, "%s: %s: session.open_cursor: %s\n", progname, "metadata:create", session->strerror(session, ret)); return (1); @@ -156,10 +181,11 @@ dump_config(WT_SESSION *session, const char *uri, bool hex, bool json) * want to output a header if the user entered the wrong name. This is * where we find out a table doesn't exist, use a simple error message. */ - cursor->set_key(cursor, uri); - if ((ret = cursor->search(cursor)) == 0) { + mcursor->set_key(mcursor, uri); + if ((ret = mcursor->search(mcursor)) == 0) { if ((!json && dump_prefix(session, hex, json) != 0) || - dump_table_config(session, cursor, uri, json) != 0 || + dump_table_config(session, mcursor, cursor, + uri, json) != 0 || dump_suffix(session, json) != 0) ret = 1; } else if (ret == WT_NOTFOUND) @@ -167,8 +193,8 @@ dump_config(WT_SESSION *session, const char *uri, bool hex, bool json) else ret = util_err(session, ret, "%s", uri); - if ((tret = cursor->close(cursor)) != 0) { - tret = util_cerr(cursor, "close", tret); + if ((tret = mcursor->close(mcursor)) != 0) { + tret = util_cerr(mcursor, "close", tret); if (ret == 0) ret = tret; } @@ -225,16 +251,125 @@ dump_json_table_end(WT_SESSION *session) } /* + * dump_add_config + * Add a formatted config string to an output buffer. + */ +static int +dump_add_config(WT_SESSION *session, char **bufp, size_t *leftp, + const char *fmt, ...) + WT_GCC_FUNC_ATTRIBUTE((format (printf, 4, 5))) +{ + int n; + va_list ap; + + va_start(ap, fmt); + n = vsnprintf(*bufp, *leftp, fmt, ap); + va_end(ap); + if (n < 0) + return (util_err(session, EINVAL, NULL)); + *bufp += n; + *leftp -= (size_t)n; + return (0); +} + +/* + * dump_projection -- + * Create a new config containing projection information. + */ +static int +dump_projection(WT_SESSION *session, const char *config, WT_CURSOR *cursor, + char **newconfigp) +{ + WT_DECL_RET; + WT_CONFIG_ITEM key, value; + WT_CONFIG_PARSER *parser; + WT_EXTENSION_API *wt_api; + size_t len, vallen; + int nkeys; + char *newconfig; + const char *keyformat, *p; + + len = strlen(config) + strlen(cursor->value_format) + + strlen(cursor->uri) + 20; + if ((newconfig = malloc(len)) == NULL) + return util_err(session, errno, NULL); + *newconfigp = newconfig; + wt_api = session->connection->get_extension_api(session->connection); + if ((ret = wt_api->config_parser_open(wt_api, session, config, + strlen(config), &parser)) != 0) + return (util_err( + session, ret, "WT_EXTENSION_API.config_parser_open")); + keyformat = cursor->key_format; + for (nkeys = 0; *keyformat; keyformat++) + if (!__wt_isdigit((u_char)*keyformat)) + nkeys++; + + /* + * Copy the configuration, replacing some fields to match the + * projection. + */ + while ((ret = parser->next(parser, &key, &value)) == 0) { + WT_RET(dump_add_config(session, &newconfig, &len, + "%.*s=", (int)key.len, key.str)); + if (STRING_MATCH_CONFIG("value_format", key)) + WT_RET(dump_add_config(session, &newconfig, &len, + "%s", cursor->value_format)); + else if (STRING_MATCH_CONFIG("columns", key)) { + /* copy names of keys */ + p = value.str; + vallen = value.len; + while (vallen > 0) { + if ((*p == ',' || *p == ')') && --nkeys == 0) + break; + p++; + vallen--; + } + WT_RET(dump_add_config(session, &newconfig, &len, + "%.*s", (int)(p - value.str), value.str)); + + /* copy names of projected values */ + p = strchr(cursor->uri, '('); + assert(p != NULL); + assert(p[strlen(p) - 1] == ')'); + p++; + if (*p != ')') + WT_RET(dump_add_config(session, &newconfig, + &len, "%s", ",")); + WT_RET(dump_add_config(session, &newconfig, &len, + "%.*s),", (int)(strlen(p) - 1), p)); + } else if (value.type == WT_CONFIG_ITEM_STRING && + value.len != 0) + WT_RET(dump_add_config(session, &newconfig, &len, + "\"%.*s\",", (int)value.len, value.str)); + else + WT_RET(dump_add_config(session, &newconfig, &len, + "%.*s,", (int)value.len, value.str)); + } + if (ret != WT_NOTFOUND) + return (util_err(session, ret, "WT_CONFIG_PARSER.next")); + + assert(len > 0); + if ((ret = parser->close(parser)) != 0) + return (util_err( + session, ret, "WT_CONFIG_PARSER.close")); + + return (0); +} + +/* * dump_table_config -- * Dump the config for a table. */ static int dump_table_config( - WT_SESSION *session, WT_CURSOR *cursor, const char *uri, bool json) + WT_SESSION *session, WT_CURSOR *mcursor, WT_CURSOR *cursor, + const char *uri, bool json) { WT_DECL_RET; + char *proj_config; const char *name, *v; + proj_config = NULL; /* Get the table name. */ if ((name = strchr(uri, ':')) == NULL) { fprintf(stderr, "%s: %s: corrupted uri\n", progname, uri); @@ -246,20 +381,25 @@ dump_table_config( * Dump out the config information: first, dump the uri entry itself, * it overrides all subsequent configurations. */ - cursor->set_key(cursor, uri); - if ((ret = cursor->search(cursor)) != 0) - return (util_cerr(cursor, "search", ret)); - if ((ret = cursor->get_value(cursor, &v)) != 0) - return (util_cerr(cursor, "get_value", ret)); - - WT_RET(print_config(session, uri, v, json, true)); + mcursor->set_key(mcursor, uri); + if ((ret = mcursor->search(mcursor)) != 0) + return (util_cerr(mcursor, "search", ret)); + if ((ret = mcursor->get_value(mcursor, &v)) != 0) + return (util_cerr(mcursor, "get_value", ret)); + + if (strchr(cursor->uri, '(') != NULL) { + WT_ERR(dump_projection(session, v, cursor, &proj_config)); + v = proj_config; + } + WT_ERR(print_config(session, uri, v, json, true)); - WT_RET(dump_table_parts_config( - session, cursor, name, "colgroup:", json)); - WT_RET(dump_table_parts_config( - session, cursor, name, "index:", json)); + WT_ERR(dump_table_parts_config( + session, mcursor, name, "colgroup:", json)); + WT_ERR(dump_table_parts_config( + session, mcursor, name, "index:", json)); - return (0); +err: free(proj_config); + return (ret); } /* diff --git a/src/third_party/wiredtiger/src/utilities/util_list.c b/src/third_party/wiredtiger/src/utilities/util_list.c index e91dbfce05b..f19ba4d1f97 100644 --- a/src/third_party/wiredtiger/src/utilities/util_list.c +++ b/src/third_party/wiredtiger/src/utilities/util_list.c @@ -19,10 +19,10 @@ util_list(WT_SESSION *session, int argc, char *argv[]) WT_DECL_RET; int ch; bool cflag, vflag; - char *name; + char *uri; cflag = vflag = false; - name = NULL; + uri = NULL; while ((ch = __wt_getopt(progname, argc, argv, "cv")) != EOF) switch (ch) { case 'c': @@ -42,17 +42,16 @@ util_list(WT_SESSION *session, int argc, char *argv[]) case 0: break; case 1: - if ((name = util_name(session, *argv, "table")) == NULL) + if ((uri = util_uri(session, *argv, "table")) == NULL) return (1); break; default: return (usage()); } - ret = list_print(session, name, cflag, vflag); - - free(name); + ret = list_print(session, uri, cflag, vflag); + free(uri); return (ret); } @@ -99,7 +98,7 @@ list_get_allocsize(WT_SESSION *session, const char *key, size_t *allocsize) * List the high-level objects in the database. */ static int -list_print(WT_SESSION *session, const char *name, bool cflag, bool vflag) +list_print(WT_SESSION *session, const char *uri, bool cflag, bool vflag) { WT_CURSOR *cursor; WT_DECL_RET; @@ -120,7 +119,7 @@ list_print(WT_SESSION *session, const char *name, bool cflag, bool vflag) ret, "%s: WT_SESSION.open_cursor", WT_METADATA_URI)); } - found = name == NULL; + found = uri == NULL; while ((ret = cursor->next(cursor)) == 0) { /* Get the key. */ if ((ret = cursor->get_key(cursor, &key)) != 0) @@ -129,8 +128,8 @@ list_print(WT_SESSION *session, const char *name, bool cflag, bool vflag) /* * If a name is specified, only show objects that match. */ - if (name != NULL) { - if (!WT_PREFIX_MATCH(key, name)) + if (uri != NULL) { + if (!WT_PREFIX_MATCH(key, uri)) continue; found = true; } @@ -161,7 +160,7 @@ list_print(WT_SESSION *session, const char *name, bool cflag, bool vflag) if (ret != WT_NOTFOUND) return (util_cerr(cursor, "next", ret)); if (!found) { - fprintf(stderr, "%s: %s: not found\n", progname, name); + fprintf(stderr, "%s: %s: not found\n", progname, uri); return (1); } diff --git a/src/third_party/wiredtiger/src/utilities/util_load.c b/src/third_party/wiredtiger/src/utilities/util_load.c index ac18df80851..ca77643eb49 100644 --- a/src/third_party/wiredtiger/src/utilities/util_load.c +++ b/src/third_party/wiredtiger/src/utilities/util_load.c @@ -126,7 +126,7 @@ load_dump(WT_SESSION *session) append ? ",append" : "", no_overwrite ? ",overwrite=false" : ""); if ((ret = session->open_cursor( session, uri, NULL, config, &cursor)) != 0) { - ret = util_err(session, ret, "%s: session.open", uri); + ret = util_err(session, ret, "%s: session.open_cursor", uri); goto err; } diff --git a/src/third_party/wiredtiger/src/utilities/util_load_json.c b/src/third_party/wiredtiger/src/utilities/util_load_json.c index 020a4ed9ba9..1189d49a483 100644 --- a/src/third_party/wiredtiger/src/utilities/util_load_json.c +++ b/src/third_party/wiredtiger/src/utilities/util_load_json.c @@ -242,7 +242,7 @@ json_data(WT_SESSION *session, LF_ISSET(LOAD_JSON_NO_OVERWRITE) ? ",overwrite=false" : ""); if ((ret = session->open_cursor( session, uri, NULL, config, &cursor)) != 0) { - ret = util_err(session, ret, "%s: session.open", uri); + ret = util_err(session, ret, "%s: session.open_cursor", uri); goto err; } keyformat = cursor->key_format; diff --git a/src/third_party/wiredtiger/src/utilities/util_loadtext.c b/src/third_party/wiredtiger/src/utilities/util_loadtext.c index f9c5b6e9a1f..7602d43f8c9 100644 --- a/src/third_party/wiredtiger/src/utilities/util_loadtext.c +++ b/src/third_party/wiredtiger/src/utilities/util_loadtext.c @@ -15,9 +15,11 @@ static int usage(void); int util_loadtext(WT_SESSION *session, int argc, char *argv[]) { + WT_DECL_RET; int ch; - const char *uri; + char *uri; + uri = NULL; while ((ch = __wt_getopt(progname, argc, argv, "f:")) != EOF) switch (ch) { case 'f': /* input file */ @@ -35,10 +37,13 @@ util_loadtext(WT_SESSION *session, int argc, char *argv[]) /* The remaining argument is the uri. */ if (argc != 1) return (usage()); - if ((uri = util_name(session, *argv, "table")) == NULL) + if ((uri = util_uri(session, *argv, "table")) == NULL) return (1); - return (text(session, uri)); + ret = text(session, uri); + + free(uri); + return (ret); } /* @@ -61,7 +66,7 @@ text(WT_SESSION *session, const char *uri) */ if ((ret = session->open_cursor( session, uri, NULL, "append,overwrite", &cursor)) != 0) - return (util_err(session, ret, "%s: session.open", uri)); + return (util_err(session, ret, "%s: session.open_cursor", uri)); /* * We're about to load strings, make sure the formats match. diff --git a/src/third_party/wiredtiger/src/utilities/util_main.c b/src/third_party/wiredtiger/src/utilities/util_main.c index 1da56adf137..7157f0d90fe 100644 --- a/src/third_party/wiredtiger/src/utilities/util_main.c +++ b/src/third_party/wiredtiger/src/utilities/util_main.c @@ -175,6 +175,10 @@ main(int argc, char *argv[]) config = "statistics=(all)"; } break; + case 't' : + if (strcmp(command, "truncate") == 0) + func = util_truncate; + break; case 'u': if (strcmp(command, "upgrade") == 0) func = util_upgrade; @@ -272,6 +276,7 @@ usage(void) "\t" "rename\t rename an object\n" "\t" "salvage\t salvage a file\n" "\t" "stat\t display statistics for an object\n" + "\t" "truncate truncate an object, removing all content\n" "\t" "upgrade\t upgrade an object\n" "\t" "verify\t verify an object\n" "\t" "write\t write values to an object\n"); @@ -280,11 +285,11 @@ usage(void) } /* - * util_name -- + * util_uri -- * Build a name. */ char * -util_name(WT_SESSION *session, const char *s, const char *type) +util_uri(WT_SESSION *session, const char *s, const char *type) { size_t len; char *name; diff --git a/src/third_party/wiredtiger/src/utilities/util_printlog.c b/src/third_party/wiredtiger/src/utilities/util_printlog.c index e7fa2134934..5f3ed43905b 100644 --- a/src/third_party/wiredtiger/src/utilities/util_printlog.c +++ b/src/third_party/wiredtiger/src/utilities/util_printlog.c @@ -14,8 +14,8 @@ int util_printlog(WT_SESSION *session, int argc, char *argv[]) { WT_DECL_RET; - int ch; uint32_t flags; + int ch; flags = 0; while ((ch = __wt_getopt(progname, argc, argv, "f:x")) != EOF) @@ -41,17 +41,9 @@ util_printlog(WT_SESSION *session, int argc, char *argv[]) if (argc != 0) return (usage()); - ret = __wt_txn_printlog(session, flags); - - if (ret != 0) { - fprintf(stderr, "%s: printlog failed: %s\n", - progname, session->strerror(session, ret)); - goto err; - } + if ((ret = __wt_txn_printlog(session, flags)) != 0) + (void)util_err(session, ret, "printlog"); - if (0) { -err: ret = 1; - } return (ret); } diff --git a/src/third_party/wiredtiger/src/utilities/util_read.c b/src/third_party/wiredtiger/src/utilities/util_read.c index 2e766377aa9..393949b6a1c 100644 --- a/src/third_party/wiredtiger/src/utilities/util_read.c +++ b/src/third_party/wiredtiger/src/utilities/util_read.c @@ -18,8 +18,9 @@ util_read(WT_SESSION *session, int argc, char *argv[]) uint64_t recno; int ch; bool rkey, rval; - const char *uri, *value; + char *uri, *value; + uri = NULL; while ((ch = __wt_getopt(progname, argc, argv, "")) != EOF) switch (ch) { case '?': @@ -32,13 +33,19 @@ util_read(WT_SESSION *session, int argc, char *argv[]) /* The remaining arguments are a uri followed by a list of keys. */ if (argc < 2) return (usage()); - if ((uri = util_name(session, *argv, "table")) == NULL) + if ((uri = util_uri(session, *argv, "table")) == NULL) return (1); - /* Open the object. */ - if ((ret = session->open_cursor( - session, uri, NULL, NULL, &cursor)) != 0) - return (util_err(session, ret, "%s: session.open", uri)); + /* + * Open the object; free allocated memory immediately to simplify + * future error handling. + */ + if ((ret = + session->open_cursor(session, uri, NULL, NULL, &cursor)) != 0) + (void)util_err(session, ret, "%s: session.open_cursor", uri); + free(uri); + if (ret != 0) + return (ret); /* * A simple search only makes sense if the key format is a string or a diff --git a/src/third_party/wiredtiger/src/utilities/util_rebalance.c b/src/third_party/wiredtiger/src/utilities/util_rebalance.c index 45f161487e5..c188ea17d22 100644 --- a/src/third_party/wiredtiger/src/utilities/util_rebalance.c +++ b/src/third_party/wiredtiger/src/utilities/util_rebalance.c @@ -15,9 +15,9 @@ util_rebalance(WT_SESSION *session, int argc, char *argv[]) { WT_DECL_RET; int ch; - char *name; + char *uri; - name = NULL; + uri = NULL; while ((ch = __wt_getopt(progname, argc, argv, "")) != EOF) switch (ch) { case '?': @@ -30,25 +30,21 @@ util_rebalance(WT_SESSION *session, int argc, char *argv[]) /* The remaining argument is the table name. */ if (argc != 1) return (usage()); - if ((name = util_name(session, *argv, "table")) == NULL) + if ((uri = util_uri(session, *argv, "table")) == NULL) return (1); - if ((ret = session->rebalance(session, name, NULL)) != 0) { - fprintf(stderr, "%s: rebalance(%s): %s\n", - progname, name, session->strerror(session, ret)); - goto err; + if ((ret = session->rebalance(session, uri, NULL)) != 0) + (void)util_err(session, ret, "session.rebalance: %s", uri); + else { + /* + * Verbose configures a progress counter, move to the next + * line. + */ + if (verbose) + printf("\n"); } - /* Verbose configures a progress counter, move to the next line. */ - if (verbose) - printf("\n"); - - if (0) { -err: ret = 1; - } - - free(name); - + free(uri); return (ret); } diff --git a/src/third_party/wiredtiger/src/utilities/util_rename.c b/src/third_party/wiredtiger/src/utilities/util_rename.c index aee299c6e63..bb2d40cd103 100644 --- a/src/third_party/wiredtiger/src/utilities/util_rename.c +++ b/src/third_party/wiredtiger/src/utilities/util_rename.c @@ -30,22 +30,15 @@ util_rename(WT_SESSION *session, int argc, char *argv[]) /* The remaining arguments are the object uri and new name. */ if (argc != 2) return (usage()); - if ((uri = util_name(session, *argv, "table")) == NULL) + if ((uri = util_uri(session, *argv, "table")) == NULL) return (1); newuri = argv[1]; - if ((ret = session->rename(session, uri, newuri, NULL)) != 0) { - fprintf(stderr, "%s: rename %s to %s: %s\n", - progname, uri, newuri, session->strerror(session, ret)); - goto err; - } - - if (0) { -err: ret = 1; - } + if ((ret = session->rename(session, uri, newuri, NULL)) != 0) + (void)util_err( + session, ret, "session.rename: %s, %s", uri, newuri); free(uri); - return (ret); } diff --git a/src/third_party/wiredtiger/src/utilities/util_salvage.c b/src/third_party/wiredtiger/src/utilities/util_salvage.c index 679d1074457..6cc2278b846 100644 --- a/src/third_party/wiredtiger/src/utilities/util_salvage.c +++ b/src/third_party/wiredtiger/src/utilities/util_salvage.c @@ -16,10 +16,10 @@ util_salvage(WT_SESSION *session, int argc, char *argv[]) WT_DECL_RET; int ch; const char *force; - char *name; + char *uri; force = NULL; - name = NULL; + uri = NULL; while ((ch = __wt_getopt(progname, argc, argv, "F")) != EOF) switch (ch) { case 'F': @@ -35,25 +35,21 @@ util_salvage(WT_SESSION *session, int argc, char *argv[]) /* The remaining argument is the file name. */ if (argc != 1) return (usage()); - if ((name = util_name(session, *argv, "file")) == NULL) + if ((uri = util_uri(session, *argv, "file")) == NULL) return (1); - if ((ret = session->salvage(session, name, force)) != 0) { - fprintf(stderr, "%s: salvage(%s): %s\n", - progname, name, session->strerror(session, ret)); - goto err; + if ((ret = session->salvage(session, uri, force)) != 0) + (void)util_err(session, ret, "session.salvage: %s", uri); + else { + /* + * Verbose configures a progress counter, move to the next + * line. + */ + if (verbose) + printf("\n"); } - /* Verbose configures a progress counter, move to the next line. */ - if (verbose) - printf("\n"); - - if (0) { -err: ret = 1; - } - - free(name); - + free(uri); return (ret); } diff --git a/src/third_party/wiredtiger/src/utilities/util_stat.c b/src/third_party/wiredtiger/src/utilities/util_stat.c index 4376f559ceb..1b75d9ea8bf 100644 --- a/src/third_party/wiredtiger/src/utilities/util_stat.c +++ b/src/third_party/wiredtiger/src/utilities/util_stat.c @@ -55,7 +55,7 @@ util_stat(WT_SESSION *session, int argc, char *argv[]) objname = (char *)""; break; case 1: - if ((objname = util_name(session, *argv, "table")) == NULL) + if ((objname = util_uri(session, *argv, "table")) == NULL) return (1); objname_free = true; break; @@ -82,8 +82,8 @@ util_stat(WT_SESSION *session, int argc, char *argv[]) (ret = cursor->next(cursor)) == 0 && (ret = cursor->get_value(cursor, &desc, &pval, NULL)) == 0) if (printf("%s=%s\n", desc, pval) < 0) { - ret = errno; - break; + (void)util_err(session, errno, "printf"); + goto err; } if (ret == WT_NOTFOUND) ret = 0; diff --git a/src/third_party/wiredtiger/src/utilities/util_truncate.c b/src/third_party/wiredtiger/src/utilities/util_truncate.c new file mode 100644 index 00000000000..35de02345c8 --- /dev/null +++ b/src/third_party/wiredtiger/src/utilities/util_truncate.c @@ -0,0 +1,52 @@ +/*- + * Copyright (c) 2014-2016 MongoDB, Inc. + * Copyright (c) 2008-2014 WiredTiger, Inc. + * All rights reserved. + * + * See the file LICENSE for redistribution information. + */ + +#include "util.h" + +static int usage(void); + +int +util_truncate(WT_SESSION *session, int argc, char *argv[]) +{ + WT_DECL_RET; + int ch; + char *uri; + + uri = NULL; + while ((ch = __wt_getopt(progname, argc, argv, "")) != EOF) + switch (ch) { + case '?': + default: + return (usage()); + } + + argc -= __wt_optind; + argv += __wt_optind; + + /* The remaining argument is the uri. */ + if (argc != 1) + return (usage()); + if ((uri = util_uri(session, *argv, "table")) == NULL) + return (1); + + if ((ret = session->truncate(session, uri, NULL, NULL, NULL)) != 0) + (void)util_err(session, ret, "session.truncate: %s", uri); + + free(uri); + return (ret); +} + +static int +usage(void) +{ + (void)fprintf(stderr, + "usage: %s %s " + "truncate uri\n", + progname, usage_prefix); + return (1); +} diff --git a/src/third_party/wiredtiger/src/utilities/util_upgrade.c b/src/third_party/wiredtiger/src/utilities/util_upgrade.c index 63b23f28c16..f89bd46e133 100644 --- a/src/third_party/wiredtiger/src/utilities/util_upgrade.c +++ b/src/third_party/wiredtiger/src/utilities/util_upgrade.c @@ -15,9 +15,9 @@ util_upgrade(WT_SESSION *session, int argc, char *argv[]) { WT_DECL_RET; int ch; - char *name; + char *uri; - name = NULL; + uri = NULL; while ((ch = __wt_getopt(progname, argc, argv, "")) != EOF) switch (ch) { case '?': @@ -30,25 +30,21 @@ util_upgrade(WT_SESSION *session, int argc, char *argv[]) /* The remaining argument is the table name. */ if (argc != 1) return (usage()); - if ((name = util_name(session, *argv, "table")) == NULL) + if ((uri = util_uri(session, *argv, "table")) == NULL) return (1); - if ((ret = session->upgrade(session, name, NULL)) != 0) { - fprintf(stderr, "%s: upgrade(%s): %s\n", - progname, name, session->strerror(session, ret)); - goto err; + if ((ret = session->upgrade(session, uri, NULL)) != 0) + (void)util_err(session, ret, "session.upgrade: %s", uri); + else { + /* + * Verbose configures a progress counter, move to the next + * line. + */ + if (verbose) + printf("\n"); } - /* Verbose configures a progress counter, move to the next line. */ - if (verbose) - printf("\n"); - - if (0) { -err: ret = 1; - } - - free(name); - + free(uri); return (ret); } diff --git a/src/third_party/wiredtiger/src/utilities/util_verify.c b/src/third_party/wiredtiger/src/utilities/util_verify.c index 82bdd780cd3..d0587fcfc8c 100644 --- a/src/third_party/wiredtiger/src/utilities/util_verify.c +++ b/src/third_party/wiredtiger/src/utilities/util_verify.c @@ -17,10 +17,10 @@ util_verify(WT_SESSION *session, int argc, char *argv[]) size_t size; int ch; bool dump_address, dump_blocks, dump_layout, dump_pages; - char *config, *dump_offsets, *name; + char *config, *dump_offsets, *uri; dump_address = dump_blocks = dump_layout = dump_pages = false; - config = dump_offsets = name = NULL; + config = dump_offsets = uri = NULL; while ((ch = __wt_getopt(progname, argc, argv, "d:")) != EOF) switch (ch) { case 'd': @@ -55,7 +55,7 @@ util_verify(WT_SESSION *session, int argc, char *argv[]) /* The remaining argument is the table name. */ if (argc != 1) return (usage()); - if ((name = util_name(session, *argv, "table")) == NULL) + if ((uri = util_uri(session, *argv, "table")) == NULL) return (1); /* Build the configuration string as necessary. */ @@ -69,7 +69,7 @@ util_verify(WT_SESSION *session, int argc, char *argv[]) strlen("dump_offsets[],") + (dump_offsets == NULL ? 0 : strlen(dump_offsets)) + 20; if ((config = malloc(size)) == NULL) { - (void)util_err(session, errno, NULL); + ret = util_err(session, errno, NULL); goto err; } snprintf(config, size, @@ -82,23 +82,19 @@ util_verify(WT_SESSION *session, int argc, char *argv[]) dump_offsets != NULL ? "]," : "", dump_pages ? "dump_pages," : ""); } - if ((ret = session->verify(session, name, config)) != 0) { - fprintf(stderr, "%s: verify(%s): %s\n", - progname, name, session->strerror(session, ret)); - goto err; + if ((ret = session->verify(session, uri, config)) != 0) + (void)util_err(session, ret, "session.verify: %s", uri); + else { + /* + * Verbose configures a progress counter, move to the next + * line. + */ + if (verbose) + printf("\n"); } - /* Verbose configures a progress counter, move to the next line. */ - if (verbose) - printf("\n"); - - if (0) { -err: ret = 1; - } - - free(config); - free(name); - +err: free(config); + free(uri); return (ret); } diff --git a/src/third_party/wiredtiger/src/utilities/util_write.c b/src/third_party/wiredtiger/src/utilities/util_write.c index 7d9bce02b36..b931fad064d 100644 --- a/src/third_party/wiredtiger/src/utilities/util_write.c +++ b/src/third_party/wiredtiger/src/utilities/util_write.c @@ -18,10 +18,10 @@ util_write(WT_SESSION *session, int argc, char *argv[]) uint64_t recno; int ch; bool append, overwrite, rkey; - const char *uri; - char config[100]; + char *uri, config[100]; append = overwrite = false; + uri = NULL; while ((ch = __wt_getopt(progname, argc, argv, "ao")) != EOF) switch (ch) { case 'a': @@ -47,15 +47,21 @@ util_write(WT_SESSION *session, int argc, char *argv[]) } else if (argc < 3 || ((argc - 1) % 2 != 0)) return (usage()); - if ((uri = util_name(session, *argv, "table")) == NULL) + if ((uri = util_uri(session, *argv, "table")) == NULL) return (1); - /* Open the object. */ + /* + * Open the object; free allocated memory immediately to simplify + * future error handling. + */ (void)snprintf(config, sizeof(config), "%s,%s", append ? "append=true" : "", overwrite ? "overwrite=true" : ""); - if ((ret = session->open_cursor( - session, uri, NULL, config, &cursor)) != 0) - return (util_err(session, ret, "%s: session.open", uri)); + if ((ret = + session->open_cursor(session, uri, NULL, config, &cursor)) != 0) + (void)util_err(session, ret, "%s: session.open_cursor", uri); + free(uri); + if (ret != 0) + return (ret); /* * A simple search only makes sense if the key format is a string or a diff --git a/src/third_party/wiredtiger/test/csuite/Makefile.am b/src/third_party/wiredtiger/test/csuite/Makefile.am index a96492c1e71..e2b72532703 100644 --- a/src/third_party/wiredtiger/test/csuite/Makefile.am +++ b/src/third_party/wiredtiger/test/csuite/Makefile.am @@ -37,9 +37,21 @@ noinst_PROGRAMS += test_wt2834_join_bloom_fix test_wt2853_perf_SOURCES = wt2853_perf/main.c noinst_PROGRAMS += test_wt2853_perf +test_wt2909_checkpoint_integrity_SOURCES = wt2909_checkpoint_integrity/main.c +noinst_PROGRAMS += test_wt2909_checkpoint_integrity + test_wt2999_join_extractor_SOURCES = wt2999_join_extractor/main.c noinst_PROGRAMS += test_wt2999_join_extractor +test_wt3120_filesys_SOURCES = wt3120_filesys/main.c +noinst_PROGRAMS += test_wt3120_filesys + +test_wt3135_search_near_collator_SOURCES = wt3135_search_near_collator/main.c +noinst_PROGRAMS += test_wt3135_search_near_collator + +test_wt3184_dup_index_collator_SOURCES = wt3184_dup_index_collator/main.c +noinst_PROGRAMS += test_wt3184_dup_index_collator + # Run this during a "make check" smoke test. TESTS = $(noinst_PROGRAMS) LOG_COMPILER = $(TEST_WRAPPER) diff --git a/src/third_party/wiredtiger/test/csuite/wt2909_checkpoint_integrity/main.c b/src/third_party/wiredtiger/test/csuite/wt2909_checkpoint_integrity/main.c new file mode 100644 index 00000000000..ddf249fb406 --- /dev/null +++ b/src/third_party/wiredtiger/test/csuite/wt2909_checkpoint_integrity/main.c @@ -0,0 +1,666 @@ +/*- + * Public Domain 2014-2016 MongoDB, Inc. + * Public Domain 2008-2014 WiredTiger, Inc. + * + * This is free and unencumbered software released into the public domain. + * + * Anyone is free to copy, modify, publish, use, compile, sell, or + * distribute this software, either in source code form or as a compiled + * binary, for any purpose, commercial or non-commercial, and by any + * means. + * + * In jurisdictions that recognize copyright laws, the author or authors + * of this software dedicate any and all copyright interest in the + * software to the public domain. We make this dedication for the benefit + * of the public at large and to the detriment of our heirs and + * successors. We intend this dedication to be an overt act of + * relinquishment in perpetuity of all present and future rights to this + * software under copyright law. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ +#include "test_util.h" + +#include <sys/time.h> +#include <sys/resource.h> +#include <sys/wait.h> + +/* + * JIRA ticket reference: WT-2909 + * Test case description: + * + * This test attempts to check the integrity of checkpoints by injecting + * failures (by means of a custom file system) and then trying to recover. To + * insulate the top level program from various crashes that may occur when + * injecting failures, the "populate" code runs in another process, and is + * expected to sometimes fail. Then the top level program runs recovery (with + * the normal file system) and checks the results. Any failure at the top level + * indicates a checkpoint integrity problem. + * + * Each subtest uses the same kind of schema and data, the only variance is + * when the faults are injected. At the moment, this test only injects during + * checkpoints, and only injects write failures. It varies in the number of + * successful writes that occur before an injected failure (during a checkpoint + * operation), this can be indicated with "-o N". When N is not specified, the + * test attempts to find the optimal range of N for testing. Clearly when N is + * large, then the checkpoint may be successfully written, and the data + * represented by the checkpoint will be fully present. When N is small, + * nothing of interest is written and no data is present. To find the sweet + * spot where interesting failures occur, the test does a binary search to find + * the approximate N that divides the "small" and "large" cases. This is not + * strictly deterministic, a given N may give different results on different + * runs. But approximate optimal N can be determined, allowing a series of + * additional tests clustered around this N. + * + * The data is stored in two tables, one having indices. Both tables have + * the same keys and are updated with the same key in a single transaction. + * + * Failure mode: + * If one table is out of step with the other, that is detected as a failure at + * the top level. If an index is missing values (or has extra values), that is + * likewise a failure at the top level. If the tables or the home directory + * cannot be opened, that is a top level error. The tables must be present + * as an initial checkpoint is done without any injected fault. + */ + +/* + * This program does not run on Windows. The non-portable aspects at minimum + * are fork/exec the use of environment variables (used by fail_fs), and file + * name and build locations of dynamically loaded libraries. + */ +#define BIG_SIZE (1024 * 10) +#define BIG_CONTENTS "<Big String Contents>" +#define MAX_ARGS 20 +#define MAX_OP_RANGE 1000 +#define STDERR_FILE "stderr.txt" +#define STDOUT_FILE "stdout.txt" +#define TESTS_PER_OP_VALUE 3 +#define VERBOSE_PRINT 10000 + +static int check_results(TEST_OPTS *, uint64_t *); +static void check_values(WT_CURSOR *, int, int, int, char *); +static int create_big_string(char **); +static void cursor_count_items(WT_CURSOR *, uint64_t *); +static void disable_failures(void); +static void enable_failures(uint64_t, uint64_t); +static void generate_key(uint64_t, int *); +static void generate_value(uint32_t, uint64_t, char *, int *, int *, int *, + char **); +static void run_check_subtest(TEST_OPTS *, const char *, uint64_t, bool, + uint64_t *); +static void run_check_subtest_range(TEST_OPTS *, const char *, bool); +static int run_process(TEST_OPTS *, const char *, char *[], int *); +static int subtest_main(int, char *[], bool); +static void subtest_populate(TEST_OPTS *, bool); +int main(int, char *[]); + +extern int __wt_optind; + +#define WT_FAIL_FS_LIB "../../ext/test/fail_fs/.libs/libwiredtiger_fail_fs.so" + +/* + * check_results -- + * Check all the tables and verify the results. + */ +static int +check_results(TEST_OPTS *opts, uint64_t *foundp) +{ + WT_CURSOR *maincur, *maincur2, *v0cur, *v1cur, *v2cur; + WT_SESSION *session; + uint64_t count, idxcount, nrecords; + uint32_t rndint; + int key, key_got, ret, v0, v1, v2; + char *bigref, *big; + + testutil_check(create_big_string(&bigref)); + nrecords = opts->nrecords; + testutil_check(wiredtiger_open(opts->home, NULL, + "create,log=(enabled)", &opts->conn)); + testutil_check( + opts->conn->open_session(opts->conn, NULL, NULL, &session)); + + testutil_check(session->open_cursor(session, "table:subtest", NULL, + NULL, &maincur)); + testutil_check(session->open_cursor(session, "table:subtest2", NULL, + NULL, &maincur2)); + testutil_check(session->open_cursor(session, "index:subtest:v0", NULL, + NULL, &v0cur)); + testutil_check(session->open_cursor(session, "index:subtest:v1", NULL, + NULL, &v1cur)); + testutil_check(session->open_cursor(session, "index:subtest:v2", NULL, + NULL, &v2cur)); + + count = 0; + while ((ret = maincur->next(maincur)) == 0) { + testutil_check(maincur2->next(maincur2)); + testutil_check(maincur2->get_key(maincur2, &key_got)); + testutil_check(maincur2->get_value(maincur2, &rndint)); + + generate_key(count, &key); + generate_value(rndint, count, bigref, &v0, &v1, &v2, &big); + testutil_assert(key == key_got); + + /* Check the key/values in main table. */ + testutil_check(maincur->get_key(maincur, &key_got)); + testutil_assert(key == key_got); + check_values(maincur, v0, v1, v2, big); + + /* Check the values in the indices. */ + v0cur->set_key(v0cur, v0); + testutil_check(v0cur->search(v0cur)); + check_values(v0cur, v0, v1, v2, big); + v1cur->set_key(v1cur, v1); + testutil_check(v1cur->search(v1cur)); + check_values(v1cur, v0, v1, v2, big); + v2cur->set_key(v2cur, v2); + testutil_check(v2cur->search(v2cur)); + check_values(v2cur, v0, v1, v2, big); + + count++; + if (count % VERBOSE_PRINT == 0 && opts->verbose) + printf("checked %" PRIu64 "/%" PRIu64 "\n", count, + nrecords); + } + if (count % VERBOSE_PRINT != 0 && opts->verbose) + printf("checked %" PRIu64 "/%" PRIu64 "\n", count, nrecords); + + /* + * Always expect at least one entry, as populate does a + * checkpoint after the first insert. + */ + testutil_assert(count > 0); + testutil_assert(ret == WT_NOTFOUND); + testutil_assert(maincur2->next(maincur2) == WT_NOTFOUND); + cursor_count_items(v0cur, &idxcount); + testutil_assert(count == idxcount); + cursor_count_items(v1cur, &idxcount); + testutil_assert(count == idxcount); + cursor_count_items(v2cur, &idxcount); + testutil_assert(count == idxcount); + + testutil_check(opts->conn->close(opts->conn, NULL)); + opts->conn = NULL; + + free(bigref); + *foundp = count; + return (0); +} + +/* + * check_values -- + * Check that the values in the cursor match the given values. + */ +static void +check_values(WT_CURSOR *cursor, int v0, int v1, int v2, char *big) +{ + int v0_got, v1_got, v2_got; + char *big_got; + + testutil_check(cursor->get_value(cursor, &v0_got, &v1_got, &v2_got, + &big_got)); + testutil_assert(v0 == v0_got); + testutil_assert(v1 == v1_got); + testutil_assert(v2 == v2_got); + testutil_assert(strcmp(big, big_got) == 0); +} + +/* + * create_big_string -- + * Create and fill the "reference" big array. + */ +static int create_big_string(char **bigp) +{ + size_t i, mod; + char *big; + + if ((big = malloc(BIG_SIZE + 1)) == NULL) + return (ENOMEM); + mod = strlen(BIG_CONTENTS); + for (i = 0; i < BIG_SIZE; i++) { + big[i] = BIG_CONTENTS[i % mod]; + } + big[BIG_SIZE] = '\0'; + *bigp = big; + return (0); +} + +/* + * cursor_count_items -- + * Count the number of items in the table by traversing + * through the cursor. + */ +static void +cursor_count_items(WT_CURSOR *cursor, uint64_t *countp) +{ + int ret; + + *countp = 0; + + testutil_check(cursor->reset(cursor)); + while ((ret = cursor->next(cursor)) == 0) + (*countp)++; + testutil_assert(ret == WT_NOTFOUND); +} + +/* + * disable_failures -- + * Disable failures in the fail file system. + */ +static void +disable_failures(void) +{ + testutil_check(setenv("WT_FAIL_FS_ENABLE", "0", 1)); +} + +/* + * enable_failures -- + * Enable failures in the fail file system. + */ +static void +enable_failures(uint64_t allow_writes, uint64_t allow_reads) +{ + char value[100]; + + testutil_check(setenv("WT_FAIL_FS_ENABLE", "1", 1)); + snprintf(value, sizeof(value), "%" PRIu64, allow_writes); + testutil_check(setenv("WT_FAIL_FS_WRITE_ALLOW", value, 1)); + snprintf(value, sizeof(value), "%" PRIu64, allow_reads); + testutil_check(setenv("WT_FAIL_FS_READ_ALLOW", value, 1)); +} + +/* + * generate_key -- + * Generate a key used by the "subtest" and "subtest2" tables. + */ +static void +generate_key(uint64_t i, int *keyp) +{ + *keyp = (int)i; +} + +/* + * generate_value -- + * Generate values for the "subtest" table. + */ +static void +generate_value(uint32_t rndint, uint64_t i, char *bigref, + int *v0p, int *v1p, int *v2p, char **bigp) +{ + *v0p = (int)(i * 7); + *v1p = (int)(i * 10007); + *v2p = (int)(i * 100000007); + *bigp = &bigref[rndint % BIG_SIZE]; +} + +/* + * run_check_subtest -- + * Run the subtest with the given parameters and check the results. + */ +static void +run_check_subtest(TEST_OPTS *opts, const char *debugger, uint64_t nops, + bool close_test, uint64_t *nresultsp) +{ + int estatus, narg; + char rarg[20], sarg[20], *subtest_args[MAX_ARGS]; + + narg = 0; + if (debugger != NULL) { + subtest_args[narg++] = (char *)debugger; + subtest_args[narg++] = (char *)"--"; + } + + subtest_args[narg++] = (char *)opts->progname; + /* "subtest" must appear before arguments */ + if (close_test) + subtest_args[narg++] = (char *)"subtest_close"; + else + subtest_args[narg++] = (char *)"subtest"; + subtest_args[narg++] = (char *)"-h"; + subtest_args[narg++] = opts->home; + subtest_args[narg++] = (char *)"-v"; /* subtest is always verbose */ + subtest_args[narg++] = (char *)"-p"; + subtest_args[narg++] = (char *)"-o"; + snprintf(sarg, sizeof(sarg), "%" PRIu64, nops); + subtest_args[narg++] = sarg; /* number of operations */ + subtest_args[narg++] = (char *)"-n"; + snprintf(rarg, sizeof(rarg), "%" PRIu64, opts->nrecords); + subtest_args[narg++] = rarg; /* number of records */ + subtest_args[narg++] = NULL; + testutil_assert(narg <= MAX_ARGS); + if (opts->verbose) + printf("running a separate process with %" PRIu64 + " operations until fail...\n", nops); + testutil_clean_work_dir(opts->home); + testutil_check(run_process( + opts, debugger != NULL ? debugger : opts->progname, + subtest_args, &estatus)); + if (opts->verbose) + printf("process exited %d\n", estatus); + + /* + * Verify results in parent process. + */ + testutil_check(check_results(opts, nresultsp)); +} + +/* + * run_check_subtest_range -- + * + * Run successive tests via binary search that determines the approximate + * crossover point between when data is recoverable or not. Once that is + * determined, run the subtest in a range near that crossover point. + * + * The theory is that running at the crossover point will tend to trigger + * "interesting" failures at the borderline when the checkpoint is about to, + * or has, succeeded. If any of those failures creates a WT home directory + * that cannot be recovered, the top level test will fail. + */ +static void +run_check_subtest_range(TEST_OPTS *opts, const char *debugger, bool close_test) +{ + uint64_t cutoff, high, low, mid, nops, nresults; + int i; + bool got_failure, got_success; + + if (opts->verbose) + printf("Determining best range of operations until failure, " + "with close_test %s.\n", + (close_test ? "enabled" : "disabled")); + + run_check_subtest(opts, debugger, 1, close_test, &cutoff); + low = 0; + high = MAX_OP_RANGE; + mid = (low + high) / 2; + while (mid != low) { + run_check_subtest(opts, debugger, mid, close_test, + &nresults); + if (nresults > cutoff) + high = mid; + else + low = mid; + mid = (low + high) / 2; + } + /* + * mid is the number of ops that is the crossover point. + * Run some tests near that point to try to trigger weird + * failures. If mid is too low or too high, it indicates + * there is a fundamental problem with the test. + */ + testutil_assert(mid > 1 && mid < MAX_OP_RANGE - 1); + if (opts->verbose) + printf("Retesting around %" PRIu64 " operations.\n", + mid); + + got_failure = false; + got_success = false; + for (nops = mid - 10; nops < mid + 10; nops++) { + for (i = 0; i < TESTS_PER_OP_VALUE; i++) { + run_check_subtest(opts, debugger, nops, + close_test, &nresults); + if (nresults > cutoff) + got_failure = true; + else + got_success = true; + } + } + /* + * Check that it really ran with a crossover point. + */ + testutil_assert(got_failure); + testutil_assert(got_success); +} + +/* + * run_process -- + * Run a program with arguments, wait until it completes. + */ +static int +run_process(TEST_OPTS *opts, const char *prog, char *argv[], int *status) +{ + int pid; + char **arg; + + if (opts->verbose) { + printf("running: "); + for (arg = argv; *arg != NULL; arg++) + printf("%s ", *arg); + printf("\n"); + } + if ((pid = fork()) == 0) { + (void)execv(prog, argv); + testutil_die(errno, "%s", prog); + } else if (pid < 0) + return (errno); + + (void)waitpid(pid, status, 0); + return (0); +} + +/* + * subtest_main -- + * The main program for the subtest + */ +static int +subtest_main(int argc, char *argv[], bool close_test) +{ + TEST_OPTS *opts, _opts; + WT_SESSION *session; + char config[1024], filename[1024]; + struct rlimit rlim; + + if (testutil_disable_long_tests()) + return (0); + opts = &_opts; + memset(opts, 0, sizeof(*opts)); + memset(&rlim, 0, sizeof(rlim)); + + /* No core files during fault injection tests. */ + testutil_check(setrlimit(RLIMIT_CORE, &rlim)); + testutil_check(testutil_parse_opts(argc, argv, opts)); + testutil_make_work_dir(opts->home); + + /* Redirect stderr, stdout. */ + sprintf(filename, "%s/%s", opts->home, STDERR_FILE); + testutil_assert(freopen(filename, "a", stderr) != NULL); + sprintf(filename, "%s/%s", opts->home, STDOUT_FILE); + testutil_assert(freopen(filename, "a", stdout) != NULL); + snprintf(config, sizeof(config), + "create,cache_size=250M,log=(enabled)," + "transaction_sync=(enabled,method=none),extensions=(" + WT_FAIL_FS_LIB + "=(early_load,config={environment=true,verbose=true})]"); + + testutil_check(wiredtiger_open(opts->home, NULL, config, &opts->conn)); + testutil_check( + opts->conn->open_session(opts->conn, NULL, NULL, &session)); + + testutil_check(session->create(session, "table:subtest", + "key_format=i,value_format=iiiS," + "columns=(id,v0,v1,v2,big)")); + + testutil_check(session->create(session, "table:subtest2", + "key_format=i,value_format=i")); + + testutil_check(session->create(session, "index:subtest:v0", + "columns=(v0)")); + testutil_check(session->create(session, "index:subtest:v1", + "columns=(v1)")); + testutil_check(session->create(session, "index:subtest:v2", + "columns=(v2)")); + + testutil_check(session->close(session, NULL)); + + subtest_populate(opts, close_test); + + testutil_cleanup(opts); + + return (0); +} + +/* + * This macro is used as a substitute for testutil_check, except that it is + * aware of when a failure may be expected due to the effects of the fail_fs. + * This macro is used only in subtest_populate(), it uses local variables. + */ +#define CHECK(expr) { \ + int _ret; \ + _ret = expr; \ + if (_ret != 0) { \ + if (!failmode || \ + (_ret != WT_RUN_RECOVERY && _ret != EIO)) { \ + fprintf(stderr, " BAD RETURN %d for \"%s\"\n", \ + _ret, #expr); \ + testutil_check(_ret); \ + } else \ + failed = true; \ + } \ +} + +/* + * subtest_populate -- + * Populate the tables. + */ +static void +subtest_populate(TEST_OPTS *opts, bool close_test) +{ + WT_CURSOR *maincur, *maincur2; + WT_RAND_STATE rnd; + WT_SESSION *session; + uint64_t i, nrecords; + uint32_t rndint; + int key, v0, v1, v2; + char *big, *bigref; + bool failed, failmode; + + failmode = failed = false; + __wt_random_init_seed(NULL, &rnd); + CHECK(create_big_string(&bigref)); + nrecords = opts->nrecords; + + CHECK(opts->conn->open_session( + opts->conn, NULL, NULL, &session)); + + CHECK(session->open_cursor(session, "table:subtest", NULL, + NULL, &maincur)); + + CHECK(session->open_cursor(session, "table:subtest2", NULL, + NULL, &maincur2)); + + for (i = 0; i < nrecords && !failed; i++) { + rndint = __wt_random(&rnd); + generate_key(i, &key); + generate_value(rndint, i, bigref, &v0, &v1, &v2, &big); + CHECK(session->begin_transaction(session, NULL)); + maincur->set_key(maincur, key); + maincur->set_value(maincur, v0, v1, v2, big); + CHECK(maincur->insert(maincur)); + + maincur2->set_key(maincur2, key); + maincur2->set_value(maincur2, rndint); + CHECK(maincur2->insert(maincur2)); + CHECK(session->commit_transaction(session, NULL)); + + if (i == 0) + /* + * Force an initial checkpoint, that helps to + * distinguish a clear failure from just not running + * long enough. + */ + CHECK(session->checkpoint(session, NULL)); + + if ((i + 1) % VERBOSE_PRINT == 0 && opts->verbose) + printf(" %" PRIu64 "/%" PRIu64 "\n", + (i + 1), nrecords); + /* Attempt to isolate the failures to checkpointing. */ + if (i == (nrecords/100)) { + enable_failures(opts->nops, 1000000); + failmode = true; /* CHECK should expect failures. */ + CHECK(session->checkpoint(session, NULL)); + failmode = false; + disable_failures(); + if (failed && opts->verbose) + printf("checkpoint failed (expected).\n"); + } + } + + /* + * Closing handles after an extreme fail is likely to cause + * cascading failures (or crashes), so recommended practice is + * to immediately exit. We're interested in testing both with + * and without the recommended practice. + */ + if (failed) { + if (!close_test) { + fprintf(stderr, "exit early.\n"); + exit(0); + } else + fprintf(stderr, "closing after failure.\n"); + } + + free(bigref); + CHECK(maincur->close(maincur)); + CHECK(maincur2->close(maincur2)); + CHECK(session->close(session, NULL)); +} + +/* + * main -- + * The main program for the test. When invoked with "subtest" + * argument, run the subtest. Otherwise, run a separate process + * for each needed subtest, and check the results. + */ +int +main(int argc, char *argv[]) +{ + TEST_OPTS *opts, _opts; + uint64_t nresults; + const char *debugger; + + if (testutil_disable_long_tests()) + return (0); + opts = &_opts; + memset(opts, 0, sizeof(*opts)); + debugger = NULL; + + testutil_check(testutil_parse_opts(argc, argv, opts)); + argc -= __wt_optind; + argv += __wt_optind; + if (opts->nrecords == 0) + opts->nrecords = 50000; + + while (argc > 0) { + if (strcmp(argv[0], "subtest") == 0) + return (subtest_main(argc, argv, false)); + else if (strcmp(argv[0], "subtest_close") == 0) + return (subtest_main(argc, argv, true)); + else if (strcmp(argv[0], "gdb") == 0) + debugger = "/usr/bin/gdb"; + else + testutil_assert(false); + argc--; + argv++; + } + if (opts->verbose) { + printf("Number of operations until failure: %" PRIu64 + " (change with -o N)\n", opts->nops); + printf("Number of records: %" PRIu64 + " (change with -n N)\n", opts->nrecords); + } + if (opts->nops == 0) { + run_check_subtest_range(opts, debugger, false); + run_check_subtest_range(opts, debugger, true); + } else + run_check_subtest(opts, debugger, opts->nops, + opts->nrecords, &nresults); + + testutil_clean_work_dir(opts->home); + testutil_cleanup(opts); + + return (0); +} diff --git a/src/third_party/wiredtiger/test/csuite/wt3120_filesys/main.c b/src/third_party/wiredtiger/test/csuite/wt3120_filesys/main.c new file mode 100644 index 00000000000..09dce624066 --- /dev/null +++ b/src/third_party/wiredtiger/test/csuite/wt3120_filesys/main.c @@ -0,0 +1,99 @@ +/*- + * Public Domain 2014-2017 MongoDB, Inc. + * Public Domain 2008-2014 WiredTiger, Inc. + * + * This is free and unencumbered software released into the public domain. + * + * Anyone is free to copy, modify, publish, use, compile, sell, or + * distribute this software, either in source code form or as a compiled + * binary, for any purpose, commercial or non-commercial, and by any + * means. + * + * In jurisdictions that recognize copyright laws, the author or authors + * of this software dedicate any and all copyright interest in the + * software to the public domain. We make this dedication for the benefit + * of the public at large and to the detriment of our heirs and + * successors. We intend this dedication to be an overt act of + * relinquishment in perpetuity of all present and future rights to this + * software under copyright law. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ +#include "test_util.h" + +/* + * JIRA ticket reference: WT-3120 + * Test case description: A simple file system extension built into + * a shared library. + * Failure mode: Loading the file system and closing the connection + * is enough to evoke the failure. This test does slightly more + * than that. + */ + +#define WT_FAIL_FS_LIB "../../ext/test/fail_fs/.libs/libwiredtiger_fail_fs.so" + +int +main(int argc, char *argv[]) +{ + TEST_OPTS *opts, _opts; + WT_CURSOR *cursor; + WT_SESSION *session; + char *kstr, *vstr; + char buf[1024]; + + opts = &_opts; + memset(opts, 0, sizeof(*opts)); + testutil_check(testutil_parse_opts(argc, argv, opts)); + testutil_make_work_dir(opts->home); + + snprintf(buf, sizeof(buf), + "create,extensions=(" WT_FAIL_FS_LIB "=(early_load=true))"); + testutil_check(wiredtiger_open(opts->home, NULL, buf, &opts->conn)); + testutil_check( + opts->conn->open_session(opts->conn, NULL, NULL, &session)); + testutil_check(session->create(session, opts->uri, + "key_format=S,value_format=S")); + + testutil_check(session->open_cursor(session, opts->uri, NULL, NULL, + &cursor)); + cursor->set_key(cursor, "a"); + cursor->set_value(cursor, "0"); + testutil_check(cursor->insert(cursor)); + cursor->set_key(cursor, "b"); + cursor->set_value(cursor, "1"); + testutil_check(cursor->insert(cursor)); + testutil_check(cursor->close(cursor)); + testutil_check(session->close(session, NULL)); + + /* Force to disk and re-open. */ + testutil_check(opts->conn->close(opts->conn, NULL)); + testutil_check(wiredtiger_open(opts->home, NULL, NULL, &opts->conn)); + + testutil_check( + opts->conn->open_session(opts->conn, NULL, NULL, &session)); + testutil_check(session->open_cursor(session, opts->uri, NULL, NULL, + &cursor)); + testutil_check(cursor->next(cursor)); + testutil_check(cursor->get_key(cursor, &kstr)); + testutil_check(cursor->get_value(cursor, &vstr)); + testutil_assert(strcmp(kstr, "a") == 0); + testutil_assert(strcmp(vstr, "0") == 0); + testutil_check(cursor->next(cursor)); + testutil_check(cursor->get_key(cursor, &kstr)); + testutil_check(cursor->get_value(cursor, &vstr)); + testutil_assert(strcmp(kstr, "b") == 0); + testutil_assert(strcmp(vstr, "1") == 0); + testutil_assert(cursor->next(cursor) == WT_NOTFOUND); + testutil_check(cursor->close(cursor)); + testutil_check(session->close(session, NULL)); + printf("Success\n"); + + testutil_cleanup(opts); + return (EXIT_SUCCESS); +} diff --git a/src/third_party/wiredtiger/test/csuite/wt3135_search_near_collator/main.c b/src/third_party/wiredtiger/test/csuite/wt3135_search_near_collator/main.c new file mode 100644 index 00000000000..8783034a7d8 --- /dev/null +++ b/src/third_party/wiredtiger/test/csuite/wt3135_search_near_collator/main.c @@ -0,0 +1,360 @@ +/*- + * Public Domain 2014-2016 MongoDB, Inc. + * Public Domain 2008-2014 WiredTiger, Inc. + * + * This is free and unencumbered software released into the public domain. + * + * Anyone is free to copy, modify, publish, use, compile, sell, or + * distribute this software, either in source code form or as a compiled + * binary, for any purpose, commercial or non-commercial, and by any + * means. + * + * In jurisdictions that recognize copyright laws, the author or authors + * of this software dedicate any and all copyright interest in the + * software to the public domain. We make this dedication for the benefit + * of the public at large and to the detriment of our heirs and + * successors. We intend this dedication to be an overt act of + * relinquishment in perpetuity of all present and future rights to this + * software under copyright law. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ +#include "test_util.h" + +/* + * JIRA ticket reference: WT-3135 + * Test case description: Each set of data is ordered and contains + * five elements (0-4). We insert elements 1 and 3, and then do + * search_near and search for each element. For each set of data, we perform + * these tests first using a custom collator, and second using a custom collator + * and extractor. In each case there are index keys having variable length. + * Failure mode: In the reported test case, the custom compare routine is + * given a truncated key to compare, and the unpack functions return errors + * because the truncation appeared in the middle of a key. + */ + +#define TEST_ENTRY_COUNT 5 +typedef const char *TEST_SET[TEST_ENTRY_COUNT]; +static TEST_SET test_sets[] = { + { "0", "01", "012", "0123", "01234" }, + { "A", "B", "C", "D", "E" }, + { "5", "54", "543", "5432", "54321" }, + { "54321", "5433", "544", "55", "6" } +}; +#define TEST_SET_COUNT (sizeof(test_sets) / sizeof(test_sets[0])) + +static bool +item_str_equal(WT_ITEM *item, const char *str) +{ + return (item->size == strlen(str) + 1 && strncmp((char *)item->data, + str, item->size) == 0); +} + +static int +compare_int(int64_t a, int64_t b) +{ + return (a < b ? -1 : (a > b ? 1 : 0)); +} + +static int +index_compare_primary(WT_PACK_STREAM *s1, WT_PACK_STREAM *s2, int *cmp) +{ + int64_t pkey1, pkey2; + int rc1, rc2; + + rc1 = wiredtiger_unpack_int(s1, &pkey1); + rc2 = wiredtiger_unpack_int(s2, &pkey2); + + if (rc1 == 0 && rc2 == 0) + *cmp = compare_int(pkey1, pkey2); + else if (rc1 != 0 && rc2 != 0) + *cmp = 0; + else if (rc1 != 0) + *cmp = -1; + else + *cmp = 1; + return (0); +} + +static int +index_compare_S(WT_COLLATOR *collator, WT_SESSION *session, + const WT_ITEM *key1, const WT_ITEM *key2, int *cmp) +{ + WT_PACK_STREAM *s1, *s2; + const char *skey1, *skey2; + + (void)collator; + + testutil_check(wiredtiger_unpack_start(session, "Si", key1->data, + key1->size, &s1)); + testutil_check(wiredtiger_unpack_start(session, "Si", key2->data, + key2->size, &s2)); + + testutil_check(wiredtiger_unpack_str(s1, &skey1)); + testutil_check(wiredtiger_unpack_str(s2, &skey2)); + + if ((*cmp = strcmp(skey1, skey2)) == 0) + testutil_check(index_compare_primary(s1, s2, cmp)); + + testutil_check(wiredtiger_pack_close(s1, NULL)); + testutil_check(wiredtiger_pack_close(s2, NULL)); + + return (0); +} + +static int +index_compare_u(WT_COLLATOR *collator, WT_SESSION *session, + const WT_ITEM *key1, const WT_ITEM *key2, int *cmp) +{ + WT_ITEM skey1, skey2; + WT_PACK_STREAM *s1, *s2; + + (void)collator; + + testutil_check(wiredtiger_unpack_start(session, "ui", key1->data, + key1->size, &s1)); + testutil_check(wiredtiger_unpack_start(session, "ui", key2->data, + key2->size, &s2)); + + testutil_check(wiredtiger_unpack_item(s1, &skey1)); + testutil_check(wiredtiger_unpack_item(s2, &skey2)); + + if ((*cmp = strcmp(skey1.data, skey2.data)) == 0) + testutil_check(index_compare_primary(s1, s2, cmp)); + + testutil_check(wiredtiger_pack_close(s1, NULL)); + testutil_check(wiredtiger_pack_close(s2, NULL)); + + return (0); +} + +static int +index_extractor_u(WT_EXTRACTOR *extractor, WT_SESSION *session, + const WT_ITEM *key, const WT_ITEM *value, WT_CURSOR *result_cursor) +{ + (void)extractor; + (void)session; + (void)key; + + result_cursor->set_key(result_cursor, value); + return result_cursor->insert(result_cursor); +} + +static WT_COLLATOR collator_S = { index_compare_S, NULL, NULL }; +static WT_COLLATOR collator_u = { index_compare_u, NULL, NULL }; +static WT_EXTRACTOR extractor_u = { index_extractor_u, NULL, NULL }; + +/* + * Check search() and search_near() using the test string indicated + * by test_index. + */ +static void +search_using_str(WT_CURSOR *cursor, TEST_SET test_set, int test_index) +{ + int exact, ret; + const char *result; + const char *str_01, *str_0123, *test_str; + + testutil_assert(test_index >= 0 && test_index <= 4); + str_01 = test_set[1]; + str_0123 = test_set[3]; + test_str = test_set[test_index]; + + cursor->set_key(cursor, test_str); + testutil_check(cursor->search_near(cursor, &exact)); + testutil_check(cursor->get_key(cursor, &result)); + + if (test_index == 0) + testutil_assert(strcmp(result, str_01) == 0 && exact > 0); + else if (test_index == 1) + testutil_assert(strcmp(result, str_01) == 0 && exact == 0); + else if (test_index == 2) + testutil_assert((strcmp(result, str_0123) == 0 && exact > 0) || + (strcmp(result, str_01) == 0 && exact < 0)); + else if (test_index == 3) + testutil_assert(strcmp(result, str_0123) == 0 && exact == 0); + else if (test_index == 4) + testutil_assert(strcmp(result, str_0123) == 0 && exact < 0); + + cursor->set_key(cursor, test_str); + ret = cursor->search(cursor); + + if (test_index == 0 || test_index == 2 || test_index == 4) + testutil_assert(ret == WT_NOTFOUND); + else if (test_index == 1 || test_index == 3) + testutil_assert(ret == 0); +} + +/* + * Check search() and search_near() using the test string indicated + * by test_index against a table containing a variable sized item. + */ +static void +search_using_item(WT_CURSOR *cursor, TEST_SET test_set, int test_index) +{ + WT_ITEM item; + size_t testlen; + int exact, ret; + const char *str_01, *str_0123, *test_str; + + testutil_assert(test_index >= 0 && test_index <= 4); + str_01 = test_set[1]; + str_0123 = test_set[3]; + test_str = test_set[test_index]; + + testlen = strlen(test_str) + 1; + item.data = test_str; + item.size = testlen; + cursor->set_key(cursor, &item); + testutil_check(cursor->search_near(cursor, &exact)); + testutil_check(cursor->get_key(cursor, &item)); + + if (test_index == 0) + testutil_assert(item_str_equal(&item, str_01) && exact > 0); + else if (test_index == 1) + testutil_assert(item_str_equal(&item, str_01) && exact == 0); + else if (test_index == 2) + testutil_assert((item_str_equal(&item, str_0123) && exact > 0) + || (item_str_equal(&item, str_01) && exact < 0)); + else if (test_index == 3) + testutil_assert(item_str_equal(&item, str_0123) && exact == 0); + else if (test_index == 4) + testutil_assert(item_str_equal(&item, str_0123) && exact < 0); + + item.data = test_str; + item.size = testlen; + cursor->set_key(cursor, &item); + ret = cursor->search(cursor); + + if (test_index == 0 || test_index == 2 || test_index == 4) + testutil_assert(ret == WT_NOTFOUND); + else if (test_index == 1 || test_index == 3) + testutil_assert(ret == 0); +} + +/* + * For each set of data, perform tests. + */ +static void +test_one_set(WT_SESSION *session, TEST_SET set) +{ + WT_CURSOR *cursor; + WT_ITEM item; + int32_t i; + + /* + * Part 1: Using a custom collator, insert some elements + * and verify results from search_near. + */ + + testutil_check(session->create(session, + "table:main", "key_format=i,value_format=S,columns=(k,v)")); + testutil_check(session->create(session, + "index:main:def_collator", "columns=(v)")); + testutil_check(session->create(session, + "index:main:custom_collator", + "columns=(v),collator=collator_S")); + + /* Insert only elements #1 and #3. */ + testutil_check(session->open_cursor(session, + "table:main", NULL, NULL, &cursor)); + cursor->set_key(cursor, 0); + cursor->set_value(cursor, set[1]); + testutil_check(cursor->insert(cursor)); + cursor->set_key(cursor, 1); + cursor->set_value(cursor, set[3]); + testutil_check(cursor->insert(cursor)); + testutil_check(cursor->close(cursor)); + + /* Check all elements in def_collator index. */ + testutil_check(session->open_cursor(session, + "index:main:def_collator", NULL, NULL, &cursor)); + for (i = 0; i < (int32_t)TEST_ENTRY_COUNT; i++) + search_using_str(cursor, set, i); + testutil_check(cursor->close(cursor)); + + /* Check all elements in custom_collator index */ + testutil_check(session->open_cursor(session, + "index:main:custom_collator", NULL, NULL, &cursor)); + for (i = 0; i < (int32_t)TEST_ENTRY_COUNT; i++) + search_using_str(cursor, set, i); + testutil_check(cursor->close(cursor)); + + /* + * Part 2: perform the same checks using a custom collator and + * extractor. + */ + testutil_check(session->create(session, + "table:main2", "key_format=i,value_format=u,columns=(k,v)")); + + testutil_check(session->create(session, "index:main2:idx_w_coll", + "key_format=u,collator=collator_u,extractor=extractor_u")); + + testutil_check(session->open_cursor(session, + "table:main2", NULL, NULL, &cursor)); + + memset(&item, 0, sizeof(item)); + item.size = strlen(set[1]) + 1; + item.data = set[1]; + cursor->set_key(cursor, 1); + cursor->set_value(cursor, &item); + testutil_check(cursor->insert(cursor)); + + item.size = strlen(set[3]) + 1; + item.data = set[3]; + cursor->set_key(cursor, 3); + cursor->set_value(cursor, &item); + testutil_check(cursor->insert(cursor)); + + testutil_check(cursor->close(cursor)); + + testutil_check(session->open_cursor(session, + "index:main2:idx_w_coll", NULL, NULL, &cursor)); + for (i = 0; i < (int32_t)TEST_ENTRY_COUNT; i++) + search_using_item(cursor, set, i); + testutil_check(cursor->close(cursor)); + + testutil_check(session->drop(session, "table:main", NULL)); + testutil_check(session->drop(session, "table:main2", NULL)); +} + +int +main(int argc, char *argv[]) +{ + TEST_OPTS *opts, _opts; + WT_SESSION *session; + size_t i; + + opts = &_opts; + memset(opts, 0, sizeof(*opts)); + testutil_check(testutil_parse_opts(argc, argv, opts)); + testutil_make_work_dir(opts->home); + + testutil_check(wiredtiger_open(opts->home, NULL, "create", + &opts->conn)); + testutil_check( + opts->conn->open_session(opts->conn, NULL, NULL, &session)); + + /* Add any collators and extractors used by tests */ + testutil_check(opts->conn->add_collator(opts->conn, "collator_S", + &collator_S, NULL)); + testutil_check(opts->conn->add_collator(opts->conn, "collator_u", + &collator_u, NULL)); + testutil_check(opts->conn->add_extractor(opts->conn, "extractor_u", + &extractor_u, NULL)); + + for (i = 0; i < TEST_SET_COUNT; i++) { + printf("test set %" WT_SIZET_FMT "\n", i); + test_one_set(session, test_sets[i]); + } + + testutil_check(session->close(session, NULL)); + testutil_cleanup(opts); + return (EXIT_SUCCESS); +} diff --git a/src/third_party/wiredtiger/test/csuite/wt3184_dup_index_collator/main.c b/src/third_party/wiredtiger/test/csuite/wt3184_dup_index_collator/main.c new file mode 100644 index 00000000000..bcefd2f1a3b --- /dev/null +++ b/src/third_party/wiredtiger/test/csuite/wt3184_dup_index_collator/main.c @@ -0,0 +1,168 @@ +/*- + * Public Domain 2014-2016 MongoDB, Inc. + * Public Domain 2008-2014 WiredTiger, Inc. + * + * This is free and unencumbered software released into the public domain. + * + * Anyone is free to copy, modify, publish, use, compile, sell, or + * distribute this software, either in source code form or as a compiled + * binary, for any purpose, commercial or non-commercial, and by any + * means. + * + * In jurisdictions that recognize copyright laws, the author or authors + * of this software dedicate any and all copyright interest in the + * software to the public domain. We make this dedication for the benefit + * of the public at large and to the detriment of our heirs and + * successors. We intend this dedication to be an overt act of + * relinquishment in perpetuity of all present and future rights to this + * software under copyright law. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ +#include "test_util.h" + +/* + * JIRA ticket reference: WT-3184 + * Test case description: Each set of data is ordered and contains + * five elements (0-4). We insert elements 1 and 3, and then do + * search_near and search for each element. For each set of data, we perform + * these tests first using a custom collator, and second using a custom collator + * and extractor. In each case there are index keys having variable length. + * Failure mode: In the reported test case, the custom compare routine is + * given a truncated key to compare, and the unpack functions return errors + * because the truncation appeared in the middle of a key. + */ + +static int +compare_int(int32_t a, int32_t b) +{ + return (a < b ? -1 : (a > b ? 1 : 0)); +} + +static int32_t +item_to_int(WT_ITEM *item) +{ + testutil_assert(item->size == sizeof(int32_t)); + return (*(int32_t *)item->data); +} + +static int +compare_int_items(WT_ITEM *itema, WT_ITEM *itemb) +{ + testutil_assert(itema->size == sizeof(int32_t)); + testutil_assert(itemb->size == sizeof(int32_t)); + return (compare_int(item_to_int(itema), item_to_int(itemb))); +} + +static void +print_int_item(const char *str, const WT_ITEM *item) +{ + if (item->size > 0) { + testutil_assert(item->size == sizeof(int32_t)); + printf("%s%" PRId32, str, *(int32_t *)item->data); + } else + printf("%s<empty>", str); +} + +static int +index_compare(WT_COLLATOR *collator, WT_SESSION *session, + const WT_ITEM *key1, const WT_ITEM *key2, int *cmp) +{ + WT_ITEM ikey1, pkey1, ikey2, pkey2; + + (void)collator; + testutil_check(wiredtiger_struct_unpack(session, + key1->data, key1->size, "uu", &ikey1, &pkey1)); + testutil_check(wiredtiger_struct_unpack(session, + key2->data, key2->size, "uu", &ikey2, &pkey2)); + + print_int_item("index_compare: index key1 = ", &ikey1); + print_int_item(", primary key1 = ", &pkey1); + print_int_item(", index key2 = ", &ikey2); + print_int_item(", primary key2 = ", &pkey2); + printf("\n"); + + if ((*cmp = compare_int_items(&ikey1, &ikey2)) != 0) + return (0); + + if (pkey1.size != 0 && pkey2.size != 0) + *cmp = compare_int_items(&pkey1, &pkey2); + else if (pkey1.size != 0) + *cmp = 1; + else if (pkey2.size != 0) + *cmp = -1; + else + *cmp = 0; + + return (0); +} + +static WT_COLLATOR index_coll = { index_compare, NULL, NULL }; + +int +main(int argc, char *argv[]) +{ + TEST_OPTS *opts, _opts; + WT_CURSOR *cursor, *cursor1; + WT_ITEM got, k, v; + WT_SESSION *session; + int32_t ki, vi; + + opts = &_opts; + memset(opts, 0, sizeof(*opts)); + testutil_check(testutil_parse_opts(argc, argv, opts)); + testutil_make_work_dir(opts->home); + + testutil_check(wiredtiger_open(opts->home, NULL, "create", + &opts->conn)); + testutil_check( + opts->conn->open_session(opts->conn, NULL, NULL, &session)); + + testutil_check(opts->conn->add_collator(opts->conn, "index_coll", + &index_coll, NULL)); + + testutil_check(session->create(session, + "table:main", "key_format=u,value_format=u,columns=(k,v)")); + testutil_check(session->create(session, + "index:main:index", "columns=(v),collator=index_coll")); + + printf("adding new record\n"); + testutil_check(session->open_cursor(session, "table:main", NULL, NULL, + &cursor)); + + ki = 13; + vi = 17; + + k.data = &ki; k.size = sizeof(ki); + v.data = &vi; v.size = sizeof(vi); + + cursor->set_key(cursor, &k); + cursor->set_value(cursor, &v); + testutil_check(cursor->insert(cursor)); + testutil_check(cursor->close(cursor)); + + printf("positioning index cursor\n"); + + testutil_check(session->open_cursor(session, "index:main:index", NULL, + NULL, &cursor)); + cursor->set_key(cursor, &v); + testutil_check(cursor->search(cursor)); + + printf("duplicating cursor\n"); + testutil_check(session->open_cursor(session, NULL, cursor, NULL, + &cursor1)); + cursor->get_value(cursor, &got); + testutil_assert(item_to_int(&got) == 17); + cursor1->get_value(cursor1, &got); + testutil_assert(item_to_int(&got) == 17); + + testutil_check(session->close(session, NULL)); + testutil_cleanup(opts); + return (EXIT_SUCCESS); +} diff --git a/src/third_party/wiredtiger/test/format/config.c b/src/third_party/wiredtiger/test/format/config.c index cf922b5db04..50430fe073e 100644 --- a/src/third_party/wiredtiger/test/format/config.c +++ b/src/third_party/wiredtiger/test/format/config.c @@ -44,6 +44,7 @@ static void config_map_compression(const char *, u_int *); static void config_map_encryption(const char *, u_int *); static void config_map_file_type(const char *, u_int *); static void config_map_isolation(const char *, u_int *); +static void config_pct(void); static void config_reset(void); /* @@ -159,31 +160,19 @@ config_setup(void) config_encryption(); config_isolation(); config_lrt(); + config_pct(); /* - * Periodically, set the delete percentage to 0 so salvage gets run, - * as long as the delete percentage isn't nailed down. - * Don't do it on the first run, all our smoke tests would hit it. - */ - if (!g.replay && g.run_cnt % 10 == 9 && !config_is_perm("delete_pct")) - config_single("delete_pct=0", 0); - - /* - * If this is an LSM run, set the cache size and crank up the insert - * percentage. + * If this is an LSM run, ensure cache size sanity. + * Ensure there is at least 1MB of cache per thread. */ - if (DATASOURCE("lsm")) { - if (!config_is_perm("cache")) + if (!config_is_perm("cache")) { + if (DATASOURCE("lsm")) g.c_cache = 30 * g.c_chunk_size; - - if (!config_is_perm("insert_pct")) - g.c_insert_pct = mmrand(NULL, 50, 85); + if (g.c_cache < g.c_threads) + g.c_cache = g.c_threads; } - /* Ensure there is at least 1MB of cache per thread. */ - if (!config_is_perm("cache") && g.c_cache < g.c_threads) - g.c_cache = g.c_threads; - /* Give in-memory configuration a final review. */ config_in_memory_check(); @@ -482,6 +471,83 @@ config_lrt(void) } /* + * config_pct -- + * Configure operation percentages. + */ +static void +config_pct(void) +{ + static struct { + const char *name; /* Operation */ + uint32_t *vp; /* Value store */ + u_int order; /* Order of assignment */ + } list[] = { +#define CONFIG_DELETE_ENTRY 0 + { "delete_pct", &g.c_delete_pct, 0 }, + { "insert_pct", &g.c_insert_pct, 0 }, + { "read_pct", &g.c_read_pct, 0 }, + { "write_pct", &g.c_write_pct, 0 }, + }; + u_int i, max_order, max_slot, n, pct; + + /* + * Walk the list of operations, checking for an illegal configuration + * and creating a random order in the list. + */ + pct = 0; + for (i = 0; i < WT_ELEMENTS(list); ++i) + if (config_is_perm(list[i].name)) + pct += *list[i].vp; + else + list[i].order = mmrand(NULL, 1, 1000); + if (pct > 100) + testutil_die(EINVAL, + "operation percentages total to more than 100%%"); + + /* + * If the delete percentage isn't nailed down, periodically set it to + * 0 so salvage gets run. Don't do it on the first run, all our smoke + * tests would hit it. + */ + if (!config_is_perm("delete_pct") && !g.replay && g.run_cnt % 10 == 9) { + list[CONFIG_DELETE_ENTRY].order = 0; + *list[CONFIG_DELETE_ENTRY].vp = 0; + } + + /* + * Walk the list, allocating random numbers of operations in a random + * order. + * + * If the "order" field is non-zero, we need to create a value for this + * operation. Find the largest order field in the array; if one non-zero + * order field is found, it's the last entry and gets the remainder of + * the operations. + */ + for (pct = 100 - pct;;) { + for (i = n = + max_order = max_slot = 0; i < WT_ELEMENTS(list); ++i) { + if (list[i].order != 0) + ++n; + if (list[i].order > max_order) { + max_order = list[i].order; + max_slot = i; + } + } + if (n == 0) + break; + if (n == 1) { + *list[max_slot].vp = pct; + break; + } + *list[max_slot].vp = mmrand(NULL, 0, pct); + list[max_slot].order = 0; + pct -= *list[max_slot].vp; + } + testutil_assert(g.c_delete_pct + + g.c_insert_pct + g.c_read_pct + g.c_write_pct == 100); +} + +/* * config_error -- * Display configuration information on error. */ diff --git a/src/third_party/wiredtiger/test/format/config.h b/src/third_party/wiredtiger/test/format/config.h index e4f7af2e1b2..e3e1e73a786 100644 --- a/src/third_party/wiredtiger/test/format/config.h +++ b/src/third_party/wiredtiger/test/format/config.h @@ -131,7 +131,7 @@ static CONFIG c[] = { { "delete_pct", "percent operations that are deletes", - 0x0, 0, 45, 90, &g.c_delete_pct, NULL }, + C_IGNORE, 0, 0, 100, &g.c_delete_pct, NULL }, { "dictionary", "if values are dictionary compressed", /* 20% */ @@ -171,7 +171,7 @@ static CONFIG c[] = { { "insert_pct", "percent operations that are inserts", - 0x0, 0, 45, 90, &g.c_insert_pct, NULL }, + C_IGNORE, 0, 0, 100, &g.c_insert_pct, NULL }, { "internal_key_truncation", "if internal keys are truncated", /* 95% */ @@ -254,6 +254,14 @@ static CONFIG c[] = { "quiet run (same as -q)", C_IGNORE|C_BOOL, 0, 0, 0, &g.c_quiet, NULL }, + { "read_pct", + "percent operations that are reads", + C_IGNORE, 0, 0, 100, &g.c_read_pct, NULL }, + + { "rebalance", + "rebalance testing", /* 100% */ + C_BOOL, 100, 1, 0, &g.c_rebalance, NULL }, + { "repeat_data_pct", "percent duplicate values in row- or var-length column-stores", 0x0, 0, 90, 90, &g.c_repeat_data_pct, NULL }, @@ -270,10 +278,6 @@ static CONFIG c[] = { "the number of runs", C_IGNORE, 0, UINT_MAX, UINT_MAX, &g.c_runs, NULL }, - { "rebalance", - "rebalance testing", /* 100% */ - C_BOOL, 100, 1, 0, &g.c_rebalance, NULL }, - { "salvage", "salvage testing", /* 100% */ C_BOOL, 100, 1, 0, &g.c_salvage, NULL }, @@ -320,7 +324,7 @@ static CONFIG c[] = { { "write_pct", "percent operations that are writes", - 0x0, 0, 90, 90, &g.c_write_pct, NULL }, + C_IGNORE, 0, 0, 100, &g.c_write_pct, NULL }, { NULL, NULL, 0x0, 0, 0, 0, NULL, NULL } }; diff --git a/src/third_party/wiredtiger/test/format/format.h b/src/third_party/wiredtiger/test/format/format.h index c1f4875dbb2..6bb44410acc 100644 --- a/src/third_party/wiredtiger/test/format/format.h +++ b/src/third_party/wiredtiger/test/format/format.h @@ -192,6 +192,7 @@ typedef struct { uint32_t c_reverse; uint32_t c_rows; uint32_t c_runs; + uint32_t c_read_pct; uint32_t c_rebalance; uint32_t c_salvage; uint32_t c_split_pct; diff --git a/src/third_party/wiredtiger/test/recovery/random-abort.c b/src/third_party/wiredtiger/test/recovery/random-abort.c index c407361c7eb..660ef0cca67 100644 --- a/src/third_party/wiredtiger/test/recovery/random-abort.c +++ b/src/third_party/wiredtiger/test/recovery/random-abort.c @@ -31,9 +31,13 @@ #include <sys/wait.h> #include <signal.h> -static char home[512]; /* Program working dir */ +static char home[1024]; /* Program working dir */ static const char *progname; /* Program name */ +/* + * These two names for the URI and file system must be maintained in tandem. + */ static const char * const uri = "table:main"; +static const char * const fs_main = "main.wt"; static bool inmem; #define MAX_TH 12 @@ -211,6 +215,7 @@ extern char *__wt_optarg; int main(int argc, char *argv[]) { + struct stat sb; FILE *fp; WT_CONNECTION *conn; WT_CURSOR *cursor; @@ -222,7 +227,7 @@ main(int argc, char *argv[]) pid_t pid; bool fatal, rand_th, rand_time, verify_only; const char *working_dir; - char fname[64], kname[64]; + char fname[64], kname[64], statname[1024]; if ((progname = strrchr(argv[0], DIR_DELIM)) == NULL) progname = argv[0]; @@ -263,7 +268,7 @@ main(int argc, char *argv[]) if (argc != 0) usage(); - testutil_work_dir_from_path(home, 512, working_dir); + testutil_work_dir_from_path(home, sizeof(home), working_dir); /* * If the user wants to verify they need to tell us how many threads * there were so we can find the old record files. @@ -305,8 +310,15 @@ main(int argc, char *argv[]) /* parent */ /* * Sleep for the configured amount of time before killing - * the child. + * the child. Start the timeout from the time we notice that + * the table has been created. That allows the test to run + * correctly on really slow machines. Verify the process ID + * still exists in case the child aborts for some reason we + * don't stay in this loop forever. */ + snprintf(statname, sizeof(statname), "%s/%s", home, fs_main); + while (stat(statname, &sb) != 0 && kill(pid, 0) == 0) + sleep(1); sleep(timeout); /* @@ -340,11 +352,8 @@ main(int argc, char *argv[]) for (i = 0; i < nth; ++i) { middle = 0; snprintf(fname, sizeof(fname), RECORDS_FILE, i); - if ((fp = fopen(fname, "r")) == NULL) { - fprintf(stderr, - "Failed to open %s. i %" PRIu32 "\n", fname, i); - testutil_die(errno, "fopen"); - } + if ((fp = fopen(fname, "r")) == NULL) + testutil_die(errno, "fopen: %s", fname); /* * For every key in the saved file, verify that the key exists diff --git a/src/third_party/wiredtiger/test/recovery/truncated-log.c b/src/third_party/wiredtiger/test/recovery/truncated-log.c index c265263d44c..6a142b8e710 100644 --- a/src/third_party/wiredtiger/test/recovery/truncated-log.c +++ b/src/third_party/wiredtiger/test/recovery/truncated-log.c @@ -35,7 +35,7 @@ #define snprintf _snprintf #endif -static char home[512]; /* Program working dir */ +static char home[1024]; /* Program working dir */ static const char *progname; /* Program name */ static const char * const uri = "table:main"; @@ -290,7 +290,7 @@ main(int argc, char *argv[]) if (argc != 0) usage(); - testutil_work_dir_from_path(home, 512, working_dir); + testutil_work_dir_from_path(home, sizeof(home), working_dir); testutil_make_work_dir(home); /* diff --git a/src/third_party/wiredtiger/test/suite/run.py b/src/third_party/wiredtiger/test/suite/run.py index ba6d9f78503..97c58bfdccf 100644 --- a/src/third_party/wiredtiger/test/suite/run.py +++ b/src/third_party/wiredtiger/test/suite/run.py @@ -324,7 +324,8 @@ if __name__ == '__main__': # All global variables should be set before any test classes are loaded. # That way, verbose printing can be done at the class definition level. wttest.WiredTigerTestCase.globalSetup(preserve, timestamp, gdbSub, - verbose, dirarg, longtest) + verbose, wt_builddir, dirarg, + longtest) # Without any tests listed as arguments, do discovery if len(testargs) == 0: diff --git a/src/third_party/wiredtiger/test/suite/test_async01.py b/src/third_party/wiredtiger/test/suite/test_async01.py index cbb3dad8de6..158c16a9381 100644 --- a/src/third_party/wiredtiger/test/suite/test_async01.py +++ b/src/third_party/wiredtiger/test/suite/test_async01.py @@ -132,7 +132,7 @@ class test_async01(wttest.WiredTigerTestCase, suite_subprocess): ]) # Enable async for this test. - def conn_config(self, dir): + def conn_config(self): return 'async=(enabled=true,ops_max=%s,' % self.async_ops + \ 'threads=%s)' % self.async_threads diff --git a/src/third_party/wiredtiger/test/suite/test_async02.py b/src/third_party/wiredtiger/test/suite/test_async02.py index 50652da6dfd..28435fe85b2 100644 --- a/src/third_party/wiredtiger/test/suite/test_async02.py +++ b/src/third_party/wiredtiger/test/suite/test_async02.py @@ -129,7 +129,7 @@ class test_async02(wttest.WiredTigerTestCase, suite_subprocess): ]) # Enable async for this test. - def conn_config(self, dir): + def conn_config(self): return 'async=(enabled=true,ops_max=%s,' % self.async_ops + \ 'threads=%s)' % self.async_threads diff --git a/src/third_party/wiredtiger/test/suite/test_backup03.py b/src/third_party/wiredtiger/test/suite/test_backup03.py index 73d05f0b0a1..c1ed3cc9e1a 100644 --- a/src/third_party/wiredtiger/test/suite/test_backup03.py +++ b/src/third_party/wiredtiger/test/suite/test_backup03.py @@ -74,7 +74,7 @@ class test_backup_target(wttest.WiredTigerTestCase, suite_subprocess): ('backup_9', dict(big=3,list=[])), # Backup everything ] - scenarios = make_scenarios(list) + scenarios = make_scenarios(list, prune=3, prunelong=1000) # Create a large cache, otherwise this test runs quite slowly. conn_config = 'cache_size=1G' diff --git a/src/third_party/wiredtiger/test/suite/test_backup04.py b/src/third_party/wiredtiger/test/suite/test_backup04.py index 919649fed57..be52a5e1e97 100644 --- a/src/third_party/wiredtiger/test/suite/test_backup04.py +++ b/src/third_party/wiredtiger/test/suite/test_backup04.py @@ -60,7 +60,7 @@ class test_backup_target(wttest.WiredTigerTestCase, suite_subprocess): ]) # Create a large cache, otherwise this test runs quite slowly. - def conn_config(self, dir): + def conn_config(self): return 'cache_size=1G,log=(archive=false,enabled,file_max=%s)' % \ self.logmax diff --git a/src/third_party/wiredtiger/test/suite/test_bug011.py b/src/third_party/wiredtiger/test/suite/test_bug011.py index 969aaeb5b39..5e0721b93f1 100644 --- a/src/third_party/wiredtiger/test/suite/test_bug011.py +++ b/src/third_party/wiredtiger/test/suite/test_bug011.py @@ -43,7 +43,7 @@ class test_bug011(wttest.WiredTigerTestCase): nrows = 10000 nops = 10000 # Add connection configuration for this test. - def conn_config(self, dir): + def conn_config(self): return 'cache_size=1GB' @wttest.longtest("Eviction copes with lots of files") diff --git a/src/third_party/wiredtiger/test/suite/test_collator.py b/src/third_party/wiredtiger/test/suite/test_collator.py index 3fae4ff47cb..7ce135c8976 100644 --- a/src/third_party/wiredtiger/test/suite/test_collator.py +++ b/src/third_party/wiredtiger/test/suite/test_collator.py @@ -48,34 +48,10 @@ class test_collator(wttest.WiredTigerTestCase): nentries = 100 nindices = 4 - # Return the wiredtiger_open extension argument for a shared library. - def extensionArg(self, exts): - extfiles = [] - for ext in exts: - (dirname, name, libname) = ext - if name != None and name != 'none': - testdir = os.path.dirname(__file__) - extdir = os.path.join(run.wt_builddir, 'ext', dirname) - extfile = os.path.join( - extdir, name, '.libs', 'libwiredtiger_' + libname + '.so') - if not os.path.exists(extfile): - self.skipTest('extension "' + extfile + '" not built') - if not extfile in extfiles: - extfiles.append(extfile) - if len(extfiles) == 0: - return '' - else: - return ',extensions=["' + '","'.join(extfiles) + '"]' - - # Override WiredTigerTestCase, we have extensions. - def setUpConnectionOpen(self, dir): - extarg = self.extensionArg([('extractors', 'csv', 'csv_extractor'), - ('collators', 'revint', 'revint_collator')]) - connarg = 'create,error_prefix="{0}: ",{1}'.format( - self.shortid(), extarg) - conn = self.wiredtiger_open(dir, connarg) - self.pr(`conn`) - return conn + def conn_extensions(self, extlist): + extlist.skip_if_missing = True + extlist.extension('extractors', 'csv') + extlist.extension('collators', 'revint') def create_indices(self): # Create self.nindices index files, each with a column from the CSV diff --git a/src/third_party/wiredtiger/test/suite/test_compress01.py b/src/third_party/wiredtiger/test/suite/test_compress01.py index 606f7b63235..ef1064d294e 100644 --- a/src/third_party/wiredtiger/test/suite/test_compress01.py +++ b/src/third_party/wiredtiger/test/suite/test_compress01.py @@ -51,22 +51,10 @@ class test_compress01(wttest.WiredTigerTestCase): nrecords = 10000 bigvalue = "abcdefghij" * 1000 - # Load the compression extension, compression is enabled elsewhere. - def conn_config(self, dir): - return self.extensionArg(self.compress) - - # Return the wiredtiger_open extension argument for a shared library. - def extensionArg(self, name): - if name == None: - return '' - - testdir = os.path.dirname(__file__) - extdir = os.path.join(run.wt_builddir, 'ext/compressors') - extfile = os.path.join( - extdir, name, '.libs', 'libwiredtiger_' + name + '.so') - if not os.path.exists(extfile): - self.skipTest('compression extension "' + extfile + '" not built') - return ',extensions=["' + extfile + '"]' + # Load the compression extension, skip the test if missing + def conn_extensions(self, extlist): + extlist.skip_if_missing = True + extlist.extension('compressors', self.compress) # Create a table, add keys with both big and small values, then verify them. def test_compress(self): diff --git a/src/third_party/wiredtiger/test/suite/test_config03.py b/src/third_party/wiredtiger/test/suite/test_config03.py index 6699f7d2650..89038d71319 100644 --- a/src/third_party/wiredtiger/test/suite/test_config03.py +++ b/src/third_party/wiredtiger/test/suite/test_config03.py @@ -71,7 +71,7 @@ class test_config03(test_base03.test_base03): cache_size_scenarios, create_scenarios, error_prefix_scenarios, eviction_target_scenarios, eviction_trigger_scenarios, multiprocess_scenarios, session_max_scenarios, - transactional_scenarios, verbose_scenarios, prune=1000) + transactional_scenarios, verbose_scenarios, prune=100, prunelong=1000) #wttest.WiredTigerTestCase.printVerbose(2, 'test_config03: running ' + \ # str(len(scenarios)) + ' of ' + \ diff --git a/src/third_party/wiredtiger/test/suite/test_cursor07.py b/src/third_party/wiredtiger/test/suite/test_cursor07.py index d6078183fc1..19db718fd11 100644 --- a/src/third_party/wiredtiger/test/suite/test_cursor07.py +++ b/src/third_party/wiredtiger/test/suite/test_cursor07.py @@ -49,7 +49,7 @@ class test_cursor07(wttest.WiredTigerTestCase, suite_subprocess): ('reopen', dict(reopen=True)) ]) # Enable logging for this test. - def conn_config(self, dir): + def conn_config(self): return 'log=(archive=false,enabled,file_max=%s),' % self.logmax + \ 'transaction_sync="(method=dsync,enabled)"' diff --git a/src/third_party/wiredtiger/test/suite/test_cursor08.py b/src/third_party/wiredtiger/test/suite/test_cursor08.py index 3f8f50defa7..cc76f528aa9 100644 --- a/src/third_party/wiredtiger/test/suite/test_cursor08.py +++ b/src/third_party/wiredtiger/test/suite/test_cursor08.py @@ -54,24 +54,14 @@ class test_cursor08(wttest.WiredTigerTestCase, suite_subprocess): ] scenarios = make_scenarios(reopens, compress) # Load the compression extension, and enable it for logging. - def conn_config(self, dir): + def conn_config(self): return 'log=(archive=false,enabled,file_max=%s,' % self.logmax + \ 'compressor=%s),' % self.compress + \ - 'transaction_sync="(method=dsync,enabled)",' + \ - self.extensionArg(self.compress) + 'transaction_sync="(method=dsync,enabled)"' - # Return the wiredtiger_open extension argument for a shared library. - def extensionArg(self, name): - if name == None or name == 'none': - return '' - - testdir = os.path.dirname(__file__) - extdir = os.path.join(run.wt_builddir, 'ext/compressors') - extfile = os.path.join( - extdir, name, '.libs', 'libwiredtiger_' + name + '.so') - if not os.path.exists(extfile): - self.skipTest('compression extension "' + extfile + '" not built') - return ',extensions=["' + extfile + '"]' + def conn_extensions(self, extlist): + extlist.skip_if_missing = True + extlist.extension('compressors', self.compress) def test_log_cursor(self): # print "Creating %s with config '%s'" % (self.uri, self.create_params) diff --git a/src/third_party/wiredtiger/test/suite/test_cursor_random.py b/src/third_party/wiredtiger/test/suite/test_cursor_random.py index 3bda6dc9946..ee0f85a29ee 100644 --- a/src/third_party/wiredtiger/test/suite/test_cursor_random.py +++ b/src/third_party/wiredtiger/test/suite/test_cursor_random.py @@ -71,6 +71,15 @@ class test_cursor_random(wttest.WiredTigerTestCase): self.assertEquals(cursor.reset(), 0) cursor.close() + # Check that next_random fails with an empty tree, repeatedly. + def test_cursor_random_empty(self): + uri = self.type + self.session.create(uri, 'key_format=S,value_format=S') + cursor = self.session.open_cursor(uri, None, self.config) + for i in range(1,5): + self.assertTrue(cursor.next(), wiredtiger.WT_NOTFOUND) + cursor.close + # Check that next_random works with a single value, repeatedly. def test_cursor_random_single_record(self): uri = self.type @@ -127,6 +136,46 @@ class test_cursor_random(wttest.WiredTigerTestCase): def test_cursor_random_multiple_page_records(self): self.cursor_random_multiple_page_records(0) + # Check that next_random fails in the presence of a set of values, some of + # which are deleted. + def test_cursor_random_deleted_partial(self): + uri = self.type + ds = self.dataset(self, uri, 10000, + config='allocation_size=512,leaf_page_max=512') + ds.populate() + + # Close the connection so everything is forced to disk. + self.reopen_conn() + + start = self.session.open_cursor(uri, None) + start.set_key(ds.key(10)) + end = self.session.open_cursor(uri, None) + end.set_key(ds.key(10000-10)) + self.session.truncate(None, start, end, None) + self.assertEqual(start.close(), 0) + self.assertEqual(end.close(), 0) + + cursor = self.session.open_cursor(uri, None, self.config) + for i in range(1,10): + self.assertEqual(cursor.next(), 0) + + # Check that next_random fails in the presence of a set of values, all of + # which are deleted. + def test_cursor_random_deleted_all(self): + uri = self.type + ds = self.dataset(self, uri, 10000, + config='allocation_size=512,leaf_page_max=512') + ds.populate() + + # Close the connection so everything is forced to disk. + self.reopen_conn() + + self.session.truncate(uri, None, None, None) + + cursor = self.session.open_cursor(uri, None, self.config) + for i in range(1,10): + self.assertTrue(cursor.next(), wiredtiger.WT_NOTFOUND) + # Check that opening a random cursor on column-store returns not-supported. class test_cursor_random_column(wttest.WiredTigerTestCase): scenarios = make_scenarios([ diff --git a/src/third_party/wiredtiger/test/suite/test_dump.py b/src/third_party/wiredtiger/test/suite/test_dump.py index f6a83c32489..3127c7aef00 100644 --- a/src/third_party/wiredtiger/test/suite/test_dump.py +++ b/src/third_party/wiredtiger/test/suite/test_dump.py @@ -32,7 +32,7 @@ import wiredtiger, wttest from suite_subprocess import suite_subprocess from wtscenario import make_scenarios from wtdataset import SimpleDataSet, SimpleIndexDataSet, SimpleLSMDataSet, \ - ComplexDataSet, ComplexLSMDataSet + ComplexDataSet, ComplexLSMDataSet, ProjectionDataSet, ProjectionIndexDataSet # test_dump.py # Utilities: wt dump @@ -62,6 +62,10 @@ class test_dump(wttest.WiredTigerTestCase, suite_subprocess): ('table-simple-lsm', dict(uri='table:', dataset=SimpleLSMDataSet)), ('table-complex', dict(uri='table:', dataset=ComplexDataSet)), ('table-complex-lsm', dict(uri='table:', dataset=ComplexLSMDataSet)), + ('table-simple-proj', dict(uri='table:', + dataset=ProjectionDataSet, projection=True)), + ('table-index-proj', dict(uri='table:', + dataset=ProjectionIndexDataSet, projection=True)), ] scenarios = make_scenarios(types, keyfmt, dumpfmt) @@ -158,5 +162,53 @@ class test_dump(wttest.WiredTigerTestCase, suite_subprocess): pop = self.dataset(self, uri2, self.nentries, key_format=self.keyfmt) pop.check() +# test_dump_projection +# Utilities: wt dump +# Test the dump utility with projections +class test_dump_projection(wttest.WiredTigerTestCase, suite_subprocess): + dir = 'dump.dir' # Backup directory name + + name = 'test_dump' + nentries = 2500 + uri = 'table:' + + # Dump, re-load and do a content comparison. + def test_dump(self): + + # Create the object. + uri = self.uri + self.name + pop = ProjectionDataSet(self, uri, self.nentries, key_format='S') + pop.populate() + + # Check some cases with invalid projections. + self.runWt(['dump', '-x', uri + '('], \ + outfilename='bad1.out', errfilename='err1.out', failure=True) + self.check_non_empty_file('err1.out') + self.runWt(['dump', '-x', uri + '(xx)'], \ + outfilename='bad2.out', errfilename='err2.out', failure=True) + self.check_non_empty_file('err2.out') + self.runWt(['dump', '-x', uri + pop.projection[:-1]], \ + outfilename='bad3.out', errfilename='err3.out', failure=True) + self.check_non_empty_file('err3.out') + + # Dump the object with a valid projection. + self.runWt(['dump', '-x', uri + pop.projection], outfilename='dump.out') + + # Re-load the object in a new home. + os.mkdir(self.dir) + self.runWt(['-h', self.dir, 'load', '-f', 'dump.out']) + + # Check the database contents. + self.runWt(['list'], outfilename='list.out') + self.runWt(['-h', self.dir, 'list'], outfilename='list.out.new') + s1 = set(open('list.out').read().split()) + s2 = set(open('list.out.new').read().split()) + self.assertEqual(not s1.symmetric_difference(s2), True) + + # Check the object's contents. + self.reopen_conn(self.dir) + pop_reload = ProjectionDataSet(self, uri, self.nentries, key_format='S') + pop_reload.check() + if __name__ == '__main__': wttest.run() diff --git a/src/third_party/wiredtiger/test/suite/test_encrypt01.py b/src/third_party/wiredtiger/test/suite/test_encrypt01.py index 746c9d13e96..317bed93246 100644 --- a/src/third_party/wiredtiger/test/suite/test_encrypt01.py +++ b/src/third_party/wiredtiger/test/suite/test_encrypt01.py @@ -66,41 +66,20 @@ class test_encrypt01(wttest.WiredTigerTestCase): nrecords = 5000 bigvalue = "abcdefghij" * 1001 # len(bigvalue) = 10010 - # Override WiredTigerTestCase, we have extensions. - def setUpConnectionOpen(self, dir): + def conn_extensions(self, extlist): + extlist.skip_if_missing = True + extlist.extension('encryptors', self.sys_encrypt) + extlist.extension('encryptors', self.file_encrypt) + extlist.extension('compressors', self.block_compress) + extlist.extension('compressors', self.log_compress) + + def conn_config(self): encarg = 'encryption=(name={0}{1}),'.format( self.sys_encrypt, self.sys_encrypt_args) comparg = '' if self.log_compress != None: comparg='log=(compressor={0}),'.format(self.log_compress) - extarg = self.extensionArg([('encryptors', self.sys_encrypt), - ('encryptors', self.file_encrypt), - ('compressors', self.block_compress), - ('compressors', self.log_compress)]) - conn = self.wiredtiger_open(dir, - 'create,error_prefix="{0}: ",{1}{2}{3}'.format( - self.shortid(), encarg, comparg, extarg)) - self.pr(`conn`) - return conn - - # Return the wiredtiger_open extension argument for a shared library. - def extensionArg(self, exts): - extfiles = [] - for ext in exts: - (dirname, name) = ext - if name != None and name != 'none': - testdir = os.path.dirname(__file__) - extdir = os.path.join(run.wt_builddir, 'ext', dirname) - extfile = os.path.join( - extdir, name, '.libs', 'libwiredtiger_' + name + '.so') - if not os.path.exists(extfile): - self.skipTest('extension "' + extfile + '" not built') - if not extfile in extfiles: - extfiles.append(extfile) - if len(extfiles) == 0: - return '' - else: - return ',extensions=["' + '","'.join(extfiles) + '"]' + return encarg + comparg # Create a table, add keys with both big and small values, then verify them. def test_encrypt(self): diff --git a/src/third_party/wiredtiger/test/suite/test_encrypt02.py b/src/third_party/wiredtiger/test/suite/test_encrypt02.py index 648686274c4..d950be067e2 100644 --- a/src/third_party/wiredtiger/test/suite/test_encrypt02.py +++ b/src/third_party/wiredtiger/test/suite/test_encrypt02.py @@ -39,51 +39,27 @@ from wtscenario import make_scenarios class test_encrypt02(wttest.WiredTigerTestCase, suite_subprocess): uri = 'file:test_encrypt02' encrypt_type = [ - ('noarg', dict( encrypt='rotn', encrypt_args='name=rotn', - secret_arg=None)), - ('keyid', dict( encrypt='rotn', encrypt_args='name=rotn,keyid=11', - secret_arg=None)), - ('pass', dict( encrypt='rotn', encrypt_args='name=rotn', - secret_arg='ABC')), - ('keyid-pass', dict( encrypt='rotn', encrypt_args='name=rotn,keyid=11', - secret_arg='ABC')), + ('noarg', dict( encrypt_args='name=rotn', secret_arg=None)), + ('keyid', dict( encrypt_args='name=rotn,keyid=11', secret_arg=None)), + ('pass', dict( encrypt_args='name=rotn', secret_arg='ABC')), + ('keyid-pass', dict( + encrypt_args='name=rotn,keyid=11', secret_arg='ABC')), ] scenarios = make_scenarios(encrypt_type) + def conn_extensions(self, extlist): + # Load the compression extension, skip the test if missing + extlist.skip_if_missing = True + extlist.extension('encryptors', 'rotn') + nrecords = 5000 bigvalue = "abcdefghij" * 1001 # len(bigvalue) = 10010 - # Return the wiredtiger_open extension argument for a shared library. - def extensionArg(self, exts): - extfiles = [] - for ext in exts: - (dirname, name) = ext - if name != None and name != 'none': - testdir = os.path.dirname(__file__) - extdir = os.path.join(run.wt_builddir, 'ext', dirname) - extfile = os.path.join( - extdir, name, '.libs', 'libwiredtiger_' + name + '.so') - if not os.path.exists(extfile): - self.skipTest('extension "' + extfile + '" not built') - if not extfile in extfiles: - extfiles.append(extfile) - if len(extfiles) == 0: - return '' - else: - return ',extensions=["' + '","'.join(extfiles) + '"]' - - # Override WiredTigerTestCase, we have extensions. - def setUpConnectionOpen(self, dir): + def conn_config(self): secretarg = '' if self.secret_arg != None: secretarg = ',secretkey=' + self.secret_arg - encarg = 'encryption=({0}{1})'.format(self.encrypt_args, secretarg) - extarg = self.extensionArg([('encryptors', self.encrypt)]) - connarg = 'create,error_prefix="{0}: ",{1},{2}'.format( - self.shortid(), encarg, extarg) - conn = self.wiredtiger_open(dir, connarg) - self.pr(`conn`) - return conn + return 'encryption=({0}{1})'.format(self.encrypt_args, secretarg) # Create a table, add keys with both big and small values, then verify them. def test_pass(self): diff --git a/src/third_party/wiredtiger/test/suite/test_encrypt03.py b/src/third_party/wiredtiger/test/suite/test_encrypt03.py index cf459190637..302572bd044 100644 --- a/src/third_party/wiredtiger/test/suite/test_encrypt03.py +++ b/src/third_party/wiredtiger/test/suite/test_encrypt03.py @@ -50,37 +50,14 @@ class test_encrypt03(wttest.WiredTigerTestCase): ] scenarios = make_scenarios(types, encrypt) - # Override WiredTigerTestCase, we have extensions. - def setUpConnectionOpen(self, dir): - encarg = 'encryption=(name={0}{1}),'.format( - self.sys_encrypt, self.sys_encrypt_args) - extarg = self.extensionArg([('encryptors', self.sys_encrypt), - ('encryptors', self.file_encrypt)]) - self.pr('encarg = ' + encarg + ' extarg = ' + extarg) - conn = self.wiredtiger_open(dir, - 'create,error_prefix="{0}: ",{1}{2}'.format( - self.shortid(), encarg, extarg)) - self.pr(`conn`) - return conn + def conn_extensions(self, extlist): + extlist.skip_if_missing = True + extlist.extension('encryptors', self.sys_encrypt) + extlist.extension('encryptors', self.file_encrypt) - # Return the wiredtiger_open extension argument for a shared library. - def extensionArg(self, exts): - extfiles = [] - for ext in exts: - (dirname, name) = ext - if name != None and name != 'none': - testdir = os.path.dirname(__file__) - extdir = os.path.join(run.wt_builddir, 'ext', dirname) - extfile = os.path.join( - extdir, name, '.libs', 'libwiredtiger_' + name + '.so') - if not os.path.exists(extfile): - self.skipTest('extension "' + extfile + '" not built') - if not extfile in extfiles: - extfiles.append(extfile) - if len(extfiles) == 0: - return '' - else: - return ',extensions=["' + '","'.join(extfiles) + '"]' + def conn_config(self): + return 'encryption=(name={0}{1}),'.format( + self.sys_encrypt, self.sys_encrypt_args) # Create a table with encryption values that are in error. def test_encrypt(self): diff --git a/src/third_party/wiredtiger/test/suite/test_encrypt04.py b/src/third_party/wiredtiger/test/suite/test_encrypt04.py index a244cf97961..17777fc9564 100644 --- a/src/third_party/wiredtiger/test/suite/test_encrypt04.py +++ b/src/third_party/wiredtiger/test/suite/test_encrypt04.py @@ -77,9 +77,16 @@ class test_encrypt04(wttest.WiredTigerTestCase, suite_subprocess): wttest.WiredTigerTestCase.__init__(self, *args, **kwargs) self.part = 1 + def conn_extensions(self, extlist): + extarg = None + if self.expect_forceerror: + extarg='(config=\"rotn_force_error=true\")' + extlist.skip_if_missing = True + extlist.extension('encryptors', self.name, extarg) + # Override WiredTigerTestCase, we have extensions. def setUpConnectionOpen(self, dir): - forceerror = None + self.expect_forceerror = False if self.part == 1: self.name = self.name1 self.keyid = self.keyid1 @@ -93,16 +100,15 @@ class test_encrypt04(wttest.WiredTigerTestCase, suite_subprocess): self.fileinclear = self.fileinclear2 if \ hasattr(self, 'fileinclear2') else False if hasattr(self, 'forceerror1') and hasattr(self, 'forceerror2'): - forceerror = "rotn_force_error=true" - self.expect_forceerror = forceerror != None + self.expect_forceerror = True self.got_forceerror = False encarg = 'encryption=(name={0},keyid={1},secretkey={2}),'.format( self.name, self.keyid, self.secretkey) - # If forceerror is set for this test, add a config arg to - # the extension string. That signals rotn to return a (-1000) - # error code, which we'll detect here. - extarg = self.extensionArg([('encryptors', self.name, forceerror)]) + # If forceerror is set for this test, conn_extensions adds a + # config arg to the extension string. That signals rotn to + # return a (-1000) error code, which we'll detect here. + extarg = self.extensionsConfig() self.pr('encarg = ' + encarg + ' extarg = ' + extarg) completed = False try: @@ -135,29 +141,6 @@ class test_encrypt04(wttest.WiredTigerTestCase, suite_subprocess): self.assertEqual(cursor.search(), 0) self.assertEquals(cursor.get_value(), val) - # Return the wiredtiger_open extension argument for a shared library. - def extensionArg(self, exts): - extfiles = [] - for ext in exts: - (dirname, name, extarg) = ext - if name != None and name != 'none': - testdir = os.path.dirname(__file__) - extdir = os.path.join(run.wt_builddir, 'ext', dirname) - extfile = os.path.join( - extdir, name, '.libs', 'libwiredtiger_' + name + '.so') - if not os.path.exists(extfile): - self.skipTest('extension "' + extfile + '" not built') - extfile = '"' + extfile + '"' - if not extfile in extfiles: - s = extfile - if extarg != None: - s += "=(config=\"" + extarg + "\")" - extfiles.append(s) - if len(extfiles) == 0: - return '' - else: - return ',extensions=[' + ','.join(extfiles) + ']' - # Evaluate expression, which either must succeed (if expect_okay) # or must fail (if !expect_okay). def check_okay(self, expect_okay, expr): diff --git a/src/third_party/wiredtiger/test/suite/test_encrypt05.py b/src/third_party/wiredtiger/test/suite/test_encrypt05.py index 19a3522b3d5..d8862321821 100644 --- a/src/third_party/wiredtiger/test/suite/test_encrypt05.py +++ b/src/third_party/wiredtiger/test/suite/test_encrypt05.py @@ -49,41 +49,20 @@ class test_encrypt05(wttest.WiredTigerTestCase): nrecords = 500 bigvalue = 'a' * 500 # we use values that will definitely give compression - # Override WiredTigerTestCase, we have extensions. - def setUpConnectionOpen(self, dir): + def conn_extensions(self, extlist): + extlist.skip_if_missing = True + extlist.extension('encryptors', self.sys_encrypt) + extlist.extension('encryptors', self.file_encrypt) + extlist.extension('compressors', self.block_compress) + extlist.extension('compressors', self.log_compress) + + def conn_config(self): encarg = 'encryption=(name={0}{1}),'.format( self.sys_encrypt, self.sys_encrypt_args) comparg = '' if self.log_compress != None: comparg='log=(compressor={0}),'.format(self.log_compress) - extarg = self.extensionArg([('encryptors', self.sys_encrypt), - ('encryptors', self.file_encrypt), - ('compressors', self.block_compress), - ('compressors', self.log_compress)]) - conn = self.wiredtiger_open(dir, - 'create,error_prefix="{0}: ",{1}{2}{3}'.format( - self.shortid(), encarg, comparg, extarg)) - self.pr(`conn`) - return conn - - # Return the wiredtiger_open extension argument for a shared library. - def extensionArg(self, exts): - extfiles = [] - for ext in exts: - (dirname, name) = ext - if name != None and name != 'none': - testdir = os.path.dirname(__file__) - extdir = os.path.join(run.wt_builddir, 'ext', dirname) - extfile = os.path.join( - extdir, name, '.libs', 'libwiredtiger_' + name + '.so') - if not os.path.exists(extfile): - self.skipTest('extension "' + extfile + '" not built') - if not extfile in extfiles: - extfiles.append(extfile) - if len(extfiles) == 0: - return '' - else: - return ',extensions=["' + '","'.join(extfiles) + '"]' + return encarg + comparg def getvalue(self, r, n): if n < len(self.bigvalue): diff --git a/src/third_party/wiredtiger/test/suite/test_encrypt06.py b/src/third_party/wiredtiger/test/suite/test_encrypt06.py index 893c4ba3095..72718e53b2b 100644 --- a/src/third_party/wiredtiger/test/suite/test_encrypt06.py +++ b/src/third_party/wiredtiger/test/suite/test_encrypt06.py @@ -89,38 +89,15 @@ class test_encrypt06(wttest.WiredTigerTestCase): scenarios = make_scenarios(encrypt, storagetype) nrecords = 1000 - # Override WiredTigerTestCase, we have extensions. - def setUpConnectionOpen(self, dir): - encarg = 'encryption=(name={0}{1}),'.format( + def conn_extensions(self, extlist): + extlist.skip_if_missing = True + extlist.extension('encryptors', self.sys_encrypt) + extlist.extension('encryptors', self.file0_encrypt) + extlist.extension('encryptors', self.file1_encrypt) + + def conn_config(self): + return 'encryption=(name={0}{1}),'.format( self.sys_encrypt, self.sys_encrypt_args) - comparg = '' - extarg = self.extensionArg([('encryptors', self.sys_encrypt), - ('encryptors', self.file0_encrypt), - ('encryptors', self.file1_encrypt)]) - self.open_params = 'create,error_prefix="{0}: ",{1}{2}{3}'.format( - self.shortid(), encarg, comparg, extarg) - conn = self.wiredtiger_open(dir, self.open_params) - self.pr(`conn`) - return conn - - # Return the wiredtiger_open extension argument for a shared library. - def extensionArg(self, exts): - extfiles = [] - for ext in exts: - (dirname, name) = ext - if name != None and name != 'none': - testdir = os.path.dirname(__file__) - extdir = os.path.join(run.wt_builddir, 'ext', dirname) - extfile = os.path.join( - extdir, name, '.libs', 'libwiredtiger_' + name + '.so') - if not os.path.exists(extfile): - self.skipTest('extension "' + extfile + '" not built') - if not extfile in extfiles: - extfiles.append(extfile) - if len(extfiles) == 0: - return '' - else: - return ',extensions=["' + '","'.join(extfiles) + '"]' def encrypt_file_params(self, name, args): if name == None: diff --git a/src/third_party/wiredtiger/test/suite/test_encrypt07.py b/src/third_party/wiredtiger/test/suite/test_encrypt07.py index 97ab1987d4f..81c9f1a49ea 100644 --- a/src/third_party/wiredtiger/test/suite/test_encrypt07.py +++ b/src/third_party/wiredtiger/test/suite/test_encrypt07.py @@ -44,35 +44,14 @@ class test_encrypt07(test_salvage.test_salvage): nrecords = 5000 bigvalue = "abcdefghij" * 1007 # len(bigvalue) = 10070 - # Override WiredTigerTestCase, we have extensions. - def setUpConnectionOpen(self, dir): - encarg = 'encryption=(name={0}{1}),'.format( - self.sys_encrypt, self.sys_encrypt_args) - extarg = self.extensionArg([('encryptors', self.sys_encrypt)]) - conn = self.wiredtiger_open(dir, - 'create,error_prefix="{0}: ",{1}{2}'.format( - self.shortid(), encarg, extarg)) - self.pr(`conn`) - return conn + def conn_extensions(self, extlist): + # Load the compression extension, skip the test if missing + extlist.skip_if_missing = True + extlist.extension('encryptors', self.sys_encrypt) - # Return the wiredtiger_open extension argument for a shared library. - def extensionArg(self, exts): - extfiles = [] - for ext in exts: - (dirname, name) = ext - if name != None and name != 'none': - testdir = os.path.dirname(__file__) - extdir = os.path.join(run.wt_builddir, 'ext', dirname) - extfile = os.path.join( - extdir, name, '.libs', 'libwiredtiger_' + name + '.so') - if not os.path.exists(extfile): - self.skipTest('extension "' + extfile + '" not built') - if not extfile in extfiles: - extfiles.append(extfile) - if len(extfiles) == 0: - return '' - else: - return ',extensions=["' + '","'.join(extfiles) + '"]' + def conn_config(self): + return 'encryption=(name={0}{1}),'.format( + self.sys_encrypt, self.sys_encrypt_args) def rot13(self, s): return codecs.encode(s, 'rot_13') diff --git a/src/third_party/wiredtiger/test/suite/test_intpack.py b/src/third_party/wiredtiger/test/suite/test_intpack.py index b0cece09494..ae391e68fca 100644 --- a/src/third_party/wiredtiger/test/suite/test_intpack.py +++ b/src/third_party/wiredtiger/test/suite/test_intpack.py @@ -126,8 +126,8 @@ class PackTester: class test_intpack(wttest.WiredTigerTestCase): name = 'test_intpack' - # We have to be a bit verbose here with naming, as there can be problems with - # case insensitive test names:w + # We have to be a bit verbose here with naming, scenario names are + # case insensitive and must be unique. scenarios = make_scenarios([ ('int8_t_b', dict(formatcode='b', low=-128, high=127, nbits=8)), diff --git a/src/third_party/wiredtiger/test/suite/test_join01.py b/src/third_party/wiredtiger/test/suite/test_join01.py index 2c4328dc7d3..bdd86a06d4f 100644 --- a/src/third_party/wiredtiger/test/suite/test_join01.py +++ b/src/third_party/wiredtiger/test/suite/test_join01.py @@ -69,7 +69,7 @@ class test_join01(wttest.WiredTigerTestCase): ] scenarios = make_scenarios(type_scen, bloom0_scen, bloom1_scen, projection_scen, nested_scen, stats_scen, - order_scen) + order_scen, prune=50, prunelong=1000) # We need statistics for these tests. conn_config = 'statistics=(all)' diff --git a/src/third_party/wiredtiger/test/suite/test_join03.py b/src/third_party/wiredtiger/test/suite/test_join03.py index edab7146a6b..dd8111f6ead 100644 --- a/src/third_party/wiredtiger/test/suite/test_join03.py +++ b/src/third_party/wiredtiger/test/suite/test_join03.py @@ -36,33 +36,9 @@ class test_join03(wttest.WiredTigerTestCase): table_name1 = 'test_join03' nentries = 100 - # Return the wiredtiger_open extension argument for a shared library. - def extensionArg(self, exts): - extfiles = [] - for ext in exts: - (dirname, name, libname) = ext - if name != None and name != 'none': - testdir = os.path.dirname(__file__) - extdir = os.path.join(run.wt_builddir, 'ext', dirname) - extfile = os.path.join( - extdir, name, '.libs', 'libwiredtiger_' + libname + '.so') - if not os.path.exists(extfile): - self.skipTest('extension "' + extfile + '" not built') - if not extfile in extfiles: - extfiles.append(extfile) - if len(extfiles) == 0: - return '' - else: - return ',extensions=["' + '","'.join(extfiles) + '"]' - - # Override WiredTigerTestCase, we have extensions. - def setUpConnectionOpen(self, dir): - extarg = self.extensionArg([('extractors', 'csv', 'csv_extractor')]) - connarg = 'create,error_prefix="{0}: ",{1}'.format( - self.shortid(), extarg) - conn = self.wiredtiger_open(dir, connarg) - self.pr(`conn`) - return conn + def conn_extensions(self, extlist): + extlist.skip_if_missing = True + extlist.extension('extractors', 'csv') def gen_key(self, i): return [ i + 1 ] diff --git a/src/third_party/wiredtiger/test/suite/test_join04.py b/src/third_party/wiredtiger/test/suite/test_join04.py index a71418d9f05..e65b8b53333 100644 --- a/src/third_party/wiredtiger/test/suite/test_join04.py +++ b/src/third_party/wiredtiger/test/suite/test_join04.py @@ -36,33 +36,9 @@ class test_join04(wttest.WiredTigerTestCase): table_name1 = 'test_join04' nentries = 100 - # Return the wiredtiger_open extension argument for a shared library. - def extensionArg(self, exts): - extfiles = [] - for ext in exts: - (dirname, name, libname) = ext - if name != None and name != 'none': - testdir = os.path.dirname(__file__) - extdir = os.path.join(run.wt_builddir, 'ext', dirname) - extfile = os.path.join( - extdir, name, '.libs', 'libwiredtiger_' + libname + '.so') - if not os.path.exists(extfile): - self.skipTest('extension "' + extfile + '" not built') - if not extfile in extfiles: - extfiles.append(extfile) - if len(extfiles) == 0: - return '' - else: - return ',extensions=["' + '","'.join(extfiles) + '"]' - - # Override WiredTigerTestCase, we have extensions. - def setUpConnectionOpen(self, dir): - extarg = self.extensionArg([('extractors', 'csv', 'csv_extractor')]) - connarg = 'create,error_prefix="{0}: ",{1}'.format( - self.shortid(), extarg) - conn = self.wiredtiger_open(dir, connarg) - self.pr(`conn`) - return conn + def conn_extensions(self, extlist): + extlist.skip_if_missing = True + extlist.extension('extractors', 'csv') # JIRA WT-2308: # Test extractors with equality joins diff --git a/src/third_party/wiredtiger/test/suite/test_join07.py b/src/third_party/wiredtiger/test/suite/test_join07.py index 2a32e678d72..8fae3539246 100644 --- a/src/third_party/wiredtiger/test/suite/test_join07.py +++ b/src/third_party/wiredtiger/test/suite/test_join07.py @@ -200,33 +200,9 @@ class test_join07(wttest.WiredTigerTestCase): scenarios = make_scenarios(extractscen) - # Return the wiredtiger_open extension argument for a shared library. - def extensionArg(self, exts): - extfiles = [] - for ext in exts: - (dirname, name, libname) = ext - if name != None and name != 'none': - testdir = os.path.dirname(__file__) - extdir = os.path.join(run.wt_builddir, 'ext', dirname) - extfile = os.path.join( - extdir, name, '.libs', 'libwiredtiger_' + libname + '.so') - if not os.path.exists(extfile): - self.skipTest('extension "' + extfile + '" not built') - if not extfile in extfiles: - extfiles.append(extfile) - if len(extfiles) == 0: - return '' - else: - return ',extensions=["' + '","'.join(extfiles) + '"]' - - # Override WiredTigerTestCase, we have extensions. - def setUpConnectionOpen(self, dir): - extarg = self.extensionArg([('extractors', 'csv', 'csv_extractor')]) - connarg = 'create,error_prefix="{0}: ",{1}'.format( - self.shortid(), extarg) - conn = self.wiredtiger_open(dir, connarg) - self.pr(`conn`) - return conn + def conn_extensions(self, extlist): + extlist.skip_if_missing = True + extlist.extension('extractors', 'csv') def expect(self, token, expected): if token == None or token.kind not in expected: diff --git a/src/third_party/wiredtiger/test/suite/test_jsondump02.py b/src/third_party/wiredtiger/test/suite/test_jsondump02.py index 8482851fb94..60863c4aa97 100644 --- a/src/third_party/wiredtiger/test/suite/test_jsondump02.py +++ b/src/third_party/wiredtiger/test/suite/test_jsondump02.py @@ -234,6 +234,24 @@ class test_jsondump02(wttest.WiredTigerTestCase, suite_subprocess): ('"ikey" : 4,\n"Skey" : "key4"', '"S1" : "val16",\n"i2" : 16,\n"S3" : "val64",\n"i4" : 64')) self.check_json(self.table_uri4, table4_json) + # This projection has 3 value fields reversed with a key at the end. + table4_json_projection = ( + ('"ikey" : 1,\n"Skey" : "key1"', + '"i4" : 1,\n"S3" : "val1",\n"i2" : 1,\n"ikey" : 1'), + ('"ikey" : 2,\n"Skey" : "key2"', + '"i4" : 8,\n"S3" : "val8",\n"i2" : 4,\n"ikey" : 2'), + ('"ikey" : 3,\n"Skey" : "key3"', + '"i4" : 27,\n"S3" : "val27",\n"i2" : 9,\n"ikey" : 3'), + ('"ikey" : 4,\n"Skey" : "key4"', + '"i4" : 64,\n"S3" : "val64",\n"i2" : 16,\n"ikey" : 4')) + # bad projection URI + self.assertRaisesWithMessage(wiredtiger.WiredTigerError, + lambda: self.check_json(self.table_uri4 + '(i4,S3,i2,ikey', + table4_json_projection), + '/Unbalanced brackets/') + # This projection should work. + self.check_json(self.table_uri4 + '(i4,S3,i2,ikey)', + table4_json_projection) # The dump config currently is not supported for the index type. self.check_json(uri4index1, ( ('"Skey" : "key1"', diff --git a/src/third_party/wiredtiger/test/suite/test_lsm01.py b/src/third_party/wiredtiger/test/suite/test_lsm01.py index b44df4bae14..f705b09b0a4 100644 --- a/src/third_party/wiredtiger/test/suite/test_lsm01.py +++ b/src/third_party/wiredtiger/test/suite/test_lsm01.py @@ -57,7 +57,7 @@ class test_lsm01(wttest.WiredTigerTestCase): scenarios = wtscenario.make_scenarios( chunk_size_scenarios, merge_max_scenarios, bloom_scenarios, bloom_bit_scenarios, bloom_hash_scenarios, record_count_scenarios, - prune=500) + prune=100, prunelong=500) # Test drop of an object. def test_lsm(self): diff --git a/src/third_party/wiredtiger/test/suite/test_perf001.py b/src/third_party/wiredtiger/test/suite/test_perf001.py index b22ed2baeb0..6331a3f64d6 100644 --- a/src/third_party/wiredtiger/test/suite/test_perf001.py +++ b/src/third_party/wiredtiger/test/suite/test_perf001.py @@ -40,7 +40,8 @@ class test_perf001(wttest.WiredTigerTestCase): scenarios = make_scenarios([ #('file-file', dict(tabletype='file',indextype='file')), - ('file-lsm', dict(tabletype='file',indextype='lsm')), + ('file-lsm', dict(tabletype='file',indextype='lsm', cfg='', + conn_config="statistics=(fast),statistics_log=(wait=1)")), #('lsm-file', dict(tabletype='lsm',indextype='file')), #('lsm-lsm', dict(tabletype='lsm',indextype='lsm')), ]) diff --git a/src/third_party/wiredtiger/test/suite/test_readonly01.py b/src/third_party/wiredtiger/test/suite/test_readonly01.py index e4b431ca1da..f41280a3283 100644 --- a/src/third_party/wiredtiger/test/suite/test_readonly01.py +++ b/src/third_party/wiredtiger/test/suite/test_readonly01.py @@ -75,8 +75,7 @@ class test_readonly01(wttest.WiredTigerTestCase, suite_subprocess): scenarios = make_scenarios(basecfg_list, dir_list, log_list, types) - def conn_config(self, dir): - self.home = dir + def conn_config(self): params = \ 'error_prefix="%s",' % self.shortid() + \ '%s' % self.logcfg + \ diff --git a/src/third_party/wiredtiger/test/suite/test_reconfig01.py b/src/third_party/wiredtiger/test/suite/test_reconfig01.py index e76becac76a..cbc8bca5740 100644 --- a/src/third_party/wiredtiger/test/suite/test_reconfig01.py +++ b/src/third_party/wiredtiger/test/suite/test_reconfig01.py @@ -64,6 +64,18 @@ class test_reconfig01(wttest.WiredTigerTestCase): # same ops_max of 512 and thread of 8. self.conn.reconfigure("async=(enabled=true)") + def test_reconfig_eviction(self): + # Increase the max number of running threads (default 8). + self.conn.reconfigure("eviction=(threads_max=10)") + # Increase the min number of running threads (default 1). + self.conn.reconfigure("eviction=(threads_min=5)") + # Decrease the max number of running threads. + self.conn.reconfigure("eviction=(threads_max=7)") + # Decrease the min number of running threads. + self.conn.reconfigure("eviction=(threads_min=2)") + # Set min and max the same. + self.conn.reconfigure("eviction=(threads_min=6,threads_max=6)") + def test_reconfig_lsm_manager(self): # We create and populate a tiny LSM so that we can start off with # the LSM threads running and change the numbers of threads. diff --git a/src/third_party/wiredtiger/test/suite/test_reconfig02.py b/src/third_party/wiredtiger/test/suite/test_reconfig02.py index 36a78a1805f..8054b2a6ab5 100644 --- a/src/third_party/wiredtiger/test/suite/test_reconfig02.py +++ b/src/third_party/wiredtiger/test/suite/test_reconfig02.py @@ -109,6 +109,7 @@ class test_reconfig02(wttest.WiredTigerTestCase): # Now turn on archive, sleep a bit to allow the archive thread # to run and then confirm that all original logs are gone. self.conn.reconfigure("log=(archive=true)") + self.session.checkpoint("force") time.sleep(2) cur_logs = fnmatch.filter(os.listdir('.'), "*Log*") for o in orig_logs: diff --git a/src/third_party/wiredtiger/test/suite/test_reconfig04.py b/src/third_party/wiredtiger/test/suite/test_reconfig04.py index be5e6d3729e..51d9b91c1f4 100644 --- a/src/third_party/wiredtiger/test/suite/test_reconfig04.py +++ b/src/third_party/wiredtiger/test/suite/test_reconfig04.py @@ -26,9 +26,7 @@ # ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR # OTHER DEALINGS IN THE SOFTWARE. -import fnmatch, os, time import wiredtiger, wttest -from wtdataset import SimpleDataSet # test_reconfig04.py # Test WT_SESSION::reconfigure diff --git a/src/third_party/wiredtiger/test/suite/test_schema05.py b/src/third_party/wiredtiger/test/suite/test_schema05.py index 28ad51b3c92..d536a629373 100644 --- a/src/third_party/wiredtiger/test/suite/test_schema05.py +++ b/src/third_party/wiredtiger/test/suite/test_schema05.py @@ -57,33 +57,9 @@ class test_schema05(wttest.WiredTigerTestCase): ('index-after', { 'create_index' : 2 }), ]) - # Return the wiredtiger_open extension argument for a shared library. - def extensionArg(self, exts): - extfiles = [] - for ext in exts: - (dirname, name, libname) = ext - if name != None and name != 'none': - testdir = os.path.dirname(__file__) - extdir = os.path.join(run.wt_builddir, 'ext', dirname) - extfile = os.path.join( - extdir, name, '.libs', 'libwiredtiger_' + libname + '.so') - if not os.path.exists(extfile): - self.skipTest('extension "' + extfile + '" not built') - if not extfile in extfiles: - extfiles.append(extfile) - if len(extfiles) == 0: - return '' - else: - return ',extensions=["' + '","'.join(extfiles) + '"]' - - # Override WiredTigerTestCase, we have extensions. - def setUpConnectionOpen(self, dir): - extarg = self.extensionArg([('extractors', 'csv', 'csv_extractor')]) - connarg = 'create,error_prefix="{0}: ",{1}'.format( - self.shortid(), extarg) - conn = self.wiredtiger_open(dir, connarg) - self.pr(`conn`) - return conn + def conn_extensions(self, extlist): + extlist.skip_if_missing = True + extlist.extension('extractors', 'csv') def create_indices(self): # Create self.nindices index files, each with a column from the CSV diff --git a/src/third_party/wiredtiger/test/suite/test_schema07.py b/src/third_party/wiredtiger/test/suite/test_schema07.py index ac397c6e1a1..3e4b1d28a4d 100644 --- a/src/third_party/wiredtiger/test/suite/test_schema07.py +++ b/src/third_party/wiredtiger/test/suite/test_schema07.py @@ -33,8 +33,7 @@ import wiredtiger, wttest class test_schema07(wttest.WiredTigerTestCase): tablename = 'table:test_schema07' - def conn_config(self, dir): - return 'cache_size=10MB' + conn_config = 'cache_size=10MB' @wttest.longtest("Creating many tables shouldn't fill the cache") def test_many_tables(self): diff --git a/src/third_party/wiredtiger/test/suite/test_stat02.py b/src/third_party/wiredtiger/test/suite/test_stat02.py index cecda7f1ddc..45af283ed02 100644 --- a/src/third_party/wiredtiger/test/suite/test_stat02.py +++ b/src/third_party/wiredtiger/test/suite/test_stat02.py @@ -59,7 +59,7 @@ class test_stat_cursor_config(wttest.WiredTigerTestCase): scenarios = make_scenarios(uri, data_config, cursor_config) # Turn on statistics for this test. - def conn_config(self, dir): + def conn_config(self): return 'statistics=(%s)' % self.data_config # For each database/cursor configuration, confirm the right combinations diff --git a/src/third_party/wiredtiger/test/suite/test_sweep01.py b/src/third_party/wiredtiger/test/suite/test_sweep01.py index 71f8fcb180e..5559190caca 100644 --- a/src/third_party/wiredtiger/test/suite/test_sweep01.py +++ b/src/third_party/wiredtiger/test/suite/test_sweep01.py @@ -116,10 +116,15 @@ class test_sweep01(wttest.WiredTigerTestCase, suite_subprocess): # Give slow machines time to process files. stat_cursor = self.session.open_cursor('statistics:', None, None) this_nfile = stat_cursor[stat.conn.file_open][2] + removed = stat_cursor[stat.conn.dh_sweep_remove][2] stat_cursor.close() self.pr("==== loop " + str(sleep)) self.pr("this_nfile " + str(this_nfile)) - if this_nfile == final_nfile: + self.pr("removed " + str(removed)) + # On slow machines there can be a lag where files get closed but + # the sweep server cannot yet remove the handles. So wait for the + # removed statistic to indicate forward progress too. + if this_nfile == final_nfile and removed != remove1: break c.close() self.pr("Sweep loop took " + str(sleep)) diff --git a/src/third_party/wiredtiger/test/suite/test_truncate01.py b/src/third_party/wiredtiger/test/suite/test_truncate01.py index 2319eeddbef..7d2b3862568 100644 --- a/src/third_party/wiredtiger/test/suite/test_truncate01.py +++ b/src/third_party/wiredtiger/test/suite/test_truncate01.py @@ -183,11 +183,11 @@ class test_truncate_cursor(wttest.WiredTigerTestCase): # those tests to file objects. types = [ ('file', dict(type='file:', valuefmt='S', - config='allocation_size=512,leaf_page_max=512')), + config='allocation_size=512,leaf_page_max=512', P=0.25)), ('file8t', dict(type='file:', valuefmt='8t', - config='allocation_size=512,leaf_page_max=512')), + config='allocation_size=512,leaf_page_max=512', P=0.25)), ('table', dict(type='table:', valuefmt='S', - config='allocation_size=512,leaf_page_max=512')), + config='allocation_size=512,leaf_page_max=512', P=0.5)), ] keyfmt = [ ('integer', dict(keyfmt='i')), @@ -203,7 +203,8 @@ class test_truncate_cursor(wttest.WiredTigerTestCase): ('big', dict(nentries=1000,skip=37)), ] - scenarios = make_scenarios(types, keyfmt, size, reopen) + scenarios = make_scenarios(types, keyfmt, size, reopen, + prune=10, prunelong=1000) # Set a cursor key. def cursorKey(self, ds, uri, key): diff --git a/src/third_party/wiredtiger/test/suite/test_truncate02.py b/src/third_party/wiredtiger/test/suite/test_truncate02.py index 73fed362354..729825b26d4 100644 --- a/src/third_party/wiredtiger/test/suite/test_truncate02.py +++ b/src/third_party/wiredtiger/test/suite/test_truncate02.py @@ -85,7 +85,8 @@ class test_truncate_fast_delete(wttest.WiredTigerTestCase): ('txn2', dict(commit=False)), ] - scenarios = make_scenarios(types, keyfmt, overflow, reads, writes, txn) + scenarios = make_scenarios(types, keyfmt, overflow, reads, writes, txn, + prune=20, prunelong=1000) # Return the number of records visible to the cursor; test both forward # and backward iteration, they are different code paths in this case. diff --git a/src/third_party/wiredtiger/test/suite/test_txn02.py b/src/third_party/wiredtiger/test/suite/test_txn02.py index a0c2c12a47c..01626057b9e 100644 --- a/src/third_party/wiredtiger/test/suite/test_txn02.py +++ b/src/third_party/wiredtiger/test/suite/test_txn02.py @@ -93,11 +93,10 @@ class test_txn02(wttest.WiredTigerTestCase, suite_subprocess): checklog_calls = 100 if wttest.islongtest() else 2 checklog_mod = (len(scenarios) / checklog_calls + 1) - def setUpConnectionOpen(self, dir): - self.home = dir + def conn_config(self): # Cycle through the different transaction_sync values in a # deterministic manner. - self.txn_sync = self.sync_list[ + txn_sync = self.sync_list[ self.scenario_number % len(self.sync_list)] # # We don't want to run zero fill with only the same settings, such @@ -107,17 +106,9 @@ class test_txn02(wttest.WiredTigerTestCase, suite_subprocess): zerofill = 'false' if self.scenario_number % freq == 0: zerofill = 'true' - self.backup_dir = os.path.join(self.home, "WT_BACKUP") - conn_params = \ - 'log=(archive=false,enabled,file_max=%s),' % self.logmax + \ - 'log=(zero_fill=%s),' % zerofill + \ - 'create,error_prefix="%s: ",' % self.shortid() + \ - 'transaction_sync="%s",' % self.txn_sync - # print "Creating conn at '%s' with config '%s'" % (dir, conn_params) - conn = self.wiredtiger_open(dir, conn_params) - self.pr(`conn`) - self.session2 = conn.open_session() - return conn + return 'log=(archive=false,enabled,file_max=%s),' % self.logmax + \ + 'log=(zero_fill=%s),' % zerofill + \ + 'transaction_sync="%s",' % txn_sync # Check that a cursor (optionally started in a new transaction), sees the # expected values. @@ -176,8 +167,10 @@ class test_txn02(wttest.WiredTigerTestCase, suite_subprocess): backup_conn = self.wiredtiger_open(self.backup_dir, backup_conn_params) try: - self.check(backup_conn.open_session(), None, committed) + session = backup_conn.open_session() finally: + session.checkpoint("force") + self.check(backup_conn.open_session(), None, committed) # Sleep long enough so that the archive thread is guaranteed # to run before we close the connection. time.sleep(1.0) @@ -204,6 +197,8 @@ class test_txn02(wttest.WiredTigerTestCase, suite_subprocess): self.assertEqual(cur_logs, pr_logs) def test_ops(self): + self.backup_dir = os.path.join(self.home, "WT_BACKUP") + self.session2 = self.conn.open_session() # print "Creating %s with config '%s'" % (self.uri, self.create_params) self.session.create(self.uri, self.create_params) # Set up the table with entries for 1, 2, 10 and 11. @@ -226,6 +221,7 @@ class test_txn02(wttest.WiredTigerTestCase, suite_subprocess): # Close and reopen the connection and cursor. if reopen == 'reopen': self.reopen_conn() + self.session2 = self.conn.open_session() c = self.session.open_cursor(self.uri, None, 'overwrite') self.session.begin_transaction( diff --git a/src/third_party/wiredtiger/test/suite/test_txn04.py b/src/third_party/wiredtiger/test/suite/test_txn04.py index ade39272f84..d8f6774ded1 100644 --- a/src/third_party/wiredtiger/test/suite/test_txn04.py +++ b/src/third_party/wiredtiger/test/suite/test_txn04.py @@ -63,24 +63,15 @@ class test_txn04(wttest.WiredTigerTestCase, suite_subprocess): txn1s = [('t1c', dict(txn1='commit')), ('t1r', dict(txn1='rollback'))] scenarios = make_scenarios(types, op1s, txn1s) - # Overrides WiredTigerTestCase - def setUpConnectionOpen(self, dir): - self.home = dir + + def conn_config(self): # Cycle through the different transaction_sync values in a # deterministic manner. - self.txn_sync = self.sync_list[ + txn_sync = self.sync_list[ self.scenario_number % len(self.sync_list)] - self.backup_dir = os.path.join(self.home, "WT_BACKUP") # Set archive false on the home directory. - conn_params = \ - 'log=(archive=false,enabled,file_max=%s),' % self.logmax + \ - 'create,error_prefix="%s: ",' % self.shortid() + \ - 'transaction_sync="%s",' % self.txn_sync - # print "Creating conn at '%s' with config '%s'" % (dir, conn_params) - conn = self.wiredtiger_open(dir, conn_params) - self.pr(`conn`) - self.session2 = conn.open_session() - return conn + return 'log=(archive=false,enabled,file_max=%s),' % self.logmax + \ + 'transaction_sync="%s",' % txn_sync # Check that a cursor (optionally started in a new transaction), sees the # expected values. @@ -146,6 +137,7 @@ class test_txn04(wttest.WiredTigerTestCase, suite_subprocess): # The runWt command closes our connection and sessions so # we need to reopen them here. self.hot_backup(None, committed) + self.session2 = self.conn.open_session() c = self.session.open_cursor(self.uri, None, 'overwrite') c.set_value(1) # Then do the given modification. @@ -193,6 +185,8 @@ class test_txn04(wttest.WiredTigerTestCase, suite_subprocess): self.hot_backup(self.uri, committed) def test_ops(self): + self.backup_dir = os.path.join(self.home, "WT_BACKUP") + self.session2 = self.conn.open_session() with self.expectedStdoutPattern('recreating metadata'): self.ops() diff --git a/src/third_party/wiredtiger/test/suite/test_txn05.py b/src/third_party/wiredtiger/test/suite/test_txn05.py index 9e84fe7d3fe..7aaff221ba4 100644 --- a/src/third_party/wiredtiger/test/suite/test_txn05.py +++ b/src/third_party/wiredtiger/test/suite/test_txn05.py @@ -64,23 +64,15 @@ class test_txn05(wttest.WiredTigerTestCase, suite_subprocess): txn1s = [('t1c', dict(txn1='commit')), ('t1r', dict(txn1='rollback'))] scenarios = make_scenarios(types, op1s, txn1s) - # Overrides WiredTigerTestCase - def setUpConnectionOpen(self, dir): - self.home = dir + + def conn_config(self): # Cycle through the different transaction_sync values in a # deterministic manner. - self.txn_sync = self.sync_list[ + txn_sync = self.sync_list[ self.scenario_number % len(self.sync_list)] - self.backup_dir = os.path.join(self.home, "WT_BACKUP") - conn_params = \ - 'log=(archive=false,enabled,file_max=%s),' % self.logmax + \ - 'create,error_prefix="%s: ",' % self.shortid() + \ - 'transaction_sync="%s",' % self.txn_sync - # print "Creating conn at '%s' with config '%s'" % (dir, conn_params) - conn = self.wiredtiger_open(dir, conn_params) - self.pr(`conn`) - self.session2 = conn.open_session() - return conn + # Set archive false on the home directory. + return 'log=(archive=false,enabled,file_max=%s),' % self.logmax + \ + 'transaction_sync="%s",' % txn_sync # Check that a cursor (optionally started in a new transaction), sees the # expected values. @@ -139,8 +131,12 @@ class test_txn05(wttest.WiredTigerTestCase, suite_subprocess): backup_conn = self.wiredtiger_open(self.backup_dir, backup_conn_params) try: - self.check(backup_conn.open_session(), None, committed) + session = backup_conn.open_session() finally: + self.check(session, None, committed) + # Force a checkpoint because we don't record the recovery + # checkpoint as available for archiving. + session.checkpoint("force") # Sleep long enough so that the archive thread is guaranteed # to run before we close the connection. time.sleep(1.0) @@ -163,6 +159,8 @@ class test_txn05(wttest.WiredTigerTestCase, suite_subprocess): self.runWt(['-h', self.backup_dir, 'printlog'], outfilename='printlog.out') def test_ops(self): + self.backup_dir = os.path.join(self.home, "WT_BACKUP") + self.session2 = self.conn.open_session() # print "Creating %s with config '%s'" % (self.uri, self.create_params) self.session.create(self.uri, self.create_params) # Set up the table with entries for 1-5. diff --git a/src/third_party/wiredtiger/test/suite/test_txn06.py b/src/third_party/wiredtiger/test/suite/test_txn06.py index 2bff97f6aac..c91dc6a623b 100644 --- a/src/third_party/wiredtiger/test/suite/test_txn06.py +++ b/src/third_party/wiredtiger/test/suite/test_txn06.py @@ -40,10 +40,10 @@ class test_txn06(wttest.WiredTigerTestCase, suite_subprocess): source_uri = 'table:' + tablename + "_src" nrows = 100000 - def setUpConnectionOpen(self, *args): + def conn_config(self): if not wiredtiger.verbose_build(): self.skipTest('requires a verbose build') - return super(test_txn06, self).setUpConnectionOpen(*args) + return '' def test_long_running(self): # Populate a table diff --git a/src/third_party/wiredtiger/test/suite/test_txn07.py b/src/third_party/wiredtiger/test/suite/test_txn07.py index f9577bad7f2..e2986fb999a 100644 --- a/src/third_party/wiredtiger/test/suite/test_txn07.py +++ b/src/third_party/wiredtiger/test/suite/test_txn07.py @@ -70,43 +70,20 @@ class test_txn07(wttest.WiredTigerTestCase, suite_subprocess): ('none', dict(compress='')), ] - scenarios = make_scenarios(types, op1s, txn1s, compress) - # Overrides WiredTigerTestCase - def setUpConnectionOpen(self, dir): - self.home = dir - # Cycle through the different transaction_sync values in a - # deterministic manner. - self.txn_sync = self.sync_list[ - self.scenario_number % len(self.sync_list)] - self.backup_dir = os.path.join(self.home, "WT_BACKUP") - conn_params = \ - 'log=(archive=false,enabled,file_max=%s,' % self.logmax + \ - 'compressor=%s)' % self.compress + \ - self.extensionArg(self.compress) + \ - ',create,error_prefix="%s: ",' % self.shortid() + \ - "statistics=(fast)," + \ - 'transaction_sync="%s",' % self.txn_sync - # print "Creating conn at '%s' with config '%s'" % (dir, conn_params) - try: - conn = self.wiredtiger_open(dir, conn_params) - except wiredtiger.WiredTigerError as e: - print "Failed conn at '%s' with config '%s'" % (dir, conn_params) - self.pr(`conn`) - self.session2 = conn.open_session() - return conn - - # Return the wiredtiger_open extension argument for a shared library. - def extensionArg(self, name): - if name == None or name == '': - return '' - - testdir = os.path.dirname(__file__) - extdir = os.path.join(run.wt_builddir, 'ext/compressors') - extfile = os.path.join( - extdir, name, '.libs', 'libwiredtiger_' + name + '.so') - if not os.path.exists(extfile): - self.skipTest('compression extension "' + extfile + '" not built') - return ',extensions=["' + extfile + '"]' + scenarios = make_scenarios(types, op1s, txn1s, compress, + prune=30, prunelong=1000) + + def conn_config(self): + return 'log=(archive=false,enabled,file_max=%s,' % self.logmax + \ + 'compressor=%s)' % self.compress + \ + ',create,error_prefix="%s: ",' % self.shortid() + \ + "statistics=(fast)," + \ + 'transaction_sync="%s",' % \ + self.sync_list[self.scenario_number % len(self.sync_list)] + + def conn_extensions(self, extlist): + extlist.skip_if_missing = True + extlist.extension('compressors', self.compress) # Check that a cursor (optionally started in a new transaction), sees the # expected values. @@ -139,7 +116,7 @@ class test_txn07(wttest.WiredTigerTestCase, suite_subprocess): self.backup(self.backup_dir) backup_conn_params = 'log=(enabled,file_max=%s,' % self.logmax + \ 'compressor=%s)' % self.compress + \ - self.extensionArg(self.compress) + self.extensionsConfig() backup_conn = self.wiredtiger_open(self.backup_dir, backup_conn_params) try: self.check(backup_conn.open_session(), None, committed) @@ -147,6 +124,9 @@ class test_txn07(wttest.WiredTigerTestCase, suite_subprocess): backup_conn.close() def test_ops(self): + self.backup_dir = os.path.join(self.home, "WT_BACKUP") + self.session2 = self.conn.open_session() + # print "Creating %s with config '%s'" % (self.uri, self.create_params) self.session.create(self.uri, self.create_params) # Set up the table with entries for 1-5. diff --git a/src/third_party/wiredtiger/test/suite/test_txn08.py b/src/third_party/wiredtiger/test/suite/test_txn08.py index f0cdf08df07..04faed9d45a 100644 --- a/src/third_party/wiredtiger/test/suite/test_txn08.py +++ b/src/third_party/wiredtiger/test/suite/test_txn08.py @@ -41,7 +41,7 @@ class test_txn08(wttest.WiredTigerTestCase, suite_subprocess): uri = 'table:' + tablename # Turn on logging for this test. - def conn_config(self, dir): + def conn_config(self): return 'log=(archive=false,enabled,file_max=%s),' % self.logmax + \ 'transaction_sync="(method=dsync,enabled)"' diff --git a/src/third_party/wiredtiger/test/suite/test_txn09.py b/src/third_party/wiredtiger/test/suite/test_txn09.py index cfad8270ab1..768d714e248 100644 --- a/src/third_party/wiredtiger/test/suite/test_txn09.py +++ b/src/third_party/wiredtiger/test/suite/test_txn09.py @@ -80,19 +80,9 @@ class test_txn09(wttest.WiredTigerTestCase, suite_subprocess): op1s, txn1s, op2s, txn2s, op3s, txn3s, op4s, txn4s, prune=20, prunelong=5000) - # Overrides WiredTigerTestCase - def setUpConnectionOpen(self, dir): - self.home = dir - conn_params = \ - 'create,error_prefix="%s: ",' % self.shortid() + \ - 'log=(archive=false,enabled=%s),' % int(self.log_enabled) + \ - 'transaction_sync=(enabled=false),' - - # print "Opening conn at '%s' with config '%s'" % (dir, conn_params) - conn = self.wiredtiger_open(dir, conn_params) - self.pr(`conn`) - self.session2 = conn.open_session() - return conn + def conn_config(self): + return 'log=(archive=false,enabled=%s),' % int(self.log_enabled) + \ + 'transaction_sync=(enabled=false)' # Check that a cursor (optionally started in a new transaction), sees the # expected values. @@ -141,6 +131,7 @@ class test_txn09(wttest.WiredTigerTestCase, suite_subprocess): # Close and reopen the connection and cursor, toggling the log self.log_enabled = not self.log_enabled self.reopen_conn() + self.session2 = self.conn.open_session() c = self.session.open_cursor(self.uri, None, 'overwrite') self.session.begin_transaction( diff --git a/src/third_party/wiredtiger/test/suite/test_txn11.py b/src/third_party/wiredtiger/test/suite/test_txn11.py index 147bf3a76c0..3c02b1e86e3 100644 --- a/src/third_party/wiredtiger/test/suite/test_txn11.py +++ b/src/third_party/wiredtiger/test/suite/test_txn11.py @@ -44,7 +44,7 @@ class test_txn11(wttest.WiredTigerTestCase, suite_subprocess): uri = 'table:' + tablename # Turn on logging for this test. - def conn_config(self, dir): + def conn_config(self): return 'log=(archive=%s,' % self.archive + \ 'enabled,file_max=%s,prealloc=false),' % self.logmax + \ 'transaction_sync=(enabled=false),' diff --git a/src/third_party/wiredtiger/test/suite/test_txn13.py b/src/third_party/wiredtiger/test/suite/test_txn13.py index ae0250c06e8..2bf49486b3a 100644 --- a/src/third_party/wiredtiger/test/suite/test_txn13.py +++ b/src/third_party/wiredtiger/test/suite/test_txn13.py @@ -50,7 +50,7 @@ class test_txn13(wttest.WiredTigerTestCase, suite_subprocess): ]) # Turn on logging for this test. - def conn_config(self, dir): + def conn_config(self): return 'log=(archive=false,enabled,file_max=%s)' % self.logmax + \ ',cache_size=8G' diff --git a/src/third_party/wiredtiger/test/suite/test_txn15.py b/src/third_party/wiredtiger/test/suite/test_txn15.py index c061c093b02..a2bfb626338 100644 --- a/src/third_party/wiredtiger/test/suite/test_txn15.py +++ b/src/third_party/wiredtiger/test/suite/test_txn15.py @@ -41,7 +41,7 @@ class test_txn15(wttest.WiredTigerTestCase, suite_subprocess): create_params = 'key_format=i,value_format=i' entries = 100 # Turn on logging for this test. - def conn_config(self, dir): + def conn_config(self): return 'statistics=(fast),' + \ 'log=(archive=false,enabled,file_max=100K),' + \ 'use_environment=false,' + \ diff --git a/src/third_party/wiredtiger/test/suite/test_util14.py b/src/third_party/wiredtiger/test/suite/test_util14.py new file mode 100644 index 00000000000..e2a9f41f0d4 --- /dev/null +++ b/src/third_party/wiredtiger/test/suite/test_util14.py @@ -0,0 +1,92 @@ +#!/usr/bin/env python +# +# Public Domain 2014-2017 MongoDB, Inc. +# Public Domain 2008-2014 WiredTiger, Inc. +# +# This is free and unencumbered software released into the public domain. +# +# Anyone is free to copy, modify, publish, use, compile, sell, or +# distribute this software, either in source code form or as a compiled +# binary, for any purpose, commercial or non-commercial, and by any +# means. +# +# In jurisdictions that recognize copyright laws, the author or authors +# of this software dedicate any and all copyright interest in the +# software to the public domain. We make this dedication for the benefit +# of the public at large and to the detriment of our heirs and +# successors. We intend this dedication to be an overt act of +# relinquishment in perpetuity of all present and future rights to this +# software under copyright law. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +# IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR +# OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +# ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR +# OTHER DEALINGS IN THE SOFTWARE. + +import os +from suite_subprocess import suite_subprocess +import wiredtiger, wttest + +# test_util14.py +# Utilities: wt truncate +class test_util14(wttest.WiredTigerTestCase, suite_subprocess): + tablename = 'test_util14.a' + nentries = 1000 + + def test_truncate_process(self): + """ + Test truncate in a 'wt' process + """ + params = 'key_format=S,value_format=S' + self.session.create('table:' + self.tablename, params) + self.assertTrue(os.path.exists(self.tablename + ".wt")) + cursor = self.session.open_cursor('table:' + self.tablename, None, None) + for i in range(0, self.nentries): + cursor[str(i)] = str(i) + cursor.close() + + self.runWt(["truncate", "table:" + self.tablename]) + + """ + Test to confirm table exists and is empty + """ + outfile="outfile.txt" + errfile="errfile.txt" + self.assertTrue(os.path.exists(self.tablename + ".wt")) + self.runWt(["read", 'table:' + self.tablename, 'NoMatch'], + outfilename=outfile, errfilename=errfile, failure=True) + self.check_empty_file(outfile) + self.check_file_contains(errfile, 'NoMatch: not found\n') + + """ + Tests for error cases + 1. Missing URI + 2. Invalid URI + 3. Valid but incorrect URI + 4. Double URI + """ + self.runWt(["truncate"], + outfilename=outfile, errfilename=errfile, failure=True) + self.check_empty_file(outfile) + self.check_file_contains(errfile, 'usage:') + + self.runWt(["truncate", "foobar"], + outfilename=outfile, errfilename=errfile, failure=True) + self.check_empty_file(outfile) + self.check_file_contains(errfile, 'No such file or directory') + + self.runWt(["truncate", 'table:xx' + self.tablename], + outfilename=outfile, errfilename=errfile, failure=True) + self.check_empty_file(outfile) + self.check_file_contains(errfile, 'No such file or directory') + + self.runWt(["truncate", 'table:' + self.tablename, 'table:' + self.tablename], + outfilename=outfile, errfilename=errfile, failure=True) + self.check_empty_file(outfile) + self.check_file_contains(errfile, 'usage:') + +if __name__ == '__main__': + wttest.run() diff --git a/src/third_party/wiredtiger/test/suite/test_util15.py b/src/third_party/wiredtiger/test/suite/test_util15.py new file mode 100644 index 00000000000..33096e71bee --- /dev/null +++ b/src/third_party/wiredtiger/test/suite/test_util15.py @@ -0,0 +1,71 @@ +#!/usr/bin/env python +# +# Public Domain 2014-2017 MongoDB, Inc. +# Public Domain 2008-2014 WiredTiger, Inc. +# +# This is free and unencumbered software released into the public domain. +# +# Anyone is free to copy, modify, publish, use, compile, sell, or +# distribute this software, either in source code form or as a compiled +# binary, for any purpose, commercial or non-commercial, and by any +# means. +# +# In jurisdictions that recognize copyright laws, the author or authors +# of this software dedicate any and all copyright interest in the +# software to the public domain. We make this dedication for the benefit +# of the public at large and to the detriment of our heirs and +# successors. We intend this dedication to be an overt act of +# relinquishment in perpetuity of all present and future rights to this +# software under copyright law. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +# IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR +# OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +# ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR +# OTHER DEALINGS IN THE SOFTWARE. + +import os +from suite_subprocess import suite_subprocess +import wiredtiger, wttest + +# test_util15.py +# Utilities: wt alter +class test_util15(wttest.WiredTigerTestCase, suite_subprocess): + tablename = 'test_util15.a' + + def test_alter_process(self): + """ + Test alter in a 'wt' process + """ + params = 'key_format=S,value_format=S' + self.session.create('table:' + self.tablename, params) + self.assertTrue(os.path.exists(self.tablename + ".wt")) + + """ + Alter access pattern and confirm + """ + acc_pat_seq="access_pattern_hint=sequential" + self.runWt(["alter", "table:" + self.tablename, acc_pat_seq]) + cursor = self.session.open_cursor("metadata:create", None, None) + cursor.set_key("table:" + self.tablename) + self.assertEqual(cursor.search(),0) + string = cursor.get_value() + cursor.close() + self.assertTrue(acc_pat_seq in string) + + """ + Alter access pattern again and confirm + """ + acc_pat_rand="access_pattern_hint=random" + self.runWt(["alter", "table:" + self.tablename, acc_pat_rand]) + cursor = self.session.open_cursor("metadata:create", None, None) + cursor.set_key("table:" + self.tablename) + self.assertEqual(cursor.search(),0) + string = cursor.get_value() + cursor.close() + self.assertTrue(acc_pat_rand in string) + +if __name__ == '__main__': + wttest.run() diff --git a/src/third_party/wiredtiger/test/suite/test_util16.py b/src/third_party/wiredtiger/test/suite/test_util16.py new file mode 100644 index 00000000000..00e68c1017a --- /dev/null +++ b/src/third_party/wiredtiger/test/suite/test_util16.py @@ -0,0 +1,71 @@ +#!/usr/bin/env python +# +# Public Domain 2014-2017 MongoDB, Inc. +# Public Domain 2008-2014 WiredTiger, Inc. +# +# This is free and unencumbered software released into the public domain. +# +# Anyone is free to copy, modify, publish, use, compile, sell, or +# distribute this software, either in source code form or as a compiled +# binary, for any purpose, commercial or non-commercial, and by any +# means. +# +# In jurisdictions that recognize copyright laws, the author or authors +# of this software dedicate any and all copyright interest in the +# software to the public domain. We make this dedication for the benefit +# of the public at large and to the detriment of our heirs and +# successors. We intend this dedication to be an overt act of +# relinquishment in perpetuity of all present and future rights to this +# software under copyright law. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +# IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR +# OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +# ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR +# OTHER DEALINGS IN THE SOFTWARE. + +import os +from suite_subprocess import suite_subprocess +import wiredtiger, wttest + +# test_util16.py +# Utilities: wt rename +class test_util16(wttest.WiredTigerTestCase, suite_subprocess): + tablename = 'test_util16.a' + tablename2 = 'test_util16.b' + nentries = 1000 + + def test_rename_process(self): + """ + Test alter in a 'wt' process + """ + params = 'key_format=S,value_format=S' + self.session.create('table:' + self.tablename, params) + self.assertTrue(os.path.exists(self.tablename + ".wt")) + cursor = self.session.open_cursor('table:' + self.tablename, None, None) + for i in range(0, self.nentries): + cursor[str(i)] = str(i) + cursor.close() + + self.runWt(["rename", "table:" + self.tablename, "table:" + self.tablename2]) + self.assertTrue(os.path.exists(self.tablename2 + ".wt")) + cursor = self.session.open_cursor('table:' + self.tablename2, None, None) + count = 0 + while cursor.next() == 0: + count +=1 + cursor.close() + self.assertEquals(self.nentries, count) + + self.runWt(["rename", "table:" + self.tablename2, "table:" + self.tablename]) + self.assertTrue(os.path.exists(self.tablename + ".wt")) + cursor = self.session.open_cursor('table:' + self.tablename, None, None) + count = 0 + while cursor.next() == 0: + count +=1 + cursor.close() + self.assertEquals(self.nentries, count) + +if __name__ == '__main__': + wttest.run() diff --git a/src/third_party/wiredtiger/test/suite/test_util17.py b/src/third_party/wiredtiger/test/suite/test_util17.py new file mode 100644 index 00000000000..decc1fabf1d --- /dev/null +++ b/src/third_party/wiredtiger/test/suite/test_util17.py @@ -0,0 +1,57 @@ +#!/usr/bin/env python +# +# Public Domain 2014-2017 MongoDB, Inc. +# Public Domain 2008-2014 WiredTiger, Inc. +# +# This is free and unencumbered software released into the public domain. +# +# Anyone is free to copy, modify, publish, use, compile, sell, or +# distribute this software, either in source code form or as a compiled +# binary, for any purpose, commercial or non-commercial, and by any +# means. +# +# In jurisdictions that recognize copyright laws, the author or authors +# of this software dedicate any and all copyright interest in the +# software to the public domain. We make this dedication for the benefit +# of the public at large and to the detriment of our heirs and +# successors. We intend this dedication to be an overt act of +# relinquishment in perpetuity of all present and future rights to this +# software under copyright law. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +# IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR +# OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +# ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR +# OTHER DEALINGS IN THE SOFTWARE. + +import os +from suite_subprocess import suite_subprocess +import wiredtiger, wttest + +# test_util17.py +# Utilities: wt stat +class test_util17(wttest.WiredTigerTestCase, suite_subprocess): + tablename = 'test_util17.a' + + def test_stat_process(self): + """ + Test stat in a 'wt' process + This test is just here to confirm that stat produces a correct looking + output, it isn't here to do statistics validation. + """ + params = 'key_format=S,value_format=S' + outfile = "wt-stat.out" + expected_string = "cursor: cursor create calls=" + self.session.create('table:' + self.tablename, params) + self.assertTrue(os.path.exists(self.tablename + ".wt")) + self.runWt(["stat"], outfilename=outfile) + self.check_file_contains(outfile, expected_string) + + expected_string = "cache_walk: Entries in the root page=1" + self.runWt(["stat", "table:" + self.tablename ], outfilename=outfile) + self.check_file_contains(outfile, expected_string) + +if __name__ == '__main__': + wttest.run() diff --git a/src/third_party/wiredtiger/test/suite/wtdataset.py b/src/third_party/wiredtiger/test/suite/wtdataset.py index 74e07e24e93..946b97d995f 100644 --- a/src/third_party/wiredtiger/test/suite/wtdataset.py +++ b/src/third_party/wiredtiger/test/suite/wtdataset.py @@ -41,6 +41,7 @@ class BaseDataSet(object): self.key_format = kwargs.get('key_format', 'S') self.value_format = kwargs.get('value_format', 'S') self.config = kwargs.get('config', '') + self.projection = kwargs.get('projection', '') def create(self): self.testcase.session.create(self.uri, 'key_format=' + self.key_format @@ -103,7 +104,8 @@ class BaseDataSet(object): def check(self): self.testcase.pr('check: ' + self.uri) - cursor = self.testcase.session.open_cursor(self.uri, None) + cursor = self.testcase.session.open_cursor( + self.uri + self.projection, None, None) self.check_cursor(cursor) cursor.close() @@ -289,6 +291,94 @@ class ComplexLSMDataSet(ComplexDataSet): def is_lsm(cls): return True +class ProjectionDataSet(SimpleDataSet): + """ + ProjectionDataSet creates a table with predefined data identical to + SimpleDataSet (single key and value), but when checking it, uses + a cursor with a projection. + """ + def __init__(self, testcase, uri, rows, **kwargs): + kwargs['config'] = kwargs.get('config', '') + ',columns=(k,v0)' + kwargs['projection'] = '(v0,v0,v0)' + super(ProjectionDataSet, self).__init__(testcase, uri, rows, **kwargs) + + # A value suitable for checking the value returned by a cursor. + def comparable_value(self, i): + v0 = self.value(i) + return [v0, v0, v0] + + def check_cursor(self, cursor): + i = 0 + for key, got0, got1, got2 in cursor: + i += 1 + self.testcase.assertEqual(key, self.key(i)) + if cursor.value_format == '8t' and got0 == 0: # deleted + continue + self.testcase.assertEqual([got0, got1, got2], + self.comparable_value(i)) + self.testcase.assertEqual(i, self.rows) + +class ProjectionIndexDataSet(BaseDataSet): + """ + ProjectionIndexDataSet creates a table with three values and + an index. Checks are made against a projection of the main table + and a projection of the index. + """ + def __init__(self, testcase, uri, rows, **kwargs): + self.origconfig = kwargs.get('config', '') + self.indexname = 'index:' + uri.split(":")[1] + ':index0' + kwargs['config'] = self.origconfig + ',columns=(k,v0,v1,v2)' + kwargs['value_format'] = kwargs.get('value_format', 'SiS') + kwargs['projection'] = '(v1,v2,v0)' + super(ProjectionIndexDataSet, self).__init__( + testcase, uri, rows, **kwargs) + + def value(self, i): + return ('v0:' + str(i), i*i, 'v2:' + str(i)) + + # Suitable for checking the value returned by a cursor using a projection. + def comparable_value(self, i): + return [i*i, 'v2:' + str(i), 'v0:' + str(i)] + + def create(self): + super(ProjectionIndexDataSet, self).create() + self.testcase.session.create(self.indexname, 'columns=(v2,v1),' + + self.origconfig) + + def check_cursor(self, cursor): + i = 0 + for key, got0, got1, got2 in cursor: + i += 1 + self.testcase.assertEqual(key, self.key(i)) + if cursor.value_format == '8t' and got0 == 0: # deleted + continue + self.testcase.assertEqual([got0, got1, got2], + self.comparable_value(i)) + self.testcase.assertEqual(i, self.rows) + + def check_index_cursor(self, cursor): + for i in xrange(1, self.rows + 1): + k = self.key(i) + v = self.value(i) + ik = (v[2], v[1]) # The index key is (v2,v2) + expect = [v[1],k,v[2],v[0]] + self.testcase.assertEqual(expect, cursor[ik]) + + def check(self): + BaseDataSet.check(self) + + # Check values in the index. + idxcursor = self.testcase.session.open_cursor( + self.indexname + '(v1,k,v2,v0)') + self.check_index_cursor(idxcursor) + idxcursor.close() + + def index_count(self): + return 1 + + def index_name(self, i): + return self.indexname + # create a key based on a cursor as a shortcut to creating a SimpleDataSet def simple_key(cursor, i): return BaseDataSet.key_by_format(i, cursor.key_format) diff --git a/src/third_party/wiredtiger/test/suite/wttest.py b/src/third_party/wiredtiger/test/suite/wttest.py index 4d6df0bc8bd..0dce51f07d5 100644 --- a/src/third_party/wiredtiger/test/suite/wttest.py +++ b/src/third_party/wiredtiger/test/suite/wttest.py @@ -37,9 +37,8 @@ except ImportError: import unittest from contextlib import contextmanager -import os, re, shutil, sys, time, traceback -import wtscenario -import wiredtiger +import glob, os, re, shutil, sys, time, traceback +import wiredtiger, wtscenario def shortenWithEllipsis(s, maxlen): if len(s) > maxlen: @@ -152,6 +151,14 @@ class TestSuiteConnection(object): else: return getattr(self._conn, attr) +# Just like a list of strings, but with a convenience function +class ExtensionList(list): + skipIfMissing = False + def extension(self, dirname, name, extarg=None): + if name != None and name != 'none': + ext = '' if extarg == None else '=' + extarg + self.append(dirname + '/' + name + ext) + class WiredTigerTestCase(unittest.TestCase): _globalSetup = False _printOnceSeen = {} @@ -160,9 +167,16 @@ class WiredTigerTestCase(unittest.TestCase): # Can be a string or a callable function or lambda expression. conn_config = '' + # conn_extensions can be overridden to add a list of extensions to load. + # Each entry is a string (directory and extension name) and optional config. + # Example: + # conn_extensions = ('extractors/csv_extractor', + # 'test/fail_fs={allow_writes=100}') + conn_extensions = () + @staticmethod def globalSetup(preserveFiles = False, useTimestamp = False, - gdbSub = False, verbose = 1, dirarg = None, + gdbSub = False, verbose = 1, builddir = None, dirarg = None, longtest = False): WiredTigerTestCase._preserveFiles = preserveFiles d = 'WT_TEST' if dirarg == None else dirarg @@ -172,6 +186,7 @@ class WiredTigerTestCase(unittest.TestCase): os.makedirs(d) wtscenario.set_long_run(longtest) WiredTigerTestCase._parentTestdir = d + WiredTigerTestCase._builddir = builddir WiredTigerTestCase._origcwd = os.getcwd() WiredTigerTestCase._resultfile = open(os.path.join(d, 'results.txt'), "w", 0) # unbuffered WiredTigerTestCase._gdbSubprocess = gdbSub @@ -224,12 +239,66 @@ class WiredTigerTestCase(unittest.TestCase): return "%s.%s.%s" % (self.__module__, self.className(), self._testMethodName) - # Can be overridden, but first consider setting self.conn_config . + # Return the wiredtiger_open extension argument for + # any needed shared library. + def extensionsConfig(self): + exts = self.conn_extensions + if hasattr(exts, '__call__'): + exts = ExtensionList() + self.conn_extensions(exts) + result = '' + extfiles = {} + skipIfMissing = False + if hasattr(exts, 'skip_if_missing'): + skipIfMissing = exts.skip_if_missing + for ext in exts: + extconf = '' + if '=' in ext: + splits = ext.split('=', 1) + ext = splits[0] + extconf = '=' + splits[1] + splits = ext.split('/') + if len(splits) != 2: + raise Exception(self.shortid() + + ": " + ext + + ": extension is not named <dir>/<name>") + libname = splits[1] + dirname = splits[0] + pat = os.path.join(WiredTigerTestCase._builddir, 'ext', + dirname, libname, '.libs', 'libwiredtiger_*.so') + filenames = glob.glob(pat) + if len(filenames) == 0: + if skipIfMissing: + self.skipTest('extension "' + ext + '" not built') + continue + else: + raise Exception(self.shortid() + + ": " + ext + + ": no extensions library found matching: " + pat) + elif len(filenames) > 1: + raise Exception(self.shortid() + + ": " + ext + + ": multiple extensions libraries found matching: " + pat) + complete = '"' + filenames[0] + '"' + extconf + if ext in extfiles: + if extfiles[ext] != complete: + raise Exception(self.shortid() + + ": non-matching extension arguments in " + + str(exts)) + else: + extfiles[ext] = complete + if len(extfiles) != 0: + result = ',extensions=[' + ','.join(extfiles.values()) + ']' + return result + + # Can be overridden, but first consider setting self.conn_config + # or self.conn_extensions def setUpConnectionOpen(self, home): self.home = home config = self.conn_config if hasattr(config, '__call__'): - config = config(home) + config = self.conn_config() + config += self.extensionsConfig() # In case the open starts additional threads, flush first to # avoid confusion. sys.stdout.flush() @@ -287,6 +356,7 @@ class WiredTigerTestCase(unittest.TestCase): self.testsubdir = self.className() + '.' + str(self.__class__.wt_ntests) self.testdir = os.path.join(WiredTigerTestCase._parentTestdir, self.testsubdir) self.__class__.wt_ntests += 1 + self.starttime = time.time() if WiredTigerTestCase._verbose > 2: self.prhead('started in ' + self.testdir, True) # tearDown needs connections list, set it here in case the open fails. @@ -355,6 +425,9 @@ class WiredTigerTestCase(unittest.TestCase): else: self.pr('preserving directory ' + self.testdir) + elapsed = time.time() - self.starttime + if elapsed > 0.001 and WiredTigerTestCase._verbose >= 2: + print "%s: %.2f seconds" % (str(self), elapsed) if not passed and not skipped: print "ERROR in " + str(self) self.pr('FAIL') diff --git a/src/third_party/wiredtiger/test/utility/misc.c b/src/third_party/wiredtiger/test/utility/misc.c index 1491c9a6938..1ba08ddd77f 100644 --- a/src/third_party/wiredtiger/test/utility/misc.c +++ b/src/third_party/wiredtiger/test/utility/misc.c @@ -78,7 +78,7 @@ testutil_work_dir_from_path(char *buffer, size_t len, const char *dir) * Remove the work directory. */ void -testutil_clean_work_dir(char *dir) +testutil_clean_work_dir(const char *dir) { size_t len; int ret; diff --git a/src/third_party/wiredtiger/test/utility/test_util.h b/src/third_party/wiredtiger/test/utility/test_util.h index f6a9cd68e02..489bbe18d87 100644 --- a/src/third_party/wiredtiger/test/utility/test_util.h +++ b/src/third_party/wiredtiger/test/utility/test_util.h @@ -183,7 +183,7 @@ void *dmalloc(size_t); void *drealloc(void *, size_t); void *dstrdup(const void *); void *dstrndup(const char *, size_t); -void testutil_clean_work_dir(char *); +void testutil_clean_work_dir(const char *); void testutil_cleanup(TEST_OPTS *); bool testutil_disable_long_tests(void); void testutil_make_work_dir(char *); diff --git a/src/third_party/wiredtiger/test/wtperf/test_conf_dump.py b/src/third_party/wiredtiger/test/wtperf/test_conf_dump.py new file mode 100644 index 00000000000..ef7f276a1d0 --- /dev/null +++ b/src/third_party/wiredtiger/test/wtperf/test_conf_dump.py @@ -0,0 +1,296 @@ +# Usage: python test_conf_dump.py <optional-wtperf-config> +# +# This script tests if the config file dumped in the test directory corresponds +# correctly to the wtperf config file used. Command line options to wtperf are +# also taken into account. +# +# Following expectations are checked for: +# 1. If provided through multiple sources, "conn_config" and "table_config" +# configuration options are appended to each other. All other options get +# replaced by a higher precedent source. +# 2. The precedence order for the options in an increasing order is as follows: +# default option, +# provided through config file, +# provided through option -o +# provided through option -C (for conn_config) or -T (for table_config) +# +# Test fails if any config option is missing or has a wrong value. Test also +# fails if the value for the option is not replaced/appended in the correct +# order of precedence as stated above. + +import os, re, subprocess, sys + +OP_FILE = "WT_TEST/CONFIG.wtperf" +TMP_CONF = "__tmp.wtperf" +WTPERF_BIN = "./wtperf" +WTPERF_DIR = "../../build_posix/bench/wtperf/" + +CONF_NOT_PROVIDED = -2 + +# Generate a wtperf conf file to use +def generate_conf_file(file_name): + f = open(file_name, 'w') + f.write( +'''conn_config="cache_size=16GB,eviction=(threads_max=4),log=(enabled=false),session_max=33" +table_config="leaf_page_max=32k,internal_page_max=16k,allocation_size=4k,split_pct=90,type=file" +close_conn=false +icount=1500 +create=true +compression="snappy" +checkpoint_interval=5 +checkpoint_threads=1 +populate_threads=1 +report_interval=5 +session_count_idle=50 +session_count_idle=60 +session_count_idle=70 +session_count_idle=80 +run_time=5 +sample_interval=5 +sample_rate=1 +table_count=2 +threads=((count=6,updates=1)) +value_sz=1000 +warmup=2 +''') + f.close() + +# Build a command from the given options and execute wtperf +def execute_wtperf(conf_file, option_C = "", option_T = "", option_o = ""): + # Generate the command to run, execute wtperf + cmd = WTPERF_BIN + " -O " + conf_file + if option_C: + cmd += " -C " + option_C + if option_T: + cmd += " -T " + option_T + if option_o: + # Any quotes in option_o need to be escaped before providing it as part + # of the command + option_o_cmd_str = option_o.replace('"', '\\"') + cmd += " -o " + option_o_cmd_str + + print "Running: ", cmd + subprocess.check_call(cmd, shell=True) + print "=========================\n" + +# Build a dictionary of config key and it's value from the given config file. +# Optionally take -C, -T and -o and overwrite/append values as per correct +# precedence +def build_dict_from_conf( + conf_file, option_C = "", option_T = "", option_o = ""): + # Open given conf file and make a dictionary of passed arguments and values + with open(conf_file) as f: + lines = f.read().splitlines() + + # Maintain precedence order of config file, -o, -C/-T + # Build a dict of config options, appending values for table_config and + # conn_config, if specified multiple times. Replace with the latest in + # case of all other configuration keys. + key_val_dict = {} + for line in lines: + if re.match('^\s*#', line) is None: + key_val_pair = line.split('=', 1) + if ((key_val_pair[0] == 'table_config' or + key_val_pair[0] == 'conn_config') and + key_val_pair[0] in key_val_dict): + tmp_val = key_val_dict[key_val_pair[0]][:-1] + tmp_val += "," + tmp_val += key_val_pair[1][1:] + key_val_dict[key_val_pair[0]] = tmp_val + else: + key_val_dict[key_val_pair[0]] = key_val_pair[1] + + # If provided, put option o in the dict + if option_o: + opt_o_key_val_list = option_o.split(',') + for op_o_key_val in opt_o_key_val_list: + key_val_pair = op_o_key_val.split('=', 1) + if ((key_val_pair[0] == 'table_config' or + key_val_pair[0] == 'conn_config') and + key_val_pair[0] in key_val_dict): + tmp_val = key_val_dict[key_val_pair[0]][:-1] + tmp_val += "," + tmp_val += key_val_pair[1][1:] + key_val_dict[key_val_pair[0]] = tmp_val + else: + key_val_dict[key_val_pair[0]] = key_val_pair[1] + + # If provided, put option C in the dict + if option_C: + tmp_val = key_val_dict["conn_config"][:-1] + tmp_val += "," + tmp_val += option_C[1:] + key_val_dict["conn_config"] = tmp_val + + # If provided, put option T in the dict + if option_T: + tmp_val = key_val_dict["table_config"][:-1] + tmp_val += "," + tmp_val += option_T[1:] + key_val_dict["table_config"] = tmp_val + + return key_val_dict + +# Extract configuration value for the given key from the given config file +def extract_config_from_file(conf_file, key): + ret_val = "" + with open(conf_file) as f: + lines = f.read().splitlines() + for line in lines: + if re.match('^\s*#', line) is None: + key_val_pair = line.split('=', 1) + if key_val_pair[0] == key: + ret_val = key_val_pair[1] + return ret_val + +# Extract configuration value for the given key from the given "-o" string +def extract_config_from_opt_o(option_o, key): + ret_val = "" + opt_o_key_val_list = option_o.split(',') + for op_o_key_val in opt_o_key_val_list: + key_val_pair = op_o_key_val.split('=', 1) + if key_val_pair[0] == key: + ret_val = key_val_pair[1] + return ret_val + +# Execute test: +# Run wtperf with given config and check if the dumped config file matches the +# given inputs +def run_test(conf_file, option_C = "", option_T = "", option_o = ""): + # Run wtperf + execute_wtperf(conf_file, option_C, option_T, option_o) + + key_val_dict_ip = build_dict_from_conf( + conf_file, option_C, option_T, option_o) + key_val_dict_op = build_dict_from_conf(OP_FILE) + + conn_config_from_file = extract_config_from_file(conf_file, "conn_config") + table_config_from_file = extract_config_from_file(conf_file, "table_config") + conn_config_from_opt_o = "" + table_config_from_opt_o = "" + if option_o: + conn_config_from_opt_o = extract_config_from_opt_o( + option_o, "conn_config") + table_config_from_opt_o = extract_config_from_opt_o( + option_o, "table_config") + + # Check if dumped output conf matches with input file and options + match = True + for key in key_val_dict_ip: + match_itr = True + + # Check if we see this config key in the dumped file + if not key in key_val_dict_op: + print "Key '", key, "' not found in dumped file ", OP_FILE + match = match_itr = False + continue + + # Check if values from all sources of conn_config are presented in the + # conn_config in dumped file. Also check of their relative ordering as + # per precedence rules defined. + if (key == 'conn_config' and + (conn_config_from_file or conn_config_from_opt_o or option_C)): + # Should find these config in order: file < option o < option C + file_loc = CONF_NOT_PROVIDED + option_o_loc = CONF_NOT_PROVIDED + option_C_loc = CONF_NOT_PROVIDED + op_conn_config = key_val_dict_op['conn_config'] + + if conn_config_from_file: + file_loc = op_conn_config.find(conn_config_from_file[1:-1]) + if conn_config_from_opt_o: + option_o_loc = op_conn_config.find(conn_config_from_opt_o[1:-1]) + if option_C: + option_C_loc = op_conn_config.find(option_C[1:-1]) + + # Check if value from any of the sources is missing + if ((conn_config_from_file and file_loc == -1) or + (conn_config_from_opt_o and option_o_loc == -1) or + (option_C and option_C_loc == -1)): + print "Part of conn_config missing in dumped file ", OP_FILE + match_itr = False + + # Check if the values got appended in the correct order + if match_itr: + if ((option_o_loc != CONF_NOT_PROVIDED and + option_o_loc < file_loc) or + (option_C_loc != CONF_NOT_PROVIDED and + (option_C_loc < file_loc or option_C_loc < option_o_loc))): + print "Detected incorrect config append order:" + match_itr = False + + # Check if values from all sources of table_config are presented in the + # table_config in dumped file. Also check of their relative ordering as + # per precedence rules defined. + if (key == 'table_config' and + (table_config_from_file or table_config_from_opt_o or option_T)): + # Should find these config in order: file < option o < option T + file_loc = CONF_NOT_PROVIDED + option_o_loc = CONF_NOT_PROVIDED + option_T_loc = CONF_NOT_PROVIDED + op_table_config = key_val_dict_op['table_config'] + + if table_config_from_file: + file_loc = op_table_config.find(table_config_from_file[1:-1]) + if table_config_from_opt_o: + option_o_loc = op_table_config.find( + table_config_from_opt_o[1:-1]) + if option_T: + option_T_loc = op_table_config.find(option_T[1:-1]) + + # Check if value from any of the sources is missing + if ((table_config_from_file and file_loc == -1) or + (table_config_from_opt_o and option_o_loc == -1) or + (option_T and option_T_loc == -1)): + print "Part of table_config missing in dumped file ", OP_FILE + match_itr = False + + # Check if the values got appended in the correct order + if match_itr: + if ((option_o_loc != CONF_NOT_PROVIDED and + option_o_loc < file_loc) or + (option_T_loc != CONF_NOT_PROVIDED and + (option_T_loc < file_loc or option_T_loc < option_o_loc))): + print "Detected incorrect config append order:" + match_itr = False + + if (key != 'table_config' and key != 'conn_config' and + key_val_dict_ip[key] != key_val_dict_op[key]): + print "Config mismatch between:" + match_itr = False + + if match_itr is False: + print "Input Config:", key, '=', key_val_dict_ip[key] + print "Dumped Config:", key, '=', key_val_dict_op[key] + print "\n" + + match = match and match_itr + + return match + +# ----------------- Execute Test -------------- +# If a wtperf conf file is provided use it, else generate a temp conf file +os.chdir(WTPERF_DIR) +if len(sys.argv) == 2: + conf_file = sys.argv[1] +else: + conf_file = TMP_CONF + generate_conf_file(conf_file) + +# Run a test with no options +if not run_test(conf_file): + exit(-1) + +# Run a test with -C, -T, -o provided +option_o = "verbose=2,conn_config=\"session_max=135\",table_config=\"type=lsm\",sample_interval=2,run_time=0,sample_rate=2,readonly=false" +option_C = "\"cache_size=10GB,session_max=115\"" +option_T = "\"allocation_size=8k,split_pct=92\"" +if not run_test(conf_file, option_C, option_T, option_o): + exit(-1) + +# Cleanup generated temp files +subprocess.check_call("rm -rf WT_TEST/", shell=True) +if len(sys.argv) == 1 and conf_file == TMP_CONF: + subprocess.check_call("rm " + TMP_CONF, shell=True) + +print "All tests succeeded" diff --git a/src/third_party/wiredtiger/tools/wtstats/stat_data.py b/src/third_party/wiredtiger/tools/wtstats/stat_data.py index d925dd67b80..a94ce524ae3 100644 --- a/src/third_party/wiredtiger/tools/wtstats/stat_data.py +++ b/src/third_party/wiredtiger/tools/wtstats/stat_data.py @@ -94,6 +94,7 @@ no_scale_per_second_list = [ 'btree: row-store leaf pages', 'cache: bytes currently in the cache', 'cache: overflow values cached in memory', + 'cache: tracked dirty bytes in the cache', 'cache_walk: Average difference between current eviction generation when the page was last considered', 'cache_walk: Average on-disk page image size seen', 'cache_walk: Clean pages currently in cache', @@ -127,6 +128,8 @@ no_clear_list = [ 'cache: eviction currently operating in aggressive mode', 'cache: eviction empty score', 'cache: eviction state', + 'cache: eviction worker thread active', + 'cache: eviction worker thread stable number', 'cache: files with active eviction walks', 'cache: maximum bytes configured', 'cache: maximum page size at eviction', @@ -186,6 +189,7 @@ no_clear_list = [ 'transaction: transaction range of IDs currently pinned by named snapshots', 'btree: btree checkpoint generation', 'cache: bytes currently in the cache', + 'cache: tracked dirty bytes in the cache', 'cache_walk: Average difference between current eviction generation when the page was last considered', 'cache_walk: Average on-disk page image size seen', 'cache_walk: Clean pages currently in cache', |