diff options
author | Alex Gorrod <alexg@wiredtiger.com> | 2014-03-04 16:04:48 +1100 |
---|---|---|
committer | Alex Gorrod <alexg@wiredtiger.com> | 2014-03-04 16:04:48 +1100 |
commit | be98ccec7299820ab9cbfa78581d1400b5f5ec65 (patch) | |
tree | 73f63490811959378dd453de3a3d00d4e1ca334c | |
parent | 006f6b7afec5d29351c6c1bf79268cb0d1c011fc (diff) | |
parent | 4deb7b35e421150a6d92ba3c197eab438b63cef7 (diff) | |
download | mongo-be98ccec7299820ab9cbfa78581d1400b5f5ec65.tar.gz |
Merge branch 'develop'
89 files changed, 2114 insertions, 1048 deletions
@@ -1,3 +1,29 @@ +WiredTiger release 2.1.1, 2014-03-04 +------------------------------------ + +The WiredTiger 2.1.1 release contains new features, performance enhancements +and bug fixes. Significant changes include: + +Fix a bug where a page could be marked clean when it contained uncommitted +changes. This bug could cause undefined behavior in transaction rollback +under load. + +Fix a bug with shared caches when rebalancing between connections. + +Add a new public API to WiredTiger that provides the ability to parse +WiredTiger compatible configuration strings. See the upgrading documentation +for further information. [#873] + +A number of performance enhancements to the LSM implementation, particularly +for long running workloads. + +A number of performance enhancements and bug fixes to cache eviction code. + +Add an option to use direct I/O when reading from checkpoints. To enabled +the functionality add "direct_io=[checkpoint]" to your wiredtiger_open +configuration string. [#847] + + WiredTiger release 2.1.0, 2014-02-04 ------------------------------------ @@ -1,6 +1,6 @@ -WiredTiger 2.1.0: (February 4, 2014) +WiredTiger 2.1.1: (March 4, 2014) -This is version 2.1.0 of WiredTiger. +This is version 2.1.1 of WiredTiger. WiredTiger release packages and documentation can be found at: @@ -9,7 +9,7 @@ WiredTiger release packages and documentation can be found at: Information on configuring, building and installing WiredTiger can be found at: - http://source.wiredtiger.com/2.1.0/install.html + http://source.wiredtiger.com/2.1.1/install.html WiredTiger licensing information can be found at: @@ -1,6 +1,6 @@ WIREDTIGER_VERSION_MAJOR=2 WIREDTIGER_VERSION_MINOR=1 -WIREDTIGER_VERSION_PATCH=0 +WIREDTIGER_VERSION_PATCH=1 WIREDTIGER_VERSION="$WIREDTIGER_VERSION_MAJOR.$WIREDTIGER_VERSION_MINOR.$WIREDTIGER_VERSION_PATCH" WIREDTIGER_RELEASE_DATE=`date "+%B %e, %Y"` diff --git a/bench/wtperf/config.c b/bench/wtperf/config.c index 21275f69768..c87c3ac75a1 100644 --- a/bench/wtperf/config.c +++ b/bench/wtperf/config.c @@ -60,6 +60,23 @@ config_assign(CONFIG *dest, const CONFIG *src) config_free(dest); memcpy(dest, src, sizeof(CONFIG)); + if (src->uris != NULL) { + dest->uris = (char **)calloc(src->table_count, sizeof(char *)); + for (i = 0; i < src->table_count; i++) + dest->uris[i] = strdup(src->uris[i]); + } + dest->ckptthreads = NULL; + dest->popthreads = NULL; + dest->workers = NULL; + + if (src->base_uri != NULL) + dest->base_uri = strdup(src->base_uri); + if (src->workload != NULL) { + dest->workload = calloc(WORKLOAD_MAX, sizeof(WORKLOAD)); + memcpy(dest->workload, + src->workload, WORKLOAD_MAX * sizeof(WORKLOAD)); + } + for (i = 0; i < sizeof(config_opts) / sizeof(config_opts[0]); i++) if (config_opts[i].type == STRING_TYPE || config_opts[i].type == CONFIG_STRING_TYPE) { @@ -96,14 +113,53 @@ config_free(CONFIG *cfg) *pstr = NULL; } } + if (cfg->uris != NULL) { + for (i = 0; i < cfg->table_count; i++) + free(cfg->uris[i]); + free(cfg->uris); + } + free(cfg->ckptthreads); free(cfg->popthreads); - free(cfg->uri); + free(cfg->base_uri); free(cfg->workers); free(cfg->workload); } /* + * config_compress -- + * Parse the compression configuration. + */ +int +config_compress(CONFIG *cfg) +{ + int ret; + const char *s; + + ret = 0; + s = cfg->compression; + if (strcmp(s, "none") == 0) { + cfg->compress_ext = NULL; + cfg->compress_table = NULL; + } else if (strcmp(s, "bzip") == 0) { + cfg->compress_ext = BZIP_EXT; + cfg->compress_table = BZIP_BLK; + } else if (strcmp(s, "snappy") == 0) { + cfg->compress_ext = SNAPPY_EXT; + cfg->compress_table = SNAPPY_BLK; + } else if (strcmp(s, "zlib") == 0) { + cfg->compress_ext = ZLIB_EXT; + cfg->compress_table = ZLIB_BLK; + } else { + fprintf(stderr, + "invalid compression configuration: %s\n", s); + ret = EINVAL; + } + return (ret); + +} + +/* * config_threads -- * Parse the thread configuration. */ @@ -112,12 +168,10 @@ config_threads(CONFIG *cfg, const char *config, size_t len) { WORKLOAD *workp; WT_CONFIG_ITEM groupk, groupv, k, v; - WT_CONFIG_SCAN *group, *scan; - WT_EXTENSION_API *wt_api; + WT_CONFIG_PARSER *group, *scan; int ret; - wt_api = cfg->conn->get_extension_api(cfg->conn); - + group = scan = NULL; /* Allocate the workload array. */ if ((cfg->workload = calloc(WORKLOAD_MAX, sizeof(WORKLOAD))) == NULL) return (enomem(cfg)); @@ -132,12 +186,11 @@ config_threads(CONFIG *cfg, const char *config, size_t len) * returned from the original string. */ if ((ret = - wt_api->config_scan_begin(wt_api, NULL, config, len, &group)) != 0) + wiredtiger_config_parser_open(NULL, config, len, &group)) != 0) goto err; - while ((ret = - wt_api->config_scan_next(wt_api, group, &groupk, &groupv)) == 0) { - if ((ret = wt_api-> config_scan_begin( - wt_api, NULL, groupk.str, groupk.len, &scan)) != 0) + while ((ret = group->next(group, &groupk, &groupv)) == 0) { + if ((ret = wiredtiger_config_parser_open( + NULL, groupk.str, groupk.len, &scan)) != 0) goto err; /* Move to the next workload slot. */ @@ -150,8 +203,7 @@ config_threads(CONFIG *cfg, const char *config, size_t len) } workp = &cfg->workload[cfg->workload_cnt++]; - while ((ret = - wt_api->config_scan_next(wt_api, scan, &k, &v)) == 0) { + while ((ret = scan->next(scan, &k, &v)) == 0) { if (STRING_MATCH("count", k.str, k.len)) { if ((workp->threads = v.val) <= 0) goto err; @@ -181,7 +233,9 @@ config_threads(CONFIG *cfg, const char *config, size_t len) ret = 0; if (ret != 0 ) goto err; - if ((ret = wt_api->config_scan_end(wt_api, scan)) != 0) + ret = scan->close(scan); + scan = NULL; + if (ret != 0) goto err; if (workp->insert == 0 && @@ -190,12 +244,19 @@ config_threads(CONFIG *cfg, const char *config, size_t len) cfg->workers_cnt += (u_int)workp->threads; } - if ((ret = wt_api->config_scan_end(wt_api, group)) != 0) + ret = group->close(group); + group = NULL; + if (ret != 0) goto err; return (0); -err: fprintf(stderr, +err: if (group != NULL) + (void)group->close(group); + if (scan != NULL) + (void)scan->close(scan); + + fprintf(stderr, "invalid thread configuration or scan error: %.*s\n", (int)len, config); return (EINVAL); @@ -408,21 +469,17 @@ int config_opt_line(CONFIG *cfg, const char *optstr) { WT_CONFIG_ITEM k, v; - WT_CONFIG_SCAN *scan; - WT_EXTENSION_API *wt_api; + WT_CONFIG_PARSER *scan; int ret, t_ret; - wt_api = cfg->conn->get_extension_api(cfg->conn); - - if ((ret = wt_api->config_scan_begin( - wt_api, NULL, optstr, strlen(optstr), &scan)) != 0) { + if ((ret = wiredtiger_config_parser_open( + NULL, optstr, strlen(optstr), &scan)) != 0) { lprintf(cfg, ret, 0, "Error in config_scan_begin"); return (ret); } while (ret == 0) { - if ((ret = - wt_api->config_scan_next(wt_api, scan, &k, &v)) != 0) { + if ((ret = scan->next(scan, &k, &v)) != 0) { /* Any parse error has already been reported. */ if (ret == WT_NOTFOUND) ret = 0; @@ -430,7 +487,7 @@ config_opt_line(CONFIG *cfg, const char *optstr) } ret = config_opt(cfg, &k, &v); } - if ((t_ret = wt_api->config_scan_end(wt_api, scan)) != 0) { + if ((t_ret = scan->close(scan)) != 0) { lprintf(cfg, ret, 0, "Error in config_scan_end"); if (ret == 0) ret = t_ret; @@ -474,6 +531,14 @@ config_sanity(CONFIG *cfg) fprintf(stderr, "interval value longer than the run-time\n"); return (1); } + if (cfg->table_count > 99) { + fprintf(stderr, "table count greater than 99\n"); + return (1); + } + if (cfg->database_count > 99) { + fprintf(stderr, "database count greater than 99\n"); + return (1); + } return (0); } @@ -600,11 +665,8 @@ config_opt_usage(void) void usage(void) { - printf("wtperf [-LMS] [-C config] " + printf("wtperf [-C config] " "[-H mount] [-h home] [-O file] [-o option] [-T config]\n"); - printf("\t-L Use a large default configuration\n"); - printf("\t-M Use a medium default configuration\n"); - printf("\t-S Use a small default configuration\n"); printf("\t-C <string> additional connection configuration\n"); printf("\t (added to option conn_config)\n"); printf("\t-H <mount> configure Helium volume mount point\n"); diff --git a/bench/wtperf/misc.c b/bench/wtperf/misc.c index 4dac172a0c5..4946a260bec 100644 --- a/bench/wtperf/misc.c +++ b/bench/wtperf/misc.c @@ -44,24 +44,27 @@ enomem(const CONFIG *cfg) int setup_log_file(CONFIG *cfg) { + int ret; char *fname; + ret = 0; + if (cfg->verbose < 1) return (0); - if ((fname = calloc(strlen(cfg->home) + + if ((fname = calloc(strlen(cfg->monitor_dir) + strlen(cfg->table_name) + strlen(".stat") + 2, 1)) == NULL) return (enomem(cfg)); - sprintf(fname, "%s/%s.stat", cfg->home, cfg->table_name); - cfg->logf = fopen(fname, "a"); - free(fname); - + sprintf(fname, "%s/%s.stat", cfg->monitor_dir, cfg->table_name); + cfg->logf = fopen(fname, "w"); if (cfg->logf == NULL) { - fprintf(stderr, - "Failed to open log file: %s\n", strerror(errno)); - return (EINVAL); + ret = errno; + fprintf(stderr, "%s: %s\n", fname, strerror(ret)); } + free(fname); + if (cfg->logf == NULL) + return (ret); /* Use line buffering for the log file. */ (void)setvbuf(cfg->logf, NULL, _IOLBF, 0); diff --git a/bench/wtperf/runners/fruit-lsm.wtperf b/bench/wtperf/runners/fruit-lsm.wtperf index 193b29d38bf..a06cc37c33d 100644 --- a/bench/wtperf/runners/fruit-lsm.wtperf +++ b/bench/wtperf/runners/fruit-lsm.wtperf @@ -6,7 +6,7 @@ conn_config="cache_size=21G,checkpoint_sync=false,mmap=false,session_max=1024" compact=true sess_config="isolation=snapshot" -table_config="internal_page_max=128K,lsm=(bloom_config=(leaf_page_max=8MB),bloom_bit_count=28,bloom_hash_count=19,bloom_oldest=true,chunk_size=100MB,merge_threads=2),type=lsm" +table_config="internal_page_max=128K,lsm=(bloom_config=(leaf_page_max=8MB),bloom_bit_count=28,bloom_hash_count=19,bloom_oldest=true,chunk_size=100MB,merge_threads=2),type=lsm,leaf_page_max=16K" icount=25000000 key_sz=40 value_sz=800 diff --git a/bench/wtperf/runners/fruit-short.wtperf b/bench/wtperf/runners/fruit-short.wtperf index 9061e231bbe..f546cbc29b5 100644 --- a/bench/wtperf/runners/fruit-short.wtperf +++ b/bench/wtperf/runners/fruit-short.wtperf @@ -6,7 +6,7 @@ conn_config="cache_size=21G,checkpoint_sync=false,mmap=false,session_max=1024" compact=true sess_config="isolation=snapshot" -table_config="internal_page_max=128K,lsm=(bloom_config=(leaf_page_max=8MB),bloom_bit_count=28,bloom_hash_count=19,bloom_oldest=true,chunk_size=100MB,merge_threads=2),type=lsm" +table_config="internal_page_max=128K,lsm=(bloom_config=(leaf_page_max=8MB),bloom_bit_count=28,bloom_hash_count=19,bloom_oldest=true,chunk_size=100MB,merge_threads=2),type=lsm,leaf_page_max=16K" icount=25000000 key_sz=40 value_sz=800 diff --git a/bench/wtperf/runners/medium-lsm-compact.wtperf b/bench/wtperf/runners/medium-lsm-compact.wtperf index 5393cdbfeba..62ae8cf86ca 100644 --- a/bench/wtperf/runners/medium-lsm-compact.wtperf +++ b/bench/wtperf/runners/medium-lsm-compact.wtperf @@ -1,6 +1,6 @@ # wtperf options file: medium lsm configuration conn_config="cache_size=1G" -table_config="lsm=(chunk_size=100MB,merge_threads=2),type=lsm" +table_config="lsm=(chunk_size=100MB,merge_threads=2,chunk_max=1TB),type=lsm" icount=50000000 populate_threads=1 compact=true diff --git a/bench/wtperf/runners/multi-btree.wtperf b/bench/wtperf/runners/multi-btree.wtperf new file mode 100644 index 00000000000..831aa71cdd2 --- /dev/null +++ b/bench/wtperf/runners/multi-btree.wtperf @@ -0,0 +1,13 @@ +# wtperf options file: small btree multi-database configuration +# Original cache was 500MB. Shared cache is 500MB * database_count. +conn_config="shared_cache=(enable=true,size=2500MB,chunk=10M)" +database_count=5 +table_config="leaf_page_max=4k,internal_page_max=16k,leaf_item_max=1433,internal_item_max=3100,type=file" +# Likewise, divide original icount by database_count. +icount=50000 +populate_threads=1 +random_range=100000000 +report_interval=5 +run_time=3000 +threads=((count=1,reads=1),(count=1,inserts=1)) +value_sz=1024 diff --git a/bench/wtperf/runners/test1-500m-lsm.wtperf b/bench/wtperf/runners/test1-500m-lsm.wtperf index 116641fc165..7f85b8c13e5 100644 --- a/bench/wtperf/runners/test1-500m-lsm.wtperf +++ b/bench/wtperf/runners/test1-500m-lsm.wtperf @@ -1,12 +1,13 @@ -# wtperf options file: simulate riak and its test1 and test2 configuration +# wtperf options file: simulate riak and its test1 configuration # The configuration for the connection and table are from riak and the # specification of the data (count, size, threads) is from basho_bench. # -#conn_config="cache_size=21G,checkpoint_sync=false,mmap=false,session_max=1024,statistics=(fast,clear),statistics_log=(wait=600)" +#conn_config="cache_size=21G,checkpoint_sync=false,mmap=false,session_max=1024,statistics=(fast,clear),statistics_log=(wait=60)" conn_config="cache_size=21G,checkpoint_sync=false,mmap=false,session_max=1024" compact=true +compression="snappy" sess_config="isolation=snapshot" -table_config="internal_page_max=128K,lsm=(bloom_config=(leaf_page_max=8MB),bloom_bit_count=28,bloom_hash_count=19,bloom_oldest=true,chunk_size=100MB,merge_threads=2),type=lsm" +table_config="internal_page_max=128K,lsm=(bloom_config=(leaf_page_max=8MB),bloom_bit_count=28,bloom_hash_count=19,bloom_oldest=true,chunk_max=5GB,chunk_size=100MB,merge_threads=3),type=lsm,leaf_page_max=16K" icount=500000000 key_sz=40 value_sz=1000 diff --git a/bench/wtperf/runners/test1-50m-lsm.wtperf b/bench/wtperf/runners/test1-50m-lsm.wtperf index 6551b1565a9..548932f3aee 100644 --- a/bench/wtperf/runners/test1-50m-lsm.wtperf +++ b/bench/wtperf/runners/test1-50m-lsm.wtperf @@ -1,12 +1,12 @@ -# wtperf options file: simulate riak and its test1 and test2 configuration +# wtperf options file: simulate riak and its test1 configuration # The configuration for the connection and table are from riak and the # specification of the data (count, size, threads) is from basho_bench. # -#conn_config="cache_size=10G,checkpoint_sync=false,mmap=false,session_max=1024,statistics=(fast,clear),statistics_log=(wait=300)" +#conn_config="cache_size=10G,checkpoint_sync=false,mmap=false,session_max=1024,statistics=(fast,clear),statistics_log=(wait=30)" conn_config="cache_size=10G,checkpoint_sync=false,mmap=false,session_max=1024" compact=true sess_config="isolation=snapshot" -table_config="internal_page_max=128K,lsm=(bloom_config=(leaf_page_max=8MB),bloom_bit_count=28,bloom_hash_count=19,bloom_oldest=true,chunk_size=100MB,merge_threads=2),type=lsm" +table_config="internal_page_max=128K,lsm=(bloom_config=(leaf_page_max=8MB),bloom_bit_count=28,bloom_hash_count=19,bloom_oldest=true,chunk_size=100MB,merge_threads=2),type=lsm,leaf_page_max=16K" icount=50000000 key_sz=40 value_sz=1000 diff --git a/bench/wtperf/runners/test2-500m-lsm.wtperf b/bench/wtperf/runners/test2-500m-lsm.wtperf index 9b8f225f643..db1ee9d469c 100644 --- a/bench/wtperf/runners/test2-500m-lsm.wtperf +++ b/bench/wtperf/runners/test2-500m-lsm.wtperf @@ -3,11 +3,12 @@ # specification of the data (count, size, threads) is from basho_bench. # This test assumes that a test1 populate already completed and exists. # -#conn_config="cache_size=21G,checkpoint_sync=false,mmap=false,session_max=1024,statistics=(fast,clear),statistics_log=(wait=600)" +#conn_config="cache_size=21G,checkpoint_sync=false,mmap=false,session_max=1024,statistics=(fast,clear),statistics_log=(wait=60)" conn_config="cache_size=21G,checkpoint_sync=false,mmap=false,session_max=1024" create=false -sess_config="isolation=snapshot" -table_config="internal_page_max=128K,lsm=(bloom_config=(leaf_page_max=8MB),bloom_bit_count=28,bloom_hash_count=19,bloom_oldest=true,chunk_size=100MB,merge_threads=2),type=lsm" +compression="snappy" +sess_config="isolation=snapshot +table_config="internal_page_max=128K,lsm=(bloom_config=(leaf_page_max=8MB),bloom_bit_count=28,bloom_hash_count=19,bloom_oldest=true,chunk_max=5GB,chunk_size=100MB,merge_threads=3),type=lsm,leaf_page_max=16K" key_sz=40 value_sz=1000 report_interval=10 diff --git a/bench/wtperf/runners/test2-50m-lsm.wtperf b/bench/wtperf/runners/test2-50m-lsm.wtperf index 945faade694..c5e77e7e207 100644 --- a/bench/wtperf/runners/test2-50m-lsm.wtperf +++ b/bench/wtperf/runners/test2-50m-lsm.wtperf @@ -3,11 +3,11 @@ # specification of the data (count, size, threads) is from basho_bench. # This test assumes that a test1 populate already completed and exists. # -#conn_config="cache_size=10G,checkpoint_sync=false,mmap=false,session_max=1024,statistics=(fast,clear),statistics_log=(wait=300)" +#conn_config="cache_size=10G,checkpoint_sync=false,mmap=false,session_max=1024,statistics=(fast,clear),statistics_log=(wait=30)" conn_config="cache_size=10G,checkpoint_sync=false,mmap=false,session_max=1024" create=false sess_config="isolation=snapshot" -table_config="internal_page_max=128K,lsm=(bloom_config=(leaf_page_max=8MB),bloom_bit_count=28,bloom_hash_count=19,bloom_oldest=true,chunk_size=100MB,merge_threads=2),type=lsm" +table_config="internal_page_max=128K,lsm=(bloom_config=(leaf_page_max=8MB),bloom_bit_count=28,bloom_hash_count=19,bloom_oldest=true,chunk_size=100MB,merge_threads=2),type=lsm,leaf_page_max=16K" key_sz=40 value_sz=1000 report_interval=10 diff --git a/bench/wtperf/runners/test3-500m-lsm.wtperf b/bench/wtperf/runners/test3-500m-lsm.wtperf index 2da836d577c..446309c32c8 100644 --- a/bench/wtperf/runners/test3-500m-lsm.wtperf +++ b/bench/wtperf/runners/test3-500m-lsm.wtperf @@ -1,13 +1,14 @@ -# wtperf options file: simulate riak and its test2 configuration +# wtperf options file: simulate riak and its test3 configuration # The configuration for the connection and table are from riak and the # specification of the data (count, size, threads) is from basho_bench. # This test assumes that a test1 populate already completed and exists. # -#conn_config="cache_size=21G,checkpoint_sync=false,mmap=false,session_max=1024,statistics=(fast,clear),statistics_log=(wait=600)" +#conn_config="cache_size=21G,checkpoint_sync=false,mmap=false,session_max=1024,statistics=(fast,clear),statistics_log=(wait=60)" conn_config="cache_size=21G,checkpoint_sync=false,mmap=false,session_max=1024" create=false +compression="snappy" sess_config="isolation=snapshot" -table_config="internal_page_max=128K,lsm=(bloom_config=(leaf_page_max=8MB),bloom_bit_count=28,bloom_hash_count=19,bloom_oldest=true,chunk_size=100MB,merge_threads=2),type=lsm" +table_config="internal_page_max=128K,lsm=(bloom_config=(leaf_page_max=8MB),bloom_bit_count=28,bloom_hash_count=19,bloom_oldest=true,chunk_max=5GB,chunk_size=100MB,merge_threads=3),type=lsm,leaf_page_max=16K" key_sz=40 value_sz=1000 pareto=true diff --git a/bench/wtperf/runners/test3-50m-lsm.wtperf b/bench/wtperf/runners/test3-50m-lsm.wtperf index bc54cec3496..00291737545 100644 --- a/bench/wtperf/runners/test3-50m-lsm.wtperf +++ b/bench/wtperf/runners/test3-50m-lsm.wtperf @@ -1,17 +1,18 @@ -# wtperf options file: simulate riak and its test2 configuration +# wtperf options file: simulate riak and its test3 configuration # The configuration for the connection and table are from riak and the # specification of the data (count, size, threads) is from basho_bench. # This test assumes that a test1 populate already completed and exists. # -#conn_config="cache_size=10G,checkpoint_sync=false,mmap=false,session_max=1024,statistics=(fast,clear),statistics_log=(wait=300)" +#conn_config="cache_size=10G,checkpoint_sync=false,mmap=false,session_max=1024,statistics=(fast,clear),statistics_log=(wait=30)" conn_config="cache_size=10G,checkpoint_sync=false,mmap=false,session_max=1024" create=false sess_config="isolation=snapshot" -table_config="internal_page_max=128K,lsm=(bloom_config=(leaf_page_max=8MB),bloom_bit_count=28,bloom_hash_count=19,bloom_oldest=true,chunk_size=100MB,merge_threads=2),type=lsm" +table_config="internal_page_max=128K,lsm=(bloom_config=(leaf_page_max=8MB),bloom_bit_count=28,bloom_hash_count=19,bloom_oldest=true,chunk_size=100MB,merge_threads=2),type=lsm,leaf_page_max=16K" key_sz=40 value_sz=1000 pareto=true report_interval=10 run_time=1440 sample_interval=10 -threads=((count=10,reads=1,updates=1)) +#threads=((count=10,reads=1,updates=1)) +threads=((count=5,reads=1),(count=5,updates=1)) diff --git a/bench/wtperf/runners/test4-500m-lsm.wtperf b/bench/wtperf/runners/test4-500m-lsm.wtperf index 1257f473a7d..a959ec9057e 100644 --- a/bench/wtperf/runners/test4-500m-lsm.wtperf +++ b/bench/wtperf/runners/test4-500m-lsm.wtperf @@ -1,13 +1,14 @@ -# wtperf options file: simulate riak and its test2 configuration +# wtperf options file: simulate riak and its test4 configuration # The configuration for the connection and table are from riak and the # specification of the data (count, size, threads) is from basho_bench. # This test assumes that a test1 populate already completed and exists. # -#conn_config="cache_size=21G,checkpoint_sync=false,mmap=false,session_max=1024,statistics=(fast,clear),statistics_log=(wait=600)" +#conn_config="cache_size=21G,checkpoint_sync=false,mmap=false,session_max=1024,statistics=(fast,clear),statistics_log=(wait=60)" conn_config="cache_size=21G,checkpoint_sync=false,mmap=false,session_max=1024" create=false +compression="snappy" sess_config="isolation=snapshot" -table_config="internal_page_max=128K,lsm=(bloom_config=(leaf_page_max=8MB),bloom_bit_count=28,bloom_hash_count=19,bloom_oldest=true,chunk_size=100MB,merge_threads=2),type=lsm" +table_config="internal_page_max=128K,lsm=(bloom_config=(leaf_page_max=8MB),bloom_bit_count=28,bloom_hash_count=19,bloom_oldest=true,chunk_max=5GB,chunk_size=100MB,merge_threads=3),type=lsm,leaf_page_max=16K" key_sz=40 value_sz=1000 report_interval=10 diff --git a/bench/wtperf/runners/test4-50m-lsm.wtperf b/bench/wtperf/runners/test4-50m-lsm.wtperf index fc7c96c31d3..6536d8a1de7 100644 --- a/bench/wtperf/runners/test4-50m-lsm.wtperf +++ b/bench/wtperf/runners/test4-50m-lsm.wtperf @@ -1,13 +1,13 @@ -# wtperf options file: simulate riak and its test2 configuration +# wtperf options file: simulate riak and its test4 configuration # The configuration for the connection and table are from riak and the # specification of the data (count, size, threads) is from basho_bench. # This test assumes that a test1 populate already completed and exists. # -#conn_config="cache_size=10G,checkpoint_sync=false,mmap=false,session_max=1024,statistics=(fast,clear),statistics_log=(wait=300)" +#conn_config="cache_size=10G,checkpoint_sync=false,mmap=false,session_max=1024,statistics=(fast,clear),statistics_log=(wait=30)" conn_config="cache_size=10G,checkpoint_sync=false,mmap=false,session_max=1024" create=false sess_config="isolation=snapshot" -table_config="internal_page_max=128K,lsm=(bloom_config=(leaf_page_max=8MB),bloom_bit_count=28,bloom_hash_count=19,bloom_oldest=true,chunk_size=100MB,merge_threads=2),type=lsm" +table_config="internal_page_max=128K,lsm=(bloom_config=(leaf_page_max=8MB),bloom_bit_count=28,bloom_hash_count=19,bloom_oldest=true,chunk_size=100MB,merge_threads=2),type=lsm,leaf_page_max=16K" key_sz=40 value_sz=1000 report_interval=10 diff --git a/bench/wtperf/runners/voxer-10k.wtperf b/bench/wtperf/runners/voxer-10k.wtperf new file mode 100644 index 00000000000..17c46b4ed94 --- /dev/null +++ b/bench/wtperf/runners/voxer-10k.wtperf @@ -0,0 +1,19 @@ +# wtperf options file: simulate riak and its test1 and test2 configuration +# The configuration for the connection and table are from riak and the +# specification of the data (count, size, threads) is from basho_bench. +# +#conn_config="cache_size=21G,checkpoint_sync=false,mmap=false,session_max=1024,statistics=(fast,clear),statistics_log=(wait=600)" +conn_config="cache_size=21G,checkpoint_sync=false,mmap=false,session_max=1024" +compact=true +compression="snappy" +sess_config="isolation=snapshot" +table_config="internal_page_max=128K,lsm=(bloom_config=(leaf_page_max=8MB),bloom_bit_count=28,bloom_hash_count=19,bloom_oldest=true,chunk_size=100MB,merge_threads=2),type=lsm,leaf_page_max=16K" +icount=15000 +key_sz=40 +value_sz=10000 +populate_threads=1 +report_interval=10 +random_value=true +run_time=3600 +sample_interval=10 +threads=((count=20,read=1,update=1)) diff --git a/bench/wtperf/runners/voxer-130k.wtperf b/bench/wtperf/runners/voxer-130k.wtperf new file mode 100644 index 00000000000..e81510cf067 --- /dev/null +++ b/bench/wtperf/runners/voxer-130k.wtperf @@ -0,0 +1,19 @@ +# wtperf options file: simulate riak and its test1 and test2 configuration +# The configuration for the connection and table are from riak and the +# specification of the data (count, size, threads) is from basho_bench. +# +#conn_config="cache_size=21G,checkpoint_sync=false,mmap=false,session_max=1024,statistics=(fast,clear),statistics_log=(wait=600)" +conn_config="cache_size=21G,checkpoint_sync=false,mmap=false,session_max=1024" +compact=true +compression="snappy" +sess_config="isolation=snapshot" +table_config="internal_page_max=128K,lsm=(bloom_config=(leaf_page_max=8MB),bloom_bit_count=28,bloom_hash_count=19,bloom_oldest=true,chunk_size=100MB,merge_threads=2),type=lsm,leaf_page_max=16K" +icount=15000 +key_sz=40 +value_sz=130000 +populate_threads=1 +report_interval=10 +random_value=true +run_time=3600 +sample_interval=10 +threads=((count=20,read=1,update=1)) diff --git a/bench/wtperf/smoke.sh b/bench/wtperf/smoke.sh index 192d3cd208b..062277d90dc 100755 --- a/bench/wtperf/smoke.sh +++ b/bench/wtperf/smoke.sh @@ -1,4 +1,4 @@ #! /bin/sh # Smoke-test wtperf as part of running "make check". -./wtperf -S +./wtperf -O `dirname $0`/runners/small-lsm.wtperf -o "run_time=20" diff --git a/bench/wtperf/track.c b/bench/wtperf/track.c index 12f50dd5411..3919d0eb1ab 100644 --- a/bench/wtperf/track.c +++ b/bench/wtperf/track.c @@ -279,10 +279,8 @@ latency_print_single(CONFIG *cfg, TRACK *total, const char *name) return; } -#ifdef __WRITE_A_HEADER fprintf(fp, - "usecs,operations,cumulative-operations,total-operations\n"); -#endif + "#usecs,operations,cumulative-operations,total-operations\n"); cumops = 0; for (i = 0; i < ELEMENTS(total->us); ++i) { if (total->us[i] == 0) diff --git a/bench/wtperf/wtperf.c b/bench/wtperf/wtperf.c index 5554f2e564e..ab64c00802e 100644 --- a/bench/wtperf/wtperf.c +++ b/bench/wtperf/wtperf.c @@ -31,68 +31,36 @@ static const CONFIG default_cfg = { "WT_TEST", /* home */ "WT_TEST", /* monitor dir */ - NULL, /* uri */ + NULL, /* base_uri */ + NULL, /* uris */ + NULL, /* helium_mount */ NULL, /* conn */ NULL, /* logf */ + NULL, NULL, /* compressor ext, blk */ NULL, NULL, /* populate, checkpoint threads */ NULL, /* worker threads */ 0, /* worker thread count */ NULL, /* workloads */ 0, /* workload count */ + 0, /* checkpoint operations */ + 0, /* insert operations */ + 0, /* read operations */ + 0, /* update operations */ + 0, /* insert key */ + 0, /* checkpoint in progress */ + 0, /* thread error */ + 0, /* notify threads to stop */ + 0, /* total seconds running */ #define OPT_DEFINE_DEFAULT #include "wtperf_opt.i" #undef OPT_DEFINE_DEFAULT }; -static const char * const small_config_str = - "conn_config=\"cache_size=500MB\"," - "table_config=\"lsm=(chunk_size=5MB)\"," - "icount=500000," - "value_sz=100," - "key_sz=20," - "report_interval=5," - "run_time=20," - "populate_threads=1," - "threads=((count=8,read=1)),"; - -static const char * const med_config_str = - "conn_config=\"cache_size=1GB\"," - "table_config=\"lsm=(chunk_size=20MB)\"," - "icount=50000000," - "value_sz=100," - "key_sz=20," - "report_interval=5," - "run_time=100," - "populate_threads=1," - "threads=((count=16,read=1)),"; - -static const char * const large_config_str = - "conn_config=\"cache_size=2GB\"," - "table_config=\"lsm=(chunk_size=50MB)\"," - "icount=500000000," - "value_sz=100," - "key_sz=20," - "report_interval=5," - "run_time=600," - "populate_threads=1," - "threads=((count=16,read=1)),"; - static const char * const debug_cconfig = "verbose=[lsm]"; static const char * const debug_tconfig = ""; -static uint64_t g_ckpt_ops; /* checkpoint operations */ -static uint64_t g_insert_ops; /* insert operations */ -static uint64_t g_read_ops; /* read operations */ -static uint64_t g_update_ops; /* update operations */ - -static uint64_t g_insert_key; /* insert key */ - -static volatile int g_ckpt; /* checkpoint in progress */ -static volatile int g_error; /* thread error */ -static volatile int g_stop; /* notify threads to stop */ - /* * Atomic update where needed. */ @@ -103,15 +71,20 @@ static volatile int g_stop; /* notify threads to stop */ #endif static void *checkpoint_worker(void *); +static int create_tables(CONFIG *); +static int create_uris(CONFIG *); static int execute_populate(CONFIG *); static int execute_workload(CONFIG *); static int find_table_count(CONFIG *); static void *monitor(void *); static void *populate_thread(void *); -static void randomize_value(CONFIG *, char *); +static void randomize_value(CONFIG *, char *); +static int start_all_runs(CONFIG *); +static int start_run(CONFIG *); static int start_threads(CONFIG *, WORKLOAD *, CONFIG_THREAD *, u_int, void *(*)(void *)); static int stop_threads(CONFIG *, u_int, CONFIG_THREAD *); +static void *thread_run_wtperf(void *); static void *worker(void *); static uint64_t wtperf_rand(CONFIG *); static uint64_t wtperf_value_range(CONFIG *); @@ -130,9 +103,9 @@ extern uint32_t __wt_random(void); /* Retrieve an ID for the next insert operation. */ static inline uint64_t -get_next_incr(void) +get_next_incr(CONFIG *cfg) { - return (ATOMIC_ADD(g_insert_key, 1)); + return (ATOMIC_ADD(cfg->insert_key, 1)); } static void @@ -219,16 +192,18 @@ worker(void *arg) CONFIG_THREAD *thread; TRACK *trk; WT_CONNECTION *conn; - WT_CURSOR *cursor; + WT_CURSOR **cursors, *cursor; WT_SESSION *session; + size_t i; uint64_t next_val, usecs; - int measure_latency, ret; uint8_t *op, *op_end; + int measure_latency, ret; char *value_buf, *key_buf, *value; thread = (CONFIG_THREAD *)arg; cfg = thread->cfg; conn = cfg->conn; + cursors = NULL; session = NULL; trk = NULL; @@ -237,12 +212,22 @@ worker(void *arg) lprintf(cfg, ret, 0, "worker: WT_CONNECTION.open_session"); goto err; } - if ((ret = session->open_cursor( - session, cfg->uri, NULL, NULL, &cursor)) != 0) { - lprintf(cfg, - ret, 0, "worker: WT_SESSION.open_cursor: %s", cfg->uri); + cursors = (WT_CURSOR **)calloc( + cfg->table_count, sizeof(WT_CURSOR *)); + if (cursors == NULL) { + lprintf(cfg, ENOMEM, 0, + "worker: couldn't allocate cursor array"); goto err; } + for (i = 0; i < cfg->table_count; i++) { + if ((ret = session->open_cursor(session, + cfg->uris[i], NULL, NULL, &cursors[i])) != 0) { + lprintf(cfg, ret, 0, + "worker: WT_SESSION.open_cursor: %s", + cfg->uris[i]); + goto err; + } + } key_buf = thread->key_buf; value_buf = thread->value_buf; @@ -250,7 +235,7 @@ worker(void *arg) op = thread->workload->ops; op_end = op + sizeof(thread->workload->ops); - while (!g_stop) { + while (!cfg->stop) { /* * Generate the next key and setup operation specific * statistics tracking objects. @@ -262,7 +247,7 @@ worker(void *arg) if (cfg->random_range) next_val = wtperf_rand(cfg); else - next_val = cfg->icount + get_next_incr(); + next_val = cfg->icount + get_next_incr(cfg); break; case WORKER_READ: trk = &thread->read; @@ -285,7 +270,18 @@ worker(void *arg) } sprintf(key_buf, "%0*" PRIu64, cfg->key_sz, next_val); - measure_latency = cfg->sample_interval != 0 && ( + + /* + * Spread the data out around the multiple databases. + */ + cursor = cursors[next_val % cfg->table_count]; + + /* + * Skip the first time we do an operation, when trk->ops + * is 0, to avoid first time latency spikes. + */ + measure_latency = + cfg->sample_interval != 0 && trk->ops != 0 && ( trk->ops % cfg->sample_rate == 0); if (measure_latency && (ret = __wt_epoch(NULL, &start)) != 0) { @@ -385,8 +381,10 @@ op_err: lprintf(cfg, ret, 0, /* Notify our caller we failed and shut the system down. */ if (0) { -err: g_error = g_stop = 1; +err: cfg->error = cfg->stop = 1; } + if (cursors != NULL) + free(cursors); return (NULL); } @@ -502,17 +500,20 @@ populate_thread(void *arg) CONFIG_THREAD *thread; TRACK *trk; WT_CONNECTION *conn; - WT_CURSOR *cursor; + WT_CURSOR **cursors, *cursor; WT_SESSION *session; - uint32_t opcount; + size_t i; uint64_t op, usecs; + uint32_t opcount; int intxn, measure_latency, ret; char *value_buf, *key_buf; + const char *cursor_config; thread = (CONFIG_THREAD *)arg; cfg = thread->cfg; conn = cfg->conn; session = NULL; + cursors = NULL; ret = 0; trk = &thread->insert; @@ -525,17 +526,30 @@ populate_thread(void *arg) goto err; } - /* Do a bulk load if populate is single-threaded. */ - if ((ret = session->open_cursor(session, cfg->uri, NULL, - cfg->populate_threads == 1 ? "bulk" : NULL, &cursor)) != 0) { - lprintf(cfg, - ret, 0, "populate: WT_SESSION.open_cursor: %s", cfg->uri); + /* Do bulk loads if populate is single-threaded. */ + cursor_config = cfg->populate_threads == 1 ? "bulk" : NULL; + /* Create the cursors. */ + cursors = (WT_CURSOR **)calloc( + cfg->table_count, sizeof(WT_CURSOR *)); + if (cursors == NULL) { + lprintf(cfg, ENOMEM, 0, + "worker: couldn't allocate cursor array"); goto err; } + for (i = 0; i < cfg->table_count; i++) { + if ((ret = session->open_cursor( + session, cfg->uris[i], NULL, + cursor_config, &cursors[i])) != 0) { + lprintf(cfg, ret, 0, + "populate: WT_SESSION.open_cursor: %s", + cfg->uris[i]); + goto err; + } + } - /* Populate the database. */ + /* Populate the databases. */ for (intxn = 0, opcount = 0;;) { - op = get_next_incr(); + op = get_next_incr(cfg); if (op > cfg->icount) break; @@ -548,8 +562,13 @@ populate_thread(void *arg) } intxn = 1; } + /* + * Figure out which table this op belongs to. + */ + cursor = cursors[op % cfg->table_count]; sprintf(key_buf, "%0*" PRIu64, cfg->key_sz, op); - measure_latency = cfg->sample_interval != 0 && ( + measure_latency = + cfg->sample_interval != 0 && trk->ops != 0 && ( trk->ops % cfg->sample_rate == 0); if (measure_latency && (ret = __wt_epoch(NULL, &start)) != 0) { @@ -564,7 +583,12 @@ populate_thread(void *arg) lprintf(cfg, ret, 0, "Failed inserting"); goto err; } - /* Gather statistics */ + /* + * Gather statistics. + * We measure the latency of inserting a single key. If there + * are multiple tables, it is the time for insertion into all + * of them. + */ if (measure_latency) { if ((ret = __wt_epoch(NULL, &stop)) != 0) { lprintf(cfg, ret, 0, @@ -601,8 +625,10 @@ populate_thread(void *arg) /* Notify our caller we failed and shut the system down. */ if (0) { -err: g_error = g_stop = 1; +err: cfg->error = cfg->stop = 1; } + if (cursors != NULL) + free(cursors); return (NULL); } @@ -614,16 +640,16 @@ monitor(void *arg) struct tm *tm, _tm; CONFIG *cfg; FILE *fp; - char buf[64], *path; - int ret; + size_t len; uint64_t reads, inserts, updates; uint64_t cur_reads, cur_inserts, cur_updates; uint64_t last_reads, last_inserts, last_updates; uint32_t read_avg, read_min, read_max; uint32_t insert_avg, insert_min, insert_max; uint32_t update_avg, update_min, update_max; - size_t len; u_int i; + int ret; + char buf[64], *path; cfg = (CONFIG *)arg; assert(cfg->sample_interval != 0); @@ -643,28 +669,32 @@ monitor(void *arg) } /* Set line buffering for monitor file. */ (void)setvbuf(fp, NULL, _IOLBF, 0); -#ifdef __WRITE_A_HEADER fprintf(fp, "#time," - "read operations,insert operations,update operations," + "totalsec," + "read ops per second," + "insert ops per second," + "update ops per second," "checkpoints," - "read average latency(NS),read minimum latency(NS)," - "read maximum latency(NS)," - "insert average latency(NS),insert min latency(NS)," - "insert maximum latency(NS)," - "update average latency(NS),update min latency(NS)," - "update maximum latency(NS)" + "read average latency(uS)," + "read minimum latency(uS)," + "read maximum latency(uS)," + "insert average latency(uS)," + "insert min latency(uS)," + "insert maximum latency(uS)," + "update average latency(uS)," + "update min latency(uS)," + "update maximum latency(uS)" "\n"); -#endif last_reads = last_inserts = last_updates = 0; - while (!g_stop) { + while (!cfg->stop) { for (i = 0; i < cfg->sample_interval; i++) { sleep(1); - if (g_stop) + if (cfg->stop) break; } /* If the workers are done, don't bother with a final call. */ - if (g_stop) + if (cfg->stop) break; if ((ret = __wt_epoch(NULL, &t)) != 0) { @@ -694,18 +724,18 @@ monitor(void *arg) cur_inserts = inserts - last_inserts; (void)fprintf(fp, - "%s" + "%s,%" PRIu32 ",%" PRIu64 ",%" PRIu64 ",%" PRIu64 ",%c" ",%" PRIu32 ",%" PRIu32 ",%" PRIu32 ",%" PRIu32 ",%" PRIu32 ",%" PRIu32 ",%" PRIu32 ",%" PRIu32 ",%" PRIu32 "\n", - buf, + buf, cfg->totalsec, cur_reads / cfg->sample_interval, cur_inserts / cfg->sample_interval, cur_updates / cfg->sample_interval, - g_ckpt ? 'Y' : 'N', + cfg->ckpt ? 'Y' : 'N', read_avg, read_min, read_max, insert_avg, insert_min, insert_max, update_avg, update_min, update_max); @@ -717,7 +747,7 @@ monitor(void *arg) /* Notify our caller we failed and shut the system down. */ if (0) { -err: g_error = g_stop = 1; +err: cfg->error = cfg->stop = 1; } if (fp != NULL) @@ -750,27 +780,27 @@ checkpoint_worker(void *arg) goto err; } - while (!g_stop) { + while (!cfg->stop) { /* Break the sleep up, so we notice interrupts faster. */ for (i = 0; i < cfg->checkpoint_interval; i++) { sleep(1); - if (g_stop) + if (cfg->stop) break; } /* If the workers are done, don't bother with a final call. */ - if (g_stop) + if (cfg->stop) break; if ((ret = __wt_epoch(NULL, &s)) != 0) { lprintf(cfg, ret, 0, "Get time failed in checkpoint."); goto err; } - g_ckpt = 1; + cfg->ckpt = 1; if ((ret = session->checkpoint(session, NULL)) != 0) { lprintf(cfg, ret, 0, "Checkpoint failed."); goto err; } - g_ckpt = 0; + cfg->ckpt = 0; ++thread->ckpt.ops; if ((ret = __wt_epoch(NULL, &e)) != 0) { @@ -788,7 +818,7 @@ checkpoint_worker(void *arg) /* Notify our caller we failed and shut the system down. */ if (0) { -err: g_error = g_stop = 1; +err: cfg->error = cfg->stop = 1; } return (NULL); @@ -797,13 +827,14 @@ err: g_error = g_stop = 1; static int execute_populate(CONFIG *cfg) { + struct timespec start, stop; CONFIG_THREAD *popth; WT_SESSION *session; - struct timespec start, stop; double secs; + size_t i; uint64_t last_ops; - uint32_t interval, total; - int elapsed, ret; + uint32_t interval; + int elapsed, ret, t_ret; session = NULL; lprintf(cfg, 0, 1, @@ -816,14 +847,14 @@ execute_populate(CONFIG *cfg) cfg->popthreads, cfg->populate_threads, populate_thread)) != 0) return (ret); - g_insert_key = 0; + cfg->insert_key = 0; if ((ret = __wt_epoch(NULL, &start)) != 0) { lprintf(cfg, ret, 0, "Get time failed in populate."); return (ret); } - for (elapsed = 0, interval = 0, last_ops = 0, total = 0; - g_insert_key < cfg->icount && g_error == 0;) { + for (elapsed = 0, interval = 0, last_ops = 0; + cfg->insert_key < cfg->icount && cfg->error == 0;) { /* * Sleep for 100th of a second, report_interval is in second * granularity, each 100th increment of elapsed is a single @@ -836,14 +867,14 @@ execute_populate(CONFIG *cfg) if (++interval < cfg->report_interval) continue; interval = 0; - total += cfg->report_interval; - g_insert_ops = sum_pop_ops(cfg); + cfg->totalsec += cfg->report_interval; + cfg->insert_ops = sum_pop_ops(cfg); lprintf(cfg, 0, 1, "%" PRIu64 " populate inserts (%" PRIu64 " of %" PRIu32 ") in %" PRIu32 " secs (%" PRIu32 " total secs)", - g_insert_ops - last_ops, g_insert_ops, - cfg->icount, cfg->report_interval, total); - last_ops = g_insert_ops; + cfg->insert_ops - last_ops, cfg->insert_ops, + cfg->icount, cfg->report_interval, cfg->totalsec); + last_ops = cfg->insert_ops; } if ((ret = __wt_epoch(NULL, &stop)) != 0) { lprintf(cfg, ret, 0, "Get time failed in populate."); @@ -864,7 +895,7 @@ execute_populate(CONFIG *cfg) return (ret); /* Report if any worker threads didn't finish. */ - if (g_error != 0) { + if (cfg->error != 0) { lprintf(cfg, WT_ERROR, 0, "Populate thread(s) exited without finishing."); return (WT_ERROR); @@ -879,7 +910,9 @@ execute_populate(CONFIG *cfg) "Load time: %.2f\n" "load ops/sec: %.2f", secs, cfg->icount / secs); /* - * If configured, compact to allow LSM merging to complete. + * If configured, compact to allow LSM merging to complete. We + * set an unlimited timeout because if we close the connection + * then any in-progress compact/merge is aborted. */ if (cfg->compact) { if ((ret = cfg->conn->open_session( @@ -889,16 +922,34 @@ execute_populate(CONFIG *cfg) return (ret); } lprintf(cfg, 0, 1, "Compact after populate"); - if ((ret = session->compact(session, cfg->uri, NULL)) != 0) { - lprintf(cfg, ret, 0, - "execute_populate: WT_SESSION.compact"); - return (ret); + if ((ret = __wt_epoch(NULL, &start)) != 0) { + lprintf(cfg, ret, 0, "Get time failed in populate."); + goto err; } - if ((ret = session->close(session, NULL)) != 0) { - lprintf(cfg, ret, 0, + /* + * We measure how long it takes to compact all tables for this + * workload. + */ + for (i = 0; i < cfg->table_count; i++) + if ((ret = session->compact( + session, cfg->uris[i], "timeout=0")) != 0) { + lprintf(cfg, ret, 0, + "execute_populate: WT_SESSION.compact"); + goto err; + } + if ((ret = __wt_epoch(NULL, &stop)) != 0) { + lprintf(cfg, ret, 0, "Get time failed in populate."); + goto err; + } + secs = stop.tv_sec + stop.tv_nsec / (double)BILLION; + secs -= start.tv_sec + start.tv_nsec / (double)BILLION; + lprintf(cfg, 0, 1, "Compact completed in %.2f seconds", secs); +err: + if ((t_ret = session->close(session, NULL)) != 0) + lprintf(cfg, t_ret, 0, "execute_populate: WT_SESSION.close"); + if (ret != 0 || (ret = t_ret) != 0) return (ret); - } } /* @@ -926,12 +977,12 @@ execute_workload(CONFIG *cfg) CONFIG_THREAD *threads; WORKLOAD *workp; uint64_t last_ckpts, last_inserts, last_reads, last_updates; - uint32_t interval, run_ops, run_time, total; + uint32_t interval, run_ops, run_time; u_int i; int ret, t_ret; - g_insert_key = 0; - g_insert_ops = g_read_ops = g_update_ops = 0; + cfg->insert_key = 0; + cfg->insert_ops = cfg->read_ops = cfg->update_ops = 0; last_ckpts = last_inserts = last_reads = last_updates = 0; ret = 0; @@ -963,8 +1014,8 @@ execute_workload(CONFIG *cfg) threads += workp->threads; } - for (interval = cfg->report_interval, total = 0, - run_time = cfg->run_time, run_ops = cfg->run_ops; g_error == 0;) { + for (interval = cfg->report_interval, run_time = cfg->run_time, + run_ops = cfg->run_ops; cfg->error == 0;) { /* * Sleep for one second at a time. * If we are tracking run time, check to see if we're done, and @@ -979,46 +1030,46 @@ execute_workload(CONFIG *cfg) } /* Sum the operations we've done. */ - g_ckpt_ops = sum_ckpt_ops(cfg); - g_insert_ops = sum_insert_ops(cfg); - g_read_ops = sum_read_ops(cfg); - g_update_ops = sum_update_ops(cfg); + cfg->ckpt_ops = sum_ckpt_ops(cfg); + cfg->insert_ops = sum_insert_ops(cfg); + cfg->read_ops = sum_read_ops(cfg); + cfg->update_ops = sum_update_ops(cfg); /* If we're checking total operations, see if we're done. */ - if (run_ops != 0 && - run_ops <= g_insert_ops + g_read_ops + g_update_ops) + if (run_ops != 0 && run_ops <= + cfg->insert_ops + cfg->read_ops + cfg->update_ops) break; /* If writing out throughput information, see if it's time. */ if (interval == 0 || --interval > 0) continue; interval = cfg->report_interval; - total += cfg->report_interval; + cfg->totalsec += cfg->report_interval; lprintf(cfg, 0, 1, "%" PRIu64 " reads, %" PRIu64 " inserts, %" PRIu64 " updates, %" PRIu64 " checkpoints in %" PRIu32 " secs (%" PRIu32 " total secs)", - g_read_ops - last_reads, - g_insert_ops - last_inserts, - g_update_ops - last_updates, - g_ckpt_ops - last_ckpts, - cfg->report_interval, total); - last_reads = g_read_ops; - last_inserts = g_insert_ops; - last_updates = g_update_ops; - last_ckpts = g_ckpt_ops; + cfg->read_ops - last_reads, + cfg->insert_ops - last_inserts, + cfg->update_ops - last_updates, + cfg->ckpt_ops - last_ckpts, + cfg->report_interval, cfg->totalsec); + last_reads = cfg->read_ops; + last_inserts = cfg->insert_ops; + last_updates = cfg->update_ops; + last_ckpts = cfg->ckpt_ops; } /* Notify the worker threads they are done. */ -err: g_stop = 1; +err: cfg->stop = 1; if ((t_ret = stop_threads( cfg, (u_int)cfg->workers_cnt, cfg->workers)) != 0 && ret == 0) ret = t_ret; /* Report if any worker threads didn't finish. */ - if (g_error != 0) { + if (cfg->error != 0) { lprintf(cfg, WT_ERROR, 0, "Worker thread(s) exited without finishing."); if (ret == 0) @@ -1037,235 +1088,249 @@ find_table_count(CONFIG *cfg) WT_CONNECTION *conn; WT_CURSOR *cursor; WT_SESSION *session; - char *key; + uint32_t i, max_icount, table_icount; int ret, t_ret; + char *key; conn = cfg->conn; + max_icount = 0; if ((ret = conn->open_session( conn, NULL, cfg->sess_config, &session)) != 0) { lprintf(cfg, ret, 0, - "open_session failed finding existing table count"); - goto err; - } - if ((ret = session->open_cursor(session, cfg->uri, - NULL, NULL, &cursor)) != 0) { - lprintf(cfg, ret, 0, - "open_cursor failed finding existing table count"); - goto err; - } - if ((ret = cursor->prev(cursor)) != 0) { - lprintf(cfg, ret, 0, - "cursor prev failed finding existing table count"); - goto err; - } - if ((ret = cursor->get_key(cursor, &key)) != 0) { - lprintf(cfg, ret, 0, - "cursor get_key failed finding existing table count"); - goto err; + "find_table_count: open_session failed"); + goto out; } - cfg->icount = (uint32_t)atoi(key); + for (i = 0; i < cfg->table_count; i++) { + if ((ret = session->open_cursor(session, cfg->uris[i], + NULL, NULL, &cursor)) != 0) { + lprintf(cfg, ret, 0, + "find_table_count: open_cursor failed"); + goto err; + } + if ((ret = cursor->prev(cursor)) != 0) { + lprintf(cfg, ret, 0, + "find_table_count: cursor prev failed"); + goto err; + } + if ((ret = cursor->get_key(cursor, &key)) != 0) { + lprintf(cfg, ret, 0, + "find_table_count: cursor get_key failed"); + goto err; + } + table_icount = (uint32_t)atoi(key); + if (table_icount > max_icount) + max_icount = table_icount; + if ((ret = cursor->close(cursor)) != 0) { + lprintf(cfg, ret, 0, + "find_table_count: cursor close failed"); + goto err; + } + } err: if ((t_ret = session->close(session, NULL)) != 0) { if (ret == 0) ret = t_ret; lprintf(cfg, ret, 0, - "session close failed finding existing table count"); + "find_table_count: session close failed"); } - return (ret); + cfg->icount = max_icount; +out: return (ret); } -int -main(int argc, char *argv[]) +/* + * Populate the uri array if more than one table is being used. + */ +static int +create_uris(CONFIG *cfg) { - CONFIG *cfg, _cfg; - WT_SESSION *session; - pthread_t monitor_thread; - size_t len; - uint64_t req_len, total_ops; - int ch, monitor_created, monitor_set, ret, t_ret; - const char *helium_mount; - const char *opts = "C:H:h:LMm:O:o:ST:"; - const char *wtperftmp_subdir = "wtperftmp"; - const char *user_cconfig, *user_tconfig; - char *cmd, *cc_buf, *tc_buf, *tmphome; - - session = NULL; - monitor_created = monitor_set = ret = 0; - helium_mount = user_cconfig = user_tconfig = NULL; - cmd = cc_buf = tc_buf = tmphome = NULL; + size_t base_uri_len; + uint32_t i; + int ret; + char *uri; - /* Setup the default configuration values. */ - cfg = &_cfg; - memset(cfg, 0, sizeof(*cfg)); - if (config_assign(cfg, &default_cfg)) + ret = 0; + base_uri_len = strlen(cfg->base_uri); + cfg->uris = (char **)calloc(cfg->table_count, sizeof(char *)); + if (cfg->uris == NULL) { + ret = ENOMEM; goto err; - - /* Do a basic validation of options, and home is needed before open. */ - while ((ch = getopt(argc, argv, opts)) != EOF) - switch (ch) { - case 'h': - cfg->home = optarg; - break; - case 'm': - cfg->monitor_dir = optarg; - monitor_set = 1; - break; - case '?': - fprintf(stderr, "Invalid option\n"); - usage(); - goto einval; + } + for (i = 0; i < cfg->table_count; i++) { + uri = cfg->uris[i] = (char *)calloc(base_uri_len + 3, 1); + if (uri == NULL) { + ret = ENOMEM; + goto err; } + memcpy(uri, cfg->base_uri, base_uri_len); + /* + * If there is only one table, just use base name. + */ + if (cfg->table_count > 1) { + uri[base_uri_len] = uri[base_uri_len + 1] = '0'; + uri[base_uri_len] = '0' + (i / 10); + uri[base_uri_len + 1] = '0' + (i % 10); + } + } +err: if (ret != 0 && cfg->uris != NULL) { + for (i = 0; i < cfg->table_count; i++) + free(cfg->uris[i]); + free(cfg->uris); + cfg->uris = NULL; + } + return (ret); +} - /* - * If the user did not specify a monitor directory - * then set the monitor directory to the home dir. - */ - if (!monitor_set) - cfg->monitor_dir = cfg->home; +static int +create_tables(CONFIG *cfg) +{ + WT_SESSION *session; + size_t i; + int ret; + char *uri; - /* - * Create a temporary directory underneath the test directory in which - * we do an initial WiredTiger open, because we need a connection in - * order to use the extension configuration parser. We will open the - * real WiredTiger database after parsing the options. - */ - len = strlen(cfg->home) + strlen(wtperftmp_subdir) + 2; - if ((tmphome = malloc(len)) == NULL) { - ret = enomem(cfg); - goto err; - } - snprintf(tmphome, len, "%s/%s", cfg->home, wtperftmp_subdir); - len = len * 2 + 100; - if ((cmd = malloc(len)) == NULL) { - ret = enomem(cfg); - goto err; + session = NULL; + if (cfg->create == 0) + return (0); + + uri = cfg->base_uri; + if ((ret = cfg->conn->open_session( + cfg->conn, NULL, cfg->sess_config, &session)) != 0) { + lprintf(cfg, ret, 0, + "Error opening a session on %s", cfg->home); + return (ret); } - snprintf(cmd, len, "rm -rf %s && mkdir %s", tmphome, tmphome); - if (system(cmd) != 0) { - fprintf(stderr, "%s: failed\n", cmd); - goto einval; + for (i = 0; i < cfg->table_count; i++) { + uri = cfg->uris[i]; + if ((ret = session->create( + session, uri, cfg->table_config)) != 0) { + lprintf(cfg, ret, 0, + "Error creating table %s", cfg->uris[i]); + return (ret); + } } - if ((ret = wiredtiger_open( - tmphome, NULL, "create", &cfg->conn)) != 0) { - lprintf(cfg, ret, 0, "wiredtiger_open: %s", tmphome); - goto err; + if ((ret = session->close(session, NULL)) != 0) { + lprintf(cfg, + ret, 0, "Error closing session"); + return (ret); } + return (ret); +} - /* - * Then parse different config structures - other options override - * fields within the structure. - */ - optind = 1; - while ((ch = getopt(argc, argv, opts)) != EOF) - switch (ch) { - case 'S': - if (config_opt_line(cfg, small_config_str) != 0) - goto einval; - break; - case 'M': - if (config_opt_line(cfg, med_config_str) != 0) - goto einval; - break; - case 'L': - if (config_opt_line(cfg, large_config_str) != 0) - goto einval; - break; - case 'O': - if (config_opt_file(cfg, optarg) != 0) - goto einval; - break; - default: - /* Validation done previously. */ - break; - } +static int +start_all_runs(CONFIG *cfg) +{ + CONFIG *next_cfg, **configs; + pthread_t *threads; + size_t cmd_len, home_len, i; + int ret, t_ret; + char *cmd_buf, *new_home; - /* Parse other options */ - optind = 1; - while ((ch = getopt(argc, argv, opts)) != EOF) - switch (ch) { - case 'C': - user_cconfig = optarg; - break; - case 'H': - helium_mount = optarg; - break; - case 'o': - /* Allow -o key=value */ - if (config_opt_line(cfg, optarg) != 0) - goto einval; - break; - case 'T': - user_tconfig = optarg; - break; - } + ret = 0; + configs = NULL; + cmd_buf = NULL; - /* Build the URI from the table name. */ - req_len = strlen("table:") + - strlen(HELIUM_NAME) + strlen(cfg->table_name) + 2; - if ((cfg->uri = calloc(req_len, 1)) == NULL) { - ret = enomem(cfg); + if (cfg->database_count == 1) + return (start_run(cfg)); + + /* Allocate an array to hold our config struct copies. */ + configs = calloc(cfg->database_count, sizeof(CONFIG *)); + if (configs == NULL) + return (ENOMEM); + + /* Allocate an array to hold our thread IDs. */ + threads = calloc(cfg->database_count, sizeof(pthread_t)); + if (threads == NULL) { + ret = ENOMEM; goto err; } - snprintf(cfg->uri, req_len, "table:%s%s%s", - helium_mount == NULL ? "" : HELIUM_NAME, - helium_mount == NULL ? "" : "/", - cfg->table_name); - - if ((ret = setup_log_file(cfg)) != 0) - goto err; - /* Make stdout line buffered, so verbose output appears quickly. */ - (void)setvbuf(stdout, NULL, _IOLBF, 0); + home_len = strlen(cfg->home); + cmd_len = (home_len * 2) + 30; /* Add some slop. */ + cmd_buf = calloc(cmd_len, 1); + if (cmd_buf == NULL) { + ret = ENOMEM; + goto err; + } + for (i = 0; i < cfg->database_count; i++) { + next_cfg = calloc(1, sizeof(CONFIG)); + if ((ret = config_assign(next_cfg, cfg)) != 0) + goto err; - /* Concatenate non-default configuration strings. */ - if (cfg->verbose > 1 || user_cconfig != NULL) { - req_len = strlen(cfg->conn_config) + strlen(debug_cconfig) + 3; - if (user_cconfig != NULL) - req_len += strlen(user_cconfig); - if ((cc_buf = calloc(req_len, 1)) == NULL) { - ret = enomem(cfg); + /* Setup a unique home directory for each database. */ + configs[i] = next_cfg; + new_home = malloc(home_len + 5); + if (new_home == NULL) { + ret = ENOMEM; goto err; } - snprintf(cc_buf, req_len, "%s%s%s%s%s", - cfg->conn_config, - cfg->verbose > 1 ? "," : "", - cfg->verbose > 1 ? debug_cconfig : "", - user_cconfig ? "," : "", user_cconfig ? user_cconfig : ""); - if ((ret = config_opt_str(cfg, "conn_config", cc_buf)) != 0) - goto err; - } - if (cfg->verbose > 1 || helium_mount != NULL || user_tconfig != NULL) { - req_len = strlen(cfg->table_config) + - strlen(HELIUM_CONFIG) + strlen(debug_tconfig) + 3; - if (user_tconfig != NULL) - req_len += strlen(user_tconfig); - if ((tc_buf = calloc(req_len, 1)) == NULL) { - ret = enomem(cfg); + sprintf(new_home, "%s/D%02d", cfg->home, (int)i); + next_cfg->home = (const char *)new_home; + + /* If the monitor dir is default, update it too. */ + if (strcmp(cfg->monitor_dir, cfg->home) == 0) + next_cfg->monitor_dir = new_home; + + /* Create clean home directories. */ + snprintf(cmd_buf, cmd_len, "rm -rf %s && mkdir %s", + next_cfg->home, next_cfg->home); + if ((ret = system(cmd_buf)) != 0) { + fprintf(stderr, "%s: failed\n", cmd_buf); goto err; } - snprintf(tc_buf, req_len, "%s%s%s%s%s%s", - cfg->table_config, - cfg->verbose > 1 ? "," : "", - cfg->verbose > 1 ? debug_tconfig : "", - user_tconfig ? "," : "", user_tconfig ? user_tconfig : "", - helium_mount == NULL ? "" : HELIUM_CONFIG); - if ((ret = config_opt_str(cfg, "table_config", tc_buf)) != 0) + if ((ret = pthread_create( + &threads[i], NULL, thread_run_wtperf, next_cfg)) != 0) { + lprintf(cfg, ret, 0, "Error creating thread"); goto err; + } } - ret = cfg->conn->close(cfg->conn, NULL); - cfg->conn = NULL; - if (ret != 0) { - lprintf(cfg, ret, 0, "WT_CONNECTION.close: %s", tmphome); - goto err; + /* Wait for threads to finish. */ + for (i = 0; i < cfg->database_count; i++) { + if ((t_ret = pthread_join(threads[i], NULL)) != 0) { + lprintf(cfg, ret, 0, "Error joining thread"); + if (ret == 0) + ret = t_ret; + } } - /* Sanity-check the configuration */ - if (config_sanity(cfg) != 0) - goto err; - if (cfg->verbose > 1) /* Display the configuration. */ - config_print(cfg); +err: for (i = 0; i < cfg->database_count && configs[i] != NULL; i++) { + free((char *)configs[i]->home); + config_free(configs[i]); + free(configs[i]); + } + free(configs); + free(threads); + free(cmd_buf); + + return (ret); +} + +/* Run an instance of wtperf for a given configuration. */ +static void * +thread_run_wtperf(void *arg) +{ + CONFIG *cfg; + int ret; + + cfg = (CONFIG *)arg; + if ((ret = start_run(cfg)) != 0) + lprintf(cfg, ret, 0, "Run failed for: %s.", cfg->home); + return (NULL); +} + +static int +start_run(CONFIG *cfg) +{ + pthread_t monitor_thread; + uint64_t total_ops; + int monitor_created, ret, t_ret; + char helium_buf[256]; + + monitor_created = ret = 0; + + if ((ret = setup_log_file(cfg)) != 0) + goto err; if ((ret = wiredtiger_open( /* Open the real connection. */ cfg->home, NULL, cfg->conn_config, &cfg->conn)) != 0) { @@ -1273,40 +1338,25 @@ main(int argc, char *argv[]) goto err; } - if (helium_mount != NULL) { /* Configure optional Helium volume. */ - char helium_buf[256]; + /* Configure optional Helium volume. */ + if (cfg->helium_mount != NULL) { snprintf(helium_buf, sizeof(helium_buf), "entry=wiredtiger_extension_init,config=[" "%s=[helium_devices=\"he://./%s\"," "helium_o_volume_truncate=1]]", - HELIUM_NAME, helium_mount); + HELIUM_NAME, cfg->helium_mount); if ((ret = cfg->conn->load_extension( cfg->conn, HELIUM_PATH, helium_buf)) != 0) lprintf(cfg, ret, 0, "Error loading Helium: %s", helium_buf); } - if (cfg->create != 0) { /* If creating, create the table. */ - if ((ret = cfg->conn->open_session( - cfg->conn, NULL, cfg->sess_config, &session)) != 0) { - lprintf(cfg, ret, 0, - "Error opening a session on %s", cfg->home); - goto err; - } - if ((ret = session->create( - session, cfg->uri, cfg->table_config)) != 0) { - lprintf(cfg, - ret, 0, "Error creating table %s", cfg->uri); - goto err; - } - if ((ret = session->close(session, NULL)) != 0) { - lprintf(cfg, - ret, 0, "Error closing session"); - goto err; - } - session = NULL; - } - /* Start the monitor thread. */ + if ((ret = create_uris(cfg)) != 0) + goto err; + if ((ret = create_tables(cfg)) != 0) + goto err; + + /* Start the monitor thread. */ if (cfg->sample_interval != 0) { if ((ret = pthread_create( &monitor_thread, NULL, monitor, cfg)) != 0) { @@ -1316,15 +1366,17 @@ main(int argc, char *argv[]) } monitor_created = 1; } - /* If creating, populate the table. */ + + /* If creating, populate the table. */ if (cfg->create != 0 && execute_populate(cfg) != 0) goto err; - /* Optional workload. */ + + /* Optional workload. */ if (cfg->run_time != 0 || cfg->run_ops != 0) { - /* Didn't create, set insert count. */ + /* Didn't create, set insert count. */ if (cfg->create == 0 && find_table_count(cfg) != 0) goto err; - /* Start the checkpoint thread. */ + /* Start the checkpoint thread. */ if (cfg->checkpoint_threads != 0) { lprintf(cfg, 0, 1, "Starting %" PRIu32 " checkpoint thread(s)", @@ -1339,47 +1391,46 @@ main(int argc, char *argv[]) cfg->checkpoint_threads, checkpoint_worker) != 0) goto err; } - /* Execute the workload. */ + /* Execute the workload. */ if ((ret = execute_workload(cfg)) != 0) goto err; /* One final summation of the operations we've completed. */ - g_read_ops = sum_read_ops(cfg); - g_insert_ops = sum_insert_ops(cfg); - g_update_ops = sum_update_ops(cfg); - g_ckpt_ops = sum_ckpt_ops(cfg); - total_ops = g_read_ops + g_insert_ops + g_update_ops; + cfg->read_ops = sum_read_ops(cfg); + cfg->insert_ops = sum_insert_ops(cfg); + cfg->update_ops = sum_update_ops(cfg); + cfg->ckpt_ops = sum_ckpt_ops(cfg); + total_ops = cfg->read_ops + cfg->insert_ops + cfg->update_ops; lprintf(cfg, 0, 1, "Executed %" PRIu64 " read operations (%" PRIu64 "%%) %" PRIu64 " ops/sec", - g_read_ops, (g_read_ops * 100) / total_ops, - g_read_ops / cfg->run_time); + cfg->read_ops, (cfg->read_ops * 100) / total_ops, + cfg->read_ops / cfg->run_time); lprintf(cfg, 0, 1, "Executed %" PRIu64 " insert operations (%" PRIu64 "%%) %" PRIu64 " ops/sec", - g_insert_ops, (g_insert_ops * 100) / total_ops, - g_insert_ops / cfg->run_time); + cfg->insert_ops, (cfg->insert_ops * 100) / total_ops, + cfg->insert_ops / cfg->run_time); lprintf(cfg, 0, 1, "Executed %" PRIu64 " update operations (%" PRIu64 "%%) %" PRIu64 " ops/sec", - g_update_ops, (g_update_ops * 100) / total_ops, - g_update_ops / cfg->run_time); + cfg->update_ops, (cfg->update_ops * 100) / total_ops, + cfg->update_ops / cfg->run_time); lprintf(cfg, 0, 1, "Executed %" PRIu64 " checkpoint operations", - g_ckpt_ops); + cfg->ckpt_ops); latency_print(cfg); } if (0) { -einval: ret = EINVAL; err: if (ret == 0) ret = EXIT_FAILURE; } /* Notify the worker threads they are done. */ - g_stop = 1; + cfg->stop = 1; if ((t_ret = stop_threads(cfg, 1, cfg->ckptthreads)) != 0) if (ret == 0) @@ -1415,12 +1466,165 @@ err: if (ret == 0) if ((t_ret = fclose(cfg->logf)) != 0 && ret == 0) ret = t_ret; } - config_free(cfg); + return (ret); +} +int +main(int argc, char *argv[]) +{ + CONFIG *cfg, _cfg; + size_t req_len; + int ch, monitor_set, ret; + char *cc_buf, *tc_buf; + const char *opts = "C:H:h:m:O:o:T:"; + const char *config_opts, *user_cconfig, *user_tconfig; + + monitor_set = ret = 0; + cc_buf = tc_buf = NULL; + config_opts = user_cconfig = user_tconfig = NULL; + + /* Setup the default configuration values. */ + cfg = &_cfg; + memset(cfg, 0, sizeof(*cfg)); + if (config_assign(cfg, &default_cfg)) + goto err; + + /* Do a basic validation of options, and home is needed before open. */ + while ((ch = getopt(argc, argv, opts)) != EOF) + switch (ch) { + case 'C': + user_cconfig = optarg; + break; + case 'H': + cfg->helium_mount = optarg; + break; + case 'O': + config_opts = optarg; + break; + case 'T': + user_tconfig = optarg; + break; + case 'h': + cfg->home = optarg; + break; + case 'm': + cfg->monitor_dir = optarg; + monitor_set = 1; + break; + case '?': + fprintf(stderr, "Invalid option\n"); + usage(); + goto einval; + } + + /* + * If the user did not specify a monitor directory then set the + * monitor directory to the home dir. + */ + if (!monitor_set) + cfg->monitor_dir = cfg->home; + + /* Parse configuration settings from configuration file. */ + if (config_opts != NULL && config_opt_file(cfg, config_opts) != 0) + goto einval; + + /* Parse options that override values set via a configuration file. */ + optind = 1; + while ((ch = getopt(argc, argv, opts)) != EOF) + switch (ch) { + case 'o': + /* Allow -o key=value */ + if (config_opt_line(cfg, optarg) != 0) + goto einval; + break; + } + + if ((ret = config_compress(cfg)) != 0) + goto err; + + /* Build the URI from the table name. */ + req_len = strlen("table:") + + strlen(HELIUM_NAME) + strlen(cfg->table_name) + 2; + if ((cfg->base_uri = calloc(req_len, 1)) == NULL) { + ret = enomem(cfg); + goto err; + } + snprintf(cfg->base_uri, req_len, "table:%s%s%s", + cfg->helium_mount == NULL ? "" : HELIUM_NAME, + cfg->helium_mount == NULL ? "" : "/", + cfg->table_name); + + /* Make stdout line buffered, so verbose output appears quickly. */ + (void)setvbuf(stdout, NULL, _IOLBF, 0); + + /* Concatenate non-default configuration strings. */ + if (cfg->verbose > 1 || user_cconfig != NULL || + cfg->compress_ext != NULL) { + req_len = strlen(cfg->conn_config) + strlen(debug_cconfig) + 3; + if (user_cconfig != NULL) + req_len += strlen(user_cconfig); + if (cfg->compress_ext != NULL) + req_len += strlen(cfg->compress_ext); + if ((cc_buf = calloc(req_len, 1)) == NULL) { + ret = enomem(cfg); + goto err; + } + /* + * This is getting hard to parse. + */ + snprintf(cc_buf, req_len, "%s%s%s%s%s%s", + cfg->conn_config, + cfg->compress_ext ? cfg->compress_ext : "", + cfg->verbose > 1 ? ",": "", + cfg->verbose > 1 ? debug_cconfig : "", + user_cconfig ? ",": "", + user_cconfig ? user_cconfig : ""); + if ((ret = config_opt_str(cfg, "conn_config", cc_buf)) != 0) + goto err; + } + if (cfg->verbose > 1 || cfg->helium_mount != NULL || + user_tconfig != NULL || cfg->compress_table != NULL) { + req_len = strlen(cfg->table_config) + strlen(HELIUM_CONFIG) + + strlen(debug_tconfig) + 3; + if (user_tconfig != NULL) + req_len += strlen(user_tconfig); + if (cfg->compress_table != NULL) + req_len += strlen(cfg->compress_table); + if ((tc_buf = calloc(req_len, 1)) == NULL) { + ret = enomem(cfg); + goto err; + } + /* + * This is getting hard to parse. + */ + snprintf(tc_buf, req_len, "%s%s%s%s%s%s%s", + cfg->table_config, + cfg->compress_table ? cfg->compress_table : "", + cfg->verbose > 1 ? ",": "", + cfg->verbose > 1 ? debug_tconfig : "", + user_tconfig ? ",": "", + user_tconfig ? user_tconfig : "", + cfg->helium_mount == NULL ? "" : HELIUM_CONFIG); + if ((ret = config_opt_str(cfg, "table_config", tc_buf)) != 0) + goto err; + } + + /* Sanity-check the configuration. */ + if (config_sanity(cfg) != 0) + goto err; + + /* Display the configuration. */ + if (cfg->verbose > 1) + config_print(cfg); + + if ((ret = start_all_runs(cfg)) != 0) + goto err; + + if (0) +einval: ret = EINVAL; +err: config_free(cfg); free(cc_buf); - free(cmd); free(tc_buf); - free(tmphome); return (ret == 0 ? EXIT_SUCCESS : EXIT_FAILURE); } @@ -1507,7 +1711,7 @@ wtperf_value_range(CONFIG *cfg) if (cfg->random_range) return (cfg->icount + cfg->random_range); - return (cfg->icount + g_insert_key - (u_int)(cfg->workers_cnt + 1)); + return (cfg->icount + cfg->insert_key - (u_int)(cfg->workers_cnt + 1)); } static uint64_t diff --git a/bench/wtperf/wtperf.h b/bench/wtperf/wtperf.h index 54c4334a2ac..c94ec10d8fd 100644 --- a/bench/wtperf/wtperf.h +++ b/bench/wtperf/wtperf.h @@ -48,6 +48,21 @@ typedef struct __config CONFIG; typedef struct __config_thread CONFIG_THREAD; +#define EXT_PFX ",extensions=(" +#define EXT_SFX ")" +#define EXTPATH "../../ext/compressors/" /* Extensions path */ +#define BLKCMP_PFX ",block_compressor=" + +#define BZIP_BLK BLKCMP_PFX "bzip2" +#define BZIP_EXT \ + EXT_PFX EXTPATH "bzip2/.libs/libwiredtiger_bzip2.so" EXT_SFX +#define SNAPPY_BLK BLKCMP_PFX "snappy" +#define SNAPPY_EXT \ + EXT_PFX EXTPATH "snappy/.libs/libwiredtiger_snappy.so" EXT_SFX +#define ZLIB_BLK BLKCMP_PFX "zlib" +#define ZLIB_EXT \ + EXT_PFX EXTPATH "zlib/.libs/libwiredtiger_zlib.so" EXT_SFX + typedef struct { int64_t threads; /* Thread count */ int64_t insert; /* Insert ratio */ @@ -61,15 +76,24 @@ typedef struct { uint8_t ops[100]; /* Operation schedule */ } WORKLOAD; +/* + * NOTE: If you add any fields to this structure here, you must also add + * an initialization in wtperf.c in the default_cfg. + */ struct __config { /* Configuration struction */ const char *home; /* WiredTiger home */ const char *monitor_dir; /* Monitor output dir */ - char *uri; /* Object URI */ + char *base_uri; /* Object URI */ + char **uris; /* URIs if multiple tables */ + const char *helium_mount; /* Optional Helium mount point */ WT_CONNECTION *conn; /* Database connection */ FILE *logf; /* Logging handle */ + const char *compress_ext; /* Compression extension for conn */ + const char *compress_table; /* Compression arg to table create */ + CONFIG_THREAD *ckptthreads, *popthreads; #define WORKLOAD_MAX 50 @@ -79,6 +103,21 @@ struct __config { /* Configuration struction */ WORKLOAD *workload; /* Workloads */ u_int workload_cnt; + /* State tracking variables. */ + + uint64_t ckpt_ops; /* checkpoint operations */ + uint64_t insert_ops; /* insert operations */ + uint64_t read_ops; /* read operations */ + uint64_t update_ops; /* update operations */ + + uint64_t insert_key; /* insert key */ + + volatile int ckpt; /* checkpoint in progress */ + volatile int error; /* thread error */ + volatile int stop; /* notify threads to stop */ + + volatile uint32_t totalsec; /* total seconds running */ + /* Fields changeable on command line are listed in wtperf_opt.i */ #define OPT_DECLARE_STRUCT #include "wtperf_opt.i" @@ -141,8 +180,8 @@ typedef struct { * Minimum/maximum latency, shared with the monitor thread, that is, the * monitor thread clears it so it's recalculated again for each period. */ - uint32_t min_latency; /* Minimum latency (NS) */ - uint32_t max_latency; /* Maximum latency (NS) */ + uint32_t min_latency; /* Minimum latency (uS) */ + uint32_t max_latency; /* Maximum latency (uS) */ /* * Latency buckets. @@ -168,6 +207,7 @@ struct __config_thread { /* Per-thread structure */ }; int config_assign(CONFIG *, const CONFIG *); +int config_compress(CONFIG *); void config_free(CONFIG *); int config_opt_file(CONFIG *, const char *); int config_opt_line(CONFIG *, const char *); diff --git a/bench/wtperf/wtperf_opt.i b/bench/wtperf/wtperf_opt.i index 4e11799781f..9ee7072497c 100644 --- a/bench/wtperf/wtperf_opt.i +++ b/bench/wtperf/wtperf_opt.i @@ -82,9 +82,18 @@ DEF_OPT_AS_UINT32(checkpoint_threads, 0, "number of checkpoint threads") DEF_OPT_AS_CONFIG_STRING(conn_config, "create", "connection configuration string") DEF_OPT_AS_BOOL(compact, 0, "post-populate compact for LSM merging activity") +DEF_OPT_AS_STRING(compression, "none", + "compression extension. Allowed configuration values are: " + "'none' (default), 'bzip', 'snappy', 'zlib'") DEF_OPT_AS_BOOL(create, 1, "do population phase; false to use existing database") -DEF_OPT_AS_UINT32(icount, 5000, "number of records to initially populate") +DEF_OPT_AS_UINT32(database_count, 1, + "number of WiredTiger databases to use. Each database will execute the" + " workload using a separate home directory and complete set of worker" + " threads") +DEF_OPT_AS_UINT32(icount, 5000, + "number of records to initially populate. If multiple tables are " + "configured, each table has this many items inserted.") DEF_OPT_AS_BOOL(insert_rmw, 0, "execute a read prior to each insert in workload phase") DEF_OPT_AS_UINT32(key_sz, 20, "key size") @@ -114,6 +123,9 @@ DEF_OPT_AS_CONFIG_STRING(table_config, "key_format=S,value_format=S,type=lsm,exclusive=true," "leaf_page_max=4kb,internal_page_max=64kb,allocation_size=4kb,", "table configuration string") +DEF_OPT_AS_UINT32(table_count, 1, + "number of tables to run operations over. Keys are divided evenly " + "over the tables. Default 1, maximum 99.") DEF_OPT_AS_STRING(threads, "", "workload configuration: each 'count' " "entry is the total number of threads, and the 'insert', 'read' and " "'update' entries are the ratios of insert, read and update operations " diff --git a/build_posix/aclocal/version-set.m4 b/build_posix/aclocal/version-set.m4 index 2f2770d84cf..0c7436fad81 100644 --- a/build_posix/aclocal/version-set.m4 +++ b/build_posix/aclocal/version-set.m4 @@ -2,8 +2,8 @@ dnl build by dist/s_version VERSION_MAJOR=2 VERSION_MINOR=1 -VERSION_PATCH=0 -VERSION_STRING='"WiredTiger 2.1.0: (February 4, 2014)"' +VERSION_PATCH=1 +VERSION_STRING='"WiredTiger 2.1.1: (March 4, 2014)"' AC_SUBST(VERSION_MAJOR) AC_SUBST(VERSION_MINOR) diff --git a/build_posix/aclocal/version.m4 b/build_posix/aclocal/version.m4 index b47038041e0..80fe1bb193f 100644 --- a/build_posix/aclocal/version.m4 +++ b/build_posix/aclocal/version.m4 @@ -1,2 +1,2 @@ dnl WiredTiger product version for AC_INIT. Maintained by dist/s_version -2.1.0 +2.1.1 diff --git a/dist/api_data.py b/dist/api_data.py index 5509032d148..412f0603efd 100644 --- a/dist/api_data.py +++ b/dist/api_data.py @@ -613,8 +613,13 @@ methods = { Use \c O_DIRECT to access files. Options are given as a list, such as <code>"direct_io=[data]"</code>. Configuring \c direct_io requires care, see @ref - tuning_system_buffer_cache_direct_io for important warnings''', - type='list', choices=['data', 'log']), + tuning_system_buffer_cache_direct_io for important warnings. + Including \c "data" will cause WiredTiger data files to use + \c O_DIRECT, including \c "log" will cause WiredTiger log files + to use \c O_DIRECT, and including \c "checkpoint" will cause + WiredTiger data files opened at a checkpoint (i.e: read only) to + use \c O_DIRECT''', + type='list', choices=['checkpoint', 'data', 'log']), Config('extensions', '', r''' list of shared library extensions to load (using dlopen). Any values specified to an library extension are passed to diff --git a/dist/filelist b/dist/filelist index 87b75d0b7cf..58b5aa2ca92 100644 --- a/dist/filelist +++ b/dist/filelist @@ -50,6 +50,7 @@ src/btree/row_key.c src/btree/row_modify.c src/btree/row_srch.c src/config/config.c +src/config/config_api.c src/config/config_check.c src/config/config_collapse.c src/config/config_concat.c diff --git a/dist/flags.py b/dist/flags.py index 7ca1ee3144d..2a6adfccf43 100644 --- a/dist/flags.py +++ b/dist/flags.py @@ -15,6 +15,7 @@ flags = { 'SYNC_WRITE_LEAVES', ], 'file_types' : [ + 'FILE_TYPE_CHECKPOINT', 'FILE_TYPE_DATA', 'FILE_TYPE_LOG' ], diff --git a/dist/s_funcs.list b/dist/s_funcs.list index b34564adda7..6ffb956f59e 100644 --- a/dist/s_funcs.list +++ b/dist/s_funcs.list @@ -22,6 +22,7 @@ __wt_log_scan __wt_nlpo2 __wt_nlpo2_round __wt_print_huffman_code +wiredtiger_config_parser_open wiredtiger_pack_int wiredtiger_pack_item wiredtiger_pack_str diff --git a/dist/s_string.ok b/dist/s_string.ok index a71cca62a8c..5f0d331e105 100644 --- a/dist/s_string.ok +++ b/dist/s_string.ok @@ -688,6 +688,7 @@ openfile os ovfl packv +parserp patchp pathname pathnames diff --git a/dist/s_symbols.list b/dist/s_symbols.list index a66af1f8994..d3803bc3afa 100644 --- a/dist/s_symbols.list +++ b/dist/s_symbols.list @@ -1,4 +1,5 @@ # List of OK external symbols. +wiredtiger_config_parser_open wiredtiger_open wiredtiger_pack_close wiredtiger_pack_int diff --git a/dist/stat_data.py b/dist/stat_data.py index 1007cf71a11..72babeb881a 100644 --- a/dist/stat_data.py +++ b/dist/stat_data.py @@ -153,6 +153,9 @@ connection_stats = [ ########################################## # LSM statistics ########################################## + Stat('lsm_checkpoint_throttle', + 'sleep for LSM checkpoint throttle'), + Stat('lsm_merge_throttle', 'sleep for LSM merge throttle'), Stat('lsm_rows_merged', 'rows merged in an LSM tree'), ########################################## @@ -244,6 +247,8 @@ dsrc_stats = [ 'bloom filter pages evicted from cache'), Stat('bloom_page_read', 'bloom filter pages read into cache'), Stat('bloom_size', 'total size of bloom filters', 'no_scale'), + Stat('lsm_checkpoint_throttle', + 'sleep for LSM checkpoint throttle'), Stat('lsm_chunk_count', 'chunks in the LSM tree', 'no_aggregate,no_scale'), Stat('lsm_generation_max', @@ -252,6 +257,7 @@ dsrc_stats = [ Stat('lsm_lookup_no_bloom', 'queries that could have benefited ' + 'from a Bloom filter that did not exist'), + Stat('lsm_merge_throttle', 'sleep for LSM merge throttle'), ########################################## # Block manager statistics diff --git a/examples/c/Makefile.am b/examples/c/Makefile.am index bd281df7130..5b43dcc2285 100644 --- a/examples/c/Makefile.am +++ b/examples/c/Makefile.am @@ -6,6 +6,7 @@ noinst_PROGRAMS = \ ex_all \ ex_call_center \ ex_config \ + ex_config_parse \ ex_cursor \ ex_data_source \ ex_extending \ diff --git a/examples/c/ex_config_parse.c b/examples/c/ex_config_parse.c new file mode 100644 index 00000000000..c6adc327c78 --- /dev/null +++ b/examples/c/ex_config_parse.c @@ -0,0 +1,166 @@ +/*- + * Public Domain 2008-2014 WiredTiger, Inc. + * + * This is free and unencumbered software released into the public domain. + * + * Anyone is free to copy, modify, publish, use, compile, sell, or + * distribute this software, either in source code form or as a compiled + * binary, for any purpose, commercial or non-commercial, and by any + * means. + * + * In jurisdictions that recognize copyright laws, the author or authors + * of this software dedicate any and all copyright interest in the + * software to the public domain. We make this dedication for the benefit + * of the public at large and to the detriment of our heirs and + * successors. We intend this dedication to be an overt act of + * relinquishment in perpetuity of all present and future rights to this + * software under copyright law. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + * + * ex_config_parse.c + * This is an example demonstrating how to parse WiredTiger compatible + * configuration strings. + */ + +#include <stdio.h> +#include <string.h> + +#include <wiredtiger.h> + +const char *home = NULL; + +int main(void) +{ + int ret; + + /*! [Create a configuration parser] */ + WT_CONFIG_ITEM k, v; + WT_CONFIG_PARSER *parser; + const char *config_string = + "path=/dev/loop,page_size=1024,log=(archive=true,file_max=20MB)"; + + if ((ret = wiredtiger_config_parser_open( + NULL, config_string, strlen(config_string), &parser)) != 0) { + fprintf(stderr, "Error creating configuration parser: %s\n", + wiredtiger_strerror(ret)); + return (ret); + } + if ((ret = parser->close(parser)) != 0) { + fprintf(stderr, "Error closing configuration parser: %s\n", + wiredtiger_strerror(ret)); + return (ret); + } + /*! [Create a configuration parser] */ + + if ((ret = wiredtiger_config_parser_open( + NULL, config_string, strlen(config_string), &parser)) != 0) { + fprintf(stderr, "Error creating configuration parser: %s\n", + wiredtiger_strerror(ret)); + return (ret); + } + + { + /*! [get] */ + int64_t my_page_size; + /* + * Retrieve the value of the integer configuration string "page_size". + */ + if ((ret = parser->get(parser, "page_size", &v)) != 0) { + fprintf(stderr, + "page_size configuration: %s", wiredtiger_strerror(ret)); + return (ret); + } + my_page_size = v.val; + /*! [get] */ + + ret = parser->close(parser); + + (void)my_page_size; + } + + { + if ((ret = wiredtiger_config_parser_open( + NULL, config_string, strlen(config_string), &parser)) != 0) { + fprintf(stderr, "Error creating configuration parser: %s\n", + wiredtiger_strerror(ret)); + return (ret); + } + /*! [next] */ + /* + * Retrieve and print the values of the configuration strings. + */ + while ((ret = parser->next(parser, &k, &v)) == 0) { + printf("%.*s:", (int)k.len, k.str); + if (v.type == WT_CONFIG_ITEM_NUM) + printf("%d\n", (int)v.val); + else + printf("%.*s\n", (int)v.len, v.str); + } + /*! [next] */ + ret = parser->close(parser); + } + + if ((ret = wiredtiger_config_parser_open( + NULL, config_string, strlen(config_string), &parser)) != 0) { + fprintf(stderr, "Error creating configuration parser: %s\n", + wiredtiger_strerror(ret)); + return (ret); + } + + /*! [nested get] */ + /* + * Retrieve the value of the nested log file_max configuration string + * using dot shorthand. Utilize the configuration parsing automatic + * conversion of value strings into an integer. + */ + v.type = WT_CONFIG_ITEM_NUM; + if ((ret = parser->get(parser, "log.file_max", &v)) != 0) { + fprintf(stderr, + "log.file_max configuration: %s", wiredtiger_strerror(ret)); + return (ret); + } + printf("log file max: %d\n", (int)v.val); + /*! [nested get] */ + ret = parser->close(parser); + + if ((ret = wiredtiger_config_parser_open( + NULL, config_string, strlen(config_string), &parser)) != 0) { + fprintf(stderr, "Error creating configuration parser: %s\n", + wiredtiger_strerror(ret)); + return (ret); + } + /*! [nested traverse] */ + { + WT_CONFIG_PARSER *sub_parser; + while ((ret = parser->next(parser, &k, &v)) == 0) { + if (v.type == WT_CONFIG_ITEM_STRUCT) { + printf("Found nested configuration: %.*s\n", + (int)k.len, k.str); + if ((ret = wiredtiger_config_parser_open( + NULL, v.str, v.len, &sub_parser)) != 0) { + fprintf(stderr, + "Error creating nested configuration " + "parser: %s\n", + wiredtiger_strerror(ret)); + parser->close(parser); + return (ret); + } + while ((ret = sub_parser->next( + sub_parser, &k, &v)) == 0) + printf("\t%.*s\n", (int)k.len, k.str); + sub_parser->close(sub_parser); + } + } + /*! [nested traverse] */ + parser->close(parser); + } + + return (0); +} diff --git a/examples/c/ex_data_source.c b/examples/c/ex_data_source.c index daedce04075..953bb799340 100644 --- a/examples/c/ex_data_source.c +++ b/examples/c/ex_data_source.c @@ -339,30 +339,6 @@ my_open_cursor(WT_DATA_SOURCE *dsrc, WT_SESSION *session, } { - /*! [WT_EXTENSION config_strget] */ - WT_CONFIG_ITEM v; - int64_t my_data_source_page_size; - - /* - * Retrieve the value of the integer type configuration string - * "page_size" from a local string (as opposed to the provided - * WT_CONFIG_ARG reference). - */ - const char *config_string = "path=/dev/loop,page_size=1024"; - - if ((ret = wt_api->config_strget( - wt_api, session, config_string, "page_size", &v)) != 0) { - (void)wt_api->err_printf(wt_api, session, - "page_size configuration: %s", wiredtiger_strerror(ret)); - return (ret); - } - my_data_source_page_size = v.val; - /*! [WT_EXTENSION config_strget] */ - - (void)my_data_source_page_size; - } - - { /*! [WT_EXTENSION config_get] */ WT_CONFIG_ITEM v; const char *my_data_source_key; @@ -392,31 +368,6 @@ my_open_cursor(WT_DATA_SOURCE *dsrc, WT_SESSION *session, } { - /*! [WT_EXTENSION config scan] */ - WT_CONFIG_ITEM k, v; - WT_CONFIG_SCAN *scan; - - /* - * Retrieve the value of the list type configuration string "paths". - */ - if ((ret = wt_api->config_get( - wt_api, session, config, "paths", &v)) != 0) { - (void)wt_api->err_printf(wt_api, session, - "paths configuration: %s", wiredtiger_strerror(ret)); - return (ret); - } - - /* - * Step through the list of entries. - */ - ret = wt_api->config_scan_begin(wt_api, session, v.str, v.len, &scan); - while ((ret = wt_api->config_scan_next(wt_api, scan, &k, &v)) == 0) - printf("%.*s\n", (int)k.len, k.str); - ret = wt_api->config_scan_end(wt_api, scan); - /*! [WT_EXTENSION config scan] */ - } - - { /*! [WT_EXTENSION collator config] */ /* * Configure the appropriate collator. diff --git a/ext/datasources/helium/helium.c b/ext/datasources/helium/helium.c index 1239c88befa..cc420c89999 100644 --- a/ext/datasources/helium/helium.c +++ b/ext/datasources/helium/helium.c @@ -2098,19 +2098,22 @@ helium_session_open_cursor(WT_DATA_SOURCE *wtds, WT_SESSION *session, CURSOR *cursor; DATA_SOURCE *ds; WT_CONFIG_ITEM v; + WT_CONFIG_PARSER *config_parser; WT_CURSOR *wtcursor; WT_EXTENSION_API *wtext; WT_SOURCE *ws; - int locked, ret = 0; + int locked, ret, tret; const char *value; *new_cursor = NULL; + config_parser = NULL; cursor = NULL; ds = (DATA_SOURCE *)wtds; wtext = ds->wtext; ws = NULL; locked = 0; + ret = tret = 0; value = NULL; /* Allocate and initialize a cursor. */ @@ -2164,23 +2167,28 @@ helium_session_open_cursor(WT_DATA_SOURCE *wtds, WT_SESSION *session, if ((ret = master_uri_get(wtds, session, uri, &value)) != 0) goto err; - if ((ret = wtext->config_strget( - wtext, session, value, "key_format", &v)) != 0) + if ((ret = wtext->config_parser_open(wtext, + session, value, strlen(value), &config_parser)) != 0) + EMSG_ERR(wtext, session, ret, + "Configuration string parser: %s", + wtext->strerror(ret)); + if ((ret = config_parser->get( + config_parser, "key_format", &v)) != 0) EMSG_ERR(wtext, session, ret, "key_format configuration: %s", wtext->strerror(ret)); ws->config_recno = v.len == 1 && v.str[0] == 'r'; - if ((ret = wtext->config_strget( - wtext, session, value, "value_format", &v)) != 0) + if ((ret = config_parser->get( + config_parser, "value_format", &v)) != 0) EMSG_ERR(wtext, session, ret, "value_format configuration: %s", wtext->strerror(ret)); ws->config_bitfield = v.len == 2 && isdigit(v.str[0]) && v.str[1] == 't'; - if ((ret = wtext->config_strget( - wtext, session, value, "helium_o_compress", &v)) != 0) + if ((ret = config_parser->get( + config_parser, "helium_o_compress", &v)) != 0) EMSG_ERR(wtext, session, ret, "helium_o_compress configuration: %s", wtext->strerror(ret)); @@ -2219,6 +2227,11 @@ err: if (ws != NULL && locked) ESET(unlock(wtext, session, &ws->lock)); cursor_destroy(cursor); } + if (config_parser != NULL && + (tret = config_parser->close(config_parser)) != 0) + EMSG(wtext, session, tret, + "WT_CONFIG_PARSER.close: %s", wtext->strerror(tret)); + free((void *)value); return (ret); } @@ -2882,19 +2895,19 @@ helium_config_read(WT_EXTENSION_API *wtext, WT_CONFIG_ITEM *config, char **devicep, HE_ENV *envp, int *env_setp, int *flagsp) { WT_CONFIG_ITEM k, v; - WT_CONFIG_SCAN *scan; + WT_CONFIG_PARSER *config_parser; int ret = 0, tret; *env_setp = 0; *flagsp = 0; - /* Set up the scan of the configuration arguments list. */ - if ((ret = wtext->config_scan_begin( - wtext, NULL, config->str, config->len, &scan)) != 0) + /* Traverse the configuration arguments list. */ + if ((ret = wtext->config_parser_open( + wtext, NULL, config->str, config->len, &config_parser)) != 0) ERET(wtext, NULL, ret, - "WT_EXTENSION_API.config_scan_begin: %s", + "WT_EXTENSION_API.config_parser_open: %s", wtext->strerror(ret)); - while ((ret = wtext->config_scan_next(wtext, scan, &k, &v)) == 0) { + while ((ret = config_parser->next(config_parser, &k, &v)) == 0) { if (string_match("helium_devices", k.str, k.len)) { if ((*devicep = calloc(1, v.len + 1)) == NULL) return (os_errno()); @@ -2924,13 +2937,11 @@ helium_config_read(WT_EXTENSION_API *wtext, WT_CONFIG_ITEM *config, ret = 0; if (ret != 0) EMSG_ERR(wtext, NULL, ret, - "WT_EXTENSION_API.config_scan_next: %s", - wtext->strerror(ret)); + "WT_CONFIG_PARSER.next: %s", wtext->strerror(ret)); -err: if ((tret = wtext->config_scan_end(wtext, scan)) != 0) +err: if ((tret = config_parser->close(config_parser)) != 0) EMSG(wtext, NULL, tret, - "WT_EXTENSION_API.config_scan_end: %s", - wtext->strerror(tret)); + "WT_CONFIG_PARSER.close: %s", wtext->strerror(tret)); return (ret); } @@ -3320,11 +3331,12 @@ wiredtiger_extension_init(WT_CONNECTION *connection, WT_CONFIG_ARG *config) DATA_SOURCE *ds; HELIUM_SOURCE *hs; WT_CONFIG_ITEM k, v; - WT_CONFIG_SCAN *scan; + WT_CONFIG_PARSER *config_parser; WT_EXTENSION_API *wtext; int vmajor, vminor, ret = 0; const char **p; + config_parser = NULL; ds = NULL; wtext = connection->get_extension_api(connection); @@ -3357,12 +3369,12 @@ wiredtiger_extension_init(WT_CONNECTION *connection, WT_CONFIG_ARG *config) wtext->strerror(ret)); /* Step through the list of Helium sources, opening each one. */ - if ((ret = - wtext->config_scan_begin(wtext, NULL, v.str, v.len, &scan)) != 0) + if ((ret = wtext->config_parser_open( + wtext, NULL, v.str, v.len, &config_parser)) != 0) EMSG_ERR(wtext, NULL, ret, - "WT_EXTENSION_API.config_scan_begin: config: %s", + "WT_EXTENSION_API.config_parser_open: config: %s", wtext->strerror(ret)); - while ((ret = wtext->config_scan_next(wtext, scan, &k, &v)) == 0) { + while ((ret = config_parser->next(config_parser, &k, &v)) == 0) { if (string_match("helium_verbose", k.str, k.len)) { verbose = v.val == 0 ? 0 : 1; continue; @@ -3372,12 +3384,13 @@ wiredtiger_extension_init(WT_CONNECTION *connection, WT_CONFIG_ARG *config) } if (ret != WT_NOTFOUND) EMSG_ERR(wtext, NULL, ret, - "WT_EXTENSION_API.config_scan_next: config: %s", + "WT_CONFIG_PARSER.next: config: %s", wtext->strerror(ret)); - if ((ret = wtext->config_scan_end(wtext, scan)) != 0) + if ((ret = config_parser->close(config_parser)) != 0) EMSG_ERR(wtext, NULL, ret, - "WT_EXTENSION_API.config_scan_end: config: %s", + "WT_CONFIG_PARSER.close: config: %s", wtext->strerror(ret)); + config_parser = NULL; /* Find and open the database transaction store. */ if ((ret = helium_source_open_txn(ds)) != 0) @@ -3414,6 +3427,8 @@ wiredtiger_extension_init(WT_CONNECTION *connection, WT_CONFIG_ARG *config) err: if (ds != NULL) ESET(helium_terminate((WT_DATA_SOURCE *)ds, NULL)); + if (config_parser != NULL) + (void)config_parser->close(config_parser); return (ret); } diff --git a/lang/java/java_doc.i b/lang/java/java_doc.i index fcb580ddbd4..83404b45508 100644 --- a/lang/java/java_doc.i +++ b/lang/java/java_doc.i @@ -40,3 +40,6 @@ COPYDOC(__wt_connection, WT_CONNECTION, add_data_source) COPYDOC(__wt_connection, WT_CONNECTION, add_collator) COPYDOC(__wt_connection, WT_CONNECTION, add_compressor) COPYDOC(__wt_connection, WT_CONNECTION, add_extractor) +COPYDOC(__wt_config_parser, WT_CONFIG_PARSER, close) +COPYDOC(__wt_config_parser, WT_CONFIG_PARSER, next) +COPYDOC(__wt_config_parser, WT_CONFIG_PARSER, get) diff --git a/lang/python/wiredtiger.i b/lang/python/wiredtiger.i index 31dc159410b..e5f49fe2c63 100644 --- a/lang/python/wiredtiger.i +++ b/lang/python/wiredtiger.i @@ -582,19 +582,21 @@ typedef int int_void; /* Add event handler support. */ %{ +/* Write to and flush the stream. */ static int writeToPythonStream(const char *streamname, const char *message) { - PyObject *sys, *se, *sys_stderr_write, *written, *arglist; + PyObject *sys, *se, *write_method, *flush_method, *written, + *arglist, *arglist2; char *msg; int ret; size_t msglen; sys = NULL; se = NULL; - sys_stderr_write = NULL; + write_method = flush_method = NULL; written = NULL; - arglist = NULL; + arglist = arglist2 = NULL; msglen = strlen(message); msg = malloc(msglen + 2); strcpy(msg, message); @@ -608,21 +610,30 @@ writeToPythonStream(const char *streamname, const char *message) goto err; if ((se = PyObject_GetAttrString(sys, streamname)) == NULL) goto err; - if ((sys_stderr_write = PyObject_GetAttrString(se, "write")) == NULL) + if ((write_method = PyObject_GetAttrString(se, "write")) == NULL) + goto err; + if ((flush_method = PyObject_GetAttrString(se, "flush")) == NULL) goto err; if ((arglist = Py_BuildValue("(s)", msg)) == NULL) goto err; + if ((arglist2 = Py_BuildValue("()", msg)) == NULL) + goto err; - written = PyObject_CallObject(sys_stderr_write, arglist); + written = PyObject_CallObject(write_method, arglist); + (void)PyObject_CallObject(flush_method, arglist2); ret = 0; err: /* Release python Global Interpreter Lock */ SWIG_PYTHON_THREAD_END_BLOCK; + if (arglist2) + Py_XDECREF(arglist2); if (arglist) Py_XDECREF(arglist); - if (sys_stderr_write) - Py_XDECREF(sys_stderr_write); + if (flush_method) + Py_XDECREF(flush_method); + if (write_method) + Py_XDECREF(write_method); if (se) Py_XDECREF(se); if (sys) diff --git a/src/block/block_mgr.c b/src/block/block_mgr.c index 04c24a6a3b6..35201008622 100644 --- a/src/block/block_mgr.c +++ b/src/block/block_mgr.c @@ -405,8 +405,9 @@ __bm_method_set(WT_BM *bm, int readonly) * Open a file. */ int -__wt_block_manager_open(WT_SESSION_IMPL *session, const char *filename, - const char *cfg[], int forced_salvage, uint32_t allocsize, WT_BM **bmp) +__wt_block_manager_open(WT_SESSION_IMPL *session, + const char *filename, const char *cfg[], + int forced_salvage, int readonly, uint32_t allocsize, WT_BM **bmp) { WT_BM *bm; WT_DECL_RET; @@ -416,8 +417,8 @@ __wt_block_manager_open(WT_SESSION_IMPL *session, const char *filename, WT_RET(__wt_calloc_def(session, 1, &bm)); __bm_method_set(bm, 0); - WT_ERR(__wt_block_open( - session, filename, cfg, forced_salvage, allocsize, &bm->block)); + WT_ERR(__wt_block_open(session, filename, cfg, + forced_salvage, readonly, allocsize, &bm->block)); *bmp = bm; return (0); diff --git a/src/block/block_open.c b/src/block/block_open.c index 1132cb85a6c..23e69027718 100644 --- a/src/block/block_open.c +++ b/src/block/block_open.c @@ -95,7 +95,7 @@ __block_destroy(WT_SESSION_IMPL *session, WT_BLOCK *block) int __wt_block_open(WT_SESSION_IMPL *session, const char *filename, const char *cfg[], - int forced_salvage, uint32_t allocsize, WT_BLOCK **blockp) + int forced_salvage, int readonly, uint32_t allocsize, WT_BLOCK **blockp) { WT_BLOCK *block; WT_CONFIG_ITEM cval; @@ -158,8 +158,9 @@ __wt_block_open(WT_SESSION_IMPL *session, #endif /* Open the underlying file handle. */ - WT_ERR(__wt_open( - session, filename, 0, 0, WT_FILE_TYPE_DATA, &block->fh)); + WT_ERR(__wt_open(session, filename, 0, 0, + readonly ? WT_FILE_TYPE_CHECKPOINT : WT_FILE_TYPE_DATA, + &block->fh)); /* Initialize the live checkpoint's lock. */ WT_ERR(__wt_spin_init(session, &block->live_lock, "block manager")); diff --git a/src/bloom/bloom.c b/src/bloom/bloom.c index 46f74a2b997..3adaa459826 100644 --- a/src/bloom/bloom.c +++ b/src/bloom/bloom.c @@ -127,7 +127,7 @@ __bloom_open_cursor(WT_BLOOM *bloom, WT_CURSOR *owner) WT_RET(__wt_open_cursor(session, bloom->uri, owner, cfg, &c)); /* XXX Layering violation: bump the cache priority for Bloom filters. */ - ((WT_CURSOR_BTREE *)c)->btree->evict_priority = (1 << 19); + ((WT_CURSOR_BTREE *)c)->btree->evict_priority = WT_EVICT_INT_SKEW; bloom->c = c; return (0); diff --git a/src/btree/bt_discard.c b/src/btree/bt_discard.c index 568e07f7fd1..c0c7a245a9f 100644 --- a/src/btree/bt_discard.c +++ b/src/btree/bt_discard.c @@ -317,6 +317,10 @@ __free_update_list(WT_SESSION_IMPL *session, WT_UPDATE *upd) do { next = upd->next; + /* Everything we free should be visible to everyone. */ + WT_ASSERT(session, + upd->txnid == WT_TXN_ABORTED || + __wt_txn_visible_all(session, upd->txnid)); __wt_free(session, upd); } while ((upd = next) != NULL); } diff --git a/src/btree/bt_evict.c b/src/btree/bt_evict.c index d57162c06a9..0b6767df613 100644 --- a/src/btree/bt_evict.c +++ b/src/btree/bt_evict.c @@ -229,7 +229,8 @@ __evict_worker(WT_SESSION_IMPL *session) /* Check to see if the eviction server should run. */ if (bytes_inuse > (cache->eviction_target * bytes_max) / 100) - flags = WT_EVICT_PASS_ALL; + flags = (loop > 10) ? + WT_EVICT_PASS_AGGRESSIVE : WT_EVICT_PASS_ALL; else if (dirty_inuse > (cache->eviction_dirty_target * bytes_max) / 100) /* Ignore clean pages unless the cache is too large */ @@ -244,7 +245,7 @@ __evict_worker(WT_SESSION_IMPL *session) F_SET(cache, WT_EVICT_ACTIVE); WT_VERBOSE_RET(session, evictserver, "Eviction pass with: Max: %" PRIu64 - " In use: %" PRIu64 " Dirty: %" PRIu64 " Internal: %s", + " In use: %" PRIu64 " Dirty: %" PRIu64 " Merge: %s", bytes_max, bytes_inuse, dirty_inuse, LF_ISSET(WT_EVICT_PASS_INTERNAL) ? "yes" : "no"); @@ -436,6 +437,9 @@ __wt_evict_file(WT_SESSION_IMPL *session, int syncop) */ __wt_evict_file_exclusive_on(session); + /* Make sure the oldest transaction ID is up-to-date. */ + __wt_txn_update_oldest(session); + /* * We can't evict the page just returned to us, it marks our place in * the tree. So, always walk one page ahead of the page being evicted. @@ -721,12 +725,23 @@ __evict_walk(WT_SESSION_IMPL *session, u_int *entriesp, uint32_t flags) WT_CONNECTION_IMPL *conn; WT_DATA_HANDLE *dhandle; WT_DECL_RET; - u_int slot, max_entries, retries; + u_int max_entries, old_slot, retries, slot; conn = S2C(session); cache = S2C(session)->cache; retries = 0; + /* Increment the shared read generation. */ + __wt_cache_read_gen_incr(session); + + /* + * Update the oldest ID: we use it to decide whether pages are + * candidates for eviction. Without this, if all threads are blocked + * after a long-running transaction (such as a checkpoint) completes, + * we may never start evicting again. + */ + __wt_txn_update_oldest(session); + /* * Set the starting slot in the queue and the maximum pages added * per walk. @@ -751,9 +766,7 @@ retry: SLIST_FOREACH(dhandle, &conn->dhlh, l) { /* * Each time we reenter this function, start at the next handle - * on the list. We use the address of the handle's name as the - * handle's unique identifier, that should be unique, and is - * unlikely to cause a false positive if freed and reallocated. + * on the list. */ if (cache->evict_file_next != NULL && cache->evict_file_next != dhandle) @@ -769,9 +782,29 @@ retry: SLIST_FOREACH(dhandle, &conn->dhlh, l) { */ btree = dhandle->handle; if (btree->root_page == NULL || - F_ISSET(btree, WT_BTREE_NO_EVICTION) || btree->bulk_load_ok) + F_ISSET(btree, WT_BTREE_NO_EVICTION) || + btree->bulk_load_ok) continue; + /* + * Also skip files that are configured to stick in cache until + * we get aggressive. + */ + if (btree->evict_priority != 0 && + !LF_ISSET(WT_EVICT_PASS_AGGRESSIVE)) + continue; + + /* + * If we are filling the queue, skip files that haven't been + * useful in the past. + */ + if (btree->evict_walk_period != 0 && + cache->evict_entries >= WT_EVICT_WALK_BASE && + btree->evict_walk_skips++ < btree->evict_walk_period) + continue; + btree->evict_walk_skips = 0; + old_slot = slot; + __wt_spin_lock(session, &cache->evict_walk_lock); /* @@ -784,6 +817,17 @@ retry: SLIST_FOREACH(dhandle, &conn->dhlh, l) { __wt_spin_unlock(session, &cache->evict_walk_lock); + /* + * If we didn't find enough candidates in the file, skip it + * next time. + */ + if (slot >= old_slot + WT_EVICT_WALK_PER_FILE || + slot >= max_entries) + btree->evict_walk_period = 0; + else + btree->evict_walk_period = WT_MIN( + WT_MAX(1, 2 * btree->evict_walk_period), 1000); + if (ret != 0 || slot >= max_entries) break; } @@ -843,15 +887,15 @@ __evict_walk_file(WT_SESSION_IMPL *session, u_int *slotp, uint32_t flags) WT_DECL_RET; WT_EVICT_ENTRY *end, *evict, *start; WT_PAGE *page; - int modified, restarts, levels; + int internal_pages, levels, modified, restarts; uint32_t walk_flags; + uint64_t pages_walked; btree = S2BT(session); cache = S2C(session)->cache; start = cache->evict + *slotp; - end = start + WT_EVICT_WALK_PER_FILE; - if (end > cache->evict + cache->evict_slots) - end = cache->evict + cache->evict_slots; + end = WT_MIN(start + WT_EVICT_WALK_PER_FILE, + cache->evict + cache->evict_slots); WT_ASSERT(session, btree->evict_page == NULL || WT_PAGE_IS_ROOT(btree->evict_page) || @@ -860,20 +904,21 @@ __evict_walk_file(WT_SESSION_IMPL *session, u_int *slotp, uint32_t flags) walk_flags = WT_TREE_EVICT; if (LF_ISSET(WT_EVICT_PASS_INTERNAL)) walk_flags |= WT_TREE_SKIP_LEAF; + /* * Get some more eviction candidate pages. */ - for (evict = start, restarts = 0; + for (evict = start, pages_walked = 0, internal_pages = restarts = 0; evict < end && (ret == 0 || ret == WT_NOTFOUND); - ret = __wt_tree_walk(session, &btree->evict_page, walk_flags)) { + ret = __wt_tree_walk(session, &btree->evict_page, walk_flags), + ++pages_walked) { if ((page = btree->evict_page) == NULL) { ret = 0; /* * Take care with terminating this loop. * * Don't make an extra call to __wt_tree_walk: that - * will leave a page in the WT_REF_EVICT_WALK state, - * unable to be evicted, which may prevent any work + * will leave a page pinned, which may prevent any work * from being done. */ if (++restarts == 2) @@ -881,8 +926,6 @@ __evict_walk_file(WT_SESSION_IMPL *session, u_int *slotp, uint32_t flags) continue; } - WT_STAT_FAST_CONN_INCR(session, cache_eviction_walk); - /* Ignore root pages entirely. */ if (WT_PAGE_IS_ROOT(page)) continue; @@ -940,6 +983,13 @@ __evict_walk_file(WT_SESSION_IMPL *session, u_int *slotp, uint32_t flags) } else if (LF_ISSET(WT_EVICT_PASS_INTERNAL)) continue; + /* Limit internal pages to 50% unless we get aggressive. */ + if ((page->type == WT_PAGE_COL_INT || + page->type == WT_PAGE_ROW_INT) && + ++internal_pages > WT_EVICT_WALK_PER_FILE / 2 && + !LF_ISSET(WT_EVICT_PASS_AGGRESSIVE)) + break; + /* * If this page has never been considered for eviction, * set its read generation to a little bit in the @@ -975,6 +1025,15 @@ __evict_walk_file(WT_SESSION_IMPL *session, u_int *slotp, uint32_t flags) continue; /* + * If the page is clean but has modifications that appear too + * new to evict, skip it. + */ + if (!modified && page->modify != NULL && + !LF_ISSET(WT_EVICT_PASS_AGGRESSIVE) && + !__wt_txn_visible_all(session, page->modify->rec_max_txn)) + continue; + + /* * If the oldest transaction hasn't changed since the * last time this page was written, it's unlikely that * we can make progress. Similarly, if the most recent @@ -988,7 +1047,7 @@ __evict_walk_file(WT_SESSION_IMPL *session, u_int *slotp, uint32_t flags) * since rolled back, or we can help get the checkpoint * completed sooner. */ - if (modified && !F_ISSET(cache, WT_EVICT_STUCK) && + if (modified && !LF_ISSET(WT_EVICT_PASS_AGGRESSIVE) && (page->modify->disk_snap_min == S2C(session)->txn_global.oldest_id || !__wt_txn_visible_all(session, @@ -1004,6 +1063,7 @@ add: WT_ASSERT(session, evict->page == NULL); } *slotp += (u_int)(evict - start); + WT_STAT_FAST_CONN_INCRV(session, cache_eviction_walk, pages_walked); return (ret); } @@ -1160,7 +1220,8 @@ __wt_cache_dump(WT_SESSION_IMPL *session) WT_CONNECTION_IMPL *conn; WT_DATA_HANDLE *dhandle; WT_PAGE *page; - uint64_t file_bytes, file_dirty, file_pages, total_bytes; + uint64_t file_intl_pages, file_leaf_pages; + uint64_t file_bytes, file_dirty, total_bytes; conn = S2C(session); total_bytes = 0; @@ -1176,22 +1237,28 @@ __wt_cache_dump(WT_SESSION_IMPL *session) btree->bulk_load_ok) continue; - file_bytes = file_dirty = file_pages = 0; + file_bytes = file_dirty = file_intl_pages = file_leaf_pages = 0; page = NULL; session->dhandle = dhandle; while (__wt_tree_walk(session, &page, WT_TREE_CACHE) == 0 && page != NULL) { - ++file_pages; + if (page->type == WT_PAGE_COL_INT || + page->type == WT_PAGE_ROW_INT) + ++file_intl_pages; + else + ++file_leaf_pages; file_bytes += page->memory_footprint; if (__wt_page_is_modified(page)) file_dirty += page->memory_footprint; } session->dhandle = NULL; - printf("cache dump: %s [%s]: %" - PRIu64 " pages, %" PRIu64 "MB, %" PRIu64 "MB dirty\n", + printf("cache dump: %s [%s]:" + " %" PRIu64 " intl pages, %" PRIu64 " leaf pages," + " %" PRIu64 "MB, %" PRIu64 "MB dirty\n", dhandle->name, dhandle->checkpoint, - file_pages, file_bytes >> 20, file_dirty >> 20); + file_intl_pages, file_leaf_pages, + file_bytes >> 20, file_dirty >> 20); total_bytes += file_bytes; } diff --git a/src/btree/bt_handle.c b/src/btree/bt_handle.c index a970b656cf5..f6cc4cc6fb3 100644 --- a/src/btree/bt_handle.c +++ b/src/btree/bt_handle.c @@ -11,7 +11,7 @@ static int __btree_conf(WT_SESSION_IMPL *, WT_CKPT *ckpt); static int __btree_get_last_recno(WT_SESSION_IMPL *); static int __btree_page_sizes(WT_SESSION_IMPL *); static int __btree_preload(WT_SESSION_IMPL *); -static int __btree_tree_open_empty(WT_SESSION_IMPL *, int); +static int __btree_tree_open_empty(WT_SESSION_IMPL *, int, int); static int pse1(WT_SESSION_IMPL *, const char *, uint32_t, uint32_t); static int pse2(WT_SESSION_IMPL *, const char *, uint32_t, uint32_t, int); @@ -69,8 +69,8 @@ __wt_btree_open(WT_SESSION_IMPL *session, const char *op_cfg[]) if (!WT_PREFIX_SKIP(filename, "file:")) WT_ERR_MSG(session, EINVAL, "expected a 'file:' URI"); - WT_ERR(__wt_block_manager_open(session, filename, - dhandle->cfg, forced_salvage, btree->allocsize, &btree->bm)); + WT_ERR(__wt_block_manager_open(session, filename, dhandle->cfg, + forced_salvage, readonly, btree->allocsize, &btree->bm)); bm = btree->bm; /* @@ -102,7 +102,8 @@ __wt_btree_open(WT_SESSION_IMPL *session, const char *op_cfg[]) ckpt.raw.data, ckpt.raw.size, root_addr, &root_addr_size, readonly)); if (creation || root_addr_size == 0) - WT_ERR(__btree_tree_open_empty(session, creation)); + WT_ERR(__btree_tree_open_empty( + session, creation, readonly)); else { WT_ERR(__wt_btree_tree_open( session, root_addr, root_addr_size)); @@ -355,7 +356,7 @@ err: __wt_buf_free(session, &dsk); * Create an empty in-memory tree. */ static int -__btree_tree_open_empty(WT_SESSION_IMPL *session, int creation) +__btree_tree_open_empty(WT_SESSION_IMPL *session, int creation, int readonly) { WT_BTREE *btree; WT_DECL_RET; @@ -423,23 +424,31 @@ __btree_tree_open_empty(WT_SESSION_IMPL *session, int creation) * the root page dirty to force a write, and without reconciling the * leaf page we won't realize there's no records to write, we'll write * a root page, which isn't correct for an empty tree. - * Earlier versions of this code kept the leaf page clean, but with - * the "empty" flag set in the leaf page's modification structure; in - * that case, checkpoints works (forced reconciliation of a root with - * a single "empty" page wouldn't write any blocks). That version had + * + * Earlier versions of this code kept the leaf page clean, but with the + * "empty" flag set in the leaf page's modification structure; in that + * case, checkpoints works (forced reconciliation of a root with a + * single "empty" page wouldn't write any blocks). That version had * memory leaks because the eviction code didn't correctly handle pages * that were "clean" (and so never reconciled), yet "modified" with an * "empty" flag. The goal of this code is to mimic a real tree that * simply has no records, for whatever reason, and trust reconciliation * to figure out it's empty and not write any blocks. - * We do not set the tree's modified flag because the checkpoint code - * skips unmodified files in closing checkpoints (checkpoints that don't - * require a write unless the file is actually dirty). There's no need - * to reconcile this file unless the application does a real checkpoint - * or it's actually modified. + * + * We do not set the tree's modified flag because the checkpoint code + * skips unmodified files in closing checkpoints (checkpoints that + * don't require a write unless the file is actually dirty). There's + * no need to reconcile this file unless the application does a real + * checkpoint or it's actually modified. + * + * Only do this for a live tree, not for checkpoints. If we open an + * empty checkpoint, the leaf page cannot be dirty or eviction may try + * to write it, which will fail because checkpoints are read-only. */ - WT_ERR(__wt_page_modify_init(session, leaf)); - __wt_page_only_modify_set(session, leaf); + if (!readonly) { + WT_ERR(__wt_page_modify_init(session, leaf)); + __wt_page_only_modify_set(session, leaf); + } btree->root_page = root; diff --git a/src/btree/bt_page.c b/src/btree/bt_page.c index cfc7137b13a..21298ac4722 100644 --- a/src/btree/bt_page.c +++ b/src/btree/bt_page.c @@ -93,21 +93,17 @@ __wt_page_in_func( } /* - * If this page has ever been considered for eviction, - * and its generation is aging, update it. - */ - if (page->read_gen != WT_READ_GEN_NOTSET && - page->read_gen < __wt_cache_read_gen(session)) - page->read_gen = - __wt_cache_read_gen_set(session); - - /* * If we read the page and we are configured to not * trash the cache, set the oldest read generation so * the page is forcibly evicted as soon as possible. + * + * Otherwise, update the page's read generation. */ if (oldgen && page->read_gen == WT_READ_GEN_NOTSET) page->read_gen = WT_READ_GEN_OLDEST; + else if (page->read_gen < __wt_cache_read_gen(session)) + page->read_gen = + __wt_cache_read_gen_set(session); return (0); WT_ILLEGAL_VALUE(session); diff --git a/src/btree/rec_evict.c b/src/btree/rec_evict.c index 0713989af58..ce6c04d1283 100644 --- a/src/btree/rec_evict.c +++ b/src/btree/rec_evict.c @@ -498,7 +498,7 @@ ckpt: WT_STAT_FAST_CONN_INCR(session, cache_eviction_checkpoint); * cache. */ if (!exclusive && mod != NULL && - !__wt_txn_visible_all(session, mod->disk_txn)) + !__wt_txn_visible_all(session, mod->rec_max_txn)) return (EBUSY); /* diff --git a/src/btree/rec_write.c b/src/btree/rec_write.c index 81a4ec7a025..dd237693465 100644 --- a/src/btree/rec_write.c +++ b/src/btree/rec_write.c @@ -668,26 +668,12 @@ static inline int __rec_txn_read( WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_UPDATE *upd, WT_UPDATE **updp) { - uint64_t txnid; int skip, retried; retried = 0; -retry: *updp = __wt_txn_read_skip(session, upd, &skip); - if (!skip) { - /* - * Track the largest transaction ID written to disk for this - * page. We store this in the page at the end of - * reconciliation if no updates are skipped. It is used to - * avoid evicting a clean page from memory with changes that - * are required to satisfy a snapshot read. - */ - if (*updp != NULL) { - txnid = (*updp)->txnid; - if (TXNID_LT(r->max_txn, txnid)) - r->max_txn = txnid; - } +retry: *updp = __wt_txn_read_skip(session, upd, &r->max_txn, &skip); + if (!skip) return (0); - } /* * If skipping this update will cause reconciliation to quit, update @@ -4093,7 +4079,7 @@ err: __wt_scr_free(&tkey); * cache's dirty statistics. */ if (!r->upd_skipped) { - mod->disk_txn = r->max_txn; + mod->rec_max_txn = r->max_txn; if (WT_ATOMIC_CAS(mod->write_gen, r->orig_write_gen, 0)) __wt_cache_dirty_decr(session, page); diff --git a/src/config/config.c b/src/config/config.c index c268d4e053a..c0eb672015f 100644 --- a/src/config/config.c +++ b/src/config/config.c @@ -118,7 +118,7 @@ static const int8_t gostruct[256] = { A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_LOOP, A_BAD, A_QUP, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_UP, A_DOWN, A_BAD, A_BAD, - A_NEXT, A_NUMBARE, A_BAD, A_BARE, A_NUMBARE, A_NUMBARE, + A_NEXT, A_NUMBARE, A_BARE, A_BARE, A_NUMBARE, A_NUMBARE, A_NUMBARE, A_NUMBARE, A_NUMBARE, A_NUMBARE, A_NUMBARE, A_NUMBARE, A_NUMBARE, A_NUMBARE, A_VALUE, A_BAD, A_BAD, A_VALUE, A_BAD, A_BAD, A_BAD, A_BARE, A_BARE, A_BARE, A_BARE, @@ -738,10 +738,8 @@ __wt_config_subgetraw(WT_SESSION_IMPL *session, __wt_config_subgets(WT_SESSION_IMPL *session, WT_CONFIG_ITEM *cfg, const char *key, WT_CONFIG_ITEM *value) { - WT_CONFIG_ITEM key_item; - - key_item.str = key; - key_item.len = strlen(key); + WT_CONFIG_ITEM key_item = + { key, strlen(key), 0, WT_CONFIG_ITEM_STRING }; return (__wt_config_subgetraw(session, cfg, &key_item, value)); } diff --git a/src/config/config_api.c b/src/config/config_api.c new file mode 100644 index 00000000000..42f4c117b81 --- /dev/null +++ b/src/config/config_api.c @@ -0,0 +1,105 @@ +/*- + * Copyright (c) 2008-2014 WiredTiger, Inc. + * All rights reserved. + * + * See the file LICENSE for redistribution information. + */ + +#include "wt_internal.h" + +/* + * __config_parser_close -- + * WT_CONFIG_PARSER->close method. + */ +static int +__config_parser_close(WT_CONFIG_PARSER *wt_config_parser) +{ + WT_CONFIG_PARSER_IMPL *config_parser; + + config_parser = (WT_CONFIG_PARSER_IMPL *)wt_config_parser; + + if (config_parser == NULL) + return (EINVAL); + + __wt_free(config_parser->session, config_parser); + return (0); +} + +/* + * __config_parser_get -- + * WT_CONFIG_PARSER->search method. + */ +static int +__config_parser_get(WT_CONFIG_PARSER *wt_config_parser, + const char *key, WT_CONFIG_ITEM *cval) +{ + WT_CONFIG_PARSER_IMPL *config_parser; + + config_parser = (WT_CONFIG_PARSER_IMPL *)wt_config_parser; + + if (config_parser == NULL) + return (EINVAL); + + return (__wt_config_subgets(config_parser->session, + &config_parser->config_item, key, cval)); +} + +/* + * __config_parser_next -- + * WT_CONFIG_PARSER->next method. + */ +static int +__config_parser_next(WT_CONFIG_PARSER *wt_config_parser, + WT_CONFIG_ITEM *key, WT_CONFIG_ITEM *cval) +{ + WT_CONFIG_PARSER_IMPL *config_parser; + + config_parser = (WT_CONFIG_PARSER_IMPL *)wt_config_parser; + + if (config_parser == NULL) + return (EINVAL); + + return (__wt_config_next(&config_parser->config, key, cval)); +} + +/* + * wiredtiger_config_parser_open -- + * Create a configuration parser. + */ +int +wiredtiger_config_parser_open(WT_SESSION *wt_session, + const char *config, size_t len, WT_CONFIG_PARSER **config_parserp) +{ + static const WT_CONFIG_PARSER stds = { + __config_parser_close, + __config_parser_next, + __config_parser_get + }; + WT_CONFIG_ITEM config_item = + { config, len, 0, WT_CONFIG_ITEM_STRING }; + WT_CONFIG_PARSER_IMPL *config_parser; + WT_DECL_RET; + WT_SESSION_IMPL *session; + + *config_parserp = NULL; + session = (WT_SESSION_IMPL *)wt_session; + + WT_RET(__wt_calloc_def(session, 1, &config_parser)); + config_parser->iface = stds; + config_parser->session = session; + + /* + * Setup a WT_CONFIG_ITEM to be used for get calls and a WT_CONFIG + * structure for iterations through the configuration string. + */ + memcpy(&config_parser->config_item, &config_item, sizeof(config_item)); + WT_ERR(__wt_config_initn( + session, &config_parser->config, config, len)); + + if (ret == 0) + *config_parserp = (WT_CONFIG_PARSER *)config_parser; + else +err: __wt_free(session, config_parser); + + return (ret); +} diff --git a/src/config/config_def.c b/src/config/config_def.c index abe6713696c..2c01cac1a85 100644 --- a/src/config/config_def.c +++ b/src/config/config_def.c @@ -250,7 +250,9 @@ static const WT_CONFIG_CHECK confchk_wiredtiger_open[] = { , { "checkpoint_sync", "boolean", NULL, NULL}, { "create", "boolean", NULL, NULL}, - { "direct_io", "list", "choices=[\"data\",\"log\"]", NULL}, + { "direct_io", "list", + "choices=[\"checkpoint\",\"data\",\"log\"]", + NULL}, { "error_prefix", "string", NULL, NULL}, { "eviction_dirty_target", "int", "min=10,max=99", NULL}, { "eviction_target", "int", "min=10,max=99", NULL}, diff --git a/src/config/config_ext.c b/src/config/config_ext.c index 7dd5445cc3c..26b3799d61c 100644 --- a/src/config/config_ext.c +++ b/src/config/config_ext.c @@ -8,6 +8,19 @@ #include "wt_internal.h" /* + * __wt_ext_config_parser_open -- + * WT_EXTENSION_API->config_parser_open implementation + */ +int +__wt_ext_config_parser_open(WT_EXTENSION_API *wt_ext, WT_SESSION *wt_session, + const char *config, size_t len, WT_CONFIG_PARSER **config_parserp) +{ + WT_UNUSED(wt_ext); + return (wiredtiger_config_parser_open( + wt_session, config, len, config_parserp)); +} + +/* * __wt_ext_config_get -- * Given a NULL-terminated list of configuration strings, find the final * value for a given string key (external API version). @@ -29,81 +42,3 @@ __wt_ext_config_get(WT_EXTENSION_API *wt_api, return (WT_NOTFOUND); return (__wt_config_gets(session, cfg, key, cval)); } - -/* - * __wt_ext_config_strget -- - * Given a single configuration string, find the final value for a given - * string key (external API version). - */ -int -__wt_ext_config_strget(WT_EXTENSION_API *wt_api, - WT_SESSION *wt_session, const char *config, const char *key, - WT_CONFIG_ITEM *cval) -{ - const char *cfg_arg[] = { config, NULL }; - - return (__wt_ext_config_get( - wt_api, wt_session, (WT_CONFIG_ARG *)cfg_arg, key, cval)); -} - -/* - * __wt_ext_config_scan_begin -- - * Start a scan of a config string. - * (external API only). - */ -int -__wt_ext_config_scan_begin( - WT_EXTENSION_API *wt_api, WT_SESSION *wt_session, - const char *str, size_t len, WT_CONFIG_SCAN **scanp) -{ - WT_CONFIG config, *scan; - WT_CONNECTION_IMPL *conn; - WT_SESSION_IMPL *session; - - conn = (WT_CONNECTION_IMPL *)wt_api->conn; - if ((session = (WT_SESSION_IMPL *)wt_session) == NULL) - session = conn->default_session; - - /* Note: allocate memory last to avoid cleanup. */ - WT_CLEAR(config); - WT_RET(__wt_config_initn(session, &config, str, len)); - WT_RET(__wt_calloc_def(session, 1, &scan)); - *scan = config; - *scanp = (WT_CONFIG_SCAN *)scan; - return (0); -} - -/* - * __wt_ext_config_scan_end -- - * End a scan of a config string. - * (external API only). - */ -int -__wt_ext_config_scan_end(WT_EXTENSION_API *wt_api, WT_CONFIG_SCAN *scan) -{ - WT_CONFIG *conf; - - WT_UNUSED(wt_api); - - conf = (WT_CONFIG *)scan; - __wt_free(conf->session, scan); - return (0); -} - -/* - * __wt_ext_config_scan_next -- - * Get the next key/value pair from a config scan. - * (external API only). - */ -int -__wt_ext_config_scan_next( - WT_EXTENSION_API *wt_api, WT_CONFIG_SCAN *scan, - WT_CONFIG_ITEM *key, WT_CONFIG_ITEM *value) -{ - WT_CONFIG *conf; - - WT_UNUSED(wt_api); - - conf = (WT_CONFIG *)scan; - return (__wt_config_next(conf, key, value)); -} diff --git a/src/conn/conn_api.c b/src/conn/conn_api.c index 780ae6f6be5..b1992793827 100644 --- a/src/conn/conn_api.c +++ b/src/conn/conn_api.c @@ -105,11 +105,8 @@ __conn_get_extension_api(WT_CONNECTION *wt_conn) conn->extension_api.scr_free = __wt_ext_scr_free; conn->extension_api.collator_config = ext_collator_config; conn->extension_api.collate = ext_collate; + conn->extension_api.config_parser_open = __wt_ext_config_parser_open; conn->extension_api.config_get = __wt_ext_config_get; - conn->extension_api.config_strget = __wt_ext_config_strget; - conn->extension_api.config_scan_begin = __wt_ext_config_scan_begin; - conn->extension_api.config_scan_end = __wt_ext_config_scan_end; - conn->extension_api.config_scan_next = __wt_ext_config_scan_next; conn->extension_api.metadata_insert = __wt_ext_metadata_insert; conn->extension_api.metadata_remove = __wt_ext_metadata_remove; conn->extension_api.metadata_search = __wt_ext_metadata_search; @@ -123,6 +120,7 @@ __conn_get_extension_api(WT_CONNECTION *wt_conn) conn->extension_api.transaction_notify = __wt_ext_transaction_notify; conn->extension_api.transaction_oldest = __wt_ext_transaction_oldest; conn->extension_api.transaction_visible = __wt_ext_transaction_visible; + conn->extension_api.version = wiredtiger_version; return (&conn->extension_api); } @@ -498,6 +496,20 @@ __conn_close(WT_CONNECTION *wt_conn, const char *config) CONNECTION_API_CALL(conn, session, close, config, cfg); WT_UNUSED(cfg); + /* + * Rollback all running transactions. + * We do this as a separate pass because an active transaction in one + * session could cause trouble when closing a file, even if that + * session never referenced that file. + */ + for (s = conn->sessions, i = 0; i < conn->session_cnt; ++s, ++i) + if (s->active && !F_ISSET(s, WT_SESSION_INTERNAL) && + F_ISSET(&s->txn, TXN_RUNNING)) { + wt_session = &s->iface; + WT_TRET(wt_session->rollback_transaction( + wt_session, NULL)); + } + /* Close open, external sessions. */ for (s = conn->sessions, i = 0; i < conn->session_cnt; ++s, ++i) if (s->active && !F_ISSET(s, WT_SESSION_INTERNAL)) { @@ -1012,6 +1024,7 @@ wiredtiger_open(const char *home, WT_EVENT_HANDLER *event_handler, const char *name; uint32_t flag; } *ft, file_types[] = { + { "checkpoint", WT_FILE_TYPE_CHECKPOINT }, { "data", WT_FILE_TYPE_DATA }, { "log", WT_FILE_TYPE_LOG }, { NULL, 0 } diff --git a/src/conn/conn_cache_pool.c b/src/conn/conn_cache_pool.c index 55faeefbde0..34b6a51570b 100644 --- a/src/conn/conn_cache_pool.c +++ b/src/conn/conn_cache_pool.c @@ -464,8 +464,10 @@ __cache_pool_adjust(uint64_t highest, uint64_t bump_threshold) * it. */ grew = 0; - adjusted = (cp->chunk > entry->cache_size - reserved) ? - cp->chunk : (entry->cache_size - reserved); + if (entry->cache_size - cp->chunk > reserved) + adjusted = cp->chunk; + else + adjusted = entry->cache_size - reserved; } else if (highest > 1 && entry->cache_size < cp->size && cache->bytes_inmem >= diff --git a/src/docs/Doxyfile b/src/docs/Doxyfile index 5f7e5016892..d8f753b269b 100644 --- a/src/docs/Doxyfile +++ b/src/docs/Doxyfile @@ -1576,6 +1576,7 @@ PREDEFINED = DOXYGEN \ __wt_compressor:=WT_COMPRESSOR \ __wt_config_arg:=WT_CONFIG_ARG \ __wt_config_item:=WT_CONFIG_ITEM \ + __wt_config_parser:=WT_CONFIG_PARSER \ __wt_config_scan:=WT_CONFIG_SCAN \ __wt_connection:=WT_CONNECTION \ __wt_cursor:=WT_CURSOR \ diff --git a/src/docs/command-line.dox b/src/docs/command-line.dox index 52be68a1074..04daa4050cd 100644 --- a/src/docs/command-line.dox +++ b/src/docs/command-line.dox @@ -291,9 +291,6 @@ engine, or, if specified, for the URI on the command-line. <code>wt [-Vv] [-C config] [-h directory] stat [-a] [uri]</code> @subsection util_stat_options Options -The \c stat command has no command-specific options. - -@subsection util_stat_options Options The following are command-specific options for the \c stat command: @par <code>-a</code> diff --git a/src/docs/config-strings.dox b/src/docs/config-strings.dox index 542e86f620d..295316fe039 100644 --- a/src/docs/config-strings.dox +++ b/src/docs/config-strings.dox @@ -16,7 +16,7 @@ specified directly. More precisely, keys or values that match this regular expression do not require quotes: <pre> - [-_0-9A-Za-z/][^\\t\\r\\n :=,\\])}]* + [-_0-9A-Za-z./][^\\t\\r\\n :=,\\])}]* </pre> More complex keys and values can be specified by quoting them with double diff --git a/src/docs/custom_data.dox b/src/docs/custom_data.dox index b1ee8dfbd9e..22dd75dcc26 100644 --- a/src/docs/custom_data.dox +++ b/src/docs/custom_data.dox @@ -165,11 +165,6 @@ of a configuration string as follows: @snippet ex_data_source.c WT_EXTENSION config_get -The WT_DATA_SOURCE::open_cursor method might retrieve the list value -of a configuration string as follows: - -@snippet ex_data_source.c WT_EXTENSION config scan - @subsection custom_ds_config_add Creating data-specific configuration strings Applications can add their own configuration strings to WiredTiger diff --git a/src/docs/helium.dox b/src/docs/helium.dox index cd6b47fb968..35e3886a8d6 100644 --- a/src/docs/helium.dox +++ b/src/docs/helium.dox @@ -103,7 +103,7 @@ WT_SESSION *session; /* Create and truncate the access table. */ ret = session->create(session, "table:dev1/access", - "key_format=S,value_format=S,type=helium,helium_open_o_truncate=1"); + "key_format=S,value_format=S,type=helium,helium_o_truncate=1"); @endcode @section helium_notes Helium notes diff --git a/src/docs/spell.ok b/src/docs/spell.ok index 6d24c474e19..58701aaa5bf 100644 --- a/src/docs/spell.ok +++ b/src/docs/spell.ok @@ -313,6 +313,7 @@ statlog str strerror strftime +strget struct structs subdatabases diff --git a/src/docs/top/main.dox b/src/docs/top/main.dox index 5481d2deae5..f36f1887a73 100644 --- a/src/docs/top/main.dox +++ b/src/docs/top/main.dox @@ -6,9 +6,9 @@ WiredTiger is an high performance, scalable, production quality, NoSQL, @section releases Releases <table> -@row{<b>WiredTiger 2.1.0</b> (current), - <a href="releases/wiredtiger-2.1.0.tar.bz2"><b>[Release package]</b></a>, - <a href="2.1.0/index.html"><b>[Documentation]</b></a>} +@row{<b>WiredTiger 2.1.1</b> (current), + <a href="releases/wiredtiger-2.1.1.tar.bz2"><b>[Release package]</b></a>, + <a href="2.1.1/index.html"><b>[Documentation]</b></a>} @row{<b>WiredTiger 1.6.6</b> (previous), <a href="releases/wiredtiger-1.6.6.tar.bz2"><b>[Release package]</b></a>, <a href="1.6.6/index.html"><b>[Documentation]</b></a>} diff --git a/src/docs/upgrading.dox b/src/docs/upgrading.dox index e59b031a1ff..3cb0b96b0ef 100644 --- a/src/docs/upgrading.dox +++ b/src/docs/upgrading.dox @@ -1,5 +1,20 @@ /*! @page upgrading Upgrading WiredTiger applications +@section version_211 Upgrading to Version 2.1.1 +<dl> + +<dt>WT_EXTENSION_API::config methods</dt> +<dd> +In the 2.1.1 release of WiredTiger the configuration string parsing API +has been changed and added to a new public handle. The +WT_EXTENSION_API::config_strget, WT_EXTENSION_API::config_scan_begin, +WT_EXTENSION_API::config_scan_next and WT_EXTENSION_API::config_scan_end +have been removed. They have been replaced by a +WT_EXTENSION_API::config_parser_open method, which can be used to parse +configuration strings. See the WT_CONFIG_PARSER documentation for +examples on how to use the updated API. +</dd> + @section version_21 Upgrading to Version 2.1 <dl> diff --git a/src/include/btmem.h b/src/include/btmem.h index 7f0bf280d5c..42d7ecfa9e2 100644 --- a/src/include/btmem.h +++ b/src/include/btmem.h @@ -197,8 +197,8 @@ struct __wt_page_modify { */ uint64_t disk_snap_min; - /* The largest transaction ID written to disk for the page. */ - uint64_t disk_txn; + /* The largest transaction ID seen on the page by reconciliation. */ + uint64_t rec_max_txn; /* The largest update transaction ID (approximate). */ uint64_t update_txn; @@ -345,22 +345,17 @@ struct __wt_page { * The read generation is a 64-bit value, if incremented frequently, a * 32-bit value could overflow. * - * The read generation is a piece of shared memory potentially accessed + * The read generation is a piece of shared memory potentially read * by many threads. We don't want to update page read generations for * in-cache workloads and suffer the cache misses, so we don't simply * increment the read generation value on every access. Instead, the - * read generation is initialized to 0, then set to a real value if the - * page is ever considered for eviction. Once set to a real value, the - * read generation is potentially incremented every time the page is - * accessed. To try and avoid incrementing the page at a fast rate in - * this case, the read generation is incremented to a future point. - * - * The read generation is not declared volatile or published: the read - * generation is set a lot, and we don't want to write it that much. + * read generation is incremented by the eviction server each time it + * becomes active. To avoid incrementing a page's read generation too + * frequently, it is set to a future point. */ #define WT_READ_GEN_NOTSET 0 #define WT_READ_GEN_OLDEST 1 -#define WT_READ_GEN_STEP 1000 +#define WT_READ_GEN_STEP 100 uint64_t read_gen; uint64_t memory_footprint; /* Memory attached to the page */ diff --git a/src/include/btree.h b/src/include/btree.h index 84da462417a..b40fc8a7f80 100644 --- a/src/include/btree.h +++ b/src/include/btree.h @@ -112,7 +112,9 @@ struct __wt_btree { uint64_t write_gen; /* Write generation */ WT_PAGE *evict_page; /* Eviction thread's location */ - uint64_t evict_priority; /* Relative priority of cached pages. */ + uint64_t evict_priority; /* Relative priority of cached pages */ + u_int evict_walk_period; /* Skip this many LRU walks */ + u_int evict_walk_skips; /* Number of walks skipped */ volatile uint32_t lru_count; /* Count of threads in LRU eviction */ volatile int checkpointing; /* Checkpoint in progress */ diff --git a/src/include/btree.i b/src/include/btree.i index f09d05178ab..94f8133d187 100644 --- a/src/include/btree.i +++ b/src/include/btree.i @@ -159,6 +159,12 @@ __wt_cache_read_gen(WT_SESSION_IMPL *session) return (S2C(session)->cache->read_gen); } +static inline void +__wt_cache_read_gen_incr(WT_SESSION_IMPL *session) +{ + ++S2C(session)->cache->read_gen; +} + static inline uint64_t __wt_cache_read_gen_set(WT_SESSION_IMPL *session) { @@ -171,7 +177,7 @@ __wt_cache_read_gen_set(WT_SESSION_IMPL *session) * page. In other words, the goal is to avoid some number of updates * immediately after each update we have to make. */ - return (++S2C(session)->cache->read_gen + WT_READ_GEN_STEP); + return (__wt_cache_read_gen(session) + WT_READ_GEN_STEP); } /* @@ -563,6 +569,30 @@ __wt_eviction_force_check(WT_SESSION_IMPL *session, WT_PAGE *page) } /* + * __wt_eviction_force_txn_check -- + * Check if the current transaction permits forced eviction of a page. + */ +static inline int +__wt_eviction_force_txn_check(WT_SESSION_IMPL *session, WT_PAGE *page) +{ + WT_TXN_STATE *txn_state; + + /* + * Only try if there is a chance of success. If the page has already + * been split and this transaction is already pinning the oldest ID so + * that the page can't be evicted, it has to complete before eviction + * can succeed. + */ + txn_state = &S2C(session)->txn_global.states[session->id]; + if (!F_ISSET_ATOMIC(page, WT_PAGE_WAS_SPLIT) || + txn_state->snap_min == WT_TXN_NONE || + TXNID_LT(page->modify->update_txn, txn_state->snap_min)) + return (1); + + return (0); +} + +/* * __wt_page_release -- * Release a reference to a page. */ @@ -582,10 +612,8 @@ __wt_page_release(WT_SESSION_IMPL *session, WT_PAGE *page) * Try to immediately evict pages if they have the special "oldest" * read generation and we have some chance of succeeding. */ - if (!WT_TXN_ACTIVE(&session->txn) && - (page->modify == NULL || - !F_ISSET(page->modify, WT_PM_REC_SPLIT_MERGE)) && - page->read_gen == WT_READ_GEN_OLDEST && + if (page->read_gen == WT_READ_GEN_OLDEST && + __wt_eviction_force_txn_check(session, page) && WT_ATOMIC_CAS(page->ref->state, WT_REF_MEM, WT_REF_LOCKED)) { if ((ret = __wt_hazard_clear(session, page)) != 0) { page->ref->state = WT_REF_MEM; @@ -678,30 +706,6 @@ __wt_page_hazard_check(WT_SESSION_IMPL *session, WT_PAGE *page) /* * __wt_eviction_force -- - * Check if the current transaction permits forced eviction of a page. - */ -static inline int -__wt_eviction_force_txn_check(WT_SESSION_IMPL *session, WT_PAGE *page) -{ - WT_TXN_STATE *txn_state; - - /* - * Only try if there is a chance of success. If the page has already - * been split and this transaction is already pinning the oldest ID so - * that the page can't be evicted, it has to complete before eviction - * can succeed. - */ - txn_state = &S2C(session)->txn_global.states[session->id]; - if (!F_ISSET_ATOMIC(page, WT_PAGE_WAS_SPLIT) || - txn_state->snap_min == WT_TXN_NONE || - TXNID_LT(page->modify->update_txn, txn_state->snap_min)) - return (1); - - return (0); -} - -/* - * __wt_eviction_force -- * Forcefully evict a page, if possible. */ static inline int diff --git a/src/include/cache.h b/src/include/cache.h index 055041b7e6c..717191d19dd 100644 --- a/src/include/cache.h +++ b/src/include/cache.h @@ -9,16 +9,17 @@ * Tuning constants: I hesitate to call this tuning, but we want to review some * number of pages from each file's in-memory tree for each page we evict. */ -#define WT_EVICT_INT_SKEW (1<<12) /* Prefer leaf pages over internal +#define WT_EVICT_INT_SKEW (1<<20) /* Prefer leaf pages over internal pages by this many increments of the read generation. */ #define WT_EVICT_WALK_PER_FILE 10 /* Pages to visit per file */ #define WT_EVICT_WALK_BASE 300 /* Pages tracked across file visits */ #define WT_EVICT_WALK_INCR 100 /* Pages added each walk */ -#define WT_EVICT_PASS_ALL 0x01 -#define WT_EVICT_PASS_DIRTY 0x02 -#define WT_EVICT_PASS_INTERNAL 0x04 +#define WT_EVICT_PASS_AGGRESSIVE 0x01 +#define WT_EVICT_PASS_ALL 0x02 +#define WT_EVICT_PASS_DIRTY 0x04 +#define WT_EVICT_PASS_INTERNAL 0x08 /* * WT_EVICT_ENTRY -- diff --git a/src/include/config.h b/src/include/config.h index d8837f0f368..c83d96c8a5e 100644 --- a/src/include/config.h +++ b/src/include/config.h @@ -33,6 +33,14 @@ struct __wt_config_entry { const WT_CONFIG_CHECK *checks; /* check array */ }; +struct __wt_config_parser_impl { + WT_CONFIG_PARSER iface; + + WT_SESSION_IMPL *session; + WT_CONFIG config; + WT_CONFIG_ITEM config_item; +}; + /* * DO NOT EDIT: automatically built by dist/api_config.py. * configuration section: BEGIN diff --git a/src/include/extern.h b/src/include/extern.h index d1662717345..ba026253a5b 100644 --- a/src/include/extern.h +++ b/src/include/extern.h @@ -126,6 +126,7 @@ extern int __wt_block_manager_open(WT_SESSION_IMPL *session, const char *filename, const char *cfg[], int forced_salvage, + int readonly, uint32_t allocsize, WT_BM **bmp); extern int __wt_block_manager_truncate( WT_SESSION_IMPL *session, @@ -138,6 +139,7 @@ extern int __wt_block_open(WT_SESSION_IMPL *session, const char *filename, const char *cfg[], int forced_salvage, + int readonly, uint32_t allocsize, WT_BLOCK **blockp); extern int __wt_block_close(WT_SESSION_IMPL *session, WT_BLOCK *block); @@ -529,27 +531,16 @@ extern int __wt_config_concat( WT_SESSION_IMPL *session, const char **config_ret); extern int __wt_conn_config_init(WT_SESSION_IMPL *session); extern void __wt_conn_config_discard(WT_SESSION_IMPL *session); +extern int __wt_ext_config_parser_open(WT_EXTENSION_API *wt_ext, + WT_SESSION *wt_session, + const char *config, + size_t len, + WT_CONFIG_PARSER **config_parserp); extern int __wt_ext_config_get(WT_EXTENSION_API *wt_api, WT_SESSION *wt_session, WT_CONFIG_ARG *cfg_arg, const char *key, WT_CONFIG_ITEM *cval); -extern int __wt_ext_config_strget(WT_EXTENSION_API *wt_api, - WT_SESSION *wt_session, - const char *config, - const char *key, - WT_CONFIG_ITEM *cval); -extern int __wt_ext_config_scan_begin( WT_EXTENSION_API *wt_api, - WT_SESSION *wt_session, - const char *str, - size_t len, - WT_CONFIG_SCAN **scanp); -extern int __wt_ext_config_scan_end(WT_EXTENSION_API *wt_api, - WT_CONFIG_SCAN *scan); -extern int __wt_ext_config_scan_next( WT_EXTENSION_API *wt_api, - WT_CONFIG_SCAN *scan, - WT_CONFIG_ITEM *key, - WT_CONFIG_ITEM *value); extern int __wt_collator_config( WT_SESSION_IMPL *session, const char **cfg, WT_COLLATOR **collatorp); diff --git a/src/include/flags.h b/src/include/flags.h index 31eed83e351..89f2450f3af 100644 --- a/src/include/flags.h +++ b/src/include/flags.h @@ -10,6 +10,7 @@ #define WT_CONN_PANIC 0x00000002 #define WT_CONN_SERVER_RUN 0x00000001 #define WT_EVICTION_SERVER_LOCKED 0x00000004 +#define WT_FILE_TYPE_CHECKPOINT 0x00000004 #define WT_FILE_TYPE_DATA 0x00000002 #define WT_FILE_TYPE_LOG 0x00000001 #define WT_LOGSCAN_FIRST 0x00000008 diff --git a/src/include/lsm.h b/src/include/lsm.h index 4d3b14c5f74..b5c3859605c 100644 --- a/src/include/lsm.h +++ b/src/include/lsm.h @@ -30,6 +30,8 @@ struct __wt_cursor_lsm { uint64_t *txnid_max; /* Maximum txn for each chunk */ size_t txnid_alloc; + u_int update_count; /* Updates performed. */ + #define WT_CLSM_ACTIVE 0x01 /* Incremented the session count */ #define WT_CLSM_ITERATE_NEXT 0x02 /* Forward iteration */ #define WT_CLSM_ITERATE_PREV 0x04 /* Backward iteration */ diff --git a/src/include/stat.h b/src/include/stat.h index 6717b4d081f..ea2a4068f96 100644 --- a/src/include/stat.h +++ b/src/include/stat.h @@ -182,6 +182,8 @@ struct __wt_connection_stats { WT_STATS log_slot_transitions; WT_STATS log_sync; WT_STATS log_writes; + WT_STATS lsm_checkpoint_throttle; + WT_STATS lsm_merge_throttle; WT_STATS lsm_rows_merged; WT_STATS memory_allocation; WT_STATS memory_free; @@ -275,9 +277,11 @@ struct __wt_dsrc_stats { WT_STATS cursor_search_near; WT_STATS cursor_update; WT_STATS cursor_update_bytes; + WT_STATS lsm_checkpoint_throttle; WT_STATS lsm_chunk_count; WT_STATS lsm_generation_max; WT_STATS lsm_lookup_no_bloom; + WT_STATS lsm_merge_throttle; WT_STATS rec_dictionary; WT_STATS rec_overflow_key_internal; WT_STATS rec_overflow_key_leaf; diff --git a/src/include/txn.h b/src/include/txn.h index e71b693928b..17e2c0c632e 100644 --- a/src/include/txn.h +++ b/src/include/txn.h @@ -131,6 +131,3 @@ struct __wt_txn { #define TXN_RUNNING 0x08 uint32_t flags; }; - -#define WT_TXN_ACTIVE(txn) \ - (F_ISSET((txn), TXN_RUNNING) && (txn)->mod_count > 0) diff --git a/src/include/txn.i b/src/include/txn.i index cdfe697ee51..2543d5ff21f 100644 --- a/src/include/txn.i +++ b/src/include/txn.i @@ -166,20 +166,42 @@ __wt_txn_visible(WT_SESSION_IMPL *session, uint64_t id) /* * __wt_txn_read_skip -- - * Get the first visible update in a list (or NULL if none are visible), - * and report whether uncommitted changes were skipped. + * Get the first visible update in a list (or NULL if none are visible). + * Report the maximum transaction ID in the list and whether any updates + * were skipped to find the visible update. */ static inline WT_UPDATE * -__wt_txn_read_skip(WT_SESSION_IMPL *session, WT_UPDATE *upd, int *skipp) +__wt_txn_read_skip( + WT_SESSION_IMPL *session, WT_UPDATE *upd, uint64_t *max_txn, int *skipp) { + WT_UPDATE *first_upd; + + /* + * Track the largest transaction ID on this page. We store this in the + * page at the end of reconciliation if no updates are skipped. It is + * used to avoid evicting a clean page from memory with changes that + * are required to satisfy a snapshot read. + * + * Record whether any updates were skipped on the way to finding the + * first visible update. That determines whether a future read with no + * intervening modifications to the page could see a different value. + * If not, the page can safely be marked clean, and does not need to be + * reconciled until it is modified again. + */ *skipp = 0; - while (upd != NULL && !__wt_txn_visible(session, upd->txnid)) { - if (upd->txnid != WT_TXN_ABORTED) - *skipp = 1; - upd = upd->next; - } + for (first_upd = NULL; upd != NULL; upd = upd->next) + if (upd->txnid != WT_TXN_ABORTED) { + if (TXNID_LT(*max_txn, upd->txnid)) + *max_txn = upd->txnid; + if (first_upd == NULL) { + if (__wt_txn_visible(session, upd->txnid)) + first_upd = upd; + else + *skipp = 1; + } + } - return (upd); + return (first_upd); } /* diff --git a/src/include/wiredtiger.in b/src/include/wiredtiger.in index 4c7ee4917e9..6334bca2061 100644 --- a/src/include/wiredtiger.in +++ b/src/include/wiredtiger.in @@ -71,6 +71,9 @@ extern "C" { *******************************************/ struct __wt_collator; typedef struct __wt_collator WT_COLLATOR; struct __wt_compressor; typedef struct __wt_compressor WT_COMPRESSOR; +struct __wt_config_item; typedef struct __wt_config_item WT_CONFIG_ITEM; +struct __wt_config_parser; + typedef struct __wt_config_parser WT_CONFIG_PARSER; struct __wt_connection; typedef struct __wt_connection WT_CONNECTION; struct __wt_cursor; typedef struct __wt_cursor WT_CURSOR; struct __wt_data_source; typedef struct __wt_data_source WT_DATA_SOURCE; @@ -1490,8 +1493,12 @@ struct __wt_connection { * @config{direct_io, Use \c O_DIRECT to access files. Options are given as a * list\, such as <code>"direct_io=[data]"</code>. Configuring \c direct_io * requires care\, see @ref tuning_system_buffer_cache_direct_io for important - * warnings., a list\, with values chosen from the following options: \c - * "data"\, \c "log"; default empty.} + * warnings. Including \c "data" will cause WiredTiger data files to use \c + * O_DIRECT\, including \c "log" will cause WiredTiger log files to use \c + * O_DIRECT\, and including \c "checkpoint" will cause WiredTiger data files + * opened at a checkpoint (i.e: read only) to use \c O_DIRECT., a list\, with + * values chosen from the following options: \c "checkpoint"\, \c "data"\, \c + * "log"; default empty.} * @config{error_prefix, prefix string for error messages., a string; default * empty.} * @config{eviction_dirty_target, continue evicting until the cache has less @@ -1890,6 +1897,153 @@ int wiredtiger_unpack_str(WT_PACK_STREAM *ps, const char **sp); */ int wiredtiger_unpack_uint(WT_PACK_STREAM *ps, uint64_t *up); +/*! + * @name Configuration string parsing + * @{ + */ + +/*! + * The configuration information returned by the WiredTiger configuration + * parsing functions in the WT_EXTENSION_API and the public API. + */ +struct __wt_config_item { + /*! + * The value of a configuration string. + * + * Regardless of the type of the configuration string (boolean, int, + * list or string), the \c str field will reference the value of the + * configuration string. + * + * The bytes referenced by \c str are <b>not</b> nul-terminated, + * use the \c len field instead of a terminating nul byte. + */ + const char *str; + + /*! The number of bytes in the value referenced by \c str. */ + size_t len; + + /*! + * The value of a configuration boolean or integer. + * + * If the configuration string's value is "true" or "false", the + * \c val field will be set to 1 (true), or 0 (false). + * + * If the configuration string can be legally interpreted as an integer, + * using the strtoll function rules as specified in ISO/IEC 9899:1990 + * ("ISO C90"), that integer will be stored in the \c val field. + */ + int64_t val; + + /*! Permitted values of the \c type field. */ + enum { + /*! A string value with quotes stripped. */ + WT_CONFIG_ITEM_STRING, + /*! A boolean literal ("true" or "false"). */ + WT_CONFIG_ITEM_BOOL, + /*! An unquoted identifier: a string value without quotes. */ + WT_CONFIG_ITEM_ID, + /*! A numeric value. */ + WT_CONFIG_ITEM_NUM, + /*! A nested structure or list, including brackets. */ + WT_CONFIG_ITEM_STRUCT + } + /*! + * The type of value determined by the parser. In all cases, + * the \c str and \c len fields are set. + */ + type; +}; + +/*! + * Create a handle that can be used to parse or create configuration strings + * compatible with WiredTiger APIs. + * This API is outside the scope of a WiredTiger connection handle, since + * applications may need to generate configuration strings prior to calling + * ::wiredtiger_open. + * @param session the session handle to be used for error reporting. If NULL + * error messages will be written to stdout. + * @param config the configuration string being parsed. The string must + * remain valid for the lifetime of the parser handle. + * @param len the number of valid bytes in \c config + * @param[out] config_parserp A pointer to the newly opened handle + * @errors + */ +int wiredtiger_config_parser_open(WT_SESSION *session, + const char *config, size_t len, WT_CONFIG_PARSER **config_parserp); + +/*! + * A handle that can be used to search and traverse configuration strings + * compatible with WiredTiger APIs. + * To parse the contents of a list or nested configuration string use a new + * configuration parser handle based on the content of the ::WT_CONFIG_ITEM + * retrieved from the parent configuration string. + * + * @section config_parse_examples Configuration String Parsing examples + * + * This could be used in C to create a configuration parser as follows: + * + * @snippet ex_config_parse.c Create a configuration parser + * + * Once the parser has been created the content can be queried directly: + * + * @snippet ex_config_parse.c get + * + * Or the content can be traversed linearly: + * + * @snippet ex_config_parse.c next + * + * Nested configuration values can be queried using a shorthand notation: + * + * @snippet ex_config_parse.c nested get + * + * Nested configuration values can be traversed using multiple + * ::WT_CONFIG_PARSER handles: + * + * @snippet ex_config_parse.c nested traverse + */ +struct __wt_config_parser { + + /*! + * Close the configuration scanner releasing any resources. + * + * @param config_parser the configuration parser handle + * @errors + * + */ + int __F(close)(WT_CONFIG_PARSER *config_parser); + + /*! + * Return the next key/value pair. + * + * When iteration would pass the end of the configuration string + * ::WT_NOTFOUND will be returned. + * + * If an item has no explicitly assigned value, the item will be + * returned in \c key and the \c value will be set to the boolean + * \c "true" value. + * + * @param config_parser the configuration parser handle + * @param key the returned key + * @param value the returned value + * @errors + * + */ + int __F(next)(WT_CONFIG_PARSER *config_parser, + WT_CONFIG_ITEM *key, WT_CONFIG_ITEM *value); + + /*! + * Return the value of an item in the configuration string. + * + * @param config_parser the configuration parser handle + * @param key configuration key string + * @param value the returned value + * @errors + * + */ + int __F(get)(WT_CONFIG_PARSER *config_parser, + const char *key, WT_CONFIG_ITEM *value); +}; + #endif /* !defined(SWIG) */ /*! * @} @@ -1985,9 +2139,7 @@ const char *wiredtiger_version(int *majorp, int *minorp, int *patchp); /******************************************* * Forward structure declarations for the extension API *******************************************/ -struct __wt_config_arg; typedef struct __wt_config_arg WT_CONFIG_ARG; -struct __wt_config_item; typedef struct __wt_config_item WT_CONFIG_ITEM; -struct __wt_config_scan; typedef struct __wt_config_scan WT_CONFIG_SCAN; +struct __wt_config_arg; typedef struct __wt_config_arg WT_CONFIG_ARG; /*! * The interface implemented by applications to provide custom ordering of @@ -2588,42 +2740,46 @@ extern int wiredtiger_extension_terminate(WT_CONNECTION *connection); #define WT_STAT_CONN_LOG_SYNC 1063 /*! log: log write operations */ #define WT_STAT_CONN_LOG_WRITES 1064 +/*! sleep for LSM checkpoint throttle */ +#define WT_STAT_CONN_LSM_CHECKPOINT_THROTTLE 1065 +/*! sleep for LSM merge throttle */ +#define WT_STAT_CONN_LSM_MERGE_THROTTLE 1066 /*! rows merged in an LSM tree */ -#define WT_STAT_CONN_LSM_ROWS_MERGED 1065 +#define WT_STAT_CONN_LSM_ROWS_MERGED 1067 /*! memory allocations */ -#define WT_STAT_CONN_MEMORY_ALLOCATION 1066 +#define WT_STAT_CONN_MEMORY_ALLOCATION 1068 /*! memory frees */ -#define WT_STAT_CONN_MEMORY_FREE 1067 +#define WT_STAT_CONN_MEMORY_FREE 1069 /*! memory re-allocations */ -#define WT_STAT_CONN_MEMORY_GROW 1068 +#define WT_STAT_CONN_MEMORY_GROW 1070 /*! total read I/Os */ -#define WT_STAT_CONN_READ_IO 1069 +#define WT_STAT_CONN_READ_IO 1071 /*! page reconciliation calls */ -#define WT_STAT_CONN_REC_PAGES 1070 +#define WT_STAT_CONN_REC_PAGES 1072 /*! page reconciliation calls for eviction */ -#define WT_STAT_CONN_REC_PAGES_EVICTION 1071 +#define WT_STAT_CONN_REC_PAGES_EVICTION 1073 /*! reconciliation failed because an update could not be included */ -#define WT_STAT_CONN_REC_SKIPPED_UPDATE 1072 +#define WT_STAT_CONN_REC_SKIPPED_UPDATE 1074 /*! pthread mutex shared lock read-lock calls */ -#define WT_STAT_CONN_RWLOCK_READ 1073 +#define WT_STAT_CONN_RWLOCK_READ 1075 /*! pthread mutex shared lock write-lock calls */ -#define WT_STAT_CONN_RWLOCK_WRITE 1074 +#define WT_STAT_CONN_RWLOCK_WRITE 1076 /*! open cursor count */ -#define WT_STAT_CONN_SESSION_CURSOR_OPEN 1075 +#define WT_STAT_CONN_SESSION_CURSOR_OPEN 1077 /*! transactions */ -#define WT_STAT_CONN_TXN_BEGIN 1076 +#define WT_STAT_CONN_TXN_BEGIN 1078 /*! transaction checkpoints */ -#define WT_STAT_CONN_TXN_CHECKPOINT 1077 +#define WT_STAT_CONN_TXN_CHECKPOINT 1079 /*! transaction checkpoint currently running */ -#define WT_STAT_CONN_TXN_CHECKPOINT_RUNNING 1078 +#define WT_STAT_CONN_TXN_CHECKPOINT_RUNNING 1080 /*! transactions committed */ -#define WT_STAT_CONN_TXN_COMMIT 1079 +#define WT_STAT_CONN_TXN_COMMIT 1081 /*! transaction failures due to cache overflow */ -#define WT_STAT_CONN_TXN_FAIL_CACHE 1080 +#define WT_STAT_CONN_TXN_FAIL_CACHE 1082 /*! transactions rolled-back */ -#define WT_STAT_CONN_TXN_ROLLBACK 1081 +#define WT_STAT_CONN_TXN_ROLLBACK 1083 /*! total write I/Os */ -#define WT_STAT_CONN_WRITE_IO 1082 +#define WT_STAT_CONN_WRITE_IO 1084 /*! * @} @@ -2767,43 +2923,47 @@ extern int wiredtiger_extension_terminate(WT_CONNECTION *connection); #define WT_STAT_DSRC_CURSOR_UPDATE 2066 /*! cursor-update value bytes updated */ #define WT_STAT_DSRC_CURSOR_UPDATE_BYTES 2067 +/*! sleep for LSM checkpoint throttle */ +#define WT_STAT_DSRC_LSM_CHECKPOINT_THROTTLE 2068 /*! chunks in the LSM tree */ -#define WT_STAT_DSRC_LSM_CHUNK_COUNT 2068 +#define WT_STAT_DSRC_LSM_CHUNK_COUNT 2069 /*! highest merge generation in the LSM tree */ -#define WT_STAT_DSRC_LSM_GENERATION_MAX 2069 +#define WT_STAT_DSRC_LSM_GENERATION_MAX 2070 /*! queries that could have benefited from a Bloom filter that did not * exist */ -#define WT_STAT_DSRC_LSM_LOOKUP_NO_BLOOM 2070 +#define WT_STAT_DSRC_LSM_LOOKUP_NO_BLOOM 2071 +/*! sleep for LSM merge throttle */ +#define WT_STAT_DSRC_LSM_MERGE_THROTTLE 2072 /*! reconciliation dictionary matches */ -#define WT_STAT_DSRC_REC_DICTIONARY 2071 +#define WT_STAT_DSRC_REC_DICTIONARY 2073 /*! reconciliation internal-page overflow keys */ -#define WT_STAT_DSRC_REC_OVERFLOW_KEY_INTERNAL 2072 +#define WT_STAT_DSRC_REC_OVERFLOW_KEY_INTERNAL 2074 /*! reconciliation leaf-page overflow keys */ -#define WT_STAT_DSRC_REC_OVERFLOW_KEY_LEAF 2073 +#define WT_STAT_DSRC_REC_OVERFLOW_KEY_LEAF 2075 /*! reconciliation overflow values written */ -#define WT_STAT_DSRC_REC_OVERFLOW_VALUE 2074 +#define WT_STAT_DSRC_REC_OVERFLOW_VALUE 2076 /*! reconciliation pages deleted */ -#define WT_STAT_DSRC_REC_PAGE_DELETE 2075 +#define WT_STAT_DSRC_REC_PAGE_DELETE 2077 /*! reconciliation pages merged */ -#define WT_STAT_DSRC_REC_PAGE_MERGE 2076 +#define WT_STAT_DSRC_REC_PAGE_MERGE 2078 /*! page reconciliation calls */ -#define WT_STAT_DSRC_REC_PAGES 2077 +#define WT_STAT_DSRC_REC_PAGES 2079 /*! page reconciliation calls for eviction */ -#define WT_STAT_DSRC_REC_PAGES_EVICTION 2078 +#define WT_STAT_DSRC_REC_PAGES_EVICTION 2080 /*! reconciliation failed because an update could not be included */ -#define WT_STAT_DSRC_REC_SKIPPED_UPDATE 2079 +#define WT_STAT_DSRC_REC_SKIPPED_UPDATE 2081 /*! reconciliation internal pages split */ -#define WT_STAT_DSRC_REC_SPLIT_INTERNAL 2080 +#define WT_STAT_DSRC_REC_SPLIT_INTERNAL 2082 /*! reconciliation leaf pages split */ -#define WT_STAT_DSRC_REC_SPLIT_LEAF 2081 +#define WT_STAT_DSRC_REC_SPLIT_LEAF 2083 /*! reconciliation maximum splits for a page */ -#define WT_STAT_DSRC_REC_SPLIT_MAX 2082 +#define WT_STAT_DSRC_REC_SPLIT_MAX 2084 /*! object compaction */ -#define WT_STAT_DSRC_SESSION_COMPACT 2083 +#define WT_STAT_DSRC_SESSION_COMPACT 2085 /*! open cursor count */ -#define WT_STAT_DSRC_SESSION_CURSOR_OPEN 2084 +#define WT_STAT_DSRC_SESSION_CURSOR_OPEN 2086 /*! update conflicts */ -#define WT_STAT_DSRC_TXN_UPDATE_CONFLICT 2085 +#define WT_STAT_DSRC_TXN_UPDATE_CONFLICT 2087 /*! @} */ /* * Statistics section: END diff --git a/src/include/wiredtiger_ext.h b/src/include/wiredtiger_ext.h index 88ddecdade1..acf7efad3d9 100644 --- a/src/include/wiredtiger_ext.h +++ b/src/include/wiredtiger_ext.h @@ -184,6 +184,12 @@ struct __wt_extension_api { WT_ITEM *first, WT_ITEM *second, int *cmp); /*! + * @copydoc wiredtiger_config_parser_open + */ + int (*config_parser_open)(WT_EXTENSION_API *wt_api, WT_SESSION *session, + const char *config, size_t len, WT_CONFIG_PARSER **config_parserp); + + /*! * Return the value of a configuration string. * * @param wt_api the extension handle @@ -199,69 +205,6 @@ struct __wt_extension_api { WT_CONFIG_ARG *config, const char *key, WT_CONFIG_ITEM *value); /*! - * Return the value of a configuration string. - * - * @param wt_api the extension handle - * @param session the session handle (or NULL if none available) - * @param config a configuration string - * @param key configuration key string - * @param value the returned value - * @errors - * - * @snippet ex_data_source.c WT_EXTENSION config_strget - */ - int (*config_strget)(WT_EXTENSION_API *wt_api, WT_SESSION *session, - const char *config, const char *key, WT_CONFIG_ITEM *value); - - /*! - * Return the list entries of a configuration string value. - * This method steps through the entries found in the last returned - * value from WT_EXTENSION_API::config_get. The last returned value - * should be of type "list". - * - * @param wt_api the extension handle - * @param session the session handle (or NULL if none available) - * @param str the configuration string to scan - * @param len the number of valid bytes in \c str - * @param[out] scanp a handle used to scan the config string - * @errors - * - * @snippet ex_data_source.c WT_EXTENSION config scan - */ - int (*config_scan_begin)(WT_EXTENSION_API *wt_api, WT_SESSION *session, - const char *str, size_t len, WT_CONFIG_SCAN **scanp); - - /*! - * Release any resources allocated by - * WT_EXTENSION_API::config_scan_begin. - * - * @param wt_api the extension handle - * @param scan the configuration scanner, invalid after this call - * @errors - * - * @snippet ex_data_source.c WT_EXTENSION config scan - */ - int (*config_scan_end)(WT_EXTENSION_API *wt_api, WT_CONFIG_SCAN *scan); - - /*! - * Return the next key/value pair from a config string scan. - * - * If the string contains a list of items with no assigned value, the - * items will be returned in \c key and the \c value will be set to the - * boolean \c "true" value. - * - * @param wt_api the extension handle - * @param scan the configuration scanner - * @param key the returned key - * @param value the returned value - * @errors - * - * @snippet ex_data_source.c WT_EXTENSION config scan - */ - int (*config_scan_next)(WT_EXTENSION_API *wt_api, - WT_CONFIG_SCAN *scan, WT_CONFIG_ITEM *key, WT_CONFIG_ITEM *value); - - /*! * Insert a row into the metadata if it does not already exist. * * @param wt_api the extension handle @@ -428,75 +371,19 @@ struct __wt_extension_api { */ int (*transaction_visible)(WT_EXTENSION_API *wt_api, WT_SESSION *session, uint64_t transaction_id); -}; - -/*! - * @typedef WT_CONFIG_ARG - * - * A configuration object passed to some extension interfaces. This is an - * opaque type: configuration values can be queried using - * WT_EXTENSION_API::config_get. - */ -/*! - * The configuration information returned by the WiredTiger extension function - * WT_EXTENSION_API::config_get. - */ -struct __wt_config_item { /*! - * The value of a configuration string. - * - * Regardless of the type of the configuration string (boolean, int, - * list or string), the \c str field will reference the value of the - * configuration string. - * - * The bytes referenced by \c str are <b>not</b> be nul-terminated, - * use the \c len field instead of a terminating nul byte. + * @copydoc wiredtiger_version */ - const char *str; - - /*! The number of bytes in the value referenced by \c str. */ - size_t len; - - /*! - * The value of a configuration boolean or integer. - * - * If the configuration string's value is "true" or "false", the - * \c val field will be set to 1 (true), or 0 (false). - * - * If the configuration string can be legally interpreted as an integer, - * using the strtoll function rules as specified in ISO/IEC 9899:1990 - * ("ISO C90"), that integer will be stored in the \c val field. - */ - int64_t val; - - /*! Permitted values of the \c type field. */ - enum { - /*! A string value with quotes stripped. */ - WT_CONFIG_ITEM_STRING, - /*! A boolean literal ("true" or "false"). */ - WT_CONFIG_ITEM_BOOL, - /*! An unquoted identifier: a string value without quotes. */ - WT_CONFIG_ITEM_ID, - /*! A numeric value. */ - WT_CONFIG_ITEM_NUM, - /*! A nested structure or list, including brackets. */ - WT_CONFIG_ITEM_STRUCT - } - /*! - * The type of value determined by the parser. In all cases, - * the \c str and \c len fields are set. - */ - type; + const char *(*version)(int *majorp, int *minorp, int *patchp); }; /*! - * @typedef WT_CONFIG_SCAN + * @typedef WT_CONFIG_ARG * - * A handle for a scan through a configuration string. - * This is an opaque type returned by WT_EXTENSION_API::config_scan_begin. - * Configuration values can be queried using WT_EXTENSION_API::config_scan_next. - * Call WT_EXTENSION_API::config_scan_end when finished to release resources. + * A configuration object passed to some extension interfaces. This is an + * opaque type: configuration values can be queried using + * WT_EXTENSION_API::config_get */ /*! @} */ diff --git a/src/include/wt_internal.h b/src/include/wt_internal.h index 3ae89fc211e..19a805aa25f 100644 --- a/src/include/wt_internal.h +++ b/src/include/wt_internal.h @@ -93,6 +93,8 @@ struct __wt_config_check; typedef struct __wt_config_check WT_CONFIG_CHECK; struct __wt_config_entry; typedef struct __wt_config_entry WT_CONFIG_ENTRY; +struct __wt_config_parser_impl; + typedef struct __wt_config_parser_impl WT_CONFIG_PARSER_IMPL; struct __wt_connection_impl; typedef struct __wt_connection_impl WT_CONNECTION_IMPL; struct __wt_connection_stats; diff --git a/src/lsm/lsm_cursor.c b/src/lsm/lsm_cursor.c index 618257469ee..c1824df24ac 100644 --- a/src/lsm/lsm_cursor.c +++ b/src/lsm/lsm_cursor.c @@ -17,15 +17,15 @@ /* * LSM API enter: check that the cursor is in sync with the tree. */ -#define WT_LSM_ENTER(clsm, cursor, session, n) \ +#define WT_LSM_ENTER(clsm, cursor, session, n, reset) \ clsm = (WT_CURSOR_LSM *)cursor; \ CURSOR_API_CALL(cursor, session, n, NULL); \ - WT_ERR(__clsm_enter(clsm, 0)) + WT_ERR(__clsm_enter(clsm, reset, 0)) #define WT_LSM_UPDATE_ENTER(clsm, cursor, session, n) \ clsm = (WT_CURSOR_LSM *)cursor; \ CURSOR_UPDATE_API_CALL(cursor, session, n, NULL); \ - WT_ERR(__clsm_enter(clsm, 1)) + WT_ERR(__clsm_enter(clsm, 0, 1)) #define WT_LSM_LEAVE(session) \ API_END(session); \ @@ -35,16 +35,18 @@ CURSOR_UPDATE_API_END(session, ret); \ WT_TRET(__clsm_leave(clsm)) -static int __clsm_open_cursors(WT_CURSOR_LSM *, int, u_int, uint32_t); static int __clsm_lookup(WT_CURSOR_LSM *); +static int __clsm_open_cursors(WT_CURSOR_LSM *, int, u_int, uint32_t); +static int __clsm_reset_cursors(WT_CURSOR_LSM *, WT_CURSOR *); /* * __clsm_enter -- * Start an operation on an LSM cursor, update if the tree has changed. */ static inline int -__clsm_enter(WT_CURSOR_LSM *clsm, int update) +__clsm_enter(WT_CURSOR_LSM *clsm, int reset, int update) { + WT_CURSOR *c; WT_DECL_RET; WT_LSM_CHUNK *chunk; WT_SESSION_IMPL *session; @@ -57,6 +59,20 @@ __clsm_enter(WT_CURSOR_LSM *clsm, int update) if (F_ISSET(clsm, WT_CLSM_MERGE)) return (0); + if (reset) { + c = &clsm->iface; + /* Copy out data before resetting chunk cursors. */ + if (F_ISSET(c, WT_CURSTD_KEY_INT) && + !WT_DATA_IN_ITEM(&c->key)) + WT_RET(__wt_buf_set( + session, &c->key, c->key.data, c->key.size)); + if (F_ISSET(c, WT_CURSTD_VALUE_INT) && + !WT_DATA_IN_ITEM(&c->value)) + WT_RET(__wt_buf_set( + session, &c->value, c->value.data, c->value.size)); + WT_RET(__clsm_reset_cursors(clsm, NULL)); + } + for (;;) { /* * If the cursor looks up-to-date, check if the cache is full. @@ -369,7 +385,7 @@ retry: if (F_ISSET(clsm, WT_CLSM_MERGE)) { * generation number and retry if it has changed under us. */ if (clsm->cursors != NULL && (ngood < clsm->nchunks || - (F_ISSET(clsm, WT_CLSM_OPEN_READ) && nupdates > 0))) { + (!F_ISSET(clsm, WT_CLSM_OPEN_READ) && nupdates > 0))) { saved_gen = lsm_tree->dsk_gen; locked = 0; WT_ERR(__wt_lsm_tree_unlock(session, lsm_tree)); @@ -607,7 +623,7 @@ __clsm_next(WT_CURSOR *cursor) u_int i; int check, cmp, deleted; - WT_LSM_ENTER(clsm, cursor, session, next); + WT_LSM_ENTER(clsm, cursor, session, next, 0); /* If we aren't positioned for a forward scan, get started. */ if (clsm->current == NULL || !F_ISSET(clsm, WT_CLSM_ITERATE_NEXT)) { @@ -689,7 +705,7 @@ __clsm_prev(WT_CURSOR *cursor) u_int i; int check, cmp, deleted; - WT_LSM_ENTER(clsm, cursor, session, prev); + WT_LSM_ENTER(clsm, cursor, session, prev, 0); /* If we aren't positioned for a reverse scan, get started. */ if (clsm->current == NULL || !F_ISSET(clsm, WT_CLSM_ITERATE_PREV)) { @@ -875,13 +891,14 @@ __clsm_lookup(WT_CURSOR_LSM *clsm) } WT_ERR(WT_NOTFOUND); -done: WT_TRET(__clsm_reset_cursors(clsm, c)); +done: err: if (ret == 0) { clsm->current = c; F_CLR(cursor, WT_CURSTD_KEY_EXT | WT_CURSTD_VALUE_EXT); F_SET(cursor, WT_CURSTD_KEY_INT | WT_CURSTD_VALUE_INT); } else { - WT_TRET(__clsm_reset_cursors(clsm, NULL)); + if (c != NULL) + WT_TRET(c->reset(c)); F_CLR(cursor, WT_CURSTD_KEY_SET | WT_CURSTD_VALUE_SET); } @@ -899,7 +916,7 @@ __clsm_search(WT_CURSOR *cursor) WT_DECL_RET; WT_SESSION_IMPL *session; - WT_LSM_ENTER(clsm, cursor, session, search); + WT_LSM_ENTER(clsm, cursor, session, search, 1); WT_CURSOR_NEEDKEY(cursor); ret = __clsm_lookup(clsm); @@ -925,13 +942,10 @@ __clsm_search_near(WT_CURSOR *cursor, int *exactp) larger = smaller = NULL; - WT_LSM_ENTER(clsm, cursor, session, search_near); + WT_LSM_ENTER(clsm, cursor, session, search_near, 1); WT_CURSOR_NEEDKEY(cursor); F_CLR(clsm, WT_CLSM_ITERATE_NEXT | WT_CLSM_ITERATE_PREV); - /* Reset any positioned cursor(s) to release pinned resources. */ - WT_ERR(__clsm_reset_cursors(clsm, NULL)); - /* * search_near is somewhat fiddly: we can't just return a nearby key * from the in-memory chunk because there could be a closer key on @@ -1111,13 +1125,29 @@ __clsm_put(WT_SESSION_IMPL *session, } /* - * The count is in a shared structure, but it's only approximate, so - * don't worry about protecting access. + * Update the record count. It is in a shared structure, but it's only + * approximate, so don't worry about protecting access. + * + * Throttle if necessary. Every 100 update operations on each cursor, + * check if throttling is required. Don't rely only on the shared + * counter because it can race, and because for some workloads, there + * may not be enough records per chunk to get effective throttling. */ - if (++clsm->primary_chunk->count % 100 == 0 && - lsm_tree->merge_throttle + lsm_tree->ckpt_throttle > 0) + if ((++clsm->primary_chunk->count % 100 == 0 || + ++clsm->update_count >= 100) && + lsm_tree->merge_throttle + lsm_tree->ckpt_throttle > 0) { + clsm->update_count = 0; + WT_STAT_FAST_INCRV(session, &clsm->lsm_tree->stats, + lsm_checkpoint_throttle, (uint64_t)lsm_tree->ckpt_throttle); + WT_STAT_FAST_CONN_INCRV(session, + lsm_checkpoint_throttle, (uint64_t)lsm_tree->ckpt_throttle); + WT_STAT_FAST_INCRV(session, &clsm->lsm_tree->stats, + lsm_merge_throttle, (uint64_t)lsm_tree->merge_throttle); + WT_STAT_FAST_CONN_INCRV(session, + lsm_merge_throttle, (uint64_t)lsm_tree->merge_throttle); __wt_sleep(0, lsm_tree->ckpt_throttle + lsm_tree->merge_throttle); + } /* * In LSM there are multiple btrees active at one time. The tree diff --git a/src/lsm/lsm_merge.c b/src/lsm/lsm_merge.c index 792adf0773f..1f70072b41f 100644 --- a/src/lsm/lsm_merge.c +++ b/src/lsm/lsm_merge.c @@ -82,7 +82,7 @@ __wt_lsm_merge( aggressive = 10; merge_min = (aggressive > 5) ? 2 : lsm_tree->merge_min; max_gap = (aggressive + 4) / 5; - max_level = (id == 0 ? 0 : id - 1) + aggressive; + max_level = (lsm_tree->merge_throttle > 0) ? 0 : id + aggressive; /* * If there aren't any chunks to merge, or some of the chunks aren't @@ -101,24 +101,19 @@ __wt_lsm_merge( WT_RET(__wt_lsm_tree_lock(session, lsm_tree, 1)); /* - * Only include chunks that already have a Bloom filter and not - * involved in a merge. + * Only include chunks that already have a Bloom filter or are the + * result of a merge and not involved in a merge. */ - end_chunk = lsm_tree->nchunks - 1; - while (end_chunk > 0 && - ((chunk = lsm_tree->chunk[end_chunk]) == NULL || - !F_ISSET(chunk, WT_LSM_CHUNK_BLOOM) || - F_ISSET(chunk, WT_LSM_CHUNK_MERGING))) { - --end_chunk; - - /* - * If we find a chunk on disk without a Bloom filter, give up. - * We may have waited a while to lock the tree, and new chunks - * may have been created in the meantime. - */ - if (chunk != NULL && + for (end_chunk = lsm_tree->nchunks - 1; end_chunk > 0; --end_chunk) { + chunk = lsm_tree->chunk[end_chunk]; + WT_ASSERT(session, chunk != NULL); + if (F_ISSET(chunk, WT_LSM_CHUNK_MERGING)) + continue; + if (F_ISSET(chunk, WT_LSM_CHUNK_BLOOM) || chunk->generation > 0) + break; + else if (FLD_ISSET(lsm_tree->bloom, WT_LSM_BLOOM_OFF) && F_ISSET(chunk, WT_LSM_CHUNK_ONDISK)) - end_chunk = 0; + break; } /* @@ -150,8 +145,11 @@ __wt_lsm_merge( youngest = lsm_tree->chunk[end_chunk]; nchunks = (end_chunk + 1) - start_chunk; - /* If the chunk is already involved in a merge, stop. */ - if (F_ISSET(chunk, WT_LSM_CHUNK_MERGING)) + /* + * If the chunk is already involved in a merge or a Bloom + * filter is being built for it, stop. + */ + if (F_ISSET(chunk, WT_LSM_CHUNK_MERGING) || chunk->bloom_busy) break; /* @@ -163,10 +161,14 @@ __wt_lsm_merge( /* * If the size of the chunks selected so far exceeds the - * configured maximum chunk size, stop. + * configured maximum chunk size, stop. Keep going if we can + * slide the window further into the tree: we don't want to + * leave small chunks in the middle. */ if ((chunk_size += chunk->size) > lsm_tree->chunk_max) - break; + if (nchunks < merge_min || + chunk_size - youngest->size > lsm_tree->chunk_max) + break; /* * If we have enough chunks for a merge and the next chunk is @@ -184,7 +186,12 @@ __wt_lsm_merge( record_count += chunk->count; --start_chunk; - if (nchunks == lsm_tree->merge_max) { + /* + * If we have a full window, or the merge would be too big, + * remove the youngest chunk. + */ + if (nchunks == lsm_tree->merge_max || + chunk_size > lsm_tree->chunk_max) { WT_ASSERT(session, F_ISSET(youngest, WT_LSM_CHUNK_MERGING)); F_CLR(youngest, WT_LSM_CHUNK_MERGING); diff --git a/src/lsm/lsm_stat.c b/src/lsm/lsm_stat.c index 30148de9a9f..a626032fe8a 100644 --- a/src/lsm/lsm_stat.c +++ b/src/lsm/lsm_stat.c @@ -39,7 +39,7 @@ __lsm_stat_init(WT_SESSION_IMPL *session, const char *uri, WT_CURSOR_STAT *cst) "statistics=(%s%s%s)", cst->stat_clear ? "clear," : "", cst->stat_all ? "all," : "", - cst->stat_fast ? "fast," : ""); + !cst->stat_all && cst->stat_fast ? "fast," : ""); cfg[1] = disk_cfg[1] = config; } diff --git a/src/lsm/lsm_tree.c b/src/lsm/lsm_tree.c index a830295908f..eb81acecaf4 100644 --- a/src/lsm/lsm_tree.c +++ b/src/lsm/lsm_tree.c @@ -692,9 +692,9 @@ __wt_lsm_tree_throttle( else if (!decrease_only) WT_LSM_MERGE_THROTTLE_INCREASE(lsm_tree->merge_throttle); - /* Put an upper bound of 100ms on both throttle calculations. */ - lsm_tree->ckpt_throttle = WT_MIN(100000, lsm_tree->ckpt_throttle); - lsm_tree->merge_throttle = WT_MIN(100000, lsm_tree->merge_throttle); + /* Put an upper bound of 1s on both throttle calculations. */ + lsm_tree->ckpt_throttle = WT_MIN(1000000, lsm_tree->ckpt_throttle); + lsm_tree->merge_throttle = WT_MIN(1000000, lsm_tree->merge_throttle); /* * Update our estimate of how long each in-memory chunk stays active. @@ -1035,7 +1035,13 @@ __wt_lsm_compact(WT_SESSION_IMPL *session, const char *name, int *skip) WT_RET(__wt_seconds(session, &begin)); + /* + * Set the compacting flag and clear the current merge throttle + * setting, so that all merge threads look for merges at all levels of + * the tree. + */ F_SET(lsm_tree, WT_LSM_TREE_COMPACTING); + lsm_tree->merge_throttle = 0; /* Wake up the merge threads. */ WT_RET(__wt_cond_signal(session, lsm_tree->work_cond)); diff --git a/src/lsm/lsm_worker.c b/src/lsm/lsm_worker.c index 8b939529b33..29f6ea31271 100644 --- a/src/lsm/lsm_worker.c +++ b/src/lsm/lsm_worker.c @@ -104,7 +104,7 @@ __wt_lsm_merge_worker(void *vargs) session = lsm_tree->worker_sessions[id]; __wt_free(session, args); - aggressive = stallms = 0; + aggressive = chunk_wait = stallms = 0; while (F_ISSET(lsm_tree, WT_LSM_TREE_WORKING)) { /* @@ -127,7 +127,7 @@ __wt_lsm_merge_worker(void *vargs) progress = 1; /* If we didn't create a Bloom filter, try to merge. */ - if (progress == 0 && + if ((id != 0 || progress == 0) && __wt_lsm_merge(session, lsm_tree, id, aggressive) == 0) progress = 1; @@ -185,7 +185,8 @@ err: __wt_err(session, ret, "LSM merge worker failed"); /* * __lsm_bloom_work -- - * Try to create a Bloom filter for the newest on-disk chunk. + * Try to create a Bloom filter for the newest on-disk chunk that doesn't + * have one. */ static int __lsm_bloom_work(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree) @@ -216,10 +217,14 @@ __lsm_bloom_work(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree) chunk->count == 0) continue; - /* See if we win the race to switch on the "busy" flag. */ + /* + * See if we win the race to switch on the "busy" flag and + * recheck that the chunk still needs a Bloom filter. + */ if (WT_ATOMIC_CAS(chunk->bloom_busy, 0, 1)) { - ret = __lsm_bloom_create( - session, lsm_tree, chunk, (u_int)i); + if (!F_ISSET(chunk, WT_LSM_CHUNK_BLOOM)) + ret = __lsm_bloom_create( + session, lsm_tree, chunk, (u_int)i); chunk->bloom_busy = 0; break; } diff --git a/src/os_posix/os_open.c b/src/os_posix/os_open.c index 6ef4caadd0a..c6938dad9fd 100644 --- a/src/os_posix/os_open.c +++ b/src/os_posix/os_open.c @@ -107,7 +107,8 @@ __wt_open(WT_SESSION_IMPL *session, #endif #ifdef O_NOATIME /* Avoid updating metadata for read-only workloads. */ - if (dio_type == WT_FILE_TYPE_DATA) + if (dio_type == WT_FILE_TYPE_DATA || + dio_type == WT_FILE_TYPE_CHECKPOINT) f |= O_NOATIME; #endif @@ -157,7 +158,8 @@ __wt_open(WT_SESSION_IMPL *session, #if defined(HAVE_POSIX_FADVISE) /* Disable read-ahead on trees: it slows down random read workloads. */ - if (dio_type == WT_FILE_TYPE_DATA) + if (dio_type == WT_FILE_TYPE_DATA || + dio_type == WT_FILE_TYPE_CHECKPOINT) WT_ERR(posix_fadvise(fd, 0, 0, POSIX_FADV_RANDOM)); #endif @@ -174,7 +176,8 @@ __wt_open(WT_SESSION_IMPL *session, WT_ERR(__wt_filesize(session, fh, &fh->size)); /* Configure file extension. */ - if (dio_type == WT_FILE_TYPE_DATA) + if (dio_type == WT_FILE_TYPE_DATA || + dio_type == WT_FILE_TYPE_CHECKPOINT) fh->extend_len = conn->data_extend_len; /* diff --git a/src/schema/schema_create.c b/src/schema/schema_create.c index 8d620198c09..8bd78ce0ac4 100644 --- a/src/schema/schema_create.c +++ b/src/schema/schema_create.c @@ -33,7 +33,8 @@ __wt_direct_io_size_check(WT_SESSION_IMPL *session, * if you configure direct I/O and then don't do I/O in alignments and * units of its happy place. */ - if (FLD_ISSET(conn->direct_io, WT_FILE_TYPE_DATA)) { + if (FLD_ISSET(conn->direct_io, + WT_FILE_TYPE_CHECKPOINT | WT_FILE_TYPE_DATA)) { align = (int64_t)conn->buffer_alignment; if (align != 0 && (cval.val < align || cval.val % align != 0)) WT_RET_MSG(session, EINVAL, diff --git a/src/support/scratch.c b/src/support/scratch.c index 4b342977372..1aae8649901 100644 --- a/src/support/scratch.c +++ b/src/support/scratch.c @@ -127,7 +127,8 @@ __wt_buf_set( /* Ensure the buffer is large enough. */ WT_RET(__wt_buf_initsize(session, buf, size)); - memcpy(buf->mem, data, size); + /* Copy, allowing for overlapping strings. */ + memmove(buf->mem, data, size); return (0); } diff --git a/src/support/stat.c b/src/support/stat.c index 621c79220a4..c0caecbe606 100644 --- a/src/support/stat.c +++ b/src/support/stat.c @@ -93,11 +93,14 @@ __wt_stat_init_dsrc_stats(WT_DSRC_STATS *stats) stats->cursor_search_near.desc = "cursor search near calls"; stats->cursor_update.desc = "cursor update calls"; stats->cursor_update_bytes.desc = "cursor-update value bytes updated"; + stats->lsm_checkpoint_throttle.desc = + "sleep for LSM checkpoint throttle"; stats->lsm_chunk_count.desc = "chunks in the LSM tree"; stats->lsm_generation_max.desc = "highest merge generation in the LSM tree"; stats->lsm_lookup_no_bloom.desc = "queries that could have benefited from a Bloom filter that did not exist"; + stats->lsm_merge_throttle.desc = "sleep for LSM merge throttle"; stats->rec_dictionary.desc = "reconciliation dictionary matches"; stats->rec_overflow_key_internal.desc = "reconciliation internal-page overflow keys"; @@ -194,9 +197,11 @@ __wt_stat_refresh_dsrc_stats(void *stats_arg) stats->cursor_search_near.v = 0; stats->cursor_update.v = 0; stats->cursor_update_bytes.v = 0; + stats->lsm_checkpoint_throttle.v = 0; stats->lsm_chunk_count.v = 0; stats->lsm_generation_max.v = 0; stats->lsm_lookup_no_bloom.v = 0; + stats->lsm_merge_throttle.v = 0; stats->rec_dictionary.v = 0; stats->rec_overflow_key_internal.v = 0; stats->rec_overflow_key_leaf.v = 0; @@ -280,9 +285,11 @@ __wt_stat_aggregate_dsrc_stats(const void *child, const void *parent) p->cursor_search_near.v += c->cursor_search_near.v; p->cursor_update.v += c->cursor_update.v; p->cursor_update_bytes.v += c->cursor_update_bytes.v; + p->lsm_checkpoint_throttle.v += c->lsm_checkpoint_throttle.v; if (c->lsm_generation_max.v > p->lsm_generation_max.v) p->lsm_generation_max.v = c->lsm_generation_max.v; p->lsm_lookup_no_bloom.v += c->lsm_lookup_no_bloom.v; + p->lsm_merge_throttle.v += c->lsm_merge_throttle.v; p->rec_dictionary.v += c->rec_dictionary.v; p->rec_overflow_key_internal.v += c->rec_overflow_key_internal.v; p->rec_overflow_key_leaf.v += c->rec_overflow_key_leaf.v; @@ -389,6 +396,9 @@ __wt_stat_init_connection_stats(WT_CONNECTION_STATS *stats) "log: consolidated slot join transitions"; stats->log_sync.desc = "log: log sync operations"; stats->log_writes.desc = "log: log write operations"; + stats->lsm_checkpoint_throttle.desc = + "sleep for LSM checkpoint throttle"; + stats->lsm_merge_throttle.desc = "sleep for LSM merge throttle"; stats->lsm_rows_merged.desc = "rows merged in an LSM tree"; stats->memory_allocation.desc = "memory allocations"; stats->memory_free.desc = "memory frees"; @@ -479,6 +489,8 @@ __wt_stat_refresh_connection_stats(void *stats_arg) stats->log_slot_transitions.v = 0; stats->log_sync.v = 0; stats->log_writes.v = 0; + stats->lsm_checkpoint_throttle.v = 0; + stats->lsm_merge_throttle.v = 0; stats->lsm_rows_merged.v = 0; stats->memory_allocation.v = 0; stats->memory_free.v = 0; diff --git a/tools/wt_nvd3_util.py b/tools/wt_nvd3_util.py new file mode 100644 index 00000000000..6bf1396b0ff --- /dev/null +++ b/tools/wt_nvd3_util.py @@ -0,0 +1,46 @@ +#!/usr/bin/env python +# +# Public Domain 2008-2014 WiredTiger, Inc. +# +# This is free and unencumbered software released into the public domain. +# +# Anyone is free to copy, modify, publish, use, compile, sell, or +# distribute this software, either in source code form or as a compiled +# binary, for any purpose, commercial or non-commercial, and by any +# means. +# +# In jurisdictions that recognize copyright laws, the author or authors +# of this software dedicate any and all copyright interest in the +# software to the public domain. We make this dedication for the benefit +# of the public at large and to the detriment of our heirs and +# successors. We intend this dedication to be an overt act of +# relinquishment in perpetuity of all present and future rights to this +# software under copyright law. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +# IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR +# OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +# ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR +# OTHER DEALINGS IN THE SOFTWARE. +# +from datetime import datetime +from nvd3 import lineChart + +# Add a multiChart type so we can overlay line graphs +class multiChart(lineChart): + def __init__(self, **kwargs): + lineChart.__init__(self, **kwargs) + + # Fix the axes + del self.axislist['yAxis'] + self.create_y_axis('yAxis1', format=kwargs.get('y_axis_format', '.02f')) + self.create_y_axis('yAxis2', format=kwargs.get('y_axis_format', '.02f')) + +TIMEFMT = "%b %d %H:%M:%S" + +thisyear = datetime.today().year +def parsetime(s): + return datetime.strptime(s, TIMEFMT).replace(year=thisyear) + diff --git a/tools/wtperf_graph.py b/tools/wtperf_graph.py index 8bd11433014..f45145cf801 100644 --- a/tools/wtperf_graph.py +++ b/tools/wtperf_graph.py @@ -33,22 +33,26 @@ from subprocess import call TIMEFMT = "%b %d %H:%M:%S" -def process_monitor(fname, ckptlist, opdict): +def process_monitor(fname, sfx, ckptlist, opdict): # Read the monitor file and figure out when a checkpoint was running. in_ckpt = 'N' ckptlist=[] + + ofname = 'monitor%s.png' % (sfx) # Monitor output format currently is: - # time,read,insert,update,ckpt,...latencies... + # time,totalsec,read,insert,update,ckpt,...latencies... ops = ('read', 'insert', 'update') - csvcol = (1, 2, 3) + csvcol = (2, 3, 4) with open(fname, 'r') as csvfile: reader = csv.reader(csvfile) for row in reader: + if row[0].lstrip().startswith('#'): + continue # Look for checkpoints and operations. - if row[4] != in_ckpt: + if row[5] != in_ckpt: ckptlist.append(row[0]) - in_ckpt = row[4] + in_ckpt = row[5] for op, col in zip(ops, csvcol): if row[col] != '0' and opdict[op] == 0: opdict[op] = 1 @@ -77,14 +81,14 @@ set yrange [0:]\n''' % { }) it = iter(ckptlist) for start, stop in zip(it, it): - of.write('set object rectangle from first \'' + start +\ - '\', graph 0 ' + ' to first \'' + stop +\ - '\', graph 1 fc rgb "gray" back\n') - of.write('set output "' + fname + '.png"\n') - of.write('plot "' + fname + '" using 1:($2/1000) title "Reads", "' +\ - fname + '" using 1:($3/1000) title "Inserts", "' +\ - fname + '" using 1:($4/1000) title "Updates", "' +\ - fname + '" using 1:(($2+$3+$4)/1000) title "Total"\n') + of.write("set object rectangle from first '%s',\ + graph 0 to first '%s',\ + graph 1 fc rgb \"gray\" back\n" % (start, stop)) + of.write('set output "%s"\n' % (ofname)) + of.write("""plot "{name}" using 1:($3/1000) title "Reads", \\ + "{name}" using 1:($4/1000) title "Inserts",\\ + "{name}" using 1:($5/1000) title "Updates" + """.format(name=fname)) of.close() call(["gnuplot", gcmd]) os.remove(gcmd) @@ -118,10 +122,12 @@ set yrange [1:]\n''' % { '\', graph 1 fc rgb "gray" back\n') ofname = name + sfx + '.latency1.png' of.write('set output "' + ofname + '"\n') - of.write('plot "' + fname + '" using 1:($' + repr(col_avg) +\ + of.write('plot "' +\ + fname + '" using 1:($' + repr(col_avg) +\ ') title "Average Latency", "' + fname +'" using 1:($' +\ repr(col_min) + ') title "Minimum Latency", "' +\ - fname + '" using 1:($' + repr(col_max) + ') title "Maximum Latency"\n') + fname + '" using 1:($' + repr(col_max) +\ + ') title "Maximum Latency"\n') of.close() call(["gnuplot", gcmd]) os.remove(gcmd) @@ -129,6 +135,9 @@ set yrange [1:]\n''' % { # Graph latency vs. % operations def plot_latency_percent(name, dirname, sfx, ckptlist): + lfile = os.path.join(dirname, 'latency.' + name) + if not os.path.exists(lfile): + return gcmd = "gnuplot." + name + ".l2.cmd" of = open(gcmd, "w") of.write(''' @@ -147,7 +156,7 @@ set ylabel "%% operations" set yrange [0:]\n''') ofname = name + sfx + '.latency2.png' of.write('set output "' + ofname + '"\n') - of.write('plot "' + os.path.join(dirname, 'latency.' + name) + sfx +\ + of.write('plot "' + lfile + sfx +\ '" using (($2 * 100)/$4) title "' + name + '"\n') of.close() call(["gnuplot", gcmd]) @@ -156,6 +165,9 @@ set yrange [0:]\n''') # Graph latency vs. % operations (cumulative) def plot_latency_cumulative_percent(name, dirname, sfx, ckptlist): + lfile = os.path.join(dirname, 'latency.' + name) + if not os.path.exists(lfile): + return # Latency plot: cumulative operations vs. latency gcmd = "gnuplot." + name + ".l3.cmd" of = open(gcmd, "w") @@ -176,7 +188,7 @@ set yrange [0:]\n''' % { }) ofname = name + sfx + '.latency3.png' of.write('set output "' + ofname + '"\n') - of.write('plot "' + os.path.join(dirname, 'latency.' + name) + sfx +\ + of.write('plot "' + lfile + sfx +\ '" using 1:(($3 * 100)/$4) title "' + name + '"\n') of.close() call(["gnuplot", gcmd]) @@ -187,14 +199,14 @@ def process_file(fname): # NOTE: The operations below must be in this exact order to match # the operation latency output in the monitor file. opdict={'read':0, 'insert':0, 'update':0} - process_monitor(fname, ckptlist, opdict) # This assumes the monitor file has the string "monitor" # and any other (optional) characters in the filename are a suffix. sfx = os.path.basename(fname).replace('monitor','') dirname = os.path.dirname(fname) - column = 6 # average, minimum, maximum start in column 6 + process_monitor(fname, sfx, ckptlist, opdict) + column = 7 # average, minimum, maximum start in column 7 for k, v in opdict.items(): if v != 0: plot_latency_operation( diff --git a/tools/wtperf_stats.py b/tools/wtperf_stats.py new file mode 100644 index 00000000000..d9743055fff --- /dev/null +++ b/tools/wtperf_stats.py @@ -0,0 +1,173 @@ +#!/usr/bin/env python +# +# Public Domain 2008-2014 WiredTiger, Inc. +# +# This is free and unencumbered software released into the public domain. +# +# Anyone is free to copy, modify, publish, use, compile, sell, or +# distribute this software, either in source code form or as a compiled +# binary, for any purpose, commercial or non-commercial, and by any +# means. +# +# In jurisdictions that recognize copyright laws, the author or authors +# of this software dedicate any and all copyright interest in the +# software to the public domain. We make this dedication for the benefit +# of the public at large and to the detriment of our heirs and +# successors. We intend this dedication to be an overt act of +# relinquishment in perpetuity of all present and future rights to this +# software under copyright law. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +# IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR +# OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +# ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR +# OTHER DEALINGS IN THE SOFTWARE. +# + +import os, csv, operator +from time import mktime + +try: + from wt_nvd3_util import multiChart, parsetime +except ImportError: + print >>sys.stderr, "Could not import wt_nvd3_util.py, it should be\ + in the same directory as %s" % sys.argv[0] + sys.exit(-1) + +try: + from stat_data import no_scale_per_second_list +except ImportError: + print >>sys.stderr, "Could not import stat_data.py, it should be in\ + the same directory as %s" % sys.argv[0] + sys.exit(-1) + +# Fixup the names and values in a dictionary read in from a csv file. One +# field must be "#time" - which is used to calculate the interval. +# Input is a dictionary, output is a list of dictionaries with a single entry. +def munge_dict(values_dict, abstime): + sorted_values = sorted(values_dict, key=operator.itemgetter('#time')) + start_time = parsetime(sorted_values[0]['#time']) + + ret = [] + for v in sorted_values: + if abstime: + # Build the time series, milliseconds since the epoch + v['#time'] = int(mktime(parsetime(v['#time']).timetuple())) * 1000 + else: + # Build the time series as seconds since the start of the data + v['#time'] = (parsetime(v['#time']) - start_time).seconds + next_val = {} + for title, value in v.items(): + if title.find('uS') != -1: + title = title.replace('uS', 'ms') + value = float(value) / 1000 + if title == 'totalsec': + value = 0 + if title == 'checkpoints' and value == 'N': + value = 0 + elif title.find('time') != -1: + title = 'time' + elif title.find('latency') == -1 and \ + title.find('checkpoints') == -1: + title = title + ' (thousands)' + value = float(value) / 1000 + next_val[title] = value + ret.append(next_val) + + # After building the series, eliminate constants + d0 = ret[0] + for t0, v0 in d0.items(): + skip = True + for d in ret: + v = d[t0] + if v != v0: + skip = False + break + if skip: + for dicts in ret: + del dicts[t0] + + return ret + +def addPlotsToChart(chart, graph_data, wtstat_chart = False): + # Extract the times - they are the same for all lines. + times = [] + for v in graph_data: + times.append(v['time']) + + # Add a line to the graph for each field in the CSV file in alphabetical + # order, so the key is sorted. + for field in sorted(graph_data[0].keys()): + if field == 'time': + continue + # Split the latency and non-latency measurements onto different scales + axis = "1" + if not wtstat_chart and field.find('latency') == -1: + axis="2" + ydata = [] + for v in graph_data: + ydata.append(v[field]) + chart.add_serie(x=times, y=ydata, name=field, type="line", yaxis=axis) + +# Input parameters are a chart populated with WiredTiger statistics and +# the directory where the wtperf monitor file can be found. +def addPlotsToStatsChart(chart, dirname, abstime): + fname = os.path.join(dirname, 'monitor') + try: + with open(fname, 'rb') as csvfile: + reader = csv.DictReader(csvfile) + # Transform the data into something NVD3 can digest + graph_data = munge_dict(reader, abstime) + except IOError: + print >>sys.stderr, "Could not open wtperf monitor file." + sys.exit(-1) + addPlotsToChart(chart, graph_data, 1) + +def main(): + # Parse the command line + import argparse + + parser = argparse.ArgumentParser(description='Create graphs from WiredTiger statistics.') + parser.add_argument('--abstime', action='store_true', + help='use absolute time on the x axis') + parser.add_argument('--output', '-o', metavar='file', + default='wtperf_stats.html', help='HTML output file') + parser.add_argument('files', metavar='file', nargs='+', + help='input monitor file generated by WiredTiger wtperf application') + args = parser.parse_args() + + output_file = open(args.output, 'w') + + if len(args.files) != 1: + print 'Script currently only supports a single monitor file' + exit (1) + + chart_extra = {} + # Add in the x axis if the user wants time. + if args.abstime: + chart_extra['x_axis_format'] = '%H:%M:%S' + + for f in args.files: + with open(f, 'rb') as csvfile: + reader = csv.DictReader(csvfile) + # Transform the data into something NVD3 can digest + graph_data = munge_dict(reader, args.abstime) + + chart = multiChart(name='wtperf', + height=450 + 10*len(graph_data[0].keys()), + resize=True, + x_is_date=args.abstime, + assets_directory='http://source.wiredtiger.com/graphs/', + **chart_extra) + + addPlotsToChart(chart, graph_data) + + chart.buildhtml() + output_file.write(chart.htmlcontent) + output_file.close() + +if __name__ == '__main__': + main() + diff --git a/tools/wtstats.py b/tools/wtstats.py index 766c49989b6..37b28e6c13a 100644 --- a/tools/wtstats.py +++ b/tools/wtstats.py @@ -28,38 +28,35 @@ import fileinput, os, re, shutil, sys, textwrap from collections import defaultdict -from datetime import datetime from time import mktime from subprocess import call try: from stat_data import no_scale_per_second_list except ImportError: - print >>sys.stderr, "Could not import stat_data.py, it should be in the same directory as %s" % sys.argv[0] + print >>sys.stderr, "Could not import stat_data.py, it should be\ + in the same directory as %s" % sys.argv[0] sys.exit(-1) try: - from nvd3 import lineChart, lineWithFocusChart + from wtperf_stats import addPlotsToStatsChart except ImportError: - print >>sys.stderr, "Could not import nvd3. Please install it *from source* (other versions may be missing features that we rely on). Run these commands: git clone https://github.com/areski/python-nvd3.git ; cd python-nvd3 ; sudo python setup.py install" + print >>sys.stderr, "Could not import wtperf_stats.py, it should be\ + in the same directory as %s" % sys.argv[0] sys.exit(-1) +try: + from wt_nvd3_util import multiChart, parsetime +except ImportError: + print >>sys.stderr, "Could not import wt_nvd3_util.py, it should be\ + in the same directory as %s" % sys.argv[0] + sys.exit(-1) -# Add a multiChart type so we can overlay line graphs -class multiChart(lineChart): - def __init__(self, **kwargs): - lineChart.__init__(self, **kwargs) - - # Fix the axes - del self.axislist['yAxis'] - self.create_y_axis('yAxis1', format=kwargs.get('y_axis_format', '.02f')) - self.create_y_axis('yAxis2', format=kwargs.get('y_axis_format', '.02f')) - -TIMEFMT = "%b %d %H:%M:%S" - -thisyear = datetime.today().year -def parsetime(s): - return datetime.strptime(s, TIMEFMT).replace(year=thisyear) +try: + from nvd3 import lineChart, lineWithFocusChart +except ImportError: + print >>sys.stderr, "Could not import nvd3. Please install it *from source* (other versions may be missing features that we rely on). Run these commands: git clone https://github.com/areski/python-nvd3.git ; cd python-nvd3 ; sudo python setup.py install" + sys.exit(-1) # Plot a set of entries for a title. def munge(title, values): @@ -93,7 +90,7 @@ def munge(title, values): # Parse the command line import argparse -parser = argparse.ArgumentParser(description='Create graphs from WiredTIger statistics.') +parser = argparse.ArgumentParser(description='Create graphs from WiredTiger statistics.') parser.add_argument('--abstime', action='store_true', help='use absolute time on the x axis') parser.add_argument('--focus', action='store_true', @@ -108,6 +105,8 @@ parser.add_argument('--output', '-o', metavar='file', default='wtstats.html', parser.add_argument('--right', '-R', metavar='regexp', type=re.compile, action='append', help='use the right axis for series with titles matching the specifed regexp') +parser.add_argument('--wtperf', '-w', action='store_true', + help='Plot wtperf statistics on the same graph') parser.add_argument('files', metavar='file', nargs='+', help='input files generated by WiredTiger statistics logging') args = parser.parse_args() @@ -211,6 +210,9 @@ for title, yaxis, ydata in results: chart.add_serie(x=xdata, y=(ydata.get(x, 0) for x in xdata), name=title, type="line", yaxis="2" if yaxis else "1") +if args.wtperf: + addPlotsToStatsChart(chart, os.path.dirname(args.files[0]), args.abstime) + chart.buildhtml() output_file.write(chart.htmlcontent) |